From 1419af35f01f7326459ecfd7c0bb0c89e54b4e11 Mon Sep 17 00:00:00 2001 From: Manul from Pathway Date: Fri, 15 Dec 2023 21:53:42 +0100 Subject: [PATCH] Release 0.7.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Michał Bartoszkiewicz Co-authored-by: Jan Chorowski Co-authored-by: Xavier Gendre Co-authored-by: Adrian Kosowski Co-authored-by: Jakub Kowalski Co-authored-by: Sergey Kulik Co-authored-by: Mateusz Lewandowski Co-authored-by: Mohamed Malhou Co-authored-by: Krzysztof Nowicki Co-authored-by: Richard Pelgrim Co-authored-by: Kamil Piechowiak Co-authored-by: Paweł Podhajski Co-authored-by: Olivier Ruas Co-authored-by: Przemysław Uznański Co-authored-by: Sebastian Włudzik GitOrigin-RevId: 91ccb907441defe174d6e0b31342822357bfe4bf --- .github/workflows/release.yml | 22 +- CHANGELOG.md | 21 + Cargo.lock | 136 +++---- Cargo.toml | 12 +- integration_tests/kafka/test_backfilling.py | 10 +- integration_tests/kafka/test_simple.py | 14 +- .../webserver/test_rest_connector.py | 23 +- pyproject.toml | 2 + python/pathway/__init__.py | 2 +- python/pathway/debug/__init__.py | 50 ++- python/pathway/engine.pyi | 8 +- python/pathway/internals/__init__.py | 2 +- python/pathway/internals/asynchronous.py | 13 +- python/pathway/internals/column.py | 8 +- python/pathway/internals/common.py | 25 +- python/pathway/internals/custom_reducers.py | 287 ++++++++++++++ python/pathway/internals/datasink.py | 7 +- python/pathway/internals/datasource.py | 20 + python/pathway/internals/decorators.py | 2 +- python/pathway/internals/dtype.py | 164 ++++++-- python/pathway/internals/expression.py | 12 +- .../internals/graph_runner/__init__.py | 76 ++-- .../graph_runner/expression_evaluator.py | 22 +- .../graph_runner/operator_handler.py | 1 + .../internals/graph_runner/path_evaluator.py | 23 +- .../internals/graph_runner/scope_context.py | 14 +- .../pathway/internals/graph_runner/state.py | 9 +- .../internals/graph_runner/storage_graph.py | 269 ++++--------- python/pathway/internals/operator.py | 10 +- python/pathway/internals/operator_mapping.py | 30 +- python/pathway/internals/reducers.py | 285 +------------- python/pathway/internals/run.py | 8 +- .../pathway/internals/runtime_type_check.py | 2 +- python/pathway/internals/schema.py | 59 ++- python/pathway/internals/sql.py | 4 +- python/pathway/internals/table.py | 124 ++++-- python/pathway/internals/table_like.py | 8 +- python/pathway/internals/table_slice.py | 10 +- .../pathway/internals/table_subscription.py | 40 +- python/pathway/internals/type_interpreter.py | 49 ++- python/pathway/io/_subscribe.py | 9 +- python/pathway/io/_utils.py | 8 +- python/pathway/io/csv/__init__.py | 21 +- python/pathway/io/debezium/__init__.py | 4 +- python/pathway/io/elasticsearch/__init__.py | 4 +- python/pathway/io/fs/__init__.py | 21 +- python/pathway/io/http/__init__.py | 6 +- python/pathway/io/http/_server.py | 6 +- python/pathway/io/jsonlines/__init__.py | 21 +- python/pathway/io/kafka/__init__.py | 11 +- python/pathway/io/logstash/__init__.py | 4 +- python/pathway/io/minio/__init__.py | 4 +- python/pathway/io/null/__init__.py | 4 +- python/pathway/io/plaintext/__init__.py | 19 +- python/pathway/io/postgres/__init__.py | 4 +- python/pathway/io/python/__init__.py | 94 ++++- python/pathway/io/redpanda/__init__.py | 6 +- python/pathway/io/s3/__init__.py | 23 +- python/pathway/io/s3_csv/__init__.py | 4 +- python/pathway/io/sqlite/__init__.py | 5 +- python/pathway/reducers.py | 8 +- .../stdlib/graphs/bellman_ford/impl.py | 4 +- .../stdlib/graphs/louvain_communities/impl.py | 8 +- python/pathway/stdlib/graphs/pagerank/impl.py | 4 +- python/pathway/stdlib/indexing/sorting.py | 16 +- .../pathway/stdlib/ml/classifiers/_knn_lsh.py | 55 ++- python/pathway/stdlib/ml/index.py | 45 ++- python/pathway/stdlib/ordered/diff.py | 4 +- python/pathway/stdlib/temporal/_asof_join.py | 10 +- .../pathway/stdlib/temporal/_asof_now_join.py | 8 +- .../pathway/stdlib/temporal/_interval_join.py | 12 +- python/pathway/stdlib/temporal/_window.py | 24 +- .../pathway/stdlib/temporal/_window_join.py | 12 +- python/pathway/stdlib/utils/col.py | 12 +- python/pathway/stdlib/viz/plotting.py | 42 +- python/pathway/stdlib/viz/table_viz.py | 20 +- python/pathway/tests/__init__.py | 4 +- .../tests/examples/realtime-log-slack.py | 66 ++++ python/pathway/tests/ml/test_index.py | 48 ++- .../pathway/tests/temporal/test_asof_joins.py | 21 +- .../tests/temporal/test_interval_joins.py | 62 +-- .../tests/temporal/test_window_joins.py | 26 +- python/pathway/tests/temporal/test_windows.py | 101 ++++- .../tests/temporal/test_windows_stream.py | 73 ++-- python/pathway/tests/test_api.py | 8 +- python/pathway/tests/test_build_and_run.py | 58 ++- .../pathway/tests/test_column_properties.py | 96 ++++- python/pathway/tests/test_common.py | 72 ++-- python/pathway/tests/test_dtypes.py | 15 + python/pathway/tests/test_error_messages.py | 13 +- python/pathway/tests/test_flatten.py | 16 +- python/pathway/tests/test_io.py | 323 ++++++++++++---- python/pathway/tests/test_json.py | 37 ++ python/pathway/tests/test_operators.py | 48 +-- python/pathway/tests/test_utils.py | 53 ++- python/pathway/tests/utils.py | 103 +++-- python/pathway/xpacks/__init__.py | 0 python/pathway/xpacks/llm/__init__.py | 0 python/pathway/xpacks/llm/parser.py | 116 ++++++ python/pathway/xpacks/llm/splitter.py | 19 + python/pathway/xpacks/llm/vector_store.py | 254 +++++++++++++ src/connectors/data_storage.rs | 35 +- src/engine/dataflow.rs | 231 +++++------ src/engine/dataflow/operators.rs | 140 +++---- src/engine/dataflow/operators/output.rs | 118 ++++++ src/engine/dataflow/operators/time_column.rs | 359 +++++++++++------- src/engine/dataflow/operators/utils.rs | 19 +- src/engine/expression.rs | 13 + src/engine/graph.rs | 84 +++- src/engine/time.rs | 36 ++ src/python_api.rs | 95 ++--- tests/test_time_column.rs | 148 +++++++- tests/test_upsert_session.rs | 24 +- 113 files changed, 3487 insertions(+), 1892 deletions(-) create mode 100644 python/pathway/internals/custom_reducers.py create mode 100644 python/pathway/tests/examples/realtime-log-slack.py create mode 100644 python/pathway/tests/test_dtypes.py create mode 100644 python/pathway/xpacks/__init__.py create mode 100644 python/pathway/xpacks/llm/__init__.py create mode 100644 python/pathway/xpacks/llm/parser.py create mode 100644 python/pathway/xpacks/llm/splitter.py create mode 100644 python/pathway/xpacks/llm/vector_store.py create mode 100644 src/engine/dataflow/operators/output.rs diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8af0f336..a93712ae 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -52,7 +52,7 @@ jobs: timeout-minutes: 45 steps: - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 - name: Git checkout uses: actions/checkout@v3.3.0 @@ -150,7 +150,7 @@ jobs: timeout-minutes: 45 steps: - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -158,7 +158,7 @@ jobs: run: | mkdir -p wheels - - uses: actions/download-artifact@master + - uses: actions/download-artifact@v2.1.1 with: name: pathway-x86-x64 path: ./wheels/ @@ -173,7 +173,7 @@ jobs: WHEEL=(./wheels/pathway-*.whl) pip install --prefer-binary "${WHEEL}[tests]" # --confcutdir anything below to avoid picking REPO_TOP_DIR/conftest.py - python -m pytest --confcutdir "${ENV_NAME}" --doctest-modules --pyargs pathway + python -m pytest -v --confcutdir "${ENV_NAME}" --doctest-modules --pyargs pathway Verify_ARM_ARCH: needs: @@ -194,13 +194,13 @@ jobs: run: | mkdir -p wheels - - uses: actions/download-artifact@master + - uses: actions/download-artifact@v2.1.1 if: ${{ matrix.os == 'ec2-macOS' }} with: name: pathway-arm64 path: ./wheels/ - - uses: actions/download-artifact@master + - uses: actions/download-artifact@v2.1.1 if: ${{ matrix.os == needs.start-runner.outputs.label }} with: name: pathway-arch64 @@ -217,7 +217,7 @@ jobs: PATHWAY_MONITORING_HTTP_PORT=20099 pip install --prefer-binary "${WHEEL}[tests]" # --confcutdir anything below to avoid picking REPO_TOP_DIR/conftest.py - python -m pytest --confcutdir "${ENV_NAME}" --doctest-modules --pyargs pathway + python -m pytest -v --confcutdir "${ENV_NAME}" --doctest-modules --pyargs pathway env: MACOSX_DEPLOYMENT_TARGET: "10.15" DEVELOPER_DIR: /Library/Developer/CommandLineTools @@ -251,22 +251,22 @@ jobs: run: | mkdir -p wheels - - uses: actions/download-artifact@master + - uses: actions/download-artifact@v2.1.1 with: name: pathway-x86-x64 path: ./wheels/ - - uses: actions/download-artifact@master + - uses: actions/download-artifact@v2.1.1 with: name: pathway-arch64 path: ./wheels/ - - uses: actions/download-artifact@master + - uses: actions/download-artifact@v2.1.1 with: name: pathway-arm64 path: ./wheels/ - - uses: actions/download-artifact@master + - uses: actions/download-artifact@v2.1.1 with: name: CHANGELOG.md path: . diff --git a/CHANGELOG.md b/CHANGELOG.md index 3261cd3b..046a928c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,29 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.7.5] - 2023-12-15 + +### Added +- Added Table.split() method for splitting table based on an expression into two tables. +- Columns with datatype duration can now be multiplied and divided by floats. +- Columns with datatype duration now support both true and floor division (`/` and `//`) by integers. + +### Changed +- Pathway is better at typing if_else expressions when optional types are involved. +- `table.flatten()` operator now supports Json array. +- Buffers (used to delay outputs, configured via delay in `common_behavior`) now flush the data when the computation is finished. The effect of this change can be seen when run in bounded (batch / multi-revision) mode. +- `pw.io.subscribe()` takes additional argument `on_time_end` - the callback function to be called on each closed time of computation. +- `pw.io.subscribe()` is now a single-worker operator, guaranteeing that `on_end` is triggered at most once. +- `KNNIndex` supports now metadata filtering. Each query can specify it's own filter in the JMESPath format. + +### Fixed +- Resolved an optimization bug causing `pw.iterate` to malfunction when handling columns effectively pointing to the same data. + ## [0.7.4] - 2023-12-05 +### Changed +- Pathway now keeps track of `array` columntype better - it is able to keep track of Array dtype and number of dimensions, wherever applicable. + ### Fixed - Fixed issues with standalone panel+Bokeh dashboards to ensure optimal functionality and performance. diff --git a/Cargo.lock b/Cargo.lock index 068acd49..dcc2dd57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -129,7 +129,7 @@ checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -213,9 +213,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" [[package]] name = "base64" -version = "0.21.3" +version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414dcefbc63d77c526a76b3afcf6fbb9b5e2791c19c3aa2297733208750c6e53" +checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" [[package]] name = "bincode" @@ -454,7 +454,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -476,7 +476,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core 0.20.3", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -614,9 +614,9 @@ dependencies = [ [[package]] name = "eyre" -version = "0.6.9" +version = "0.6.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80f656be11ddf91bd709454d15d5bd896fbaf4cc3314e69349e4d1569f5b46cd" +checksum = "8bbb8258be8305fb0237d7b295f47bb24ff1b136a535f473baf40e70468515aa" dependencies = [ "indenter", "once_cell", @@ -748,7 +748,7 @@ checksum = "53b153fd91e4b0147f4aced87be237c98248656bb01050b96bf3ee89220a8ddb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -835,7 +835,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap 2.0.0", + "indexmap 2.1.0", "slab", "tokio", "tokio-util", @@ -853,9 +853,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" dependencies = [ "ahash 0.8.6", "allocator-api2", @@ -867,7 +867,7 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" dependencies = [ - "hashbrown 0.14.0", + "hashbrown 0.14.3", ] [[package]] @@ -904,9 +904,9 @@ dependencies = [ [[package]] name = "http-body" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", "http", @@ -1026,12 +1026,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.14.3", "serde", ] @@ -1086,9 +1086,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jemalloc-sys" @@ -1136,9 +1136,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.150" +version = "0.2.151" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" [[package]] name = "libredox" @@ -1265,9 +1265,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0" +checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" dependencies = [ "libc", "wasi", @@ -1392,15 +1392,15 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.60" +version = "0.10.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" +checksum = "6b8419dc8cc6d866deb801274bba2e6f8f6108c1bb7fcc10ee5ab864931dbb45" dependencies = [ "bitflags 2.4.1", "cfg-if", @@ -1419,7 +1419,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -1430,18 +1430,18 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-src" -version = "300.1.6+3.1.4" +version = "300.2.0+3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439fac53e092cd7442a3660c85dde4643ab3b5bd39040912388dcdabf6b88085" +checksum = "b1ebed1d188c4cd64c2bcd73d6c1fe1092f3d98c111831923cc1b706c3859fca" dependencies = [ "cc", ] [[package]] name = "openssl-sys" -version = "0.9.96" +version = "0.9.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" +checksum = "c3eaad34cdd97d81de97964fc7f29e2d104f483840d906ef56daa1912338460b" dependencies = [ "cc", "libc", @@ -1452,9 +1452,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "4.1.1" +version = "4.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "536900a8093134cf9ccf00a27deb3532421099e958d9dd431135d0c7543ca1e8" +checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e" dependencies = [ "num-traits", "rand", @@ -1505,7 +1505,7 @@ dependencies = [ [[package]] name = "pathway" -version = "0.7.4" +version = "0.7.5" dependencies = [ "arc-swap", "arcstr", @@ -1554,7 +1554,7 @@ dependencies = [ "serde_json", "serde_with 3.4.0", "smallvec", - "syn 2.0.39", + "syn 2.0.40", "tempfile", "thiserror", "timely", @@ -1653,7 +1653,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" dependencies = [ - "base64 0.21.3", + "base64 0.21.5", "byteorder", "bytes", "fallible-iterator 0.2.0", @@ -1730,7 +1730,7 @@ checksum = "440f724eba9f6996b75d63681b0a92b06947f1457076d503a4d2e2c8f56442b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -1803,7 +1803,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -1815,7 +1815,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -1964,7 +1964,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b" dependencies = [ "async-compression", - "base64 0.21.3", + "base64 0.21.5", "bytes", "encoding_rs", "futures-core", @@ -2073,9 +2073,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.26" +version = "0.38.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9470c4bf8246c8daf25f9598dca807fb6510347b1e1cfa55749113850c79d88a" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" dependencies = [ "bitflags 2.4.1", "errno", @@ -2086,9 +2086,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "schannel" @@ -2166,7 +2166,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -2208,11 +2208,11 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64cd236ccc1b7a29e7e2739f27c0b2dd199804abc4290e32f59f3b68d6405c23" dependencies = [ - "base64 0.21.3", + "base64 0.21.5", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.0.0", + "indexmap 2.1.0", "serde", "serde_json", "serde_with_macros 3.4.0", @@ -2240,7 +2240,7 @@ dependencies = [ "darling 0.20.3", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -2331,9 +2331,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.39" +version = "2.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +checksum = "13fa70a4ee923979ffb522cacce59d34421ebdea5625e1073c4326ef9d2dd42e" dependencies = [ "proc-macro2", "quote", @@ -2409,7 +2409,7 @@ checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -2505,9 +2505,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.34.0" +version = "1.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" +checksum = "841d45b238a16291a4e1584e61820b8ae57d696cc5015c459c229ccc6990cc1c" dependencies = [ "backtrace", "bytes", @@ -2580,7 +2580,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap 2.0.0", + "indexmap 2.1.0", "toml_datetime", "winnow", ] @@ -2612,9 +2612,9 @@ dependencies = [ [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "typenum" @@ -2624,9 +2624,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416" [[package]] name = "unicode-ident" @@ -2726,7 +2726,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", "wasm-bindgen-shared", ] @@ -2760,7 +2760,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2956,9 +2956,9 @@ checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" [[package]] name = "winnow" -version = "0.5.19" +version = "0.5.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829846f3e3db426d4cee4510841b71a8e58aa2a76b1132579487ae430ccd9c7b" +checksum = "b67b5f0a4e7a27a64c651977932b9dc5667ca7fc31ac44b03ed37a0cf42fdfff" dependencies = [ "memchr", ] @@ -2981,22 +2981,22 @@ checksum = "9828b178da53440fa9c766a3d2f73f7cf5d0ac1fe3980c1e5018d899fd19e07b" [[package]] name = "zerocopy" -version = "0.7.28" +version = "0.7.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d6f15f7ade05d2a4935e34a457b936c23dc70a05cc1d97133dc99e7a3fe0f0e" +checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.28" +version = "0.7.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbbad221e3f78500350ecbd7dfa4e63ef945c05f4c61cb7f4d3f84cd0bba649b" +checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index cc496574..4f8b9da5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pathway" -version = "0.7.4" +version = "0.7.5" edition = "2021" publish = false rust-version = "1.72.0" @@ -12,7 +12,7 @@ crate-type = ["cdylib", "lib"] [dev-dependencies] assert_matches = "1.5.0" -eyre = "0.6.9" +eyre = "0.6.10" [dependencies] arc-swap = "1.6.0" @@ -40,8 +40,8 @@ ndarray = { version = "0.15.6", features = ["serde"] } nix = { version = "0.27.1", features = ["fs", "user"] } num-integer = "0.1.45" numpy = "0.20.0" -once_cell = "1.18.0" -ordered-float = { version = "4.1.1", features = ["serde"] } +once_cell = "1.19.0" +ordered-float = { version = "4.2.0", features = ["serde"] } pipe = "0.4.0" postgres = { version = "0.19.7", features = ["with-chrono-0_4", "with-serde_json-1"] } prometheus-client = "0.22.0" @@ -58,11 +58,11 @@ serde = { version = "1.0.193", features = ["derive", "rc"] } serde_json = "1.0" serde_with = "3.4.0" smallvec = { version = "1.11.2", features = ["union", "const_generics"] } -syn = { version = "2.0.39", features = ["default", "full", "visit", "visit-mut"] } # Hack to keep features unified between normal and build deps +syn = { version = "2.0.40", features = ["default", "full", "visit", "visit-mut"] } # Hack to keep features unified between normal and build deps tempfile = "3.8.1" thiserror = "1.0.50" timely = { path = "./external/timely-dataflow/timely", features = ["bincode"] } -tokio = "1.34.0" +tokio = "1.35.0" xxhash-rust = { version = "0.8.7", features = ["xxh3"] } [target.'cfg(target_os = "linux")'.dependencies] diff --git a/integration_tests/kafka/test_backfilling.py b/integration_tests/kafka/test_backfilling.py index fa4dabeb..92240b32 100644 --- a/integration_tests/kafka/test_backfilling.py +++ b/integration_tests/kafka/test_backfilling.py @@ -6,7 +6,6 @@ import random import time -import pytest from utils import KafkaTestContext import pathway as pw @@ -31,7 +30,10 @@ def __init__(self, kafka_context, output_file_path, n_words, n_word_repetitions) self.n_word_repetitions = n_word_repetitions def __call__(self): - return self.topic_stats()["completed_words"] == self.n_words + try: + return self.topic_stats()["completed_words"] == self.n_words + except Exception: + return False def expected_word_counts(self): # workaround for some messages being lost in kafka @@ -116,7 +118,7 @@ def run_backfilling_program( checker = WordcountChecker( kafka_context, output_file_path, 1000, 50 * (run_seq_id + 1) ) - assert wait_result_with_checker( + wait_result_with_checker( checker=checker, timeout_sec=60, target=WordcountProgram( @@ -134,7 +136,6 @@ def run_backfilling_program( del os.environ["PATHWAY_THREADS"] -@pytest.mark.xdist_group(name="backfilling_tests") def test_backfilling_fs_storage( tmp_path: pathlib.Path, kafka_context: KafkaTestContext ): @@ -145,7 +146,6 @@ def test_backfilling_fs_storage( run_backfilling_program(fs_persistence_config, tmp_path, kafka_context) -@pytest.mark.xdist_group(name="backfilling_tests") def test_backfilling_s3_storage( tmp_path: pathlib.Path, kafka_context: KafkaTestContext ): diff --git a/integration_tests/kafka/test_simple.py b/integration_tests/kafka/test_simple.py index f798ab97..447189c1 100644 --- a/integration_tests/kafka/test_simple.py +++ b/integration_tests/kafka/test_simple.py @@ -22,7 +22,7 @@ def test_kafka_raw(tmp_path: pathlib.Path, kafka_context: KafkaTestContext): pw.io.csv.write(table, str(tmp_path / "output.csv")) - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ data @@ -57,7 +57,7 @@ def test_kafka_json(tmp_path: pathlib.Path, kafka_context: KafkaTestContext): pw.io.csv.write(table, str(tmp_path / "output.csv")) - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ k | v @@ -94,7 +94,7 @@ def test_kafka_csv(tmp_path: pathlib.Path, kafka_context: KafkaTestContext): pw.io.csv.write(table, str(tmp_path / "output.csv")) - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ k | v @@ -119,7 +119,7 @@ def test_kafka_simple_wrapper(tmp_path: pathlib.Path, kafka_context: KafkaTestCo ) pw.io.csv.write(table, str(tmp_path / "output.csv")) - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ data @@ -141,7 +141,7 @@ def test_kafka_simple_wrapper(tmp_path: pathlib.Path, kafka_context: KafkaTestCo ) pw.io.csv.write(table, str(tmp_path / "output.csv")) - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ data @@ -199,7 +199,7 @@ def test_kafka_recovery(tmp_path: pathlib.Path, kafka_context: KafkaTestContext) pw.io.csv.write(table, str(tmp_path / "output.csv")) - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ k | v @@ -240,7 +240,7 @@ def test_kafka_recovery(tmp_path: pathlib.Path, kafka_context: KafkaTestContext) ) pw.io.csv.write(table, str(tmp_path / "output_backfilled.csv")) - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ k | v diff --git a/integration_tests/webserver/test_rest_connector.py b/integration_tests/webserver/test_rest_connector.py index a36a3e37..f07e7be0 100644 --- a/integration_tests/webserver/test_rest_connector.py +++ b/integration_tests/webserver/test_rest_connector.py @@ -3,20 +3,18 @@ import threading import time -import pytest import requests import pathway as pw from pathway.tests.utils import ( CsvLinesNumberChecker, expect_csv_checker, + needs_multiprocessing_fork, wait_result_with_checker, - xfail_on_darwin, ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@needs_multiprocessing_fork def test_server(tmp_path: pathlib.Path): port = int(os.environ.get("PATHWAY_MONITORING_HTTP_PORT", "20000")) + 10000 output_path = tmp_path / "output.csv" @@ -54,11 +52,10 @@ def target(): t = threading.Thread(target=target, daemon=True) t.start() - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 4), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 4), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@needs_multiprocessing_fork def test_server_customization(tmp_path: pathlib.Path): port = int(os.environ.get("PATHWAY_MONITORING_HTTP_PORT", "20000")) + 10001 output_path = tmp_path / "output.csv" @@ -96,11 +93,10 @@ def target(): t = threading.Thread(target=target, daemon=True) t.start() - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 4), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 4), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@needs_multiprocessing_fork def test_server_schema_customization(tmp_path: pathlib.Path): port = int(os.environ.get("PATHWAY_MONITORING_HTTP_PORT", "20000")) + 10002 output_path = tmp_path / "output.csv" @@ -134,11 +130,10 @@ def target(): t = threading.Thread(target=target, daemon=True) t.start() - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 4), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 4), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@needs_multiprocessing_fork def test_server_keep_queries(tmp_path: pathlib.Path): port = int(os.environ.get("PATHWAY_MONITORING_HTTP_PORT", "20000")) + 10003 output_path = tmp_path / "output.csv" @@ -172,7 +167,7 @@ def target(): t = threading.Thread(target=target, daemon=True) t.start() - assert wait_result_with_checker( + wait_result_with_checker( expect_csv_checker( """ key | sum | diff diff --git a/pyproject.toml b/pyproject.toml index 6d373a6b..24b7d87b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,12 +35,14 @@ dependencies = [ "typing_extensions >= 4.8.0", "panel >= 1.3.1", "jupyter_bokeh >= 3.0.7", + "jmespath >= 1.0.1", ] [project.optional-dependencies] tests = [ "pytest >= 7.4.2, < 8.0.0", "pytest-xdist >= 3.3.1, < 4.0.0", + "pytest-rerunfailures >= 13.0, < 14.0", "networkx", "python-louvain", ] diff --git a/python/pathway/__init__.py b/python/pathway/__init__.py index 4bda0f44..eee06e7a 100644 --- a/python/pathway/__init__.py +++ b/python/pathway/__init__.py @@ -6,6 +6,7 @@ import os +from pathway.internals.custom_reducers import BaseCustomAccumulator from pathway.internals.dtype import DATE_TIME_NAIVE, DATE_TIME_UTC, DURATION # flake8: noqa: E402 @@ -21,7 +22,6 @@ import pathway.reducers as reducers from pathway import debug, demo, io from pathway.internals import ( - BaseCustomAccumulator, ClassArg, ColumnExpression, ColumnReference, diff --git a/python/pathway/debug/__init__.py b/python/pathway/debug/__init__.py index 5af2dde6..a01f0aaf 100644 --- a/python/pathway/debug/__init__.py +++ b/python/pathway/debug/__init__.py @@ -19,7 +19,7 @@ from pathway.internals.fingerprints import fingerprint from pathway.internals.graph_runner import GraphRunner from pathway.internals.monitoring import MonitoringLevel -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema, schema_from_pandas from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -27,7 +27,7 @@ from pathway.io.python import ConnectorSubject, read -@runtime_type_check +@check_arg_types def _compute_table(table: Table) -> api.CapturedStream: [captured] = GraphRunner( parse_graph.G, debug=True, monitoring_level=MonitoringLevel.NONE @@ -145,7 +145,7 @@ def _key(row: tuple[api.Pointer, tuple[api.Value, ...]]): print(formatted.rstrip()) -@runtime_type_check +@check_arg_types @trace_user_frame def compute_and_print( table: Table, @@ -171,7 +171,7 @@ def compute_and_print( ) -@runtime_type_check +@check_arg_types @trace_user_frame def compute_and_print_update_stream( table: Table, @@ -197,7 +197,7 @@ def compute_and_print_update_stream( ) -@runtime_type_check +@check_arg_types @trace_user_frame def table_to_pandas(table: Table, *, include_id: bool = True): keys, columns = table_to_dicts(table) @@ -231,7 +231,38 @@ def _validate_dataframe(df: pd.DataFrame) -> None: ) -@runtime_type_check +@check_arg_types +@trace_user_frame +def table_from_rows( + schema: type[Schema], + rows: list[tuple], + unsafe_trusted_ids: bool = False, + is_stream=False, +) -> Table: + """ + A function for creating a table from a list of tuples. Each tuple should describe + one row of the input data (or stream), matching provided schema. + + If ``is_stream`` is set to ``True``, each tuple representing a row should contain + two additional columns, the first indicating the time of arrival of particular row + and the second indicating whether the row should be inserted (1) or deleted (-1). + + """ + + kwargs: dict[str, list] = {} + colnames = schema.column_names() + if is_stream: + colnames += ["__time__", "__diff__"] + for colname in colnames: + kwargs[colname] = [] + for row in rows: + for colname, entry in zip(colnames, list(row)): + kwargs[colname].append(entry) + df = pd.DataFrame.from_dict(kwargs) + return table_from_pandas(df, unsafe_trusted_ids=unsafe_trusted_ids, schema=schema) + + +@check_arg_types @trace_user_frame def table_from_pandas( df: pd.DataFrame, @@ -260,6 +291,9 @@ def table_from_pandas( _validate_dataframe(df) + if id_from is None and schema is not None: + id_from = schema.primary_key_columns() + if id_from is None: ids_df = pd.DataFrame({"id": df.index}) ids_df.index = df.index @@ -340,7 +374,7 @@ def parse_to_table(*args, **kwargs) -> Table: return table_from_markdown(*args, **kwargs) -@runtime_type_check +@check_arg_types def table_from_parquet( path: str | PathLike, id_from=None, @@ -354,7 +388,7 @@ def table_from_parquet( return table_from_pandas(df, id_from=None, unsafe_trusted_ids=False) -@runtime_type_check +@check_arg_types def table_to_parquet(table: Table, filename: str | PathLike): """ Converts a Pathway Table into a pandas DataFrame and then writes it to Parquet diff --git a/python/pathway/engine.pyi b/python/pathway/engine.pyi index 8f027081..428a5e8d 100644 --- a/python/pathway/engine.pyi +++ b/python/pathway/engine.pyi @@ -38,8 +38,7 @@ class PathwayType(Enum): class ConnectorMode(Enum): STATIC: ConnectorMode - SIMPLE_STREAMING: ConnectorMode - STREAMING_WITH_DELETIONS: ConnectorMode + STREAMING: ConnectorMode class ReadMethod(Enum): BY_LINE: ReadMethod @@ -412,9 +411,7 @@ class Scope: expressions: list[tuple[Expression, TableProperties]], ) -> Table: ... def table_properties(self, table: Table) -> TableProperties: ... - def columns_to_table( - self, universe: Universe, columns: list[tuple[Column, ColumnPath]] - ) -> Table: ... + def columns_to_table(self, universe: Universe, columns: list[Column]) -> Table: ... def table_column( self, universe: Universe, table: Table, column_path: ColumnPath ) -> Column: ... @@ -575,6 +572,7 @@ class Scope: column_paths: Iterable[ColumnPath], skip_persisted_batch: bool, on_change: Callable, + on_time_end: Callable, on_end: Callable, ): ... def output_table( diff --git a/python/pathway/internals/__init__.py b/python/pathway/internals/__init__.py index 3aba1081..1ef4833c 100644 --- a/python/pathway/internals/__init__.py +++ b/python/pathway/internals/__init__.py @@ -22,6 +22,7 @@ udf_async, unwrap, ) +from pathway.internals.custom_reducers import BaseCustomAccumulator from pathway.internals.datetime_types import DateTimeNaive, DateTimeUtc, Duration from pathway.internals.decorators import ( attribute, @@ -42,7 +43,6 @@ from pathway.internals.json import Json from pathway.internals.monitoring import MonitoringLevel from pathway.internals.operator import iterate_universe -from pathway.internals.reducers import BaseCustomAccumulator from pathway.internals.row_transformer import ClassArg from pathway.internals.run import run, run_all from pathway.internals.schema import ( diff --git a/python/pathway/internals/asynchronous.py b/python/pathway/internals/asynchronous.py index 175d6fb4..801141d0 100644 --- a/python/pathway/internals/asynchronous.py +++ b/python/pathway/internals/asynchronous.py @@ -15,10 +15,10 @@ import diskcache from pathway.internals import trace -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types -@runtime_type_check +@check_arg_types def with_capacity(func: Callable, capacity: int): """ Limits the number of simultaneous calls of the specified function. @@ -41,7 +41,7 @@ async def wrapper(*args, **kwargs): return wrapper -@runtime_type_check +@check_arg_types def with_retry_strategy(func: Callable, retry_strategy: AsyncRetryStrategy) -> Callable: """ Returns an asynchronous function with applied retry strategy. @@ -61,7 +61,7 @@ async def wrapper(*args, **kwargs): return wrapper -@runtime_type_check +@check_arg_types def with_cache_strategy(func, cache_strategy: CacheStrategy) -> Callable: func = coerce_async(func) @@ -90,7 +90,7 @@ def decorator(func): return decorator -@runtime_type_check +@check_arg_types def coerce_async(func: Callable) -> Callable: if asyncio.iscoroutinefunction(func): return func @@ -212,3 +212,6 @@ def _get_cache(self, func): cache_dir = Path(storage_root) / "runtime_calls" self._cache = diskcache.Cache(cache_dir / self._name) return self._cache + + +DefaultCache = DiskCache diff --git a/python/pathway/internals/column.py b/python/pathway/internals/column.py index 6b59f94f..92984fe8 100644 --- a/python/pathway/internals/column.py +++ b/python/pathway/internals/column.py @@ -779,7 +779,7 @@ def _get_flatten_column_dtype(self): if isinstance(dtype, dt.List): return dtype.wrapped if isinstance(dtype, dt.Tuple): - if dtype == dt.ANY_TUPLE: + if dtype in (dt.ANY_TUPLE, dt.Tuple()): return dt.ANY assert not isinstance(dtype.args, EllipsisType) return_dtype = dtype.args[0] @@ -788,8 +788,12 @@ def _get_flatten_column_dtype(self): return return_dtype elif dtype == dt.STR: return dt.STR - elif dtype in {dt.ARRAY, dt.ANY}: + elif dtype == dt.ANY: return dt.ANY + elif isinstance(dtype, dt.Array): + return dtype.strip_dimension() + elif dtype == dt.JSON: + return dt.JSON else: raise TypeError( f"Cannot flatten column {self.flatten_column.expression!r} of type {dtype}." diff --git a/python/pathway/internals/common.py b/python/pathway/internals/common.py index 85719314..dd6fc911 100644 --- a/python/pathway/internals/common.py +++ b/python/pathway/internals/common.py @@ -29,18 +29,19 @@ from pathway.internals.asynchronous import ( AsyncRetryStrategy, CacheStrategy, + DefaultCache, async_options, ) from pathway.internals.helpers import function_spec from pathway.internals.parse_graph import G -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame T = TypeVar("T") P = ParamSpec("P") -@runtime_type_check +@check_arg_types def iterate( func, iteration_limit: int | None = None, @@ -58,7 +59,7 @@ def iterate( ... if x == 1: ... return 1 ... elif x % 2 == 0: - ... return x / 2 + ... return x // 2 ... else: ... return 3 * x + 1 ... new_iterated = iterated.select(val=pw.apply(collatz_step, iterated.val)) @@ -93,7 +94,7 @@ def iterate( ) -@runtime_type_check +@check_arg_types @trace_user_frame def apply( fun: Callable, @@ -212,6 +213,9 @@ def udf_async( Bobdog """ + if cache_strategy is None: + cache_strategy = DefaultCache() + def apply_wrapper(fun, *args, **kwargs): fun = async_options( capacity=capacity, @@ -232,7 +236,7 @@ def decorator(fun: Callable) -> Callable: return decorator(fun) -@runtime_type_check +@check_arg_types @trace_user_frame def numba_apply( fun: Callable, @@ -315,7 +319,7 @@ def apply_with_type( return expr.ApplyExpression(fun, ret_type, *args, **kwargs) -@runtime_type_check +@check_arg_types @trace_user_frame def apply_async( fun: Callable, @@ -379,7 +383,6 @@ def declare_type( >>> t3.schema }> """ - return expr.DeclareTypeExpression(target_type, col) @@ -416,7 +419,7 @@ def cast(target_type: Any, col: expr.ColumnExpression | Value) -> expr.CastExpre return expr.CastExpression(target_type, col) -@runtime_type_check +@check_arg_types @trace_user_frame def coalesce(*args: expr.ColumnExpression | Value) -> expr.ColumnExpression: """For arguments list arg_1, arg_2, ..., arg_n returns first not-None value. @@ -441,7 +444,7 @@ def coalesce(*args: expr.ColumnExpression | Value) -> expr.ColumnExpression: return expr.CoalesceExpression(*args) -@runtime_type_check +@check_arg_types @trace_user_frame def require(val, *deps: expr.ColumnExpression | Value) -> expr.ColumnExpression: """Returns val iff every dep in deps is not-None. @@ -467,7 +470,7 @@ def require(val, *deps: expr.ColumnExpression | Value) -> expr.ColumnExpression: return expr.RequireExpression(val, *deps) -@runtime_type_check +@check_arg_types @trace_user_frame def if_else( if_clause: expr.ColumnExpression | Value, @@ -500,7 +503,7 @@ def if_else( return expr.IfElseExpression(if_clause, then_clause, else_clause) -@runtime_type_check +@check_arg_types @trace_user_frame def make_tuple(*args: expr.ColumnExpression | Value) -> expr.ColumnExpression: """ diff --git a/python/pathway/internals/custom_reducers.py b/python/pathway/internals/custom_reducers.py new file mode 100644 index 00000000..0e8d4fed --- /dev/null +++ b/python/pathway/internals/custom_reducers.py @@ -0,0 +1,287 @@ +import pickle +from abc import ABC, abstractmethod +from collections import Counter +from typing import ParamSpec, Protocol, TypeVar + +from typing_extensions import Self + +from pathway.internals import api, expression as expr +from pathway.internals.column import ColumnExpression +from pathway.internals.common import apply_with_type +from pathway.internals.reducers import StatefulManyReducer +from pathway.internals.shadows.inspect import signature + +P = ParamSpec("P") + + +S = TypeVar("S", bound=api.Value) +V1 = TypeVar("V1", bound=api.Value) +V2 = TypeVar("V2", bound=api.Value) + + +def mark_stub(fun): + fun.__pw_stub = True + return fun + + +class ReducerProtocol(Protocol): + def __call__( + self, *args: expr.ColumnExpression | api.Value + ) -> expr.ColumnExpression: + ... + + +def stateful_many( + combine_many: api.CombineMany[S], +) -> ReducerProtocol: + def wrapper(*args: expr.ColumnExpression | api.Value) -> expr.ColumnExpression: + return expr.ReducerExpression(StatefulManyReducer(combine_many), *args) + + return wrapper + + +class CombineSingle(Protocol[S, P]): + def __call__(self, state: S | None, /, *args: P.args, **kwargs: P.kwargs) -> S: + ... + + +def stateful_single(combine_single: CombineSingle[S, ...]) -> ReducerProtocol: + def wrapper(state: S | None, rows: list[tuple[list[api.Value], int]]) -> S: + for row, count in rows: + assert count > 0 + for _ in range(count): + state = combine_single(state, *row) + assert state is not None + return state + + return stateful_many(wrapper) + + +class BaseCustomAccumulator(ABC): + """Utility class for defining custom accumulators, used for custom reducers. + Custom accumulators should inherit from this class, and should implement ``from_row``, + ``update`` and ``compute_result``. Optionally ``neutral`` and ``retract`` can be provided + for more efficient processing on streams with changing data. + + >>> import pathway as pw + >>> class CustomAvgAccumulator(pw.BaseCustomAccumulator): + ... def __init__(self, sum, cnt): + ... self.sum = sum + ... self.cnt = cnt + ... + ... @classmethod + ... def from_row(self, row): + ... [val] = row + ... return CustomAvgAccumulator(val, 1) + ... + ... def update(self, other): + ... self.sum += other.sum + ... self.cnt += other.cnt + ... + ... def compute_result(self) -> float: + ... return self.sum / self.cnt + >>> import sys; sys.modules[__name__].CustomAvgAccumulator = CustomAvgAccumulator # NODOCS + >>> custom_avg = pw.reducers.udf_reducer(CustomAvgAccumulator) + >>> t1 = pw.debug.parse_to_table(''' + ... age | owner | pet | price + ... 10 | Alice | dog | 100 + ... 9 | Bob | cat | 80 + ... 8 | Alice | cat | 90 + ... 7 | Bob | dog | 70 + ... ''') + >>> t2 = t1.groupby(t1.owner).reduce(t1.owner, avg_price=custom_avg(t1.price)) + >>> pw.debug.compute_and_print(t2, include_id=False) + owner | avg_price + Alice | 95.0 + Bob | 75.0 + """ + + @classmethod + @mark_stub + def neutral(cls) -> Self: + """Neutral element of the accumulator (aggregation of an empty list). + + This function is optional, and allows for more efficient processing on streams + with changing data.""" + raise NotImplementedError() + + @classmethod + @abstractmethod + def from_row(cls, row: list[api.Value]) -> Self: + """Construct the accumulator from a row of data. + Row will be passed as a list of values. + + This is a mandatory function.""" + raise NotImplementedError() + + @abstractmethod + def update(self, other: Self) -> None: + """Update the accumulator with another one. + Method does not need to return anything, the change should be in-place. + + This is a mandatory function.""" + raise NotImplementedError() + + @mark_stub + def retract(self, other: Self) -> None: + """Update the accumulator by removing the value of another one. + + This function is optional, and allows more efficient reductions on streams + with changing data. + """ + raise NotImplementedError() + + @abstractmethod + def compute_result(self) -> api.Value: + """Mandatory function to finalize computation. + Used to extract answer from final state of accumulator. + + Narrowing the type of this function helps better type the output of the reducer. + """ + raise NotImplementedError() + + +def udf_reducer( + reducer_cls: type[BaseCustomAccumulator], +): + """Decorator for defining custom reducers. Requires custom accumulator as an argument. + Custom accumulator should implement ``from_row``, ``update`` and ``compute_result``. + Optionally ``neutral`` and ``retract`` can be provided for more efficient processing on + streams with changing data. + + >>> import pathway as pw + >>> class CustomAvgAccumulator(pw.BaseCustomAccumulator): + ... def __init__(self, sum, cnt): + ... self.sum = sum + ... self.cnt = cnt + ... + ... @classmethod + ... def from_row(self, row): + ... [val] = row + ... return CustomAvgAccumulator(val, 1) + ... + ... def update(self, other): + ... self.sum += other.sum + ... self.cnt += other.cnt + ... + ... def compute_result(self) -> float: + ... return self.sum / self.cnt + >>> import sys; sys.modules[__name__].CustomAvgAccumulator = CustomAvgAccumulator # NODOCS + >>> custom_avg = pw.reducers.udf_reducer(CustomAvgAccumulator) + >>> t1 = pw.debug.parse_to_table(''' + ... age | owner | pet | price + ... 10 | Alice | dog | 100 + ... 9 | Bob | cat | 80 + ... 8 | Alice | cat | 90 + ... 7 | Bob | dog | 70 + ... ''') + >>> t2 = t1.groupby(t1.owner).reduce(t1.owner, avg_price=custom_avg(t1.price)) + >>> pw.debug.compute_and_print(t2, include_id=False) + owner | avg_price + Alice | 95.0 + Bob | 75.0 + """ + neutral_available = _is_overridden(reducer_cls, "neutral") + retract_available = _is_overridden(reducer_cls, "retract") + + def wrapper(*args: expr.ColumnExpression | api.Value) -> ColumnExpression: + @stateful_many + def stateful_wrapper( + pickled_state: bytes | None, rows: list[tuple[list[api.Value], int]] + ) -> bytes | None: + if pickled_state is not None: + state = pickle.loads(pickled_state) + if not retract_available: + state._positive_updates = list(state._positive_updates) + else: + state = None + positive_updates: list[tuple[api.Value, ...]] = [] + negative_updates = [] + for row, count in rows: + if count > 0: + positive_updates.extend([tuple(row)] * count) + else: + negative_updates.extend([tuple(row)] * (-count)) + + if not retract_available and len(negative_updates) > 0: + if state is not None: + positive_updates.extend(state._positive_updates) + state._positive_updates = [] + state = None + acc = Counter(positive_updates) + acc.subtract(negative_updates) + assert all(x >= 0 for x in acc.values()) + positive_updates = list(acc.elements()) + negative_updates = [] + + if state is None: + if neutral_available: + state = reducer_cls.neutral() + if not retract_available: + state._positive_updates = [] + else: + state._cnt = 0 + elif len(positive_updates) == 0: + if len(negative_updates) == 0: + return None + else: + raise ValueError( + "Unable to process negative update with this custom reducer." + ) + else: + state = reducer_cls.from_row(list(positive_updates[0])) + if not retract_available: + state._positive_updates = positive_updates[0:1] + else: + state._cnt = 1 + positive_updates = positive_updates[1:] + + for row_up in positive_updates: + if not retract_available: + state._positive_updates.append(row_up) + else: + state._cnt += 1 + val = reducer_cls.from_row(list(row_up)) + state.update(val) + + for row_up in negative_updates: + if not retract_available: + raise ValueError( + "Unable to process negative update with this custom reducer." + ) + else: + state._cnt -= 1 + val = reducer_cls.from_row(list(row_up)) + state.retract(val) + + if not retract_available: + state._positive_updates = tuple( + tuple(x) for x in state._positive_updates + ) + else: + if state._cnt == 0: + # this is fine in this setting, where we process values one by one + # if this ever becomes accumulated in a tree, we have to handle + # (A-B) updates, so we have to distinguish `0` from intermediate states + # accumulating weighted count (weighted by hash) should do fine here + return None + + return pickle.dumps(state) + + def extractor(x: bytes): + unpickled = pickle.loads(x) + assert isinstance(unpickled, reducer_cls) + return unpickled.compute_result() + + return apply_with_type( + extractor, + signature(reducer_cls.compute_result).return_annotation, + stateful_wrapper(*args), + ) + + return wrapper + + +def _is_overridden(cls: type[BaseCustomAccumulator], name: str) -> bool: + assert hasattr(BaseCustomAccumulator, name) + return not hasattr(getattr(cls, name), "__pw_stub") diff --git a/python/pathway/internals/datasink.py b/python/pathway/internals/datasink.py index d2e3882b..f1f34974 100644 --- a/python/pathway/internals/datasink.py +++ b/python/pathway/internals/datasink.py @@ -7,6 +7,8 @@ from dataclasses import dataclass from typing import Any +from pathway.internals.api import Pointer + class DataSink(ABC): pass @@ -20,6 +22,7 @@ class GenericDataSink(DataSink): @dataclass(frozen=True) class CallbackDataSink(DataSink): - on_change: Callable[[str, list[Any], int, int], Any] - on_end: Callable[[], Any] + on_change: Callable[[Pointer, list[Any], int, int], None] + on_time_end: Callable[[int], None] + on_end: Callable[[], None] skip_persisted_batch: bool diff --git a/python/pathway/internals/datasource.py b/python/pathway/internals/datasource.py index a8613474..472cdbc8 100644 --- a/python/pathway/internals/datasource.py +++ b/python/pathway/internals/datasource.py @@ -40,10 +40,19 @@ def connector_properties(self) -> api.ConnectorProperties: column_properties=columns, ) + def get_effective_schema(self) -> type[Schema]: + if self.is_append_only(): + return self.schema.update_properties(append_only=True) + return self.schema + @abstractmethod def is_bounded(self) -> bool: ... + @abstractmethod + def is_append_only(self) -> bool: + ... + class StaticDataSource(DataSource, ABC): data: Any @@ -56,6 +65,11 @@ def is_bounded(self) -> bool: class PandasDataSource(StaticDataSource): data: pd.DataFrame + def is_append_only(self) -> bool: + return api.DIFF_PSEUDOCOLUMN not in self.data.columns or all( + self.data[api.DIFF_PSEUDOCOLUMN] == 1 + ) + @dataclass(frozen=True) class GenericDataSource(DataSource): @@ -65,12 +79,18 @@ class GenericDataSource(DataSource): def is_bounded(self) -> bool: return self.datastorage.mode == api.ConnectorMode.STATIC + def is_append_only(self) -> bool: + return self.datastorage.mode != api.ConnectorMode.STREAMING + @dataclass(frozen=True) class EmptyDataSource(DataSource): def is_bounded(self) -> bool: return True + def is_append_only(self) -> bool: + return True + def debug_datasource(debug_data) -> StaticDataSource | None: if debug_data is None: diff --git a/python/pathway/internals/decorators.py b/python/pathway/internals/decorators.py index a3625c58..ab10b27f 100644 --- a/python/pathway/internals/decorators.py +++ b/python/pathway/internals/decorators.py @@ -228,7 +228,7 @@ def method(func, **kwargs): def table_from_datasource( datasource, debug_datasource: StaticDataSource | None = None, -): +) -> Table: return G.add_operator( lambda id: op.InputOperator(datasource, id, debug_datasource), lambda operator: operator(), diff --git a/python/pathway/internals/dtype.py b/python/pathway/internals/dtype.py index 2f7cb9e4..b8537701 100644 --- a/python/pathway/internals/dtype.py +++ b/python/pathway/internals/dtype.py @@ -11,6 +11,7 @@ from types import EllipsisType, NoneType, UnionType import numpy as np +import numpy.typing as npt from pathway.internals import api, datetime_types, json as js @@ -35,8 +36,7 @@ def is_value_compatible(self, arg) -> bool: def _set_args(self, *args): ... - @classmethod - def _cached_new(cls, *args): + def __new__(cls, *args): key = (cls, args) if key not in DType._cache: ret = super().__new__(cls) @@ -48,7 +48,7 @@ def __class_getitem__(cls, args): if isinstance(args, tuple): return cls(*args) else: - return cls(args) # type: ignore[call-arg] + return cls(args) def equivalent_to(self, other: DType) -> bool: return dtype_equivalence(self, other) @@ -77,12 +77,17 @@ def _set_args(self, wrapped): self.wrapped = wrapped def __new__(cls, wrapped: type) -> _SimpleDType: - return cls._cached_new(wrapped) + return super().__new__(cls, wrapped) def is_value_compatible(self, arg): - if isinstance(arg, int) and self.wrapped == float: - return True - return isinstance(arg, self.wrapped) + if self.wrapped == float: + return np.issubdtype(type(arg), np.floating) or np.issubdtype( + type(arg), np.integer + ) + elif self.wrapped == int: + return np.issubdtype(type(arg), np.integer) + else: + return isinstance(arg, self.wrapped) def to_engine(self) -> api.PathwayType: return { @@ -111,7 +116,7 @@ def _set_args(self): pass def __new__(cls) -> _NoneDType: - return cls._cached_new() + return super().__new__(cls) def is_value_compatible(self, arg): return arg is None @@ -135,7 +140,7 @@ def to_engine(self) -> api.PathwayType: return api.PathwayType.ANY def __new__(cls) -> _AnyDType: - return cls._cached_new() + return super().__new__(cls) def is_value_compatible(self, arg): return True @@ -159,18 +164,18 @@ def __repr__(self): return f"Callable({self.arg_types}, {self.return_type})" def _set_args(self, arg_types, return_type): - if isinstance(arg_types, EllipsisType): - self.arg_types = ... - else: - self.arg_types = tuple(wrap(dtype) for dtype in arg_types) - self.return_type = wrap(return_type) + self.arg_types = arg_types + self.return_type = return_type def __new__( cls, arg_types: EllipsisType | tuple[DType | EllipsisType, ...], return_type: DType, ) -> Callable: - return cls._cached_new(arg_types, return_type) + if not isinstance(arg_types, EllipsisType): + arg_types = tuple(wrap(dtype) for dtype in arg_types) + return_type = wrap(return_type) + return super().__new__(cls, arg_types, return_type) def is_value_compatible(self, arg): return callable(arg) @@ -187,27 +192,52 @@ def typehint(self) -> typing.Any: class Array(DType): + n_dim: int | None + wrapped: DType + def __repr__(self): - return "Array" + return f"Array({self.n_dim}, {self.wrapped})" - def _set_args(self): - pass + def _set_args(self, n_dim, wrapped): + self.wrapped = wrapped + self.n_dim = n_dim def to_engine(self) -> api.PathwayType: return api.PathwayType.ARRAY - def __new__(cls) -> Array: - return cls._cached_new() + def __new__(cls, n_dim, wrapped) -> Array: + dtype = wrap(wrapped) + if isinstance(dtype, Array) and dtype.n_dim is not None: + return Array(n_dim=dtype.n_dim + n_dim, wrapped=dtype.wrapped) + else: + return super().__new__(cls, n_dim, dtype) def is_value_compatible(self, arg): - return isinstance(arg, np.ndarray) + if isinstance(arg, np.ndarray): + if self.n_dim is not None and self.n_dim != len(arg.shape): + return False + x: np.ndarray + for x in np.nditer(arg, flags=["zerosize_ok"]): # type: ignore[assignment] + if not self.wrapped.is_value_compatible(x[()]): + return False + return True + else: + if self.n_dim is not None: + return False + return self.wrapped.is_value_compatible(arg) @property def typehint(self) -> type[np.ndarray]: - return np.ndarray + return npt.NDArray[self.wrapped.typehint] # type: ignore[name-defined] + def strip_dimension(self) -> DType: + if self.n_dim is None: + return Array(n_dim=None, wrapped=self.wrapped) + elif self.n_dim > 1: + return Array(n_dim=self.n_dim - 1, wrapped=self.wrapped) + else: + return self.wrapped -ARRAY: DType = Array() T = typing.TypeVar("T") @@ -228,7 +258,7 @@ def to_engine(self) -> api.PathwayType: return api.PathwayType.POINTER def __new__(cls, wrapped: type[Schema] | None = None) -> Pointer: - return cls._cached_new(wrapped) + return super().__new__(cls, wrapped) def is_value_compatible(self, arg): return isinstance(arg, api.Pointer) @@ -260,7 +290,7 @@ def __new__(cls, arg: DType) -> DType: # type:ignore[misc] arg = wrap(arg) if arg == NONE or isinstance(arg, Optional) or arg == ANY: return arg - return cls._cached_new(arg) + return super().__new__(cls, arg) def is_value_compatible(self, arg): if arg is None: @@ -291,7 +321,7 @@ def __new__(cls, *args: DType | EllipsisType) -> Tuple | List: # type: ignore[m assert isinstance(arg, DType) return List(arg) else: - return cls._cached_new(tuple(wrap(arg) for arg in args)) + return super().__new__(cls, tuple(wrap(arg) for arg in args)) def is_value_compatible(self, arg): if not isinstance(arg, (tuple, list)): @@ -311,7 +341,7 @@ def typehint(self) -> type[tuple]: class Json(DType): def __new__(cls) -> Json: - return cls._cached_new() + return super().__new__(cls) def _set_args(self): pass @@ -340,7 +370,7 @@ def __repr__(self): return f"List({self.wrapped})" def __new__(cls, wrapped: DType) -> List: - return cls._cached_new(wrap(wrapped)) + return super().__new__(cls, wrap(wrapped)) def _set_args(self, wrapped): self.wrapped = wrapped @@ -355,11 +385,6 @@ def typehint(self) -> type[list]: return list[self.wrapped.typehint] # type: ignore[name-defined] -ANY_TUPLE: DType = List._cached_new( - ANY -) # List(ANY) but this requires `wrap()` to exist - - class _DateTimeNaive(DType): def __repr__(self): return "DATE_TIME_NAIVE" @@ -368,7 +393,7 @@ def _set_args(self): pass def __new__(cls) -> _DateTimeNaive: - return cls._cached_new() + return super().__new__(cls) def to_engine(self) -> api.PathwayType: return api.PathwayType.DATE_TIME_NAIVE @@ -392,7 +417,7 @@ def _set_args(self): pass def __new__(cls) -> _DateTimeUtc: - return cls._cached_new() + return super().__new__(cls) def to_engine(self) -> api.PathwayType: return api.PathwayType.DATE_TIME_UTC @@ -416,7 +441,7 @@ def _set_args(self): pass def __new__(cls) -> _Duration: - return cls._cached_new() + return super().__new__(cls) def to_engine(self) -> api.PathwayType: return api.PathwayType.DURATION @@ -443,6 +468,8 @@ def wrap(input_type) -> DType: assert input_type != Json if isinstance(input_type, DType): return input_type + if typing.get_origin(input_type) == np.dtype: + (input_type,) = typing.get_args(input_type) if input_type in (NoneType, None): return NONE elif input_type == typing.Any: @@ -500,7 +527,12 @@ def wrap(input_type) -> DType: else: return Tuple(*[wrap(arg) for arg in args]) elif input_type == np.ndarray: - return ARRAY + return ANY_ARRAY + elif typing.get_origin(input_type) == np.ndarray: + dims, wrapped = get_args(input_type) + if dims == typing.Any: + return Array(n_dim=None, wrapped=wrapped) + return Array(n_dim=len(typing.get_args(dims)), wrapped=wrapped) elif issubclass(input_type, Enum): return ANY elif input_type == datetime.datetime: @@ -518,12 +550,28 @@ def wrap(input_type) -> DType: datetime_types.Duration: DURATION, datetime_types.DateTimeNaive: DATE_TIME_NAIVE, datetime_types.DateTimeUtc: DATE_TIME_UTC, + np.int32: INT, + np.int64: INT, + np.float32: FLOAT, + np.float64: FLOAT, }.get(input_type, None) if dtype is None: raise TypeError(f"Unsupported type {input_type}.") return dtype +ANY_TUPLE: DType = List(ANY) +ANY_ARRAY: DType = Array(n_dim=None, wrapped=ANY) +ANY_ARRAY_1D: DType = Array(n_dim=1, wrapped=ANY) +ANY_ARRAY_2D: DType = Array(n_dim=2, wrapped=ANY) +INT_ARRAY: DType = Array(n_dim=None, wrapped=INT) +INT_ARRAY_1D: DType = Array(n_dim=1, wrapped=INT) +INT_ARRAY_2D: DType = Array(n_dim=2, wrapped=INT) +FLOAT_ARRAY: DType = Array(n_dim=None, wrapped=FLOAT) +FLOAT_ARRAY_1D: DType = Array(n_dim=1, wrapped=FLOAT) +FLOAT_ARRAY_2D: DType = Array(n_dim=2, wrapped=FLOAT) + + def dtype_equivalence(left: DType, right: DType) -> bool: return dtype_issubclass(left, right) and dtype_issubclass(right, left) @@ -555,6 +603,16 @@ def dtype_tuple_equivalence(left: Tuple | List, right: Tuple | List) -> bool: return all(dtype_equivalence(l_arg, r_arg) for l_arg, r_arg in zip(largs, rargs)) +def dtype_array_equivalence(left: Array, right: Array) -> bool: + dim_compatible = ( + left.n_dim is None or right.n_dim is None or left.n_dim == right.n_dim + ) + wrapped_compatible = ( + left.wrapped == ANY or right.wrapped == ANY or left.wrapped == right.wrapped + ) + return dim_compatible and wrapped_compatible + + def dtype_issubclass(left: DType, right: DType) -> bool: if right == ANY: # catch the case, when left=Optional[T] and right=Any return True @@ -569,6 +627,8 @@ def dtype_issubclass(left: DType, right: DType) -> bool: return dtype_issubclass(left, unoptionalize(right)) elif isinstance(left, (Tuple, List)) and isinstance(right, (Tuple, List)): return dtype_tuple_equivalence(left, right) + elif isinstance(left, Array) and isinstance(right, Array): + return dtype_array_equivalence(left, right) elif isinstance(left, Pointer) and isinstance(right, Pointer): return True # TODO elif isinstance(left, _SimpleDType) and isinstance(right, _SimpleDType): @@ -594,6 +654,20 @@ def types_lca(left: DType, right: DType) -> DType: return left else: return ANY_TUPLE + elif isinstance(left, Array) and isinstance(right, Array): + if left.n_dim is None or right.n_dim is None: + n_dim = None + elif left.n_dim == right.n_dim: + n_dim = left.n_dim + else: + n_dim = None + if left.wrapped == ANY or right.wrapped == ANY: + wrapped = ANY + elif left.wrapped == right.wrapped: + wrapped = left.wrapped + else: + wrapped = ANY + return Array(n_dim=n_dim, wrapped=wrapped) elif isinstance(left, Pointer) and isinstance(right, Pointer): l_schema = left.wrapped r_schema = right.wrapped @@ -626,11 +700,23 @@ def normalize_dtype(dtype: DType) -> DType: if isinstance(dtype, Pointer): return POINTER if isinstance(dtype, Array): - return ARRAY + return ANY_ARRAY return dtype -def unoptionalize_pair(left_dtype: DType, right_dtype) -> tuple[DType, DType]: +def coerce_arrays_pair(left: Array, right: Array) -> tuple[Array, Array]: + if left.wrapped == ANY and right.wrapped != ANY: + left = Array(n_dim=left.n_dim, wrapped=right.wrapped) + if right.wrapped == ANY and left.wrapped != ANY: + right = Array(n_dim=right.n_dim, wrapped=left.wrapped) + if left.n_dim is None and right.n_dim is not None: + right = Array(n_dim=None, wrapped=right.wrapped) + if right.n_dim is None and left.n_dim is not None: + left = Array(n_dim=None, wrapped=left.wrapped) + return left, right + + +def unoptionalize_pair(left_dtype: DType, right_dtype: DType) -> tuple[DType, DType]: """ Unpacks type out of typing.Optional and matches a second type with it if it is an EmptyType. diff --git a/python/pathway/internals/expression.py b/python/pathway/internals/expression.py index 745748f9..57dadf7b 100644 --- a/python/pathway/internals/expression.py +++ b/python/pathway/internals/expression.py @@ -252,8 +252,16 @@ def __rmatmul__(self, other: ColumnExpression | Value) -> ColumnBinaryOpExpressi def __neg__(self) -> ColumnUnaryOpExpression: return ColumnUnaryOpExpression(self, operator.neg) - def __invert__(self) -> ColumnUnaryOpExpression: - return ColumnUnaryOpExpression(self, operator.inv) + def __invert__(self) -> ColumnExpression: + match self: + case ColumnUnaryOpExpression(_operator=operator.inv, _expr=e): + return e + case IsNoneExpression(_expr=e): + return IsNotNoneExpression(e) + case IsNotNoneExpression(_expr=e): + return IsNoneExpression(e) + case _: + return ColumnUnaryOpExpression(self, operator.inv) def __hash__(self): return object.__hash__(self) diff --git a/python/pathway/internals/graph_runner/__init__.py b/python/pathway/internals/graph_runner/__init__.py index 5370d765..7964af5b 100644 --- a/python/pathway/internals/graph_runner/__init__.py +++ b/python/pathway/internals/graph_runner/__init__.py @@ -4,7 +4,7 @@ from collections.abc import Callable, Iterable -from pathway.internals import api, column, environ, parse_graph as graph, table, trace +from pathway.internals import api, environ, parse_graph as graph, table, trace from pathway.internals.column_path import ColumnPath from pathway.internals.graph_runner.async_utils import new_event_loop from pathway.internals.graph_runner.row_transformer_operator_handler import ( # noqa: registers handler for RowTransformerOperator @@ -29,6 +29,7 @@ class GraphRunner: _graph: graph.ParseGraph debug: bool ignore_asserts: bool + runtime_typechecking: bool def __init__( self, @@ -40,6 +41,7 @@ def __init__( with_http_server: bool = False, default_logging: bool = True, persistence_config: PersistenceConfig | None = None, + runtime_typechecking: bool | None = None, ) -> None: self._graph = input_graph self.debug = debug @@ -50,32 +52,46 @@ def __init__( self.with_http_server = with_http_server self.default_logging = default_logging self.persistence_config = persistence_config or environ.get_replay_config() + if runtime_typechecking is None: + self.runtime_typechecking = environ.runtime_typechecking + else: + self.runtime_typechecking = runtime_typechecking def run_tables( self, *tables: table.Table, - after_build: Callable[[ScopeState], None] | None = None, + after_build: Callable[[ScopeState, OperatorStorageGraph], None] | None = None, ) -> list[api.CapturedStream]: - nodes, columns = self.tree_shake_tables(self._graph.global_scope, tables) - context = ScopeContext(nodes, columns) + nodes = self.tree_shake_tables(self._graph.global_scope, tables) + context = ScopeContext(nodes, runtime_typechecking=self.runtime_typechecking) return self._run(context, output_tables=tables, after_build=after_build) def run_all( - self, after_build: Callable[[ScopeState], None] | None = None + self, + after_build: Callable[[ScopeState, OperatorStorageGraph], None] | None = None, ) -> list[api.CapturedStream]: - context = ScopeContext(nodes=self._graph.global_scope.nodes, run_all=True) + context = ScopeContext( + nodes=self._graph.global_scope.nodes, + run_all=True, + runtime_typechecking=self.runtime_typechecking, + ) return self._run(context, after_build=after_build) - def run_outputs(self, after_build: Callable[[ScopeState], None] | None = None): + def run_outputs( + self, + after_build: Callable[[ScopeState, OperatorStorageGraph], None] | None = None, + ): tables = (node.table for node in self._graph.global_scope.output_nodes) - nodes, columns = self._tree_shake( + nodes = self._tree_shake( self._graph.global_scope, self._graph.global_scope.output_nodes, tables ) - context = ScopeContext(nodes=nodes, columns=columns) + context = ScopeContext( + nodes=nodes, runtime_typechecking=self.runtime_typechecking + ) return self._run(context, after_build=after_build) def has_bounded_input(self, table: table.Table) -> bool: - nodes, _ = self.tree_shake_tables(self._graph.global_scope, [table]) + nodes = self.tree_shake_tables(self._graph.global_scope, [table]) for node in nodes: if isinstance(node, InputOperator) and not node.datasource.is_bounded(): @@ -87,7 +103,7 @@ def _run( self, context: ScopeContext, output_tables: Iterable[table.Table] = (), - after_build: Callable[[ScopeState], None] | None = None, + after_build: Callable[[ScopeState, OperatorStorageGraph], None] | None = None, ) -> list[api.CapturedStream]: storage_graph = OperatorStorageGraph.from_scope_context( context, self, output_tables @@ -96,13 +112,12 @@ def _run( def logic( scope: api.Scope, storage_graph: OperatorStorageGraph = storage_graph, - context: ScopeContext = context, ) -> list[tuple[api.Table, list[ColumnPath]]]: state = ScopeState(scope) storage_graph.build_scope(scope, state, self) if after_build is not None: - after_build(state) - return storage_graph.get_output_tables(output_tables, context, state) + after_build(state, storage_graph) + return storage_graph.get_output_tables(output_tables, state) node_names = [ (operator.id, operator.label()) @@ -146,7 +161,7 @@ def logic( def tree_shake_tables( self, graph_scope: graph.Scope, tables: Iterable[table.Table] - ) -> tuple[StableSet[Operator], StableSet[column.Column]]: + ) -> StableSet[Operator]: starting_nodes: Iterable[Operator] = ( table._source.operator for table in tables ) @@ -157,39 +172,12 @@ def _tree_shake( graph_scope: graph.Scope, starting_nodes: Iterable[Operator], tables: Iterable[table.Table], - ) -> tuple[StableSet[Operator], StableSet[column.Column]]: + ) -> StableSet[Operator]: if self.debug: starting_nodes = (*starting_nodes, *graph_scope.debug_nodes) tables = (*tables, *(node.table for node in graph_scope.debug_nodes)) nodes = StableSet(graph_scope.relevant_nodes(*starting_nodes)) - columns = self._relevant_columns(nodes, tables) - return nodes, columns - - def _relevant_columns( - self, nodes: Iterable[Operator], tables: Iterable[table.Table] - ) -> StableSet[column.Column]: - tables = StableSet.union( - tables, *(node.hard_table_dependencies() for node in nodes) - ) - - leaf_columns = (table._columns.values() for table in tables) - id_columns = ( - table._id_column for node in nodes for table in node.output_tables - ) - - stack: list[column.Column] = list(StableSet.union(id_columns, *leaf_columns)) - visited: StableSet[column.Column] = StableSet() - - while stack: - column = stack.pop() - if column in visited: - continue - visited.add(column) - for dependency in column.column_dependencies(): - if dependency not in visited: - stack.append(dependency) - - return visited + return nodes __all__ = [ diff --git a/python/pathway/internals/graph_runner/expression_evaluator.py b/python/pathway/internals/graph_runner/expression_evaluator.py index 71a0e03e..be827e7d 100644 --- a/python/pathway/internals/graph_runner/expression_evaluator.py +++ b/python/pathway/internals/graph_runner/expression_evaluator.py @@ -180,6 +180,14 @@ def eval_expression( # type: ignore[override] ) -> expr.ColumnExpression: expression = super().eval_expression(expression, **kwargs) + from pathway.internals.operator import RowTransformerOperator + + if isinstance(expression, expr.ColumnReference): + if isinstance( + expression._column.lineage.source.operator, RowTransformerOperator + ): + return expression + dtype = expression._dtype def test_type(val): @@ -189,6 +197,7 @@ def test_type(val): ret = apply_with_type(test_type, dtype, expression) ret._dtype = dtype + return ret @@ -200,6 +209,7 @@ def run( output_storage: Storage, *input_storages: Storage, old_path: ColumnPath | None = ColumnPath.EMPTY, + disable_runtime_typechecking: bool = False, ) -> api.Table: [input_storage] = input_storages engine_input_table = self.state.get_table(input_storage) @@ -229,6 +239,11 @@ def run( assert isinstance(column, clmn.ColumnWithExpression) expression = column.expression expression = self.context.expression_with_type(expression) + if ( + self.scope_context.runtime_typechecking + and not disable_runtime_typechecking + ): + expression = TypeVerifier().eval_expression(expression) properties = api.TableProperties.column(self.column_properties(column)) engine_expression = self.eval_expression(expression, eval_state=eval_state) @@ -1010,7 +1025,12 @@ def run(self, output_storage: Storage, *input_storages: Storage) -> api.Table: old_path = ColumnPath((1,)) else: old_path = None - return rowwise_evaluator.run(output_storage, joined_storage, old_path=old_path) + return rowwise_evaluator.run( + output_storage, + joined_storage, + old_path=old_path, + disable_runtime_typechecking=True, + ) class JoinRowwiseEvaluator(RowwiseEvaluator, context_type=clmn.JoinRowwiseContext): diff --git a/python/pathway/internals/graph_runner/operator_handler.py b/python/pathway/internals/graph_runner/operator_handler.py index 2884e54e..5d8301a4 100644 --- a/python/pathway/internals/graph_runner/operator_handler.py +++ b/python/pathway/internals/graph_runner/operator_handler.py @@ -174,6 +174,7 @@ def _run( table=engine_table, column_paths=column_paths, on_change=datasink.on_change, + on_time_end=datasink.on_time_end, on_end=datasink.on_end, skip_persisted_batch=datasink.skip_persisted_batch, ) diff --git a/python/pathway/internals/graph_runner/path_evaluator.py b/python/pathway/internals/graph_runner/path_evaluator.py index 55dcd2ef..545a0a20 100644 --- a/python/pathway/internals/graph_runner/path_evaluator.py +++ b/python/pathway/internals/graph_runner/path_evaluator.py @@ -12,7 +12,6 @@ import pathway.internals.operator as op from pathway.internals.column_path import ColumnPath from pathway.internals.graph_runner.path_storage import Storage -from pathway.internals.table import Table from pathway.internals.universe import Universe @@ -27,7 +26,7 @@ def compute_paths( case op.InputOperator(): evaluator = FlatStoragePathEvaluator(context) case op.RowTransformerOperator(): - evaluator = AddNewColumnsPathEvaluator(context) + evaluator = FlatStoragePathEvaluator(context) case op.ContextualizedIntermediateOperator(): evaluator = PathEvaluator.for_context(context)(context) case _: @@ -37,26 +36,6 @@ def compute_paths( return evaluator.compute(output_columns, input_storages) -def iterate( - output_columns: Iterable[clmn.Column], - input_storage: Storage, - output_table: Table, - input_table: Table, -) -> Storage: - column_names = {column: name for name, column in output_table._columns.items()} - paths = {} - for column in output_columns: - column_name = column_names.get(column) - if column_name is not None: - source_column = input_table._columns[column_name] - path = input_storage.get_path(source_column) - else: - path = input_storage.get_path(column) - paths[column] = path - storage = Storage(output_table._universe, paths) - return storage - - class PathEvaluator(ABC): context: clmn.Context diff --git a/python/pathway/internals/graph_runner/scope_context.py b/python/pathway/internals/graph_runner/scope_context.py index 69698988..ec572602 100644 --- a/python/pathway/internals/graph_runner/scope_context.py +++ b/python/pathway/internals/graph_runner/scope_context.py @@ -6,8 +6,7 @@ from dataclasses import dataclass, field, replace from typing import TYPE_CHECKING -from pathway.internals import column as clmn, operator -from pathway.internals.helpers import StableSet +from pathway.internals import operator if TYPE_CHECKING: from pathway.internals.graph_runner import GraphRunner @@ -16,23 +15,18 @@ @dataclass class ScopeContext: nodes: Iterable[operator.Operator] - columns: StableSet[clmn.Column] = field(default_factory=StableSet) run_all: bool = False subscopes: dict[operator.Operator, ScopeContext] = field(default_factory=dict) - - def skip_column(self, column: clmn.Column) -> bool: - if self.run_all: - return False - return column not in self.columns + runtime_typechecking: bool = False def iterate_subscope( self, operator: operator.IterateOperator, graph_builder: GraphRunner ) -> ScopeContext: if operator not in self.subscopes: - nodes, columns = graph_builder.tree_shake_tables( + nodes = graph_builder.tree_shake_tables( operator.scope, operator.result_iterated + operator.result_iterated_with_universe, ) - self.subscopes[operator] = replace(self, nodes=nodes, columns=columns) + self.subscopes[operator] = replace(self, nodes=nodes) return self.subscopes[operator] diff --git a/python/pathway/internals/graph_runner/state.py b/python/pathway/internals/graph_runner/state.py index 79c943e2..82ef3326 100644 --- a/python/pathway/internals/graph_runner/state.py +++ b/python/pathway/internals/graph_runner/state.py @@ -5,7 +5,6 @@ from collections.abc import Callable, Iterable from pathway.internals import api, column, table, universe -from pathway.internals.column_path import ColumnPath from pathway.internals.graph_runner.path_storage import Storage @@ -55,13 +54,11 @@ def extract_column(self, column: column.Column) -> api.Column: return engine_column def create_table(self, universe: universe.Universe, storage: Storage) -> None: - columns_with_paths: list[tuple[api.Column, ColumnPath]] = [] + columns: list[api.Column] = [] for col in storage.get_columns(): if not isinstance(col, column.ExternalMaterializedColumn): - columns_with_paths.append((self.get_column(col), storage.get_path(col))) - engine_table = self.scope.columns_to_table( - self.get_universe(universe), columns_with_paths - ) + columns.append(self.get_column(col)) + engine_table = self.scope.columns_to_table(self.get_universe(universe), columns) self.set_table(storage, engine_table) def set_column(self, key: column.Column, value: api.Column): diff --git a/python/pathway/internals/graph_runner/storage_graph.py b/python/pathway/internals/graph_runner/storage_graph.py index 73d6edae..1bd1d229 100644 --- a/python/pathway/internals/graph_runner/storage_graph.py +++ b/python/pathway/internals/graph_runner/storage_graph.py @@ -9,27 +9,14 @@ from typing import TYPE_CHECKING from pathway.internals import api -from pathway.internals.column import ( - Column, - ExternalMaterializedColumn, - IdColumn, - MaterializedColumn, -) +from pathway.internals.column import Column, IdColumn, MaterializedColumn from pathway.internals.column_path import ColumnPath -from pathway.internals.column_properties import ColumnProperties from pathway.internals.graph_runner import path_evaluator from pathway.internals.graph_runner.path_storage import Storage from pathway.internals.graph_runner.scope_context import ScopeContext from pathway.internals.graph_runner.state import ScopeState from pathway.internals.helpers import StableSet -from pathway.internals.operator import ( - DebugOperator, - InputOperator, - IterateOperator, - Operator, - OutputOperator, - RowTransformerOperator, -) +from pathway.internals.operator import InputOperator, IterateOperator, Operator from pathway.internals.table import Table from pathway.internals.universe import Universe @@ -40,21 +27,26 @@ @dataclass class OperatorStorageGraph: scope_context: ScopeContext - column_deps_changes: dict[Operator, dict[Table, StableSet[Column]]] - column_deps_at_output: dict[Operator, dict[Table, StableSet[Column]]] - column_deps_at_end: dict[Universe, StableSet[Column]] # for inner graph in iterate - initial_storages: dict[Universe, Storage] # for inner graph in iterate - output_storages: dict[Operator, dict[Table, Storage]] iterate_subgraphs: dict[Operator, OperatorStorageGraph] is_outer_graph: bool - iterate_subgraphs_external_column_mapping: dict[ - Operator, dict[Column, Column] - ] = field(default_factory=lambda: defaultdict(dict)) + column_deps_at_output: dict[Operator, dict[Table, StableSet[Column]]] = field( + default_factory=lambda: defaultdict(dict) + ) + initial_storages: dict[Universe, Storage] = field( + default_factory=dict + ) # for inner graph in iterate + output_storages: dict[Operator, dict[Table, Storage]] = field( + default_factory=lambda: defaultdict(dict) + ) final_storages: dict[Universe, Storage] | None = None + table_to_storage: dict[Table, Storage] = field(default_factory=dict) def get_iterate_subgraph(self, operator: Operator) -> OperatorStorageGraph: return self.iterate_subgraphs[operator] + def has_column(self, table: Table, column: Column) -> bool: + return self.table_to_storage[table].has_column(column) + @classmethod def from_scope_context( cls, @@ -62,10 +54,11 @@ def from_scope_context( graph_builder: GraphRunner, output_tables: Iterable[Table], ) -> OperatorStorageGraph: - graph = cls._create_storage_graph( - scope_context, graph_builder, output_tables=output_tables - ) - graph._compute_relevant_columns() + graph = cls._create_storage_graph(scope_context, graph_builder) + column_dependencies: dict[Universe, StableSet[Column]] = defaultdict(StableSet) + for table in output_tables: + column_dependencies[table._universe].update(table._columns.values()) + graph._compute_relevant_columns(column_dependencies) graph._compute_storage_paths() return graph @@ -75,108 +68,57 @@ def _create_storage_graph( cls, scope_context: ScopeContext, graph_builder: GraphRunner, - initial_deps: dict[Universe, StableSet[Column]] = {}, - output_tables: Iterable[Table] = [], is_outer_graph: bool = True, ) -> OperatorStorageGraph: - deps: dict[Universe, StableSet[Column]] = initial_deps.copy() - deps_changes: dict[Operator, dict[Table, StableSet[Column]]] = defaultdict(dict) - column_deps_at_output: dict[ - Operator, dict[Table, StableSet[Column]] - ] = defaultdict(dict) - output_storages: dict[Operator, dict[Table, Storage]] = defaultdict(dict) - initial_storages: dict[Universe, Storage] = {} iterate_subgraphs: dict[Operator, OperatorStorageGraph] = {} - for operator in scope_context.nodes: - for table in operator.intermediate_and_output_tables: - if table._universe in deps: - deps_changes[operator][table] = deps[table._universe] - deps[table._universe] = StableSet() - column_deps_at_output[operator][table] = deps[table._universe] - if isinstance(operator, IterateOperator): - inner_tables = ( - operator.iterated_copy - + operator.iterated_with_universe_copy - + operator.extra_copy - ) - inner_deps: dict[Universe, StableSet[Column]] = {} - for inner_table in inner_tables.values(): - assert isinstance(inner_table, Table) - if inner_table._universe not in inner_deps: - inner_deps[inner_table._universe] = StableSet() iterate_context = scope_context.iterate_subscope( operator, graph_builder ) iterate_subgraphs[operator] = cls._create_storage_graph( iterate_context, graph_builder, - initial_deps=inner_deps, is_outer_graph=False, ) - for table in output_tables: - deps[table._universe].update(table._columns.values()) graph = cls( scope_context=scope_context, - column_deps_changes=deps_changes, - column_deps_at_output=column_deps_at_output, - column_deps_at_end=deps, - initial_storages=initial_storages, - output_storages=output_storages, iterate_subgraphs=iterate_subgraphs, is_outer_graph=is_outer_graph, ) return graph def _compute_relevant_columns( - self, input_universes: StableSet[Universe] = StableSet() + self, + column_dependencies: dict[Universe, StableSet[Column]], + input_universes: StableSet[Universe] | None = None, ) -> None: - global_column_deps = self.column_deps_at_end.copy() operators_reversed = reversed(list(self.scope_context.nodes)) + for operator in operators_reversed: - if isinstance(operator, OutputOperator) or isinstance( - operator, DebugOperator - ): - column_dependencies = self._compute_column_dependencies_output(operator) - elif isinstance(operator, RowTransformerOperator): - column_dependencies = self._compute_column_dependencies_row_transformer( - operator, self.scope_context - ) - elif isinstance(operator, IterateOperator): - column_dependencies = self._compute_column_dependencies_iterate( - operator, self.scope_context - ) - else: - column_dependencies = self._compute_column_dependencies_ordinary( - operator, self.scope_context - ) + self._compute_column_dependencies_ordinary( + operator, self.scope_context, column_dependencies + ) - for table in reversed(list(operator.intermediate_and_output_tables)): - if table in self.column_deps_changes[operator]: - previous_deps = self.column_deps_changes[operator][table] - global_column_deps[table._universe] = previous_deps - else: - del global_column_deps[table._universe] - - for universe, columns in column_dependencies.items(): - if len(columns) > 0: - if universe in global_column_deps: - global_column_deps[universe].update(columns) - elif not self._can_skip_universe_with_cols( - universe, columns, input_universes - ): - raise RuntimeError( - f"Can't determine the source of columns {columns} from " - + f"universe: {universe} in operator: {operator}" - ) + if isinstance(operator, IterateOperator): + self._compute_column_dependencies_iterate(operator) + + for universe, columns in column_dependencies.items(): + if len(columns) > 0: + if not self._can_skip_universe_with_cols( + universe, columns, input_universes + ): + raise RuntimeError( + f"Can't determine the source of columns {columns} from " + + f"universe: {universe} in operator: {operator}" + ) def _can_skip_universe_with_cols( self, universe: Universe, columns: Iterable[Column], - input_universes: StableSet[Universe], + input_universes: StableSet[Universe] | None, ) -> bool: all_materialized_or_id = all( isinstance(column, MaterializedColumn) or isinstance(column, IdColumn) @@ -187,6 +129,7 @@ def _can_skip_universe_with_cols( # Operator in the inner graph. So they have to be skipped. return ( not self.is_outer_graph + and input_universes is not None and universe in input_universes and all_materialized_or_id ) @@ -195,8 +138,8 @@ def _compute_column_dependencies_ordinary( self, operator: Operator, scope_context: ScopeContext, - ) -> dict[Universe, StableSet[Column]]: - column_dependencies: dict[Universe, StableSet[Column]] = defaultdict(StableSet) + column_dependencies: dict[Universe, StableSet[Column]], + ) -> None: # reverse because we traverse the operators list backward # and want to process output tables before intermediate tables # because we want to propagate dependencies from output tables @@ -206,24 +149,18 @@ def _compute_column_dependencies_ordinary( ) for table in intermediate_and_output_tables_rev: - # add dependencies of downstream tables - column_dependencies[table._universe].update( - self.column_deps_at_output[operator][table] - ) - output_deps = self.column_deps_at_output[operator][table] - # output columns (not skipped) have to be in the storage - output_deps.update( - column - for column in table._columns.values() - if not scope_context.skip_column(column) - ) - # columns with a given universe must be in all intermediate storages - # in this universe (from creation to last use) + output_deps: StableSet[Column] = StableSet() + # set columns that have to be produced output_deps.update(column_dependencies.get(table._universe, [])) + # columns tree shaking if set to False + if scope_context.run_all: + output_deps.update(table._columns.values()) + self.column_deps_at_output[operator][table] = output_deps # add column dependencies for column in chain(table._columns.values(), [table._id_column]): - if not scope_context.skip_column(column): + # if the first condition is not met, the column is not needed (tree shaking) + if column in output_deps or isinstance(column, IdColumn): for dependency in column.column_dependencies(): if not isinstance(dependency, IdColumn): column_dependencies[dependency.universe].add(dependency) @@ -231,43 +168,13 @@ def _compute_column_dependencies_ordinary( # remove current columns (they are created in this operator) column_dependencies[table._universe] -= StableSet(table._columns.values()) - return column_dependencies - - def _compute_column_dependencies_output( - self, - operator: OutputOperator | DebugOperator, - ) -> dict[Universe, StableSet[Column]]: - column_dependencies: dict[Universe, StableSet[Column]] = defaultdict(StableSet) - # get columns needed in the output operators - for table_ in operator.input_tables: - column_dependencies[table_._universe].update(table_._columns.values()) - return column_dependencies - - def _compute_column_dependencies_row_transformer( - self, - operator: RowTransformerOperator, - scope_context: ScopeContext, - ) -> dict[Universe, StableSet[Column]]: - column_dependencies = self._compute_column_dependencies_ordinary( - operator, scope_context - ) - # propagate input tables columns as depndencies (they are hard deps) - for table_ in operator.input_tables: - column_dependencies[table_._universe].update(table_._columns.values()) + for table in operator.hard_table_dependencies(): + column_dependencies[table._universe].update(table._columns.values()) - return column_dependencies - - def _compute_column_dependencies_iterate( - self, - operator: IterateOperator, - scope_context: ScopeContext, - ) -> dict[Universe, StableSet[Column]]: - column_dependencies = self._compute_column_dependencies_ordinary( - operator, scope_context + def _compute_column_dependencies_iterate(self, operator: IterateOperator) -> None: + inner_column_dependencies: dict[Universe, StableSet[Column]] = defaultdict( + StableSet ) - # FIXME: remove it up when iterate inputs are not hard dependencies - - inner_graph = self.iterate_subgraphs[operator] all_columns: dict[Universe, StableSet[Column]] = defaultdict(StableSet) output_tables_columns: dict[Universe, StableSet[Column]] = defaultdict( StableSet @@ -275,33 +182,21 @@ def _compute_column_dependencies_iterate( # propagate columns existing in iterate for name, outer_handle in operator._outputs.items(): outer_table = outer_handle.value - if name not in operator.result_iterated: - continue - inner_table = operator.result_iterated[name] + if name in operator.result_iterated: + inner_table = operator.result_iterated[name] + else: + inner_table = operator.result_iterated_with_universe[name] assert isinstance(inner_table, Table) - inner_deps = inner_graph.column_deps_at_end[inner_table._universe] + inner_deps = inner_column_dependencies[inner_table._universe] for column_name, outer_column in outer_table._columns.items(): output_tables_columns[outer_table._universe].add(outer_column) inner_column = inner_table._columns[column_name] inner_deps.update([inner_column]) - all_columns[outer_table._universe].update( - self.column_deps_at_output[operator][outer_table] - ) - - columns_mapping = self.iterate_subgraphs_external_column_mapping[operator] - # propagate columns not existing in iterate but created before iterate and - # used after iterate and having the same universe as one of iterate outputs - for universe, columns in all_columns.items(): - columns -= output_tables_columns[universe] - inner_universe = operator._universe_mapping[universe] - inner_deps = inner_graph.column_deps_at_end[inner_universe] - for column in columns: - inner_column = ExternalMaterializedColumn( - inner_universe, ColumnProperties(dtype=column.dtype) + if name in operator.result_iterated: + all_columns[outer_table._universe].update( + self.column_deps_at_output[operator][outer_table] ) - inner_deps.update([inner_column]) - columns_mapping[column] = inner_column inner_tables = ( operator.iterated_copy @@ -315,13 +210,9 @@ def _compute_column_dependencies_iterate( assert isinstance(table, Table) input_universes.add(table._universe) - inner_graph._compute_relevant_columns(input_universes) - - # propagate input tables columns as depndencies (they are hard deps) - for table_ in operator.input_tables: - column_dependencies[table_._universe].update(table_._columns.values()) - - return column_dependencies + self.iterate_subgraphs[operator]._compute_relevant_columns( + inner_column_dependencies, input_universes + ) def _compute_storage_paths(self): storages: dict[Universe, Storage] = self.initial_storages.copy() @@ -332,6 +223,8 @@ def _compute_storage_paths(self): self._compute_storage_paths_iterate(operator, storages) else: self._compute_storage_paths_ordinary(operator, storages) + for table in operator.intermediate_and_output_tables: + self.table_to_storage[table] = self.output_storages[operator][table] self.final_storages = storages def _compute_storage_paths_ordinary( @@ -397,15 +290,9 @@ def _compute_storage_paths_iterate( path = storage.get_path(outer_column) inner_column_paths[inner_table._universe][inner_column] = path - # push paths for external columns that have to be propagated through iterate - columns_mapping = self.iterate_subgraphs_external_column_mapping[operator] - for outer_column, inner_column in columns_mapping.items(): - path = storages[outer_column.universe].get_path(outer_column) - inner_column_paths[inner_column.universe][inner_column] = path - for universe, paths in inner_column_paths.items(): - self.iterate_subgraphs[operator].initial_storages[universe] = Storage( - universe, paths + self.iterate_subgraphs[operator].initial_storages[universe] = Storage.flat( + universe, paths.keys() ) self.iterate_subgraphs[operator]._compute_storage_paths() @@ -416,12 +303,7 @@ def _compute_storage_paths_iterate( output_columns = self.column_deps_at_output[operator][table] input_table = operator.get_input(name).value assert isinstance(input_table, Table) - path_storage = path_evaluator.iterate( - output_columns, - storages[input_table._universe], - table, - input_table, - ) + path_storage = Storage.flat(table._universe, output_columns) self.output_storages[operator][table] = path_storage storages[table._universe] = path_storage @@ -448,7 +330,6 @@ def build_scope( def get_output_tables( self, output_tables: Iterable[Table], - scope_context: ScopeContext, state: ScopeState, ) -> list[tuple[api.Table, list[ColumnPath]]]: tables = [] @@ -456,10 +337,6 @@ def get_output_tables( assert self.final_storages is not None storage = self.final_storages[table._universe] engine_table = state.get_table(storage) - paths = [ - storage.get_path(column) - for column in table._columns.values() - if not scope_context.skip_column(column) - ] + paths = [storage.get_path(column) for column in table._columns.values()] tables.append((engine_table, paths)) return tables diff --git a/python/pathway/internals/operator.py b/python/pathway/internals/operator.py index 1f9b897c..dc885120 100644 --- a/python/pathway/internals/operator.py +++ b/python/pathway/internals/operator.py @@ -230,6 +230,9 @@ def __call__(self, table): def label(self): return f"debug: {self.name}" + def hard_table_dependencies(self) -> StableSet[pw.Table]: + return self.input_tables + class InputOperator(Operator): """Holds a definition of external datasource.""" @@ -247,8 +250,8 @@ def __init__( self.datasource = datasource self.debug_datasource = debug_datasource - def __call__(self): - result = pw.Table._from_schema(self.datasource.schema) + def __call__(self) -> pw.Table: + result = pw.Table._from_schema(self.datasource.get_effective_schema()) self._prepare_outputs(as_arg_tuple(result)) return result @@ -268,6 +271,9 @@ def __call__(self, table: pw.Table) -> OutputOperator: self.table = table return self + def hard_table_dependencies(self) -> StableSet[pw.Table]: + return self.input_tables + @dataclass class iterate_universe(OperatorInput): diff --git a/python/pathway/internals/operator_mapping.py b/python/pathway/internals/operator_mapping.py index 11b2d3f1..abaa26b7 100644 --- a/python/pathway/internals/operator_mapping.py +++ b/python/pathway/internals/operator_mapping.py @@ -152,16 +152,34 @@ def get_unary_expression(expr, op, expr_dtype: dt.DType, default=None): (operator.gt, dt.DURATION, dt.DURATION): dt.BOOL, (operator.ge, dt.DURATION, dt.DURATION): dt.BOOL, (operator.add, dt.DURATION, dt.DURATION): dt.DURATION, + (operator.sub, dt.DURATION, dt.DURATION): dt.DURATION, + (operator.floordiv, dt.DURATION, dt.DURATION): dt.INT, + (operator.truediv, dt.DURATION, dt.DURATION): dt.FLOAT, + (operator.mod, dt.DURATION, dt.DURATION): dt.DURATION, (operator.add, dt.DURATION, dt.DATE_TIME_NAIVE): dt.DATE_TIME_NAIVE, (operator.add, dt.DURATION, dt.DATE_TIME_UTC): dt.DATE_TIME_UTC, - (operator.sub, dt.DURATION, dt.DURATION): dt.DURATION, (operator.mul, dt.DURATION, dt.INT): dt.DURATION, (operator.mul, dt.INT, dt.DURATION): dt.DURATION, (operator.floordiv, dt.DURATION, dt.INT): dt.DURATION, - (operator.floordiv, dt.DURATION, dt.DURATION): dt.INT, - (operator.truediv, dt.DURATION, dt.DURATION): dt.FLOAT, - (operator.mod, dt.DURATION, dt.DURATION): dt.DURATION, - (operator.matmul, dt.ARRAY, dt.ARRAY): dt.ARRAY, + (operator.truediv, dt.DURATION, dt.INT): dt.DURATION, + (operator.mul, dt.DURATION, dt.FLOAT): dt.DURATION, + (operator.mul, dt.FLOAT, dt.DURATION): dt.DURATION, + (operator.truediv, dt.DURATION, dt.FLOAT): dt.DURATION, + (operator.matmul, dt.ANY_ARRAY_2D, dt.ANY_ARRAY_2D): dt.ANY_ARRAY_2D, + (operator.matmul, dt.INT_ARRAY_2D, dt.INT_ARRAY_2D): dt.INT_ARRAY_2D, + (operator.matmul, dt.FLOAT_ARRAY_2D, dt.FLOAT_ARRAY_2D): dt.FLOAT_ARRAY_2D, + (operator.matmul, dt.ANY_ARRAY_2D, dt.ANY_ARRAY_1D): dt.ANY_ARRAY_1D, + (operator.matmul, dt.INT_ARRAY_2D, dt.INT_ARRAY_1D): dt.INT_ARRAY_1D, + (operator.matmul, dt.FLOAT_ARRAY_2D, dt.FLOAT_ARRAY_1D): dt.FLOAT_ARRAY_1D, + (operator.matmul, dt.ANY_ARRAY_1D, dt.ANY_ARRAY_2D): dt.ANY_ARRAY_1D, + (operator.matmul, dt.INT_ARRAY_1D, dt.INT_ARRAY_2D): dt.INT_ARRAY_1D, + (operator.matmul, dt.FLOAT_ARRAY_1D, dt.FLOAT_ARRAY_2D): dt.FLOAT_ARRAY_1D, + (operator.matmul, dt.ANY_ARRAY_1D, dt.ANY_ARRAY_1D): dt.ANY, + (operator.matmul, dt.INT_ARRAY_1D, dt.INT_ARRAY_1D): dt.INT, + (operator.matmul, dt.FLOAT_ARRAY_1D, dt.FLOAT_ARRAY_1D): dt.FLOAT, + (operator.matmul, dt.ANY_ARRAY, dt.ANY_ARRAY): dt.ANY_ARRAY, + (operator.matmul, dt.INT_ARRAY, dt.INT_ARRAY): dt.INT_ARRAY, + (operator.matmul, dt.FLOAT_ARRAY, dt.FLOAT_ARRAY): dt.FLOAT_ARRAY, } tuple_handling_operators = { @@ -175,6 +193,8 @@ def get_unary_expression(expr, op, expr_dtype: dt.DType, default=None): def get_binary_operators_mapping(op, left, right, default=None): + if isinstance(left, dt.Array) and isinstance(right, dt.Array): + left, right = dt.coerce_arrays_pair(left, right) return _binary_operators_mapping.get( (op, dt.normalize_dtype(left), dt.normalize_dtype(right)), default ) diff --git a/python/pathway/internals/reducers.py b/python/pathway/internals/reducers.py index 6e1fa309..ebce419f 100644 --- a/python/pathway/internals/reducers.py +++ b/python/pathway/internals/reducers.py @@ -2,19 +2,14 @@ from __future__ import annotations -import pickle from abc import ABC, abstractmethod -from collections import Counter -from typing import ParamSpec, Protocol, TypeVar from warnings import warn import numpy as np -from typing_extensions import Self from pathway.internals import api, dtype as dt, expression as expr from pathway.internals.column import ColumnExpression, GroupedContext from pathway.internals.common import apply_with_type -from pathway.internals.shadows.inspect import signature class Reducer(ABC): @@ -97,7 +92,7 @@ def return_type_unary(self, arg_type: dt.DType) -> dt.DType: class SumReducer(UnaryReducer): def return_type_unary(self, arg_type: dt.DType) -> dt.DType: - for allowed_dtype in [dt.FLOAT, dt.ARRAY]: + for allowed_dtype in [dt.FLOAT, dt.ANY_ARRAY]: if dt.dtype_issubclass(arg_type, allowed_dtype): return arg_type raise TypeError( @@ -108,7 +103,7 @@ def return_type_unary(self, arg_type: dt.DType) -> dt.DType: def engine_reducer_unary(self, arg_type: dt.DType) -> api.Reducer: if arg_type == dt.INT: return api.Reducer.INT_SUM - elif arg_type == dt.ARRAY: + elif isinstance(arg_type, dt.Array): return api.Reducer.ARRAY_SUM else: return api.Reducer.FLOAT_SUM @@ -273,279 +268,3 @@ def ndarray(expression: expr.ColumnExpression, *, skip_nones: bool = False): return apply_with_type( np.array, np.ndarray, tuple_reducer(expression, skip_nones=skip_nones) ) - - -S = TypeVar("S", bound=api.Value) -V1 = TypeVar("V1", bound=api.Value) -V2 = TypeVar("V2", bound=api.Value) - - -class ReducerProtocol(Protocol): - def __call__( - self, *args: expr.ColumnExpression | api.Value - ) -> expr.ColumnExpression: - ... - - -def stateful_many( - combine_many: api.CombineMany[S], -) -> ReducerProtocol: - def wrapper(*args: expr.ColumnExpression | api.Value) -> expr.ColumnExpression: - return expr.ReducerExpression(StatefulManyReducer(combine_many), *args) - - return wrapper - - -P = ParamSpec("P") - - -class CombineSingle(Protocol[S, P]): - def __call__(self, state: S | None, /, *args: P.args, **kwargs: P.kwargs) -> S: - ... - - -def stateful_single(combine_single: CombineSingle[S, ...]) -> ReducerProtocol: - def wrapper(state: S | None, rows: list[tuple[list[api.Value], int]]) -> S: - for row, count in rows: - assert count > 0 - for _ in range(count): - state = combine_single(state, *row) - assert state is not None - return state - - return stateful_many(wrapper) - - -def udf_reducer( - reducer_cls: type[BaseCustomAccumulator], -): - """Decorator for defining custom reducers. Requires custom accumulator as an argument. - Custom accumulator should implement `from_row`, `update` and `compute_result`. - Optionally `neutral` and `retract` can be provided for more efficient processing on - streams with changing data. - - >>> import pathway as pw - >>> class CustomAvgAccumulator(pw.BaseCustomAccumulator): - ... def __init__(self, sum, cnt): - ... self.sum = sum - ... self.cnt = cnt - ... - ... @classmethod - ... def from_row(self, row): - ... [val] = row - ... return CustomAvgAccumulator(val, 1) - ... - ... def update(self, other): - ... self.sum += other.sum - ... self.cnt += other.cnt - ... - ... def compute_result(self) -> float: - ... return self.sum / self.cnt - >>> import sys; sys.modules[__name__].CustomAvgAccumulator = CustomAvgAccumulator # NOSHOW - >>> custom_avg = pw.reducers.udf_reducer(CustomAvgAccumulator) - >>> t1 = pw.debug.parse_to_table(''' - ... age | owner | pet | price - ... 10 | Alice | dog | 100 - ... 9 | Bob | cat | 80 - ... 8 | Alice | cat | 90 - ... 7 | Bob | dog | 70 - ... ''') - >>> t2 = t1.groupby(t1.owner).reduce(t1.owner, avg_price=custom_avg(t1.price)) - >>> pw.debug.compute_and_print(t2, include_id=False) - owner | avg_price - Alice | 95.0 - Bob | 75.0 - """ - neutral_available = _is_overridden(reducer_cls, "neutral") - retract_available = _is_overridden(reducer_cls, "retract") - - def wrapper(*args: expr.ColumnExpression | api.Value) -> ColumnExpression: - @stateful_many - def stateful_wrapper( - pickled_state: bytes | None, rows: list[tuple[list[api.Value], int]] - ) -> bytes | None: - if pickled_state is not None: - state = pickle.loads(pickled_state) - if not retract_available: - state._positive_updates = list(state._positive_updates) - else: - state = None - positive_updates: list[tuple[api.Value, ...]] = [] - negative_updates = [] - for row, count in rows: - if count > 0: - positive_updates.extend([tuple(row)] * count) - else: - negative_updates.extend([tuple(row)] * (-count)) - - if not retract_available and len(negative_updates) > 0: - if state is not None: - positive_updates.extend(state._positive_updates) - state._positive_updates = [] - state = None - acc = Counter(positive_updates) - acc.subtract(negative_updates) - assert all(x >= 0 for x in acc.values()) - positive_updates = list(acc.elements()) - negative_updates = [] - - if state is None: - if neutral_available: - state = reducer_cls.neutral() - if not retract_available: - state._positive_updates = [] - else: - state._cnt = 0 - elif len(positive_updates) == 0: - if len(negative_updates) == 0: - return None - else: - raise ValueError( - "Unable to process negative update with this custom reducer." - ) - else: - state = reducer_cls.from_row(list(positive_updates[0])) - if not retract_available: - state._positive_updates = positive_updates[0:1] - else: - state._cnt = 1 - positive_updates = positive_updates[1:] - - for row_up in positive_updates: - if not retract_available: - state._positive_updates.append(row_up) - else: - state._cnt += 1 - val = reducer_cls.from_row(list(row_up)) - state.update(val) - - for row_up in negative_updates: - if not retract_available: - raise ValueError( - "Unable to process negative update with this custom reducer." - ) - else: - state._cnt -= 1 - val = reducer_cls.from_row(list(row_up)) - state.retract(val) - - if not retract_available: - state._positive_updates = tuple( - tuple(x) for x in state._positive_updates - ) - else: - if state._cnt == 0: - # this is fine in this setting, where we process values one by one - # if this ever becomes accumulated in a tree, we have to handle - # (A-B) updates, so we have to distinguish `0` from intermediate states - # accumulating weighted count (weighted by hash) should do fine here - return None - - return pickle.dumps(state) - - def extractor(x: bytes): - unpickled = pickle.loads(x) - assert isinstance(unpickled, reducer_cls) - return unpickled.compute_result() - - return apply_with_type( - extractor, - signature(reducer_cls.compute_result).return_annotation, - stateful_wrapper(*args), - ) - - return wrapper - - -def mark_stub(fun): - fun.__pw_stub = True - return fun - - -class BaseCustomAccumulator(ABC): - """Utility class for defining custom accumulators, used for custom reducers. - Custom accumulators should inherit from this class, and should implement `from_row`, - `update` and `compute_result`. Optionally `neutral` and `retract` can be provided - for more efficient processing on streams with changing data. - - >>> import pathway as pw - >>> class CustomAvgAccumulator(pw.BaseCustomAccumulator): - ... def __init__(self, sum, cnt): - ... self.sum = sum - ... self.cnt = cnt - ... - ... @classmethod - ... def from_row(self, row): - ... [val] = row - ... return CustomAvgAccumulator(val, 1) - ... - ... def update(self, other): - ... self.sum += other.sum - ... self.cnt += other.cnt - ... - ... def compute_result(self) -> float: - ... return self.sum / self.cnt - >>> import sys; sys.modules[__name__].CustomAvgAccumulator = CustomAvgAccumulator # NOSHOW - >>> custom_avg = pw.reducers.udf_reducer(CustomAvgAccumulator) - >>> t1 = pw.debug.parse_to_table(''' - ... age | owner | pet | price - ... 10 | Alice | dog | 100 - ... 9 | Bob | cat | 80 - ... 8 | Alice | cat | 90 - ... 7 | Bob | dog | 70 - ... ''') - >>> t2 = t1.groupby(t1.owner).reduce(t1.owner, avg_price=custom_avg(t1.price)) - >>> pw.debug.compute_and_print(t2, include_id=False) - owner | avg_price - Alice | 95.0 - Bob | 75.0 - """ - - @classmethod - @mark_stub - def neutral(cls) -> Self: - """Neutral element of the accumulator (aggregation of an empty list). - - This function is optional, and allows for more efficient processing on streams - with changing data.""" - raise NotImplementedError() - - @classmethod - @abstractmethod - def from_row(cls, row: list[api.Value]) -> Self: - """Construct the accumulator from a row of data. - Row will be passed as a list of values. - - This is a mandatory function.""" - raise NotImplementedError() - - @abstractmethod - def update(self, other: Self) -> None: - """Update the accumulator with another one. - Method does not need to return anything, the change should be in-place. - - This is a mandatory function.""" - raise NotImplementedError() - - @mark_stub - def retract(self, other: Self) -> None: - """Update the accumulator by removing the value of another one. - - This function is optional, and allows more efficient reductions on streams - with changing data. - """ - raise NotImplementedError() - - @abstractmethod - def compute_result(self) -> api.Value: - """Mandatory function to finalize computation. - Used to extract answer from final state of accumulator. - - Narrowing the type of this function helps better type the output of the reducer. - """ - raise NotImplementedError() - - -def _is_overridden(cls: type[BaseCustomAccumulator], name: str) -> bool: - assert hasattr(BaseCustomAccumulator, name) - return not hasattr(getattr(cls, name), "__pw_stub") diff --git a/python/pathway/internals/run.py b/python/pathway/internals/run.py index e3b0b763..e2e21a13 100644 --- a/python/pathway/internals/run.py +++ b/python/pathway/internals/run.py @@ -4,11 +4,11 @@ from pathway.internals import parse_graph from pathway.internals.graph_runner import GraphRunner from pathway.internals.monitoring import MonitoringLevel -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.persistence import Config as PersistenceConfig -@runtime_type_check +@check_arg_types def run( debug: bool = False, monitoring_level: MonitoringLevel = MonitoringLevel.AUTO, @@ -41,12 +41,13 @@ def run( ).run_outputs() -@runtime_type_check +@check_arg_types def run_all( debug: bool = False, monitoring_level: MonitoringLevel = MonitoringLevel.AUTO, with_http_server: bool = False, default_logging: bool = True, + runtime_typechecking: bool | None = None, ): GraphRunner( parse_graph.G, @@ -54,4 +55,5 @@ def run_all( monitoring_level=monitoring_level, with_http_server=with_http_server, default_logging=default_logging, + runtime_typechecking=runtime_typechecking, ).run_all() diff --git a/python/pathway/internals/runtime_type_check.py b/python/pathway/internals/runtime_type_check.py index cfc904d5..89b9d515 100644 --- a/python/pathway/internals/runtime_type_check.py +++ b/python/pathway/internals/runtime_type_check.py @@ -6,7 +6,7 @@ import beartype -def runtime_type_check(f): +def check_arg_types(f): """Decorator allowing validating types in runtime.""" @functools.wraps(f) diff --git a/python/pathway/internals/schema.py b/python/pathway/internals/schema.py index 750e8c5a..d7907cdc 100644 --- a/python/pathway/internals/schema.py +++ b/python/pathway/internals/schema.py @@ -17,10 +17,10 @@ import numpy as np import pandas as pd -from pathway.internals import api, dtype as dt, trace +from pathway.internals import dtype as dt, trace from pathway.internals.column_properties import ColumnProperties from pathway.internals.helpers import StableSet -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types if TYPE_CHECKING: from pathway.internals import column as clmn @@ -47,12 +47,34 @@ def schema_from_columns( def _type_converter(series: pd.Series) -> dt.DType: + if series.empty: + return dt.ANY if series.apply(lambda x: isinstance(x, (tuple, list))).all(): - return dt.ANY_TUPLE + proposed_len = len(series[0]) + if (series.apply(lambda x: len(x) == proposed_len)).all(): + dtypes = [ + _type_converter(series.apply(lambda x: x[i])) + for i in range(proposed_len) + ] + return dt.Tuple(*dtypes) + else: + exploded = pd.Series([x for element in series for x in element]) + to_wrap = _type_converter(exploded) + return dt.List(to_wrap) if (series.isna() | series.isnull()).all(): return dt.NONE if (series.apply(lambda x: isinstance(x, np.ndarray))).all(): - return dt.ARRAY + if series.apply(lambda x: np.issubdtype(x.dtype, np.integer)).all(): + wrapped = dt.INT + elif series.apply(lambda x: np.issubdtype(x.dtype, np.floating)).all(): + wrapped = dt.FLOAT + else: + wrapped = dt.ANY + n_dim: int | None = len(series[0].shape) + if not series.apply(lambda x: len(x.shape) == n_dim).all(): + n_dim = None + + return dt.Array(n_dim=n_dim, wrapped=wrapped) if pd.api.types.is_integer_dtype(series.dtype): ret_type: dt.DType = dt.INT elif pd.api.types.is_float_dtype(series.dtype): @@ -78,12 +100,6 @@ def _type_converter(series: pd.Series) -> dt.DType: return ret_type -def _is_dataframe_append_only(dframe: pd.DataFrame) -> bool: - return api.DIFF_PSEUDOCOLUMN not in dframe.columns or all( - dframe[api.DIFF_PSEUDOCOLUMN] == 1 - ) - - def schema_from_pandas( dframe: pd.DataFrame, *, @@ -102,13 +118,10 @@ def schema_from_pandas( } for name in id_from: columns[name] = dataclasses.replace(columns[name], primary_key=True) - append_only = _is_dataframe_append_only(dframe) - return schema_builder( - columns=columns, properties=SchemaProperties(append_only=append_only), name=name - ) + return schema_builder(columns=columns, name=name) -@runtime_type_check +@check_arg_types def schema_from_types( _name: str | None = None, **kwargs, @@ -309,6 +322,22 @@ def update_types(self, **kwargs) -> type[Schema]: return schema_builder(columns=columns) + def update_properties(self, **kwargs) -> type[Schema]: + columns: dict[str, ColumnDefinition] = { + col.name: dataclasses.replace(col.to_definition(), **kwargs) + for col in self.__columns__.values() + } + properties = SchemaProperties( + **{ + name: value + for name, value in kwargs.items() + if name in SchemaProperties.__annotations__ + } + ) + return schema_builder( + columns=columns, name=self.__name__, properties=properties + ) + def __getitem__(self, name) -> ColumnSchema: return self.__columns__[name] diff --git a/python/pathway/internals/sql.py b/python/pathway/internals/sql.py index 76dfffb0..5e3f5057 100644 --- a/python/pathway/internals/sql.py +++ b/python/pathway/internals/sql.py @@ -15,7 +15,7 @@ from pathway.internals import expression as expr, if_else, reducers, table, thisclass from pathway.internals.desugaring import TableSubstitutionDesugaring from pathway.internals.expression_visitor import IdentityTransform -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.shadows import operator _tmp_table_cnt = itertools.count() @@ -618,7 +618,7 @@ def _select( return result, orig_context -@runtime_type_check +@check_arg_types def sql(query: str, **kwargs: table.Table) -> table.Table: r''' Run a SQL query on Pathway tables. diff --git a/python/pathway/internals/table.py b/python/pathway/internals/table.py index 01186f8b..cbcc2399 100644 --- a/python/pathway/internals/table.py +++ b/python/pathway/internals/table.py @@ -33,7 +33,7 @@ from pathway.internals.operator import DebugOperator, OutputHandle from pathway.internals.operator_input import OperatorInput from pathway.internals.parse_graph import G -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema, schema_from_columns, schema_from_types from pathway.internals.table_like import TableLike from pathway.internals.table_slice import TableSlice @@ -256,7 +256,7 @@ def __getitem__( @trace_user_frame @staticmethod - @runtime_type_check + @check_arg_types def from_columns( *args: expr.ColumnReference, **kwargs: expr.ColumnReference ) -> Table: @@ -301,7 +301,7 @@ def from_columns( return table.select(*args, **kwargs) @trace_user_frame - @runtime_type_check + @check_arg_types def concat_reindex(self, *tables: Table) -> Table: """Concatenate contents of several tables. @@ -342,7 +342,7 @@ def concat_reindex(self, *tables: Table) -> Table: @trace_user_frame @staticmethod - @runtime_type_check + @check_arg_types def empty(**kwargs: dt.DType) -> Table: """Creates an empty table with a schema specified by kwargs. @@ -475,13 +475,13 @@ def slice(self) -> TableSlice: @trace_user_frame @desugar - @runtime_type_check + @check_arg_types def filter(self, filter_expression: expr.ColumnExpression) -> Table[TSchema]: - """Filter a table according to `filter` condition. + """Filter a table according to `filter_expression` condition. Args: - filter: `ColumnExpression` that specifies the filtering condition. + filter_expression: `ColumnExpression` that specifies the filtering condition. Returns: Table: Result has the same schema as `self` and its ids are subset of `self.id`. @@ -514,6 +514,48 @@ def filter(self, filter_expression: expr.ColumnExpression) -> Table[TSchema]: ret = ret.update_types(**{name: dt.unoptionalize(dtype)}) return ret + @trace_user_frame + @desugar + @check_arg_types + def split( + self, split_expression: expr.ColumnExpression + ) -> tuple[Table[TSchema], Table[TSchema]]: + """Split a table according to `split_expression` condition. + + + Args: + split_expression: `ColumnExpression` that specifies the split condition. + + Returns: + positive_table, negative_table: tuple of tables, + with the same schemas as `self` and with ids that are subsets of `self.id`, + and provably disjoint. + + + Example: + + >>> import pathway as pw + >>> vertices = pw.debug.table_from_markdown(''' + ... label outdegree + ... 1 3 + ... 7 0 + ... ''') + >>> positive, negative = vertices.split(vertices.outdegree == 0) + >>> pw.debug.compute_and_print(positive, include_id=False) + label | outdegree + 7 | 0 + >>> pw.debug.compute_and_print(negative, include_id=False) + label | outdegree + 1 | 3 + """ + positive = self.filter(split_expression) + negative = self.filter(~split_expression) + universes.promise_are_pairwise_disjoint(positive, negative) + universes.promise_are_equal( + self, Table.concat(positive, negative) + ) # TODO: add API method for this + return positive, negative + @contextualized_operator def _filter(self, filter_expression: expr.ColumnExpression) -> Table[TSchema]: self._validate_expression(filter_expression) @@ -526,7 +568,7 @@ def _filter(self, filter_expression: expr.ColumnExpression) -> Table[TSchema]: @trace_user_frame @desugar - @runtime_type_check + @check_arg_types def _gradual_broadcast( self, threshold_table, @@ -540,7 +582,7 @@ def _gradual_broadcast( @trace_user_frame @desugar - @runtime_type_check + @check_arg_types @contextualized_operator def __gradual_broadcast( self, @@ -560,7 +602,7 @@ def __gradual_broadcast( @trace_user_frame @desugar - @runtime_type_check + @check_arg_types @contextualized_operator def _forget( self, @@ -578,7 +620,7 @@ def _forget( @trace_user_frame @desugar - @runtime_type_check + @check_arg_types @contextualized_operator def _forget_immediately( self, @@ -588,7 +630,7 @@ def _forget_immediately( @trace_user_frame @desugar - @runtime_type_check + @check_arg_types @contextualized_operator def _filter_out_results_of_forgetting( self, @@ -601,7 +643,7 @@ def _filter_out_results_of_forgetting( @trace_user_frame @desugar - @runtime_type_check + @check_arg_types @contextualized_operator def _freeze( self, @@ -617,7 +659,7 @@ def _freeze( @trace_user_frame @desugar - @runtime_type_check + @check_arg_types @contextualized_operator def _buffer( self, @@ -632,7 +674,7 @@ def _buffer( return self._table_with_context(context) @contextualized_operator - @runtime_type_check + @check_arg_types def difference(self, other: Table) -> Table[TSchema]: r"""Restrict self universe to keys not appearing in the other table. @@ -670,7 +712,7 @@ def difference(self, other: Table) -> Table[TSchema]: return self._table_with_context(context) @contextualized_operator - @runtime_type_check + @check_arg_types def intersect(self, *tables: Table) -> Table[TSchema]: """Restrict self universe to keys appearing in all of the tables. @@ -721,7 +763,7 @@ def intersect(self, *tables: Table) -> Table[TSchema]: return self._table_with_context(context) @contextualized_operator - @runtime_type_check + @check_arg_types def restrict(self, other: TableLike) -> Table[TSchema]: """Restrict self universe to keys appearing in other. @@ -776,7 +818,7 @@ def restrict(self, other: TableLike) -> Table[TSchema]: ) @contextualized_operator - @runtime_type_check + @check_arg_types def copy(self) -> Table[TSchema]: """Returns a copy of a table. @@ -811,7 +853,7 @@ def copy(self) -> Table[TSchema]: @trace_user_frame @desugar @arg_handler(handler=groupby_handler) - @runtime_type_check + @check_arg_types def groupby( self, *args: expr.ColumnReference, @@ -1061,7 +1103,7 @@ def __lshift__(self, other: Table) -> Table: return self.update_cells(other) @trace_user_frame - @runtime_type_check + @check_arg_types def concat(self, *others: Table[TSchema]) -> Table[TSchema]: """Concats `self` with every `other` ∊ `others`. @@ -1145,7 +1187,7 @@ def _concat(self, *others: Table[TSchema]) -> Table[TSchema]: return self._table_with_context(context) @trace_user_frame - @runtime_type_check + @check_arg_types def update_cells(self, other: Table) -> Table: """Updates cells of `self`, breaking ties in favor of the values in `other`. @@ -1207,7 +1249,7 @@ def update_cells(self, other: Table) -> Table: @trace_user_frame @contextualized_operator - @runtime_type_check + @check_arg_types def _update_cells(self, other: Table) -> Table: if not other._universe.is_subset_of(self._universe): raise ValueError( @@ -1224,7 +1266,7 @@ def _update_cells(self, other: Table) -> Table: return self._table_with_context(context) @trace_user_frame - @runtime_type_check + @check_arg_types def update_rows(self, other: Table[TSchema]) -> Table[TSchema]: """Updates rows of `self`, breaking ties in favor for the rows in `other`. @@ -1290,7 +1332,7 @@ def update_rows(self, other: Table[TSchema]) -> Table[TSchema]: @trace_user_frame @contextualized_operator - @runtime_type_check + @check_arg_types def _update_rows(self, other: Table[TSchema]) -> Table[TSchema]: union_ids = (self._id_column, other._id_column) context = clmn.UpdateRowsContext( @@ -1334,7 +1376,7 @@ def with_columns(self, *args: expr.ColumnReference, **kwargs: Any) -> Table: @trace_user_frame @desugar - @runtime_type_check + @check_arg_types def with_id(self, new_index: expr.ColumnReference) -> Table: """Set new ids based on another column containing id-typed values. @@ -1375,7 +1417,7 @@ def with_id(self, new_index: expr.ColumnReference) -> Table: @trace_user_frame @desugar - @runtime_type_check + @check_arg_types def with_id_from( self, *args: expr.ColumnExpression | Value, @@ -1425,7 +1467,7 @@ def with_id_from( @trace_user_frame @contextualized_operator - @runtime_type_check + @check_arg_types def _with_new_index( self, new_index: expr.ColumnExpression, @@ -1446,7 +1488,7 @@ def _with_new_index( @trace_user_frame @desugar @contextualized_operator - @runtime_type_check + @check_arg_types def rename_columns(self, **kwargs: str | expr.ColumnReference) -> Table: """Rename columns according to kwargs. @@ -1499,7 +1541,7 @@ def rename_columns(self, **kwargs: str | expr.ColumnReference) -> Table: } return self._with_same_universe(*columns_wrapped.items()) - @runtime_type_check + @check_arg_types def rename_by_dict( self, names_mapping: dict[str | expr.ColumnReference, str] ) -> Table: @@ -1533,7 +1575,7 @@ def rename_by_dict( **{new_name: self[old_name] for old_name, new_name in names_mapping.items()} ) - @runtime_type_check + @check_arg_types def with_prefix(self, prefix: str) -> Table: """Rename columns by adding prefix to each name of column. @@ -1555,7 +1597,7 @@ def with_prefix(self, prefix: str) -> Table: """ return self.rename_by_dict({name: prefix + name for name in self.keys()}) - @runtime_type_check + @check_arg_types def with_suffix(self, suffix: str) -> Table: """Rename columns by adding suffix to each name of column. @@ -1578,7 +1620,7 @@ def with_suffix(self, suffix: str) -> Table: return self.rename_by_dict({name: name + suffix for name in self.keys()}) @trace_user_frame - @runtime_type_check + @check_arg_types def rename( self, names_mapping: dict[str | expr.ColumnReference, str] | None = None, @@ -1604,7 +1646,7 @@ def rename( @trace_user_frame @desugar @contextualized_operator - @runtime_type_check + @check_arg_types def without(self, *columns: str | expr.ColumnReference) -> Table: """Selects all columns without named column references. @@ -1645,7 +1687,7 @@ def without(self, *columns: str | expr.ColumnReference) -> Table: @trace_user_frame @desugar - @runtime_type_check + @check_arg_types def having(self, *indexers: expr.ColumnReference) -> Table[TSchema]: """Removes rows so that indexed.ix(indexer) is possible when some rows are missing, for each indexer in indexers""" @@ -1661,7 +1703,7 @@ def having(self, *indexers: expr.ColumnReference) -> Table[TSchema]: return rets[0].intersect(*rets[1:]) @trace_user_frame - @runtime_type_check + @check_arg_types def update_types(self, **kwargs: Any) -> Table: """Updates types in schema. Has no effect on the runtime.""" @@ -1676,7 +1718,7 @@ def update_types(self, **kwargs: Any) -> Table: **{key: declare_type(val, self[key]) for key, val in kwargs.items()} ) - @runtime_type_check + @check_arg_types def cast_to_types(self, **kwargs: Any) -> Table: """Casts columns to types.""" @@ -1692,7 +1734,7 @@ def cast_to_types(self, **kwargs: Any) -> Table: ) @contextualized_operator - @runtime_type_check + @check_arg_types def _having(self, indexer: expr.ColumnReference) -> Table[TSchema]: context = clmn.HavingContext( orig_id_column=self._id_column, key_column=indexer._column @@ -1700,7 +1742,7 @@ def _having(self, indexer: expr.ColumnReference) -> Table[TSchema]: return self._table_with_context(context) @trace_user_frame - @runtime_type_check + @check_arg_types def with_universe_of(self, other: TableLike) -> Table: """Returns a copy of self with exactly the same universe as others. @@ -1735,10 +1777,10 @@ def with_universe_of(self, other: TableLike) -> Table: return self._unsafe_promise_universe(other) @trace_user_frame - @runtime_type_check + @check_arg_types def flatten(self, *args: expr.ColumnReference, **kwargs: Any) -> Table: """Performs a flatmap operation on a column or expression given as a first - argument. Datatype of this column or expression has to be iterable. + argument. Datatype of this column or expression has to be iterable or Json array. Other columns specified in the method arguments are duplicated as many times as the length of the iterable. @@ -1812,7 +1854,7 @@ def _flatten( @trace_user_frame @desugar @contextualized_operator - @runtime_type_check + @check_arg_types def sort( self, key: expr.ColumnExpression, diff --git a/python/pathway/internals/table_like.py b/python/pathway/internals/table_like.py index 23d89590..e081ef64 100644 --- a/python/pathway/internals/table_like.py +++ b/python/pathway/internals/table_like.py @@ -6,7 +6,7 @@ from pathway.internals import column as clmn, universes from pathway.internals.deprecation_meta import DeprecationSuperclass -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.universe import Universe SelfTableLike = TypeVar("SelfTableLike", bound="TableLike") @@ -45,7 +45,7 @@ def __init__(self, context: clmn.Context): self._universe = context.universe self._id_column = context.id_column - @runtime_type_check + @check_arg_types def promise_universes_are_disjoint( self: SelfTableLike, other: TableLike ) -> SelfTableLike: @@ -85,7 +85,7 @@ def promise_universes_are_disjoint( universes.promise_are_pairwise_disjoint(self, other) return self - @runtime_type_check + @check_arg_types def promise_universe_is_subset_of( self: SelfTableLike, other: TableLike ) -> SelfTableLike: @@ -122,7 +122,7 @@ def promise_universe_is_subset_of( universes.promise_is_subset_of(self, other) return self - @runtime_type_check + @check_arg_types def promise_universe_is_equal_to( self: SelfTableLike, other: TableLike ) -> SelfTableLike: diff --git a/python/pathway/internals/table_slice.py b/python/pathway/internals/table_slice.py index 3be049fd..36677e48 100644 --- a/python/pathway/internals/table_slice.py +++ b/python/pathway/internals/table_slice.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, overload from pathway.internals.expression import ColumnReference -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.thisclass import ThisMetaclass, this from pathway.internals.trace import trace_user_frame @@ -79,7 +79,7 @@ def __getattr__(self, name: str) -> ColumnReference: return self._mapping[name] @trace_user_frame - @runtime_type_check + @check_arg_types def without(self, *cols: str | ColumnReference) -> TableSlice: mapping = self._mapping.copy() for col in cols: @@ -90,7 +90,7 @@ def without(self, *cols: str | ColumnReference) -> TableSlice: return TableSlice(mapping, self._table) @trace_user_frame - @runtime_type_check + @check_arg_types def rename( self, rename_dict: dict[str | ColumnReference, str | ColumnReference], @@ -109,12 +109,12 @@ def rename( return TableSlice(mapping, self._table) @trace_user_frame - @runtime_type_check + @check_arg_types def with_prefix(self, prefix: str) -> TableSlice: return self.rename({name: prefix + name for name in self.keys()}) @trace_user_frame - @runtime_type_check + @check_arg_types def with_suffix(self, suffix: str) -> TableSlice: return self.rename({name: name + suffix for name in self.keys()}) diff --git a/python/pathway/internals/table_subscription.py b/python/pathway/internals/table_subscription.py index 8aa04704..61974598 100644 --- a/python/pathway/internals/table_subscription.py +++ b/python/pathway/internals/table_subscription.py @@ -13,7 +13,7 @@ class OnFinishCallback(Protocol): on each engine worker separately. """ - def __call__(self) -> Any: + def __call__(self) -> None: """ The callable part of the callback. It will be called without arguments and its return result won't be used by the engine. @@ -30,8 +30,12 @@ class OnChangeCallback(Protocol): """ def __call__( - self, key: Pointer, row: dict[str, Any], time: int, is_addition: bool - ) -> Any: + self, + key: Pointer, + row: dict[str, Any], + time: int, + is_addition: bool, + ) -> None: """ The callable part of the callback. @@ -47,8 +51,24 @@ def __call__( Returns: None + """ + ... + + +class OnTimeEndCallback(Protocol): + """ + The callback to be called on every time finished. It is required + to accept one parameter: time. + """ - The return result of this method will be ignored by the engine. + def __call__(self, time: int) -> None: + """ + The callable part of the callback. + + Args: + time: the time finished + Returns: + None """ ... @@ -58,6 +78,7 @@ def subscribe( *, skip_persisted_batch: bool, on_change: OnChangeCallback, + on_time_end: OnTimeEndCallback = lambda time: None, on_end: OnFinishCallback = lambda: None, ): """ @@ -76,13 +97,15 @@ def subscribe( of the change in milliseconds and the flag stating if the change had been an addition of the row. These parameters of the callback are expected to have names row, time and is_addition respectively. + on_time_end: the callback function to be called on each closed time of computation. on_end: the callback function to be called when the stream of changes ends. - It will be called on each engine worker separately. Returns: None """ - def wrapper(key, values, time, diff): + def on_change_wrapper( + key: Pointer, values: list[Any], time: int, diff: int + ) -> None: """ Wraps a change event from Pathway in a more human-friendly format. @@ -107,5 +130,8 @@ def wrapper(key, values, time, diff): return on_change(key=key, row=row, time=time, is_addition=(diff == 1)) return table_to_datasink( - table, datasink.CallbackDataSink(wrapper, on_end, skip_persisted_batch) + table, + datasink.CallbackDataSink( + on_change_wrapper, on_time_end, on_end, skip_persisted_batch + ), ) diff --git a/python/pathway/internals/type_interpreter.py b/python/pathway/internals/type_interpreter.py index a8377165..17146c86 100644 --- a/python/pathway/internals/type_interpreter.py +++ b/python/pathway/internals/type_interpreter.py @@ -364,19 +364,40 @@ def eval_ifelse( state: TypeInterpreterState | None = None, **kwargs, ) -> expr.IfElseExpression: - expression = super().eval_ifelse(expression, state=state, **kwargs) - if_dtype = expression._if._dtype + assert state is not None + if_ = self.eval_expression(expression._if, state=state) + if_dtype = if_._dtype if if_dtype != dt.BOOL: raise TypeError( f"First argument of pathway.if_else has to be bool, found {if_dtype.typehint}." ) - then_dtype = expression._then._dtype - else_dtype = expression._else._dtype + + if isinstance(if_, expr.IsNotNoneExpression) and isinstance( + if_._expr, expr.ColumnReference + ): + then_ = self.eval_expression( + expression._then, state=state.with_new_col([if_._expr]) + ) + else: + then_ = self.eval_expression(expression._then, state=state) + + if isinstance(if_, expr.IsNoneExpression) and isinstance( + if_._expr, expr.ColumnReference + ): + else_ = self.eval_expression( + expression._else, state=state.with_new_col([if_._expr]) + ) + else: + else_ = self.eval_expression(expression._else, state=state) + + then_dtype = then_._dtype + else_dtype = else_._dtype lca = dt.types_lca(then_dtype, else_dtype) if lca is dt.ANY: raise TypeError( f"Cannot perform pathway.if_else on columns of types {then_dtype.typehint} and {else_dtype.typehint}." ) + expression = expr.IfElseExpression(if_, then_, else_) return _wrap(expression, lca) def eval_make_tuple( @@ -423,26 +444,18 @@ def eval_get( raise TypeError(f"Cannot get from {Json | None}.") else: # sequence - if not isinstance( - object_dtype, (dt.Tuple, dt.List) - ) and object_dtype not in [ - dt.ANY, - dt.ARRAY, - ]: + if ( + not isinstance(object_dtype, (dt.Array, dt.Tuple, dt.List)) + and object_dtype != dt.ANY + ): raise TypeError( f"Object in {expression!r} has to be a JSON or sequence." ) if index_dtype != dt.INT: raise TypeError(f"Index in {expression!r} has to be an int.") - if object_dtype == dt.ARRAY: - warnings.warn( - f"Object in {expression!r} is of type numpy.ndarray but its number of" - + " dimensions is not known. Pathway cannot determine the return type" - + " and will set Any as the return type. Please use " - + "pathway.declare_type to set the correct return type." - ) - return _wrap(expression, dt.ANY) + if isinstance(object_dtype, dt.Array): + return _wrap(expression, object_dtype.strip_dimension()) if object_dtype == dt.ANY: return _wrap(expression, dt.ANY) diff --git a/python/pathway/io/_subscribe.py b/python/pathway/io/_subscribe.py index 290a94a6..93e84dfb 100644 --- a/python/pathway/io/_subscribe.py +++ b/python/pathway/io/_subscribe.py @@ -5,12 +5,16 @@ from pathway.internals.table_subscription import ( OnChangeCallback, OnFinishCallback, + OnTimeEndCallback, subscribe as internal_subscribe, ) def subscribe( - table, on_change: OnChangeCallback, on_end: OnFinishCallback = lambda: None + table, + on_change: OnChangeCallback, + on_end: OnFinishCallback = lambda: None, + on_time_end: OnTimeEndCallback = lambda time: None, ): """ Calls a callback function on_change on every change happening in table. @@ -23,7 +27,7 @@ def subscribe( addition of the row. These parameters of the callback are expected to have names row, time and is_addition respectively. on_end: the callback to be called when the stream of changes ends. - It will be called on each engine worker separately. + on_time_end: the callback function to be called on each closed time of computation. Returns: None @@ -60,5 +64,6 @@ def subscribe( table, skip_persisted_batch=True, on_change=on_change, + on_time_end=on_time_end, on_end=on_end, ) diff --git a/python/pathway/io/_utils.py b/python/pathway/io/_utils.py index 6fe5d582..78fd21de 100644 --- a/python/pathway/io/_utils.py +++ b/python/pathway/io/_utils.py @@ -12,14 +12,14 @@ STATIC_MODE_NAME = "static" STREAMING_MODE_NAME = "streaming" -SNAPSHOT_MODE_NAME = "streaming_with_deletions" +SNAPSHOT_MODE_NAME = "streaming_with_deletions" # deprecated METADATA_COLUMN_NAME = "_metadata" _INPUT_MODES_MAPPING = { STATIC_MODE_NAME: ConnectorMode.STATIC, - STREAMING_MODE_NAME: ConnectorMode.SIMPLE_STREAMING, - SNAPSHOT_MODE_NAME: ConnectorMode.STREAMING_WITH_DELETIONS, + STREAMING_MODE_NAME: ConnectorMode.STREAMING, + SNAPSHOT_MODE_NAME: ConnectorMode.STREAMING, } _DATA_FORMAT_MAPPING = { @@ -41,7 +41,7 @@ PathwayType.DATE_TIME_NAIVE: dt.DATE_TIME_NAIVE, PathwayType.DATE_TIME_UTC: dt.DATE_TIME_UTC, PathwayType.DURATION: dt.DURATION, - PathwayType.ARRAY: dt.ARRAY, + PathwayType.ARRAY: dt.ANY_ARRAY, PathwayType.JSON: dt.JSON, } diff --git a/python/pathway/io/csv/__init__.py b/python/pathway/io/csv/__init__.py index b145cbe0..f9634a8b 100644 --- a/python/pathway/io/csv/__init__.py +++ b/python/pathway/io/csv/__init__.py @@ -7,13 +7,13 @@ import pathway as pw from pathway.internals.api import PathwayType -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame from pathway.io._utils import CsvParserSettings, check_deprecated_kwargs -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: str | PathLike, @@ -47,14 +47,13 @@ def read( a subset of its columns, the set of columns should be specified in this field. Otherwise, the primary key will be generated randomly. [will be deprecated soon] csv_settings: Settings for the CSV parser. - mode: denotes how the engine polls the new data from the source. Currently \ -"streaming", "static", and "streaming_with_deletions" are supported. If set to \ -"streaming" the engine will wait for the new input files in the directory. On the other \ -hand, "streaming_with_deletions" mode also tracks file deletions and modifications and \ -reflects them in the state. For example, if a file was deleted, "streaming_with_deletions"\ -mode will also remove rows obtained by reading this file from the table. Finally, the \ -"static" mode will only consider the available data and ingest all of it in one commit. \ -The default value is "streaming". + mode: Denotes how the engine polls the new data from the source. Currently \ +"streaming" and "static" are supported. If set to "streaming" the engine will wait for \ +the updates in the specified directory. It will track file additions, deletions, and \ +modifications and reflect these events in the state. For example, if a file was deleted,\ +"streaming" mode will also remove rows obtained by reading this file from the table. On \ +the other hand, the "static" mode will only consider the available data and ingest all \ +of it in one commit. The default value is "streaming". object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. with_metadata: When set to true, the connector will add an additional column \ @@ -179,7 +178,7 @@ def read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def write(table: Table, filename: str | PathLike) -> None: """Writes `table`'s stream of updates to a file in delimiter-separated values format. diff --git a/python/pathway/io/debezium/__init__.py b/python/pathway/io/debezium/__init__.py index a798ff1a..c89524ae 100644 --- a/python/pathway/io/debezium/__init__.py +++ b/python/pathway/io/debezium/__init__.py @@ -8,14 +8,14 @@ from pathway.internals import api, datasource from pathway.internals.api import PathwayType from pathway.internals.decorators import table_from_datasource -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame from pathway.io._utils import read_schema -@runtime_type_check +@check_arg_types @trace_user_frame def read( rdkafka_settings: dict, diff --git a/python/pathway/io/elasticsearch/__init__.py b/python/pathway/io/elasticsearch/__init__.py index cc84afdd..ca6341a1 100644 --- a/python/pathway/io/elasticsearch/__init__.py +++ b/python/pathway/io/elasticsearch/__init__.py @@ -4,7 +4,7 @@ from pathway.internals import api, datasink from pathway.internals._io_helpers import _format_output_value_fields -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -47,7 +47,7 @@ def engine_es_auth(self) -> api.ElasticSearchAuth: return self._engine_es_auth -@runtime_type_check +@check_arg_types @trace_user_frame def write(table: Table, host: str, auth: ElasticSearchAuth, index_name: str) -> None: """Write a table to a given index in ElasticSearch. diff --git a/python/pathway/io/fs/__init__.py b/python/pathway/io/fs/__init__.py index 6238610e..35eda7f0 100644 --- a/python/pathway/io/fs/__init__.py +++ b/python/pathway/io/fs/__init__.py @@ -9,7 +9,7 @@ from pathway.internals._io_helpers import _format_output_value_fields from pathway.internals.api import PathwayType from pathway.internals.decorators import table_from_datasource -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame from pathway.io._utils import ( @@ -25,7 +25,7 @@ } -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: str | PathLike, @@ -63,14 +63,13 @@ def read( and one row will correspond to one file. In case the "binary" format is specified, \ the data is read as raw bytes without UTF-8 parsing. schema: Schema of the resulting table. - mode: denotes how the engine polls the new data from the source. Currently \ -"streaming", "static", and "streaming_with_deletions" are supported. If set to \ -"streaming" the engine will wait for the new input files in the directory. On the other \ -hand, "streaming_with_deletions" mode also tracks file deletions and modifications and \ -reflects them in the state. For example, if a file was deleted, "streaming_with_deletions"\ -mode will also remove rows obtained by reading this file from the table. Finally, the \ -"static" mode will only consider the available data and ingest all of it in one commit. \ -The default value is "streaming". + mode: Denotes how the engine polls the new data from the source. Currently \ +"streaming" and "static" are supported. If set to "streaming" the engine will wait for \ +the updates in the specified directory. It will track file additions, deletions, and \ +modifications and reflect these events in the state. For example, if a file was deleted,\ +"streaming" mode will also remove rows obtained by reading this file from the table. On \ +the other hand, the "static" mode will only consider the available data and ingest all \ +of it in one commit. The default value is "streaming". csv_settings: Settings for the CSV parser. This parameter is used only in case the specified format is "csv". json_field_paths: If the format is "json", this field allows to map field names @@ -263,7 +262,7 @@ def read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def write(table: Table, filename: str | PathLike, format: str) -> None: """Writes ``table``'s stream of updates to a file in the given format. diff --git a/python/pathway/io/http/__init__.py b/python/pathway/io/http/__init__.py index e19ba119..2d38eb0d 100644 --- a/python/pathway/io/http/__init__.py +++ b/python/pathway/io/http/__init__.py @@ -6,7 +6,7 @@ from typing import Any from pathway.internals.api import PathwayType, Pointer -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -18,7 +18,7 @@ from ._streaming import HttpStreamingSubject -@runtime_type_check +@check_arg_types @trace_user_frame def read( url: str, @@ -147,7 +147,7 @@ def read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def write( table: Table, diff --git a/python/pathway/io/http/_server.py b/python/pathway/io/http/_server.py index 5c092a5c..532c8bbe 100644 --- a/python/pathway/io/http/_server.py +++ b/python/pathway/io/http/_server.py @@ -78,7 +78,7 @@ async def handle(self, request: web.Request): response = await self._fetch_response(id, event) if self._delete_completed_queries: self._remove(id, data) - return web.json_response(status=200, data=response) + return web.json_response(status=200, data=response, dumps=pw.Json.dumps) async def _fetch_response(self, id, event) -> Any: await event.wait() @@ -91,6 +91,10 @@ def _verify_payload(self, payload: dict): if column not in payload and column not in defaults: raise web.HTTPBadRequest(reason=f"`{column}` is required") + @property + def _deletions_enabled(self) -> bool: + return self._delete_completed_queries + def rest_connector( host: str, diff --git a/python/pathway/io/jsonlines/__init__.py b/python/pathway/io/jsonlines/__init__.py index cdfd1d44..6df693c1 100644 --- a/python/pathway/io/jsonlines/__init__.py +++ b/python/pathway/io/jsonlines/__init__.py @@ -7,13 +7,13 @@ import pathway as pw from pathway.internals.api import PathwayType -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: str | PathLike, @@ -41,14 +41,13 @@ def read( Args: path: Path to the file or to the folder with files. schema: Schema of the resulting table. - mode: denotes how the engine polls the new data from the source. Currently \ -"streaming", "static", and "streaming_with_deletions" are supported. If set to \ -"streaming" the engine will wait for the new input files in the directory. On the other \ -hand, "streaming_with_deletions" mode also tracks file deletions and modifications and \ -reflects them in the state. For example, if a file was deleted, "streaming_with_deletions"\ -mode will also remove rows obtained by reading this file from the table. Finally, the \ -"static" mode will only consider the available data and ingest all of it in one commit. \ -The default value is "streaming". + mode: Denotes how the engine polls the new data from the source. Currently \ +"streaming" and "static" are supported. If set to "streaming" the engine will wait for \ +the updates in the specified directory. It will track file additions, deletions, and \ +modifications and reflect these events in the state. For example, if a file was deleted,\ +"streaming" mode will also remove rows obtained by reading this file from the table. On \ +the other hand, the "static" mode will only consider the available data and ingest all \ +of it in one commit. The default value is "streaming". json_field_paths: This field allows to map field names into path in the field. For the field which require such mapping, it should be given in the format ``: ``, where the path to be mapped needs to be a @@ -183,7 +182,7 @@ def read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def write(table: Table, filename: str | PathLike) -> None: """Writes ``table``'s stream of updates to a file in jsonlines format. diff --git a/python/pathway/io/kafka/__init__.py b/python/pathway/io/kafka/__init__.py index 1068cd0c..5b205a94 100644 --- a/python/pathway/io/kafka/__init__.py +++ b/python/pathway/io/kafka/__init__.py @@ -10,7 +10,7 @@ from pathway.internals._io_helpers import _format_output_value_fields from pathway.internals.api import PathwayType from pathway.internals.decorators import table_from_datasource -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -23,7 +23,7 @@ } -@runtime_type_check +@check_arg_types @trace_user_frame def read( rdkafka_settings: dict, @@ -240,6 +240,7 @@ def read( topic=topic, parallel_readers=parallel_readers, persistent_id=persistent_id, + mode=api.ConnectorMode.STREAMING, ) schema, data_format = construct_schema_and_data_format( format, @@ -265,7 +266,7 @@ def read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def simple_read( server: str, @@ -353,7 +354,7 @@ def simple_read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def read_from_upstash( endpoint: str, @@ -457,7 +458,7 @@ def read_from_upstash( ) -@runtime_type_check +@check_arg_types @trace_user_frame def write( table: Table, diff --git a/python/pathway/io/logstash/__init__.py b/python/pathway/io/logstash/__init__.py index ab74362b..099c4e60 100644 --- a/python/pathway/io/logstash/__init__.py +++ b/python/pathway/io/logstash/__init__.py @@ -2,14 +2,14 @@ from __future__ import annotations -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame from ..http import RetryPolicy, write as http_write -@runtime_type_check +@check_arg_types @trace_user_frame def write( table: Table, diff --git a/python/pathway/io/minio/__init__.py b/python/pathway/io/minio/__init__.py index ff99fdfe..f6ba262b 100644 --- a/python/pathway/io/minio/__init__.py +++ b/python/pathway/io/minio/__init__.py @@ -4,7 +4,7 @@ from typing import Any -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -54,7 +54,7 @@ def create_aws_settings(self): ) -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: str, diff --git a/python/pathway/io/null/__init__.py b/python/pathway/io/null/__init__.py index e196fe32..1e738750 100644 --- a/python/pathway/io/null/__init__.py +++ b/python/pathway/io/null/__init__.py @@ -3,12 +3,12 @@ from __future__ import annotations from pathway.internals import api, datasink -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame -@runtime_type_check +@check_arg_types @trace_user_frame def write(table: Table) -> None: """Writes ``table``'s stream of updates to the empty sink. diff --git a/python/pathway/io/plaintext/__init__.py b/python/pathway/io/plaintext/__init__.py index fd403f76..6ba485df 100644 --- a/python/pathway/io/plaintext/__init__.py +++ b/python/pathway/io/plaintext/__init__.py @@ -5,12 +5,12 @@ from os import PathLike import pathway as pw -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: str | PathLike, @@ -32,14 +32,13 @@ def read( Args: path: Path to a file or to a folder. - mode: denotes how the engine polls the new data from the source. Currently \ -"streaming", "static", and "streaming_with_deletions" are supported. If set to \ -"streaming" the engine will wait for the new input files in the directory. On the other \ -hand, "streaming_with_deletions" mode also tracks file deletions and modifications and \ -reflects them in the state. For example, if a file was deleted, "streaming_with_deletions"\ -mode will also remove rows obtained by reading this file from the table. Finally, the \ -"static" mode will only consider the available data and ingest all of it in one commit. \ -The default value is "streaming". + mode: Denotes how the engine polls the new data from the source. Currently \ +"streaming" and "static" are supported. If set to "streaming" the engine will wait for \ +the updates in the specified directory. It will track file additions, deletions, and \ +modifications and reflect these events in the state. For example, if a file was deleted,\ +"streaming" mode will also remove rows obtained by reading this file from the table. On \ +the other hand, the "static" mode will only consider the available data and ingest all \ +of it in one commit. The default value is "streaming". object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. with_metadata: When set to true, the connector will add an additional column \ diff --git a/python/pathway/io/postgres/__init__.py b/python/pathway/io/postgres/__init__.py index d05e4530..175ef148 100644 --- a/python/pathway/io/postgres/__init__.py +++ b/python/pathway/io/postgres/__init__.py @@ -4,7 +4,7 @@ from pathway.internals import api, datasink from pathway.internals._io_helpers import _format_output_value_fields -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -13,7 +13,7 @@ def _connection_string_from_settings(settings: dict): return " ".join(k + "=" + v for (k, v) in settings.items()) -@runtime_type_check +@check_arg_types @trace_user_frame def write( table: Table, diff --git a/python/pathway/io/python/__init__.py b/python/pathway/io/python/__init__.py index 61b2c127..1b534017 100644 --- a/python/pathway/io/python/__init__.py +++ b/python/pathway/io/python/__init__.py @@ -14,7 +14,7 @@ from pathway.internals import Table, api, datasource from pathway.internals.api import DataEventType, PathwayType, Pointer, SessionType from pathway.internals.decorators import table_from_datasource -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.trace import trace_user_frame from pathway.io._utils import ( @@ -42,12 +42,50 @@ class ConnectorSubject(ABC): In order to send a message one of the methods :py:meth:`next_json`, :py:meth:`next_str`, :py:meth:`next_bytes` can be used. + + If the subject won't delete records, set the class property ``deletions_enabled`` + to ``False`` as it may help to improve the performance. + + Example: + + >>> import pathway as pw + >>> from pathway.io.python import ConnectorSubject + >>> + >>> class MySchema(pw.Schema): + ... a: int + ... b: str + ... + >>> + >>> class MySubject(ConnectorSubject): + ... def run(self) -> None: + ... for i in range(4): + ... self.next_json({"a": i, "b": f"x{i}"}) + ... @property + ... def _deletions_enabled(self) -> bool: + ... return False + ... + >>> + >>> s = MySubject() + >>> + >>> table = pw.io.python.read(s, schema=MySchema) + >>> pw.debug.compute_and_print(table, include_id=False) + a | b + 0 | x0 + 1 | x1 + 2 | x2 + 3 | x3 """ _buffer: Queue + _thread: threading.Thread | None + _exception: BaseException | None + _already_used: bool def __init__(self) -> None: self._buffer = Queue() + self._thread = None + self._exception = None + self._already_used = False @abstractmethod def run(self) -> None: @@ -83,14 +121,14 @@ def next_bytes(self, message: bytes) -> None: def commit(self) -> None: """Sends a commit message.""" - self.next_bytes(b"*COMMIT*") + self._buffer.put((DataEventType.INSERT, None, b"*COMMIT*", None)) def close(self) -> None: """Sends a sentinel message. Should be called to indicate that no new messages will be sent. """ - self.next_bytes(b"*FINISH*") + self._buffer.put((DataEventType.INSERT, None, b"*FINISH*", None)) def start(self) -> None: """Runs a separate thread with function feeding data into buffer. @@ -101,11 +139,24 @@ def start(self) -> None: def target(): try: self.run() + except BaseException as e: + self._exception = e finally: self.on_stop() self.close() - threading.Thread(target=target).start() + self._thread = threading.Thread(target=target) + self._thread.start() + + def end(self) -> None: + """Joins a thread running :py:meth:`run`. + + Should not be called directly. + """ + assert self._thread is not None + self._thread.join() + if self._exception is not None: + raise self._exception def _add( self, key: Pointer | None, message: bytes, metadata: bytes | None = None @@ -113,6 +164,10 @@ def _add( if self._session_type == SessionType.NATIVE: self._buffer.put((DataEventType.INSERT, key, message, metadata)) elif self._session_type == SessionType.UPSERT: + if not self._deletions_enabled: + raise ValueError( + f"Trying to upsert a row in {type(self)} but deletions_enabled is set to False." + ) self._buffer.put((DataEventType.UPSERT, key, message, metadata)) else: raise NotImplementedError(f"session type {self._session_type} not handled") @@ -120,6 +175,10 @@ def _add( def _remove( self, key: Pointer, message: bytes, metadata: bytes | None = None ) -> None: + if not self._deletions_enabled: + raise ValueError( + f"Trying to delete a row in {type(self)} but deletions_enabled is set to False." + ) self._buffer.put((DataEventType.DELETE, key, message, metadata)) def _read(self) -> Any: @@ -149,15 +208,19 @@ def _with_metadata(self) -> bool: def _session_type(self) -> SessionType: return SessionType.NATIVE + @property + def _deletions_enabled(self) -> bool: + return True + -@runtime_type_check +@check_arg_types @trace_user_frame def read( subject: ConnectorSubject, *, schema: type[Schema] | None = None, format: str = "json", - autocommit_duration_ms: int = 1500, + autocommit_duration_ms: int | None = 1500, debug_data=None, value_columns: list[str] | None = None, primary_key: list[str] | None = None, @@ -196,6 +259,13 @@ def read( Table: The table read. """ + if subject._already_used: + raise ValueError( + "You can't use the same ConnectorSubject object in more than one Python connector." + + "If you want to use the same ConnectorSubject twice, create two separate objects of this class." + ) + subject._already_used = True + data_format_type = get_data_format_type(format, SUPPORTED_INPUT_FORMATS) if data_format_type == "identity": @@ -221,13 +291,23 @@ def read( session_type=subject._session_type, parse_utf8=(format != "binary"), ) + mode = ( + api.ConnectorMode.STREAMING + if subject._deletions_enabled + else api.ConnectorMode.STATIC + ) data_storage = api.DataStorage( storage_type="python", python_subject=api.PythonSubject( - start=subject.start, read=subject._read, is_internal=subject._is_internal() + start=subject.start, + read=subject._read, + end=subject.end, + is_internal=subject._is_internal(), + deletions_enabled=subject._deletions_enabled, ), read_method=internal_read_method(format), persistent_id=persistent_id, + mode=mode, ) data_source_options = datasource.DataSourceOptions( commit_duration_ms=autocommit_duration_ms diff --git a/python/pathway/io/redpanda/__init__.py b/python/pathway/io/redpanda/__init__.py index d8c0868e..98ec5b21 100644 --- a/python/pathway/io/redpanda/__init__.py +++ b/python/pathway/io/redpanda/__init__.py @@ -5,14 +5,14 @@ from typing import Any from pathway.internals.api import PathwayType -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame from pathway.io import kafka -@runtime_type_check +@check_arg_types @trace_user_frame def read( rdkafka_settings: dict, @@ -220,7 +220,7 @@ def read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def write( table: Table, diff --git a/python/pathway/io/s3/__init__.py b/python/pathway/io/s3/__init__.py index 7f4f5487..30c6d155 100644 --- a/python/pathway/io/s3/__init__.py +++ b/python/pathway/io/s3/__init__.py @@ -7,7 +7,7 @@ from pathway.internals import api, datasource from pathway.internals._io_helpers import AwsS3Settings from pathway.internals.decorators import table_from_datasource -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -77,7 +77,7 @@ def __init__( ) -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: str, @@ -168,11 +168,6 @@ def read( ... ) """ internal_mode = internal_connector_mode(mode) - if internal_mode == api.ConnectorMode.STREAMING_WITH_DELETIONS: - raise NotImplementedError( - "Snapshot mode is currently unsupported in S3-like connectors" - ) - if aws_s3_settings: prepared_aws_settings = aws_s3_settings else: @@ -207,7 +202,7 @@ def read( ) -@runtime_type_check +@check_arg_types @trace_user_frame def read_from_digital_ocean( path: str, @@ -281,11 +276,6 @@ def read_from_digital_ocean( ... ) """ internal_mode = internal_connector_mode(mode) - if internal_mode == api.ConnectorMode.STREAMING_WITH_DELETIONS: - raise NotImplementedError( - "Snapshot mode is currently unsupported in S3-like connectors" - ) - data_storage = construct_s3_data_storage( path=path, rust_engine_s3_settings=do_s3_settings.settings, @@ -315,7 +305,7 @@ def read_from_digital_ocean( ) -@runtime_type_check +@check_arg_types @trace_user_frame def read_from_wasabi( path: str, @@ -388,11 +378,6 @@ def read_from_wasabi( ... ) """ internal_mode = internal_connector_mode(mode) - if internal_mode == api.ConnectorMode.STREAMING_WITH_DELETIONS: - raise NotImplementedError( - "Snapshot mode is currently unsupported in S3-like connectors" - ) - data_storage = construct_s3_data_storage( path=path, rust_engine_s3_settings=wasabi_s3_settings.settings, diff --git a/python/pathway/io/s3_csv/__init__.py b/python/pathway/io/s3_csv/__init__.py index fbb79320..707814f4 100644 --- a/python/pathway/io/s3_csv/__init__.py +++ b/python/pathway/io/s3_csv/__init__.py @@ -5,7 +5,7 @@ from typing import Any from pathway.internals.api import PathwayType -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame @@ -13,7 +13,7 @@ from pathway.io.s3 import AwsS3Settings, read as s3_read -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: str, diff --git a/python/pathway/io/sqlite/__init__.py b/python/pathway/io/sqlite/__init__.py index 04e330fa..3868107f 100644 --- a/python/pathway/io/sqlite/__init__.py +++ b/python/pathway/io/sqlite/__init__.py @@ -7,14 +7,14 @@ from pathway.internals import api, datasource from pathway.internals.decorators import table_from_datasource -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.schema import Schema from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame from pathway.io._utils import read_schema -@runtime_type_check +@check_arg_types @trace_user_frame def read( path: PathLike | str, @@ -50,6 +50,7 @@ def read( path=fspath(path), table_name=table_name, column_names=schema.column_names(), + mode=api.ConnectorMode.STREAMING, ) data_format = api.DataFormat( format_type="transparent", diff --git a/python/pathway/reducers.py b/python/pathway/reducers.py index c462732e..b5d71973 100644 --- a/python/pathway/reducers.py +++ b/python/pathway/reducers.py @@ -1,5 +1,10 @@ # Copyright © 2023 Pathway +from pathway.internals.custom_reducers import ( + stateful_many, + stateful_single, + udf_reducer, +) from pathway.internals.reducers import ( any, argmax, @@ -12,11 +17,8 @@ ndarray, npsum, sorted_tuple, - stateful_many, - stateful_single, sum, tuple_reducer as tuple, - udf_reducer, unique, ) diff --git a/python/pathway/stdlib/graphs/bellman_ford/impl.py b/python/pathway/stdlib/graphs/bellman_ford/impl.py index 49ce355e..3a6c8c26 100644 --- a/python/pathway/stdlib/graphs/bellman_ford/impl.py +++ b/python/pathway/stdlib/graphs/bellman_ford/impl.py @@ -5,7 +5,7 @@ import math import pathway.internals as pw -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame from ..common import Edge @@ -37,7 +37,7 @@ def _bellman_ford_step( return vertices_dist -@runtime_type_check +@check_arg_types @trace_user_frame def bellman_ford(vertices: pw.Table[Vertex], edges: pw.Table[Edge | Dist]): vertices_dist: pw.Table[DistFromSource] = vertices.select( diff --git a/python/pathway/stdlib/graphs/louvain_communities/impl.py b/python/pathway/stdlib/graphs/louvain_communities/impl.py index ac6129c4..7bf714bf 100644 --- a/python/pathway/stdlib/graphs/louvain_communities/impl.py +++ b/python/pathway/stdlib/graphs/louvain_communities/impl.py @@ -6,7 +6,7 @@ import pathway.internals as pw from pathway.internals.fingerprints import fingerprint -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame from pathway.stdlib.graphs.common import Clustering, Edge, Weight from pathway.stdlib.graphs.graph import WeightedGraph @@ -218,7 +218,7 @@ def rand(x) -> int: return clustering.update_rows(delta).with_universe_of(clustering) -@runtime_type_check +@check_arg_types @trace_user_frame def _louvain_level(G: WeightedGraph) -> pw.Table[Clustering]: r""" @@ -377,7 +377,5 @@ def cluster_modularity(internal: float, degree: float, total: float) -> float: ) return score.reduce( - modularity=pw.declare_type( - float, pw.apply(round, pw.reducers.sum(score.modularity), round_digits) - ) + modularity=pw.reducers.sum(score.modularity).num.round(round_digits) ) diff --git a/python/pathway/stdlib/graphs/pagerank/impl.py b/python/pathway/stdlib/graphs/pagerank/impl.py index 8d0df785..ca9ce76a 100644 --- a/python/pathway/stdlib/graphs/pagerank/impl.py +++ b/python/pathway/stdlib/graphs/pagerank/impl.py @@ -3,7 +3,7 @@ from __future__ import annotations import pathway.internals as pw -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame from ..common import Edge @@ -13,7 +13,7 @@ class Result(pw.Schema): rank: int -@runtime_type_check +@check_arg_types @trace_user_frame def pagerank(edges: pw.Table[Edge], steps: int = 5) -> pw.Table[Result]: in_vertices: pw.Table = edges.groupby(id=edges.v).reduce(degree=0) diff --git a/python/pathway/stdlib/indexing/sorting.py b/python/pathway/stdlib/indexing/sorting.py index dd9eb57c..43911a28 100644 --- a/python/pathway/stdlib/indexing/sorting.py +++ b/python/pathway/stdlib/indexing/sorting.py @@ -9,7 +9,7 @@ import pathway.internals as pw from pathway.internals.arg_tuple import wrap_arg_tuple from pathway.internals.fingerprints import fingerprint -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame @@ -142,7 +142,7 @@ def build_sorted_index(nodes: pw.Table[Key | Instance]) -> SortedIndex: return dict(index=result, oracle=root) -@runtime_type_check +@check_arg_types @trace_user_frame def sort_from_index( index: pw.Table[LeftRight | Parent], oracle=None @@ -208,7 +208,7 @@ class ComparisonRet(pw.Schema): comparison_ret: int -@runtime_type_check +@check_arg_types @trace_user_frame def filter_cmp_helper(filter_val, index, oracle=None) -> pw.Table[ComparisonRet]: return _filter_cmp_helper(filter_val=filter_val, index=index).index # type: ignore @@ -258,7 +258,7 @@ class PrefixSumOracle(pw.Schema): prefix_sum_upperbound_key: Callable[..., float] -@runtime_type_check +@check_arg_types @trace_user_frame def prefix_sum_oracle(oracle, index) -> pw.Table[PrefixSumOracle]: return _prefix_sum_oracle(oracle=oracle, index=index).oracle # type: ignore @@ -340,7 +340,7 @@ class BinsearchOracle(pw.Schema): upperbound: Callable[..., pw.Pointer | None] -@runtime_type_check +@check_arg_types @trace_user_frame def binsearch_oracle(oracle, index) -> pw.Table[BinsearchOracle]: return _binsearch_oracle(oracle=oracle, index=index).oracle # type: ignore @@ -398,7 +398,7 @@ def upperbound(self, value) -> pw.Pointer | None: # This has O(k) complexity. TODO: write version that has O(log n) complexity. -@runtime_type_check +@check_arg_types @trace_user_frame def filter_smallest_k( column: pw.ColumnReference, instance: pw.ColumnReference, ks: pw.Table @@ -413,7 +413,7 @@ def filter_smallest_k( oracle_restricted = oracle.restrict(ks) # root is pked with instance, ks also res = ks.select(res=oracle_restricted.prefix_sum_upperbound(ks.k)) - validres = res.filter(res.res.is_not_none()).update_types(res=pw.Pointer) + validres = res.filter(res.res.is_not_none()) validres = validres.select(res=getattr(table.ix(validres.res), colname)) res <<= res.filter(res.res.is_none()).select(res=math.inf) res <<= validres @@ -451,7 +451,7 @@ def next_value(self): return self.transformer.ordered_table[self.next].next_value -@runtime_type_check +@check_arg_types @trace_user_frame def retrieve_prev_next_values( ordered_table: pw.Table, value: pw.ColumnReference | None = None diff --git a/python/pathway/stdlib/ml/classifiers/_knn_lsh.py b/python/pathway/stdlib/ml/classifiers/_knn_lsh.py index f08e5976..ee244742 100644 --- a/python/pathway/stdlib/ml/classifiers/_knn_lsh.py +++ b/python/pathway/stdlib/ml/classifiers/_knn_lsh.py @@ -26,6 +26,7 @@ from statistics import mode +import jmespath import numpy as np # TODO change to `import pathway as pw` when it is not imported as part of stdlib, OR move the whole file to stdlib @@ -44,7 +45,16 @@ def _euclidean_distance(data_table: np.ndarray, query_table: np.ndarray): return np.sum((data_table - query_table) ** 2, axis=1) -def knn_lsh_classifier_train(data: pw.Table[DataPoint], L, type="euclidean", **kwargs): +class MetaDataSchema: + metadata: dict + + +def knn_lsh_classifier_train( + data: pw.Table[DataPoint], + L, + type="euclidean", + **kwargs, +): """ Build the LSH index over data. L the number of repetitions of the LSH scheme. @@ -83,6 +93,9 @@ def make_band_col_name(i): buckets_list = data.select(buckets=pw.apply(list, data.buckets)) data += unpack_col(buckets_list.buckets, *band_col_names) + if "metadata" not in data._columns: + data += data.select(metadata=None) + def lsh_perform_query(queries: pw.Table, k: int | None = None) -> pw.Table: queries += queries.select(buckets=pw.apply(lsh_projection, queries.data)) if k is not None: @@ -114,24 +127,30 @@ def lsh_perform_query(queries: pw.Table, k: int | None = None) -> pw.Table: def merge_buckets(*tuples: list[tuple]) -> tuple: return tuple(StableSet(sum(tuples, ()))) + if "metadata_filter" not in result._columns: + result += result.select(metadata_filter=None) + flattened = result.select( result.data, query_id=result.id, ids=pw.apply(merge_buckets, *[result[f"items_{i}"] for i in range(L)]), k=result.k, - ).filter(pw.apply_with_type(lambda x: x != (), bool, pw.this.ids)) + metadata_filter=result.metadata_filter, + ).filter(pw.this.ids != ()) # step 3: find knns in unioned buckets @pw.transformer class compute_knns_transformer: class training_data(pw.ClassArg): data = pw.input_attribute() + metadata = pw.input_attribute() class flattened(pw.ClassArg): data = pw.input_attribute() query_id = pw.input_attribute() ids = pw.input_attribute() k = pw.input_attribute() + metadata_filter = pw.input_attribute() @pw.output_attribute def knns_ids(self) -> np.ndarray: @@ -139,22 +158,34 @@ def knns_ids(self) -> np.ndarray: for id_candidate in self.ids: try: self.transformer.training_data[id_candidate].data + self.transformer.training_data[id_candidate].metadata except BaseException: pass - data_candidates = np.array( - [ - self.transformer.training_data[id_candidate].data - for id_candidate in self.ids - ] - ) - + candidates = [ + ( + id_candidate, + self.transformer.training_data[id_candidate].data, + ) + for id_candidate in self.ids + if self.metadata_filter is None + or jmespath.search( + self.metadata_filter, + self.transformer.training_data[id_candidate].metadata.value, + ) + is True + ] + if len(candidates) == 0: + return np.array([]) + ids_filtered, data_candidates_filtered = zip(*candidates) + data_candidates = np.array(data_candidates_filtered) neighs = min(self.k, len(data_candidates)) knn_ids = np.argpartition( - distance_function(data_candidates, querypoint), neighs - 1 + distance_function(data_candidates, querypoint), + neighs - 1, )[ :neighs ] # neighs - 1 in argpartition, because of 0-based indexing - ret = np.array(self.ids)[knn_ids] + ret = np.array(ids_filtered)[knn_ids] return ret knn_result: pw.Table = compute_knns_transformer( # type: ignore @@ -169,7 +200,7 @@ def knns_ids(self) -> np.ndarray: knn_result_with_empty_results = knn_result_with_empty_results.with_columns( knns_ids=pw.coalesce(pw.this.knns_ids, np.array(())) - ).update_types(knns_ids=knn_result.typehints()["knns_ids"]) + ) # return knn_result return knn_result_with_empty_results diff --git a/python/pathway/stdlib/ml/index.py b/python/pathway/stdlib/ml/index.py index 597181fe..c3950f05 100644 --- a/python/pathway/stdlib/ml/index.py +++ b/python/pathway/stdlib/ml/index.py @@ -24,6 +24,7 @@ class KNNIndex: n_and (int): number of ANDs bucket_length (float): bucket length (after projecting on a line) distance_type (str): euclidean metric is supported. + metadata (pw.ColumnExpression): optional column expression representing dict of the metadata. """ def __init__( @@ -35,11 +36,12 @@ def __init__( n_and: int = 10, bucket_length: float = 10.0, distance_type: str = "euclidean", + metadata: pw.ColumnExpression | None = None, ): self.data = data self.packed_data = data.select(row=pw.make_tuple(*self.data)) - embeddings = data.select(data=data_embedding) + embeddings = data.select(data=data_embedding, metadata=metadata) self._query = knn_lsh_classifier_train( embeddings, L=n_or, @@ -54,6 +56,7 @@ def get_nearest_items( query_embedding: pw.ColumnReference, k: pw.ColumnExpression | int = 3, collapse_rows: bool = True, + metadata_filter: pw.ColumnExpression | None = None, ): """ This method queries the index with given queries and returns 'k' most relevant documents @@ -71,6 +74,9 @@ def get_nearest_items( multiple rows corresponding to a single query will be collapsed into a single row, with each column containing a tuple of values from the original rows. If set to False, the output will retain the multi-row format for each query. Defaults to True. + metadata_filter (pw.ColumnExpression): optional column expression containing evaluating to the text + representing the metadata filtering query in the JMESPath format. The search will happen + only for documents satisfying this filtering. Can be constant for all queries or set per query. Returns: pw.Table @@ -104,12 +110,17 @@ def get_nearest_items( >>> import pathway as pw >>> from pathway.stdlib.ml.index import KNNIndex >>> import pandas as pd + >>> class InputSchema(pw.Schema): + ... document: str + ... embeddings: list[float] + ... metadata: dict >>> documents = pw.debug.table_from_pandas( ... pd.DataFrame.from_records([ - ... {"document": "document 1", "embeddings":[1,-1, 0]}, - ... {"document": "document 2", "embeddings":[1, 1, 0]}, - ... {"document": "document 3", "embeddings":[0, 0, 1]}, - ... ]) + ... {"document": "document 1", "embeddings":[1,-1, 0], "metadata":{"foo": 1}}, + ... {"document": "document 2", "embeddings":[1, 1, 0], "metadata":{"foo": 2}}, + ... {"document": "document 3", "embeddings":[0, 0, 1], "metadata":{"foo": 3}}, + ... ]), + ... schema=InputSchema ... ) >>> index = KNNIndex(documents.embeddings, documents, n_dimensions=3) >>> queries = pw.debug.table_from_pandas( @@ -118,11 +129,17 @@ def get_nearest_items( ... {"query": "What is doc -5 about?", "embeddings":[-1, 10, -10]}, ... ]) ... ) - >>> relevant_docs = index.get_nearest_items(queries.embeddings, k=2) + >>> relevant_docs = index.get_nearest_items(queries.embeddings, k=2).without(pw.this.metadata) >>> pw.debug.compute_and_print(relevant_docs) | document | embeddings ^YYY4HAB... | () | () ^X1MXHYY... | ('document 2', 'document 3') | ((1, 1, 0), (0, 0, 1)) + >>> index = KNNIndex(documents.embeddings, documents, n_dimensions=3, metadata=documents.metadata) + >>> relevant_docs_meta = index.get_nearest_items(queries.embeddings, k=2, metadata_filter="foo >= `3`") + >>> pw.debug.compute_and_print(relevant_docs_meta) + | document | embeddings | metadata + ^YYY4HAB... | () | () | () + ^X1MXHYY... | ('document 3',) | ((0, 0, 1),) | (pw.Json({'foo': 3}),) >>> data = pw.debug.table_from_markdown( ... ''' ... x | y | __time__ @@ -152,8 +169,9 @@ def get_nearest_items( (-3, 1) | ((0, 0), (2, 2)) | 10 | -1 (-3, 1) | ((-3, 3), (0, 0)) | 10 | 1 """ - - queries = query_embedding.table.select(data=query_embedding, k=k) + queries = query_embedding.table.select( + data=query_embedding, k=k, metadata_filter=metadata_filter + ) knns_ids = ( self._query(queries) .flatten(pw.this.knns_ids, pw.this.query_id) @@ -169,6 +187,7 @@ def get_nearest_items_asof_now( query_embedding: pw.ColumnReference, k: pw.ColumnExpression | int = 3, collapse_rows: bool = True, + metadata_filter: pw.ColumnExpression | None = None, ): """ This method queries the index with given queries and returns 'k' most relevant documents @@ -184,6 +203,9 @@ def get_nearest_items_asof_now( multiple rows corresponding to a single query will be collapsed into a single row, with each column containing a tuple of values from the original rows. If set to False, the output will retain the multi-row format for each query. Defaults to True. + metadata_filter (pw.ColumnExpression): optional column expression containing evaluating to the text + representing the metadata filtering query in the JMESPath format. The search will happen + only for documents satisfying this filtering. Can be constant for all queries or set per query. Example: @@ -216,11 +238,14 @@ def get_nearest_items_asof_now( """ return _predict_asof_now( - lambda query, k: self.get_nearest_items( - query, k=k, collapse_rows=collapse_rows + lambda query, k, metadata_filter: self.get_nearest_items( + query, k=k, collapse_rows=collapse_rows, metadata_filter=metadata_filter ), query_embedding, query_embedding.table.select(k=k).k, + query_embedding.table.select( + metadata_filter=metadata_filter + ).metadata_filter, with_queries_universe=collapse_rows, ) diff --git a/python/pathway/stdlib/ordered/diff.py b/python/pathway/stdlib/ordered/diff.py index 738dcf08..4bbf04f5 100644 --- a/python/pathway/stdlib/ordered/diff.py +++ b/python/pathway/stdlib/ordered/diff.py @@ -1,11 +1,11 @@ # Copyright © 2023 Pathway import pathway as pw -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame -@runtime_type_check +@check_arg_types @trace_user_frame def diff( self: pw.Table, diff --git a/python/pathway/stdlib/temporal/_asof_join.py b/python/pathway/stdlib/temporal/_asof_join.py index 177e1310..80bd7271 100644 --- a/python/pathway/stdlib/temporal/_asof_join.py +++ b/python/pathway/stdlib/temporal/_asof_join.py @@ -20,7 +20,7 @@ desugar, ) from pathway.internals.join import validate_join_condition -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame from .utils import TimeEventType, check_joint_types @@ -434,7 +434,7 @@ def _asof_join( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=True, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def asof_join( self: pw.Table, @@ -530,7 +530,7 @@ def asof_join( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def asof_join_left( self: pw.Table, @@ -625,7 +625,7 @@ def asof_join_left( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def asof_join_right( self: pw.Table, @@ -720,7 +720,7 @@ def asof_join_right( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def asof_join_outer( self: pw.Table, diff --git a/python/pathway/stdlib/temporal/_asof_now_join.py b/python/pathway/stdlib/temporal/_asof_now_join.py index 70a23f8f..fcac1841 100644 --- a/python/pathway/stdlib/temporal/_asof_now_join.py +++ b/python/pathway/stdlib/temporal/_asof_now_join.py @@ -15,7 +15,7 @@ desugar, ) from pathway.internals.join import validate_join_condition -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.thisclass import ThisMetaclass from pathway.internals.trace import trace_user_frame @@ -169,7 +169,7 @@ def select( @trace_user_frame @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=True, allow_id=True)) -@runtime_type_check +@check_arg_types def asof_now_join( self: pw.Table, other: pw.Table, @@ -249,7 +249,7 @@ def asof_now_join( @trace_user_frame @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=True, allow_id=True)) -@runtime_type_check +@check_arg_types def asof_now_join_inner( self: pw.Table, other: pw.Table, @@ -325,7 +325,7 @@ def asof_now_join_inner( @trace_user_frame @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=True, allow_id=True)) -@runtime_type_check +@check_arg_types def asof_now_join_left( self: pw.Table, other: pw.Table, diff --git a/python/pathway/stdlib/temporal/_interval_join.py b/python/pathway/stdlib/temporal/_interval_join.py index 94b19bb3..415a403c 100644 --- a/python/pathway/stdlib/temporal/_interval_join.py +++ b/python/pathway/stdlib/temporal/_interval_join.py @@ -20,7 +20,7 @@ combine_args_kwargs, desugar, ) -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.thisclass import ThisMetaclass from pathway.internals.trace import trace_user_frame @@ -425,7 +425,7 @@ def _get_unmatched_rows( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=True, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def interval_join( self: pw.Table, @@ -628,7 +628,7 @@ def interval_join( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def interval_join_inner( self: pw.Table, @@ -827,7 +827,7 @@ def interval_join_inner( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def interval_join_left( self: pw.Table, @@ -1044,7 +1044,7 @@ def interval_join_left( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def interval_join_right( self: pw.Table, @@ -1250,7 +1250,7 @@ def interval_join_right( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def interval_join_outer( self: pw.Table, diff --git a/python/pathway/stdlib/temporal/_window.py b/python/pathway/stdlib/temporal/_window.py index 1f90be63..5ecac704 100644 --- a/python/pathway/stdlib/temporal/_window.py +++ b/python/pathway/stdlib/temporal/_window.py @@ -17,7 +17,7 @@ ) from pathway.internals.desugaring import desugar from pathway.internals.join import validate_join_condition -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame from pathway.internals.type_interpreter import eval_type @@ -112,7 +112,7 @@ def merge_ccs(data): return pw.iterate(merge_ccs, data=target).data - @runtime_type_check + @check_arg_types def _apply( self, table: pw.Table, @@ -148,7 +148,7 @@ def _apply( return gb - @runtime_type_check + @check_arg_types def _join( self, left: pw.Table, @@ -326,7 +326,7 @@ def _assign_windows( and (self.offset is None or start >= self.offset) ] - @runtime_type_check + @check_arg_types def _apply( self, table: pw.Table, @@ -425,7 +425,7 @@ def _apply( return target - @runtime_type_check + @check_arg_types def _join( self, left: pw.Table, @@ -518,7 +518,7 @@ class _IntervalsOverWindow(Window): upper_bound: int | float | datetime.timedelta is_outer: bool - @runtime_type_check + @check_arg_types def _apply( self, table: pw.Table, @@ -567,7 +567,7 @@ def _apply( ) ) - @runtime_type_check + @check_arg_types def _join( self, left: pw.Table, @@ -584,7 +584,7 @@ def _join( ) -@runtime_type_check +@check_arg_types @trace_user_frame def session( *, @@ -648,7 +648,7 @@ def session( return _SessionWindow(predicate=predicate, max_gap=max_gap) -@runtime_type_check +@check_arg_types @trace_user_frame def sliding( hop: int | float | datetime.timedelta, @@ -724,7 +724,7 @@ def sliding( ) -@runtime_type_check +@check_arg_types @trace_user_frame def tumbling( duration: int | float | datetime.timedelta, @@ -782,7 +782,7 @@ def tumbling( ) -@runtime_type_check +@check_arg_types @trace_user_frame def intervals_over( *, @@ -853,7 +853,7 @@ def intervals_over( @desugar @arg_handler(handler=shard_deprecation) @arg_handler(handler=windowby_handler) -@runtime_type_check +@check_arg_types def windowby( self: pw.Table, time_expr: pw.ColumnExpression, diff --git a/python/pathway/stdlib/temporal/_window_join.py b/python/pathway/stdlib/temporal/_window_join.py index dcd1b61c..eaa83aba 100644 --- a/python/pathway/stdlib/temporal/_window_join.py +++ b/python/pathway/stdlib/temporal/_window_join.py @@ -15,7 +15,7 @@ TableSubstitutionDesugaring, desugar, ) -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.thisclass import ThisMetaclass from pathway.internals.trace import trace_user_frame from pathway.stdlib import temporal @@ -151,7 +151,7 @@ def select(self, *args: pw.ColumnReference, **kwargs: Any) -> pw.Table: @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=True, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def window_join( self: pw.Table, @@ -355,7 +355,7 @@ def window_join( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def window_join_inner( self: pw.Table, @@ -556,7 +556,7 @@ def window_join_inner( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def window_join_left( self: pw.Table, @@ -776,7 +776,7 @@ def window_join_left( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def window_join_right( self: pw.Table, @@ -991,7 +991,7 @@ def window_join_right( @desugar(substitution={pw.left: "self", pw.right: "other"}) @arg_handler(handler=join_kwargs_handler(allow_how=False, allow_id=False)) -@runtime_type_check +@check_arg_types @trace_user_frame def window_join_outer( self: pw.Table, diff --git a/python/pathway/stdlib/utils/col.py b/python/pathway/stdlib/utils/col.py index 25372ab8..c1b5d352 100644 --- a/python/pathway/stdlib/utils/col.py +++ b/python/pathway/stdlib/utils/col.py @@ -7,11 +7,11 @@ from typing import Type, overload import pathway.internals as pw -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.trace import trace_user_frame -@runtime_type_check +@check_arg_types @trace_user_frame def flatten_column( column: pw.ColumnReference, @@ -75,7 +75,7 @@ def unpack_col( ... -@runtime_type_check +@check_arg_types @trace_user_frame def unpack_col( column: pw.ColumnReference, @@ -144,7 +144,7 @@ def unpack_col( # TODO: generalize to apply on groupby: https://github.com/navalgo/IoT-Pathway/issues/1919 -@runtime_type_check +@check_arg_types @trace_user_frame def multiapply_all_rows( *cols: pw.ColumnReference, @@ -209,7 +209,7 @@ def fun_wrapped(ids_and_cols): return result.with_universe_of(table) -@runtime_type_check +@check_arg_types @trace_user_frame def apply_all_rows( *cols: pw.ColumnReference, @@ -259,7 +259,7 @@ def fun_wrapped(*cols): ) -@runtime_type_check +@check_arg_types @trace_user_frame def groupby_reduce_majority( column_group: pw.ColumnReference, column_val: pw.ColumnReference diff --git a/python/pathway/stdlib/viz/plotting.py b/python/pathway/stdlib/viz/plotting.py index 221d2771..a7087b52 100644 --- a/python/pathway/stdlib/viz/plotting.py +++ b/python/pathway/stdlib/viz/plotting.py @@ -1,6 +1,7 @@ # Copyright © 2023 Pathway from collections.abc import Callable +from typing import Any import pandas as pd import panel as pn @@ -10,7 +11,7 @@ from pathway.internals import api, parse_graph from pathway.internals.graph_runner import GraphRunner from pathway.internals.monitoring import MonitoringLevel -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table_subscription import subscribe as internal_subscribe from pathway.internals.trace import trace_user_frame @@ -29,7 +30,7 @@ def _in_notebook(): return True -@runtime_type_check +@check_arg_types @trace_user_frame def plot( self: pw.Table, @@ -43,7 +44,8 @@ def plot( Args: self (pw.Table): a table serving as a source of data - plotting_function (Callable[[ColumnDataSource], Plot]): _description_ + plotting_function (Callable[[ColumnDataSource], Plot]): function for creating plot + from ColumnDataSource Returns: pn.Column: visualization which can be displayed immediately or passed as a dashboard widget @@ -94,15 +96,11 @@ def plot( } source.stream(dict_data, rollover=len(output_data)) # type: ignore else: - integrated = {} + integrated: dict[api.Pointer, Any] = {} in_notebook = _in_notebook() - def _update(key, row, time, is_addition): - if is_addition: - integrated[key] = row - else: - del integrated[key] + def stream_updates(): df = pd.DataFrame.from_dict(integrated, orient="index", columns=col_names) if sorting_col: df = df.sort_values(sorting_col) @@ -110,20 +108,26 @@ def _update(key, row, time, is_addition): df = df.sort_index() df = df.reset_index(drop=True) + source.stream( + df.to_dict("list"), rollover=len(df) # type:ignore[arg-type] + ) + if in_notebook: - source.stream( - df.to_dict("list"), rollover=len(df) # type:ignore[arg-type] - ) pn.io.push_notebook(viz) + + def _update(key, row, time, is_addition): + if is_addition: + integrated[key] = row else: - if plot.document is not None: - plot.document.add_next_tick_callback( - lambda: source.stream( - df.to_dict("list"), # type:ignore[arg-type] - rollover=len(df), - ) - ) + del integrated[key] + + if plot.document is not None: + if plot.document.session_context: + plot.document.add_next_tick_callback(stream_updates) + else: + stream_updates() internal_subscribe(self, on_change=_update, skip_persisted_batch=True) + pn.state.on_session_created(lambda _: stream_updates()) return viz diff --git a/python/pathway/stdlib/viz/table_viz.py b/python/pathway/stdlib/viz/table_viz.py index bc127b68..e8c17596 100644 --- a/python/pathway/stdlib/viz/table_viz.py +++ b/python/pathway/stdlib/viz/table_viz.py @@ -9,7 +9,7 @@ from pathway.internals import api, parse_graph from pathway.internals.graph_runner import GraphRunner from pathway.internals.monitoring import MonitoringLevel -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.internals.table_subscription import subscribe as internal_subscribe from pathway.internals.trace import trace_user_frame @@ -21,7 +21,7 @@ def _repr_mimebundle_(self: pw.Table, include, exclude): return self.show(snapshot=True)._repr_mimebundle_(include, exclude) -@runtime_type_check +@check_arg_types @trace_user_frame def show( self: pw.Table, *, snapshot=True, include_id=True, short_pointers=True, sorters=None @@ -33,7 +33,8 @@ def show( Args: self (pw.Table): a table to be displayed - snapshot (bool, optional): _description_. Defaults to True. + snapshot (bool, optional): whether only current snapshot or all changes to the table should be displayed. + Defaults to True. include_id (bool, optional): whether to show ids of rows. Defaults to True. short_pointers (bool, optional): whether to shorten printed ids. Defaults to True. @@ -56,12 +57,17 @@ def show( col_names.append("time") col_names.append("diff") - def _format_ptr(x): + def _format_types(x): if isinstance(x, api.Pointer): s = str(x) if len(s) > 8 and short_pointers: s = s[:8] + "..." return s + if isinstance(x, pw.Json): + s = str(x) + if len(s) > 64: + s = s[:64] + " ..." + return s return x gr = GraphRunner(parse_graph.G, debug=False, monitoring_level=MonitoringLevel.NONE) @@ -102,7 +108,7 @@ def color_negative_red(row): output_data = api.squash_updates(captured) keys = list(output_data.keys()) dict_data = { - name: [_format_ptr(output_data[key][index]) for key in keys] + name: [_format_types(output_data[key][index]) for key in keys] for index, name in enumerate(self._columns.keys()) } dynamic_table.value = pd.DataFrame(dict_data) @@ -114,7 +120,7 @@ def update(key, row, time, is_addition): if not snapshot: row["time"] = time row["diff"] = is_addition * 2 - 1 - row = {k: _format_ptr(v) for k, v in row.items()} + row = {k: _format_types(v) for k, v in row.items()} dynamic_table.stream(row, follow=False) else: if is_addition: @@ -129,7 +135,7 @@ def update(key, row, time, is_addition): .reset_index(drop=True) ) df = df[col_names] - df = df.applymap(_format_ptr) + df = df.applymap(_format_types) # TODO Replace with df.map for Pandas 2 dynamic_table.value = df # todo: use async transformer to throttle updates # dynamic_table.stream( diff --git a/python/pathway/tests/__init__.py b/python/pathway/tests/__init__.py index 315ea30e..7dbf6711 100644 --- a/python/pathway/tests/__init__.py +++ b/python/pathway/tests/__init__.py @@ -2,6 +2,6 @@ from __future__ import annotations -from pathway.tests import utils +import pytest -__all__ = ["utils"] +pytest.register_assert_rewrite("pathway.tests.utils") diff --git a/python/pathway/tests/examples/realtime-log-slack.py b/python/pathway/tests/examples/realtime-log-slack.py new file mode 100644 index 00000000..76b45a52 --- /dev/null +++ b/python/pathway/tests/examples/realtime-log-slack.py @@ -0,0 +1,66 @@ +# Copyright © 2023 Pathway + +import pathway as pw + + +# DO NOT MODIFY WITHOUT MODIFYING THE EXAMPLE AT: +# public/pathway-examples/examples/realtime-log-monitoring/filebeat-pathway-slack/pathway-src/alerts.py +def test(): + alert_threshold = 5 + sliding_window_duration = 1_000_000_000 + + inputSchema = pw.schema_builder( + columns={ + "@timestamp": pw.column_definition(dtype=str), + "message": pw.column_definition(dtype=str), + } + ) + + log_table = pw.debug.table_from_markdown( + """ + @timestamp | message + 2023-12-04T07:00:01.000000Z | "1" + 2023-12-04T07:00:02.000000Z | "2" + 2023-12-04T07:00:03.000000Z | "3" + 2023-12-04T07:00:04.000000Z | "4" + 2023-12-04T07:00:05.000000Z | "5" + 2023-12-04T07:00:05.100000Z | "6" + 2023-12-04T07:00:05.200000Z | "7" + 2023-12-04T07:00:05.300000Z | "8" + 2023-12-04T07:00:05.400000Z | "9" + 2023-12-04T07:00:05.500000Z | "10" +""", + schema=inputSchema, + ) + + log_table = log_table.select(timestamp=pw.this["@timestamp"], log=pw.this.message) + log_table = log_table.select( + pw.this.log, + timestamp=pw.this.timestamp.dt.strptime("%Y-%m-%dT%H:%M:%S.%fZ").dt.timestamp(), + ) + + t_latest_log = log_table.reduce(last_log=pw.reducers.max(pw.this.timestamp)) + + t_sliding_window = log_table.filter( + pw.this.timestamp >= t_latest_log.ix_ref().last_log - sliding_window_duration + ) + t_alert = t_sliding_window.reduce(count=pw.reducers.count()) + t_alert = t_alert.select( + alert=pw.this.count >= alert_threshold, + latest_update=t_latest_log.ix_ref().last_log, + ) + t_alert = t_alert.select(pw.this.alert) + + results = [] + + def on_alert_event(key, row, time, is_addition): + alert_message = "Alert '{}' changed state to {}".format( + row["alert"], + "ACTIVE" if is_addition else "INACTIVE", + ) + results.append(alert_message) + + pw.io.subscribe(t_alert, on_alert_event) + pw.run() + + assert results == ["Alert 'True' changed state to ACTIVE"] diff --git a/python/pathway/tests/ml/test_index.py b/python/pathway/tests/ml/test_index.py index 207f74e9..bb7f6392 100644 --- a/python/pathway/tests/ml/test_index.py +++ b/python/pathway/tests/ml/test_index.py @@ -13,7 +13,7 @@ class PointSchema(pw.Schema): is_query: bool -def sort_arrays(arrays: list[np.ndarray]) -> list[tuple[int, ...]]: +def sort_arrays(arrays: list[np.ndarray]) -> list[tuple[int, int]]: return sorted([tuple(array) for array in arrays]) @@ -34,7 +34,7 @@ def get_points() -> list[tuple[tuple[int, ...], bool]]: def nn_as_table( - to_table: list[tuple[tuple[int, ...], tuple[tuple[int, ...]]]] + to_table: list[tuple[tuple[int, int], tuple[tuple[int, int], ...]]] ) -> pw.Table: return pw.debug.table_from_pandas( pd.DataFrame( @@ -43,7 +43,7 @@ def nn_as_table( "nn": [point[1] for point in to_table], } ) - ).update_types(nn=list[tuple[int, ...]]) + ) def test_all_at_once(): @@ -72,6 +72,48 @@ def test_all_at_once(): assert_table_equality_wo_index(result, expected) +def test_all_at_once_metadata_filter(): + data = get_points() + + class InputSchema(pw.Schema): + coords: tuple[int, int] + is_query: bool + metadata: pw.Json + + df = pd.DataFrame( + { + "coords": [point[0] for point in data], + "is_query": [point[1] for point in data], + "metadata": [{"foo": i} for i, _ in enumerate(data)], + } + ) + table = pw.debug.table_from_pandas(df, schema=InputSchema) + points = table.filter(~pw.this.is_query).without(pw.this.is_query) + queries = table.filter(pw.this.is_query).without(pw.this.is_query, pw.this.metadata) + index = KNNIndex( + points.coords, + points, + n_dimensions=2, + n_and=5, + metadata=points.metadata, + ) + queries += queries.select(metadata_filter="foo > `4`") + result = queries.without(pw.this.metadata_filter) + index.get_nearest_items( + queries.coords, k=2, metadata_filter=queries.metadata_filter + ).select( + nn=pw.apply(sort_arrays, pw.this.coords), + ) + expected = nn_as_table( + [ + ((0, 0), ((-3, 1), (1, 2))), + ((2, -2), ((1, -4), (1, 2))), + ((-1, 1), ((-3, 1), (1, 2))), + ((-2, -3), ((-3, 1), (1, -4))), + ] + ) + assert_table_equality_wo_index(result, expected) + + def stream_points(with_k: bool = False) -> tuple[pw.Table, pw.Table]: points = T( """ diff --git a/python/pathway/tests/temporal/test_asof_joins.py b/python/pathway/tests/temporal/test_asof_joins.py index 392b5c94..fe2e4d87 100644 --- a/python/pathway/tests/temporal/test_asof_joins.py +++ b/python/pathway/tests/temporal/test_asof_joins.py @@ -353,10 +353,9 @@ def test_with_timestamps(): """ ).with_columns( lt=pw.this.lt.dt.strptime(fmt), - rt=pw.if_else( - pw.this.rt.is_not_none(), - pw.declare_type(str, pw.this.rt).dt.strptime(fmt), - None, + rt=pw.require( + pw.this.rt.dt.strptime(fmt), + pw.this.rt, ), ) @@ -374,19 +373,9 @@ def test_with_timestamps(): ], ) def test_incorrect_args(left_type, right_type): - t1 = T( - """ - | t - 1 | 1 - """ - ).select(t=pw.declare_type(left_type, pw.this.t)) + t1 = pw.Table.empty(t=left_type) - t2 = T( - """ - | t - 23 | -15 - """ - ).select(t=pw.declare_type(right_type, pw.this.t)) + t2 = pw.Table.empty(t=right_type) with pytest.raises( TypeError, match=r"Arguments \(t_left, t_right\) have to be of types .* but are of types .*", diff --git a/python/pathway/tests/temporal/test_interval_joins.py b/python/pathway/tests/temporal/test_interval_joins.py index bd7ef407..f4e12149 100644 --- a/python/pathway/tests/temporal/test_interval_joins.py +++ b/python/pathway/tests/temporal/test_interval_joins.py @@ -1109,22 +1109,9 @@ def test_with_timestamps() -> None: ], ) def test_incorrect_args(join_mode, left_type, right_type, lower_bound, upper_bound): - t1 = T( - """ - | a | t - 0 | 1 | -1 - """ - ) + t1 = pw.Table.empty(a=int, t=left_type) - t2 = T( - """ - | b | t - 0 | 1 | 2 - """ - ) - - t1 = t1.with_columns(t=pw.declare_type(left_type, pw.this.t)) - t2 = t2.with_columns(t=pw.declare_type(right_type, pw.this.t)) + t2 = pw.Table.empty(b=int, t=right_type) with pytest.raises( TypeError, @@ -1144,22 +1131,9 @@ def test_incorrect_args(join_mode, left_type, right_type, lower_bound, upper_bou def test_incorrect_args_specific(): - t1 = T( - """ - | a | t - 0 | 1 | -1 - """ - ) + t1 = pw.Table.empty(a=int, t=DATE_TIME_NAIVE) - t2 = T( - """ - | b | t - 0 | 1 | 2 - """ - ) - - t1 = t1.with_columns(t=pw.declare_type(DATE_TIME_NAIVE, pw.this.t)) - t2 = t2.with_columns(t=pw.declare_type(int, pw.this.t)) + t2 = pw.Table.empty(b=int, t=int) expected_error_message = """Arguments (self_time_expression, other_time_expression, lower_bound, upper_bound) have to be of types (INT, INT, INT, INT) or (FLOAT, FLOAT, @@ -1199,3 +1173,31 @@ def test_errors_on_equal_tables(): match=r"Cannot join table with itself. Use .copy\(\) as one of the arguments of the join.", # noqa ): t1.interval_join(t1, t1.t, t1.t, pw.temporal.interval(-2, 0)) + + +def test_consolidate_for_cutoff(): + t = T( + """ + a | t + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 10 + 6 | 2 + 7 | 2 + 8 | 2 + 9 | 2 + """ + ) + t = t._freeze(threshold_column=pw.this.t + 1, time_column=pw.this.t) + + assert_table_equality_wo_index( + t, + T( + """ + a | t + 5 | 10 + """ + ), + ) diff --git a/python/pathway/tests/temporal/test_window_joins.py b/python/pathway/tests/temporal/test_window_joins.py index 77c6aa19..1375ecc6 100644 --- a/python/pathway/tests/temporal/test_window_joins.py +++ b/python/pathway/tests/temporal/test_window_joins.py @@ -237,13 +237,16 @@ def test_window_join_sharded_with_smart_cols(join_type: pw.JoinMode) -> None: w = pw.temporal.tumbling(2) - res = { + join_function = { pw.JoinMode.INNER: t1.window_join_inner, pw.JoinMode.LEFT: t1.window_join_left, pw.JoinMode.RIGHT: t1.window_join_right, pw.JoinMode.OUTER: t1.window_join_outer, - }[join_type](t2, pw.left.t, pw.right.t, w, t1.k == pw.right.k).select( - pw.left.a, pw.right.b, k=pw.declare_type(int, pw.this.k) + }[join_type] + res = ( + join_function(t2, pw.left.t, pw.right.t, w, t1.k == pw.right.k) + .select(pw.left.a, pw.right.b, pw.this.k) + .update_types(k=int) ) assert_table_equality_wo_index(res, expected) @@ -929,22 +932,9 @@ def test_window_join_float(w: pw.temporal.Window) -> None: ], ) def test_incorrect_args(join_mode, left_type, right_type, window, error_str): - t1 = T( - """ - | a | t - 0 | 1 | -1 - """ - ) - - t2 = T( - """ - | b | t - 0 | 1 | 2 - """ - ) + t1 = pw.Table.empty(a=int, t=left_type) - t1 = t1.with_columns(t=pw.declare_type(left_type, pw.this.t)) - t2 = t2.with_columns(t=pw.declare_type(right_type, pw.this.t)) + t2 = pw.Table.empty(b=int, t=right_type) with pytest.raises( TypeError, diff --git a/python/pathway/tests/temporal/test_windows.py b/python/pathway/tests/temporal/test_windows.py index e06f0e31..ce0a4055 100644 --- a/python/pathway/tests/temporal/test_windows.py +++ b/python/pathway/tests/temporal/test_windows.py @@ -288,7 +288,7 @@ def test_sliding(): # in the batch mode, we can test close to nothing; -# basically chcecks whether syntax is not broken +# basically checks whether syntax is not broken # for more tests see test_windows_stream.py def test_sliding_compacting(): t = T( @@ -335,7 +335,7 @@ def test_sliding_compacting(): # in the batch mode, we can test close to nothing; -# basically chcecks whether syntax is not broken +# basically checks whether syntax is not broken # for more tests see test_windows_stream.py def test_sliding_compacting_2(): t = T( @@ -379,7 +379,95 @@ def test_sliding_compacting_2(): 1 | 9 | 19 | 10 | 11 | 2 """ ) + assert_table_equality_wo_index(result, res) + + +def test_flush_buffer_long_chain_of_operators(): + t = T( + """ + t + 12 + 14 + 16 + 18 + 20 + 22 + 24 + 26 + """ + ) + + expected = T( + """ + t + 12 + 14 + 16 + 18 + 20 + 22 + 24 + 26 + """ + ) + for i in range(5): + gb = t.windowby( + t.t, + window=pw.temporal.sliding(duration=2, hop=2, offset=1), + behavior=pw.temporal.common_behavior( + delay=8, cutoff=100, keep_results=False + ), + ) + + t = gb.reduce( + t=pw.reducers.any(pw.this.t), + ) + assert_table_equality_wo_index(t, expected) + + +def test_sliding_compacting_flush_buffer(): + t = T( + """ + | instance | t + 1 | 0 | 12 + 2 | 0 | 13 + 3 | 0 | 14 + 4 | 0 | 15 + 5 | 0 | 16 + 6 | 0 | 17 + 7 | 1 | 10 + 8 | 1 | 11 + """ + ) + + gb = t.windowby( + t.t, + window=pw.temporal.sliding(duration=10, hop=3), + behavior=pw.temporal.common_behavior(delay=8, cutoff=10, keep_results=False), + instance=t.instance, + ) + + result = gb.reduce( + pw.this._pw_shard, + pw.this._pw_window_start, + pw.this._pw_window_end, + min_t=pw.reducers.min(pw.this.t), + max_t=pw.reducers.max(pw.this.t), + count=pw.reducers.count(), + ) + + res = T( + """ + _pw_instance | _pw_window_start | _pw_window_end | min_t | max_t | count + 0 | 6 | 16 | 12 | 15 | 4 + 0 | 9 | 19 | 12 | 17 | 6 + 0 | 12 | 22 | 12 | 17 | 6 + 0 | 15 | 25 | 15 | 17 | 3 + 1 | 6 | 16 | 10 | 11 | 2 + 1 | 9 | 19 | 10 | 11 | 2 + """ + ) assert_table_equality_wo_index(result, res) @@ -789,14 +877,7 @@ def test_windows_with_datetimes(w): ], ) def test_incorrect_args(dtype, window, error_str): - t1 = T( - """ - | a | t - 0 | 1 | -1 - """ - ) - - t1 = t1.with_columns(t=pw.declare_type(dtype, pw.this.t)) + t1 = pw.Table.empty(a=int, t=dtype) with pytest.raises( TypeError, diff --git a/python/pathway/tests/temporal/test_windows_stream.py b/python/pathway/tests/temporal/test_windows_stream.py index 6a4841ee..6047fd63 100644 --- a/python/pathway/tests/temporal/test_windows_stream.py +++ b/python/pathway/tests/temporal/test_windows_stream.py @@ -62,13 +62,20 @@ def generate_buffer_output( key = (window, entry["value"]) buffer[key] = entry - for window, value in buffer.keys(): + bufkeys = list(buffer.keys()) + for window, value in bufkeys: entry = buffer[(window, value)] threshold = window[1] + delay if last_time != now and threshold <= now and threshold > last_time: to_process.append((window, entry)) + buffer.pop((window, value)) output.extend(to_process) + # flush buffer + bufkeys = list(buffer.keys()) + for window, value in bufkeys: + entry = buffer.pop((window, value)) + output.append((window, entry)) return output @@ -304,13 +311,7 @@ def test_non_zero_delay_non_zero_buffer_remove_results(): parametrized_test(5, 3, 1, 1, False) -def test_exactly_once(): - duration = 5 - hop = 3 - delay = 6 - cutoff = 1 - keep_results = True - result = create_windowby_scenario(duration, hop, delay, cutoff, keep_results) +def create_expected_for_exactly_once(result, duration): expected = [] for i, window_end in enumerate([2, 5, 8, 11, 14]): pk_row: dict = { @@ -327,6 +328,44 @@ def test_exactly_once(): } expected.append(DiffEntry.create(result, pk_row, i, True, row)) + + # flush buffer + row: dict = { + "_pw_window_end": 17, + "max_time": 16, + "max_value": 67, + } + pk_row: dict = { + "_pw_window": (None, 12, 17), + "_pw_window_start": 12, + "_pw_window_end": 17, + "_pw_instance": None, + } + expected.append(DiffEntry.create(result, pk_row, 17, True, row)) + + row: dict = { + "_pw_window_end": 20, + "max_time": 16, + "max_value": 67, + } + pk_row: dict = { + "_pw_window": (None, 15, 20), + "_pw_window_start": 15, + "_pw_window_end": 20, + "_pw_instance": None, + } + expected.append(DiffEntry.create(result, pk_row, 20, True, row)) + return expected + + +def test_exactly_once(): + duration = 5 + hop = 3 + delay = 6 + cutoff = 1 + keep_results = True + result = create_windowby_scenario(duration, hop, delay, cutoff, keep_results) + expected = create_expected_for_exactly_once(result, duration) assert_stream_equal(expected, result) run() @@ -361,23 +400,7 @@ def test_exactly_once_from_behavior(): max_time=pw.reducers.max(pw.this.time), max_value=pw.reducers.max(pw.this.value), ) - - expected = [] - for i, window_end in enumerate([2, 5, 8, 11, 14]): - pk_row: dict = { - "_pw_window": (None, window_end - duration, window_end), - "_pw_window_start": window_end - duration, - "_pw_window_end": window_end, - "_pw_instance": None, - } - - row: dict = { - "_pw_window_end": window_end, - "max_time": window_end - 1, - "max_value": 2 * window_end - 1, - } - - expected.append(DiffEntry.create(result, pk_row, i, True, row)) + expected = create_expected_for_exactly_once(result, duration) assert_stream_equal(expected, result) run() diff --git a/python/pathway/tests/test_api.py b/python/pathway/tests/test_api.py index 2975833a..cdf58e9d 100644 --- a/python/pathway/tests/test_api.py +++ b/python/pathway/tests/test_api.py @@ -69,13 +69,7 @@ def static_table_from_pandas(scope, df, ptr_columns=(), legacy=True): def convert_table(scope, table): if isinstance(table, api.LegacyTable): - new_table = scope.columns_to_table( - table.universe, - [ - (column, column_path.ColumnPath((i,))) - for (i, column) in enumerate(table.columns) - ], - ) + new_table = scope.columns_to_table(table.universe, table.columns) return ( new_table, [column_path.ColumnPath((i,)) for i in range(len(table.columns))], diff --git a/python/pathway/tests/test_build_and_run.py b/python/pathway/tests/test_build_and_run.py index 1a8731f0..57710926 100644 --- a/python/pathway/tests/test_build_and_run.py +++ b/python/pathway/tests/test_build_and_run.py @@ -19,6 +19,7 @@ from pathway.internals import column, datasink, datasource, graph_runner from pathway.internals.decorators import table_from_datasource from pathway.internals.graph_runner.state import ScopeState +from pathway.internals.graph_runner.storage_graph import OperatorStorageGraph from pathway.internals.monitoring import MonitoringLevel from pathway.internals.parse_graph import G from pathway.internals.schema import Schema, schema_from_pandas @@ -32,7 +33,7 @@ def test_process_only_relevant_nodes(): input2 = Table.empty() output = input2.select() - def validate(state: ScopeState) -> None: + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: assert not state.has_legacy_table(input1) assert state.has_legacy_table(input2) assert state.has_legacy_table(output) @@ -48,7 +49,7 @@ def test_process_relevant_nodes_and_debug_nodes(): input2.debug("input2") input3 = Table.empty() - def validate(state: ScopeState) -> None: + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: assert state.has_legacy_table(input1) assert state.has_legacy_table(input2) assert not state.has_legacy_table(input3) @@ -66,7 +67,7 @@ def test_process_output_nodes(tmp_path: pathlib.Path): file_path = tmp_path / "test_output.csv" csv.write(input2, file_path) - def validate(state: ScopeState) -> None: + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: assert not state.has_legacy_table(input1) assert state.has_legacy_table(input2) @@ -85,7 +86,7 @@ def test_process_output_nodes_and_debug_nodes(tmp_path: pathlib.Path): file_path = tmp_path / "test_output.csv" csv.write(input2, file_path) - def validate(state) -> None: + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: assert state.has_legacy_table(input1) assert state.has_legacy_table(input2) assert not state.has_legacy_table(input3) @@ -100,7 +101,7 @@ def test_process_all_nodes(): input1 = Table.empty() input2 = Table.empty() - def validate(state: ScopeState) -> None: + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: assert state.has_legacy_table(input1) assert state.has_legacy_table(input2) @@ -158,9 +159,6 @@ def test_debug_datasource_schema_mismatch(): ).run_tables(input) -@pytest.mark.xfail( - reason="Columns are not used everywhere. Add similar test for columns in storages." -) def test_process_only_relevant_columns(): input1 = T( """ @@ -175,20 +173,17 @@ def test_process_only_relevant_columns(): filtered = input1.filter(this.foo <= 42) result = filtered.select(this.bar) - def validate(state: ScopeState) -> None: - assert state.has_column(filtered._get_column("bar")) - assert state.has_column(result._get_column("bar")) - assert not state.has_column(filtered._get_column("foo")) - assert not state.has_column(filtered._get_column("baz")) + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: + assert storage_graph.has_column(filtered, filtered._get_column("bar")) + assert storage_graph.has_column(result, result._get_column("bar")) + assert not storage_graph.has_column(filtered, filtered._get_column("foo")) + assert not storage_graph.has_column(filtered, filtered._get_column("baz")) graph_runner.GraphRunner(G, monitoring_level=MonitoringLevel.NONE).run_tables( result, after_build=validate ) -@pytest.mark.xfail( - reason="Columns are not used everywhere. Add similar test for columns in storages." -) def test_process_columns_of_debug_nodes(): input = T( """ @@ -199,18 +194,15 @@ def test_process_columns_of_debug_nodes(): result = input.select(input.foo) result.debug(name="result") - def validate(state: ScopeState): - assert state.has_column(input.foo._column) - assert state.has_column(result.foo._column) + def validate(state: ScopeState, storage_graph: OperatorStorageGraph): + assert storage_graph.has_column(input, input.foo._column) + assert storage_graph.has_column(result, result.foo._column) graph_runner.GraphRunner( G, debug=True, monitoring_level=MonitoringLevel.NONE ).run_outputs(after_build=validate) -@pytest.mark.xfail( - reason="Columns are not used everywhere. Add similar test for columns in storages." -) def test_process_row_transformer_columns_if_needed(): @transformer class foo_transformer: @@ -234,19 +226,19 @@ def ret(self) -> int: result1 = foo_transformer(input).table - def validate(state: ScopeState) -> None: - assert state.has_column(input._get_column("arg")) - assert state.has_column(input._get_column("foo")) - assert state.has_column(result1._get_column("ret")) + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: + assert storage_graph.has_column(input, input._get_column("arg")) + assert storage_graph.has_column(input, input._get_column("foo")) + assert storage_graph.has_column(result1, result1._get_column("ret")) builder.run_tables(result1, after_build=validate) result2 = input.select(this.arg) - def validate(state: ScopeState) -> None: - assert state.has_column(input._get_column("arg")) - assert not state.has_column(input._get_column("foo")) - assert state.has_column(result2._get_column("arg")) + def validate(state: ScopeState, storage_graph: OperatorStorageGraph) -> None: + assert storage_graph.has_column(input, input._get_column("arg")) + assert not storage_graph.has_column(input, input._get_column("foo")) + assert storage_graph.has_column(result2, result2._get_column("arg")) builder.run_tables(result2, after_build=validate) @@ -271,7 +263,7 @@ def test_groupby_cache(): assert g1 == g2 - def validate(state: ScopeState): + def validate(state: ScopeState, storage_graph: OperatorStorageGraph): groupby_contexts = list( ctx for ctx in state.evaluators.keys() @@ -304,7 +296,7 @@ def test_groupby_cache_multiple_cols(): assert g1 == g2 - def validate(state: ScopeState): + def validate(state: ScopeState, storage_graph: OperatorStorageGraph): groupby_contexts = list( ctx for ctx in state.evaluators.keys() @@ -338,7 +330,7 @@ def test_groupby_cache_similar_tables(): g1.reduce(min=reducers.max(table.age)) g2.reduce(min=reducers.max(copy.age)) - def validate(state: ScopeState): + def validate(state: ScopeState, storage_graph: OperatorStorageGraph): groupby_contexts = list( ctx for ctx in state.evaluators.keys() diff --git a/python/pathway/tests/test_column_properties.py b/python/pathway/tests/test_column_properties.py index cb0573a4..978e5040 100644 --- a/python/pathway/tests/test_column_properties.py +++ b/python/pathway/tests/test_column_properties.py @@ -1,9 +1,11 @@ import pytest import pathway.internals as pw +from pathway import io from pathway.internals import dtype as dt from pathway.internals.column_properties import ColumnProperties -from pathway.internals.decorators import empty_from_schema +from pathway.internals.datasource import DataSource +from pathway.internals.decorators import empty_from_schema, table_from_datasource from pathway.tests.utils import T @@ -26,14 +28,12 @@ def test_preserve_dependency_properties(): ) input3 = T( """ - | c - 1 | 42 - """, - schema=pw.schema_builder( - {"c": pw.column_definition(dtype=int)}, - properties=pw.SchemaProperties(append_only=False), - ), - ) + | c | __diff__ + 1 | 42 | 1 + 1 | 42 | -1 + 1 | 43 | 1 + """ + ).with_universe_of(input1) result = input1.select(a=input1.a, b=input1.a + input2.b, c=input1.a + input3.c) @@ -57,14 +57,12 @@ def test_preserve_context_dependency_properties(): ) input3 = T( """ - | c - 1 | 42 - """, - schema=pw.schema_builder( - {"c": pw.column_definition(dtype=int)}, - properties=pw.SchemaProperties(append_only=False), - ), - ) + | c | __diff__ + 1 | 42 | 1 + 1 | 42 | -1 + 1 | 43 | 1 + """ + ).with_universe_of(input1) res1 = input1.filter(pw.this.a == input2.b) res2 = input1.filter(pw.this.a == input3.c) @@ -73,12 +71,20 @@ def test_preserve_context_dependency_properties(): assert_col_props(res2.a, ColumnProperties(dtype=dt.INT, append_only=False)) +class MockDataSource(DataSource): + def is_bounded(self) -> bool: + raise NotImplementedError() + + def is_append_only(self) -> bool: + return False + + @pytest.mark.parametrize("append_only", [True, False]) def test_const_column_properties(append_only): class Schema(pw.Schema, append_only=append_only): a: int = pw.column_definition(primary_key=True) - table = empty_from_schema(Schema) + table = table_from_datasource(MockDataSource(schema=Schema)) result = table.select(ret=42) @@ -91,7 +97,7 @@ def test_universe_properties(append_only): class Schema(pw.Schema, append_only=append_only): a: int = pw.column_definition(primary_key=True) - table = empty_from_schema(Schema) + table = table_from_datasource(MockDataSource(schema=Schema)) result = table.select() assert table._id_column.properties.append_only == append_only @@ -141,3 +147,55 @@ def test_table_from_markdown_append_only(): """ ) assert not input3._id_column.properties.append_only + + +def test_python_connector_append_only(): + class TestSubject1(io.python.ConnectorSubject): + def run(): + pass + + class TestSubject2(io.python.ConnectorSubject): + def run(): + pass + + @property + def _deletions_enabled(self) -> bool: + return False + + class TestSchema(pw.Schema): + a: int + + input1 = io.python.read(TestSubject1(), schema=TestSchema) + assert not input1._id_column.properties.append_only + + input2 = io.python.read(TestSubject2(), schema=TestSchema) + assert input2._id_column.properties.append_only + + +def test_append_only_no_columns(): + class MockDataSource(DataSource): + def is_bounded(self) -> bool: + raise NotImplementedError() + + def is_append_only(self) -> bool: + return True + + class Schema(pw.Schema, append_only=True): + pass + + table = table_from_datasource(MockDataSource(schema=Schema)) + assert table._id_column.properties.append_only + + +@pytest.mark.parametrize("delete_completed_queries", [False, True]) +def test_rest_connector(delete_completed_queries: bool): + class TestSchema(pw.Schema): + a: int + + table, response_writer = io.http.rest_connector( + host="127.0.0.1", + port=30000, # server is not started, port number does not matter + schema=TestSchema, + delete_completed_queries=delete_completed_queries, + ) + assert table._id_column.properties.append_only != delete_completed_queries diff --git a/python/pathway/tests/test_common.py b/python/pathway/tests/test_common.py index 50b9f635..e4bb1bfc 100644 --- a/python/pathway/tests/test_common.py +++ b/python/pathway/tests/test_common.py @@ -1008,10 +1008,7 @@ def test_flatten(dtype: Any): "other": [-1, -1, -3, -3, -4, -4, -4, -5, -5, -5, -5, -5], } ) - new_dtype = list[int] if dtype == np.int64 else list[float] - t1 = table_from_pandas(df).with_columns( - array=pw.declare_type(new_dtype, pw.this.array) - ) + t1 = table_from_pandas(df) t1 = t1.flatten(t1.array, t1.other) expected = table_from_pandas(expected_df) assert_table_equality_wo_index(t1, expected) @@ -1079,11 +1076,7 @@ def test_flatten_explode(mul: int, dtype: Any): }, dtype=dtype, ) - t1 = table_from_pandas(df).with_columns( - array=pw.declare_type( - {np.int64: list[int], np.float64: list[float]}[dtype], pw.this.array - ) - ) + t1 = table_from_pandas(df) t1 = t1.flatten( t1.array, other=mul * pw.cast({np.int64: int, np.float64: float}[dtype], t1.other), @@ -1630,6 +1623,33 @@ def f(t: pw.Table): ) +def test_iterate_with_diverging_columns(): + t = T( + """ + a + 1 + """ + ) + + t = t.select(pw.this.a, b=pw.this.a) + + def f(t: pw.Table): + t = t.select(pw.this.a, b=pw.this.b * 2) + return dict(t=t) + + t = pw.iterate(f, iteration_limit=2, t=t).t + + assert_table_equality( + t, + T( + """ + a | b + 1 | 4 + """ + ), + ) + + def test_apply(): a = T( """ @@ -4586,7 +4606,7 @@ def test_lazy_coalesce(): 3 """ ) - ret = tab.select(col=pw.declare_type(int, pw.coalesce(tab.col, tab.col / 0))) + ret = tab.select(col=pw.coalesce(tab.col, tab.col // 0)) assert_table_equality(ret, tab) @@ -5219,10 +5239,10 @@ def test_sequence_get_unchecked_variable_length_untyped(): """ ) - t2 = t1.select(tup=pw.declare_type(Any, pw.apply(_create_tuple, pw.this.a))) + t2 = t1.select(tup=pw.apply(_create_tuple, pw.this.a)) t3 = t2.select(x=pw.this.tup[2], y=pw.this.tup[-3]) - assert_table_equality_wo_types(t3, expected) + assert_table_equality(t3, expected) def test_sequence_get_checked_variable_length(): @@ -5241,10 +5261,10 @@ def test_sequence_get_checked_variable_length(): 1 | 1 2 | 1 """ - ) + ).update_types(y=int | None) t2 = t1.select(tup=pw.apply(_create_tuple, pw.this.a)) - t3 = t2.select(x=pw.this.tup.get(1), y=pw.declare_type(int, pw.this.tup.get(-1))) + t3 = t2.select(x=pw.this.tup.get(1), y=pw.this.tup.get(-1)) assert_table_equality(t3, expected) @@ -5338,7 +5358,7 @@ def test_sequence_get_from_1d_ndarray(dtype, index, checked): "index_neg": [-2, -1, -1], } ) - ).update_columns(a=pw.declare_type(np.ndarray, pw.this.a)) + ) expected = T( """ a @@ -5346,12 +5366,12 @@ def test_sequence_get_from_1d_ndarray(dtype, index, checked): 5 0 """ - ) + ).update_types(a=dtype) if checked: result = t.select(a=pw.this.a.get(index)) else: result = t.select(a=pw.this.a[index]) - assert_table_equality_wo_index_types(result, expected) + assert_table_equality_wo_index(result, expected) @pytest.mark.parametrize("dtype", [int, float]) @@ -5368,7 +5388,7 @@ def test_sequence_get_from_2d_ndarray(dtype, index, checked): ] } ) - ).select(a=pw.declare_type(np.ndarray, pw.this.a)) + ) expected = pw.debug.table_from_pandas( pd.DataFrame( { @@ -5379,15 +5399,13 @@ def test_sequence_get_from_2d_ndarray(dtype, index, checked): ] } ) - ).select(a=pw.declare_type(np.ndarray, pw.this.a)) + ) if checked: result = t.select(a=pw.this.a.get(index)) else: result = t.select(a=pw.this.a[index]) - result = result.select(a=pw.declare_type(np.ndarray, pw.this.a)) - assert_table_equality_wo_index(result, expected) @@ -5407,10 +5425,14 @@ def test_sequence_get_from_1d_ndarray_default(dtype, index, expected): "index": index, } ) - ).update_columns(a=pw.declare_type(np.ndarray, pw.this.a)) - expected = pw.debug.table_from_pandas(pd.DataFrame({"a": expected})) + ) + expected = pw.debug.table_from_pandas( + pd.DataFrame({"a": expected}).astype( + dtype={"a": {int: "int", float: "float"}[dtype]} + ) + ) result = t.select(a=pw.this.a.get(pw.this.index, default=-1)) - assert_table_equality_wo_index_types(result, expected) + assert_table_equality_wo_index(result, expected) @pytest.mark.parametrize("dtype", [int, float]) @@ -5427,7 +5449,7 @@ def test_sequence_get_from_1d_ndarray_out_of_bounds(dtype, index): "index": index, } ) - ).update_columns(a=pw.declare_type(np.ndarray, pw.this.a)) + ) t.select(a=pw.this.a[pw.this.index]) with pytest.raises(IndexError): run_all() diff --git a/python/pathway/tests/test_dtypes.py b/python/pathway/tests/test_dtypes.py new file mode 100644 index 00000000..d60b4de2 --- /dev/null +++ b/python/pathway/tests/test_dtypes.py @@ -0,0 +1,15 @@ +# Copyright © 2023 Pathway + +import pathway.internals.dtype as dt + + +def test_identities(): + assert dt.Optional(dt.INT) is dt.Optional(dt.INT) + assert dt.Pointer() is dt.Pointer() + assert dt.Tuple(dt.INT, dt.Optional(dt.POINTER)) is dt.Tuple( + dt.INT, dt.Optional(dt.POINTER) + ) + assert dt.Tuple(dt.INT, ...) is dt.List(dt.INT) + assert dt.Optional(dt.ANY) is dt.ANY + assert dt.Optional(dt.Optional(dt.INT)) is dt.Optional(dt.INT) + assert dt.Array(2, dt.Array(2, dt.INT)) is dt.Array(4, dt.INT) diff --git a/python/pathway/tests/test_error_messages.py b/python/pathway/tests/test_error_messages.py index eaaff934..19158400 100644 --- a/python/pathway/tests/test_error_messages.py +++ b/python/pathway/tests/test_error_messages.py @@ -8,7 +8,7 @@ import pytest import pathway as pw -from pathway.internals.runtime_type_check import runtime_type_check +from pathway.internals.runtime_type_check import check_arg_types from pathway.tests.utils import ( T, assert_table_equality, @@ -203,7 +203,7 @@ def test_session_simple(): def test_runtime_type_check_decorator(): - @runtime_type_check + @check_arg_types def foo(x: int): pass @@ -252,13 +252,12 @@ def test_traceback_rust_expression(): 2 | b 3 | c """ - ) + ).update_types(bar=int) - input = input.with_columns(bar=pw.declare_type(int, pw.this.bar)) input.select(r=pw.this.foo + pw.this.bar) # cause with _assert_error_trace(TypeError): - run_all() + run_all(runtime_typechecking=False) @pytest.mark.xfail @@ -296,12 +295,12 @@ def func(x: int) -> int: foo bar """ - ).with_columns(val=pw.declare_type(int, pw.this.val)) + ).update_types(val=int) pw.iterate(iterate, iterated=input) with _assert_error_trace(TypeError): - run_all() + run_all(runtime_typechecking=False) def test_traceback_transformers_1(): diff --git a/python/pathway/tests/test_flatten.py b/python/pathway/tests/test_flatten.py index 0f31e6df..173bc8c3 100644 --- a/python/pathway/tests/test_flatten.py +++ b/python/pathway/tests/test_flatten.py @@ -12,9 +12,7 @@ def test_flatten_simple(): - tab = T( - pd.DataFrame.from_dict({"col": [[1, 2, 3, 4]]}), format="pandas" - ).with_columns(col=pw.declare_type(list[int], pw.this.col)) + tab = T(pd.DataFrame.from_dict({"col": [[1, 2, 3, 4]]}), format="pandas") assert_table_equality_wo_index( tab.flatten(this.col, origin_id=this.id), @@ -31,9 +29,7 @@ def test_flatten_simple(): def test_flatten_no_origin(): - tab = T( - pd.DataFrame.from_dict({"col": [[1, 2, 3, 4]]}), format="pandas" - ).with_columns(col=pw.declare_type(list[int], pw.this.col)) + tab = T(pd.DataFrame.from_dict({"col": [[1, 2, 3, 4]]}), format="pandas") assert_table_equality_wo_index( tab.flatten(this.col), @@ -50,9 +46,7 @@ def test_flatten_no_origin(): def test_flatten_inner_repeats(): - tab = T( - pd.DataFrame.from_dict({"col": [[1, 1, 1, 3]]}), format="pandas" - ).with_columns(col=pw.declare_type(list[int], pw.this.col)) + tab = T(pd.DataFrame.from_dict({"col": [[1, 1, 1, 3]]}), format="pandas") assert_table_equality_wo_index( tab.flatten(this.col, origin_id=this.id), @@ -69,9 +63,7 @@ def test_flatten_inner_repeats(): def test_flatten_more_repeats(): - tab = T( - pd.DataFrame.from_dict({"col": [[1, 1, 1, 3], [1]]}), format="pandas" - ).with_columns(col=pw.declare_type(list[int], pw.this.col)) + tab = T(pd.DataFrame.from_dict({"col": [[1, 1, 1, 3], [1]]}), format="pandas") assert_table_equality_wo_index( tab.flatten(this.col, origin_id=this.id), diff --git a/python/pathway/tests/test_io.py b/python/pathway/tests/test_io.py index 4970272f..00c9b047 100644 --- a/python/pathway/tests/test_io.py +++ b/python/pathway/tests/test_io.py @@ -5,6 +5,7 @@ import os import pathlib import random +import re import sqlite3 import sys import threading @@ -29,11 +30,13 @@ assert_table_equality_wo_index, assert_table_equality_wo_index_types, assert_table_equality_wo_types, + needs_multiprocessing_fork, + run, run_all, wait_result_with_checker, write_csv, write_lines, - xfail_on_darwin, + xfail_on_multiple_threads, ) @@ -240,6 +243,159 @@ class InputSchema(pw.Schema): assert result.equals(expected) +def test_python_connector_remove(): + class TestSubject(pw.io.python.ConnectorSubject): + def run(self): + self._add( + api.ref_scalar(1), + json.dumps({"key": 1, "genus": "upupa", "epithet": "epops"}).encode(), + ) + self._remove( + api.ref_scalar(1), + json.dumps({"key": 1, "genus": "upupa", "epithet": "epops"}).encode(), + ) + self._add( + api.ref_scalar(3), + json.dumps( + {"key": 3, "genus": "bubo", "epithet": "scandiacus"} + ).encode(), + ) + + class InputSchema(pw.Schema): + key: int + genus: str + epithet: str + + table = pw.io.python.read( + TestSubject(), + schema=InputSchema, + ) + + assert_table_equality_wo_index( + table, + T( + """ + key | genus | epithet + 3 | bubo | scandiacus + """, + ), + ) + + +def test_python_connector_deletions_disabled(): + class TestSubject(pw.io.python.ConnectorSubject): + def run(self): + self._add( + api.ref_scalar(1), + json.dumps({"key": 1, "genus": "upupa", "epithet": "epops"}).encode(), + ) + self._add( + api.ref_scalar(3), + json.dumps( + {"key": 3, "genus": "bubo", "epithet": "scandiacus"} + ).encode(), + ) + + @property + def _deletions_enabled(self) -> bool: + return False + + class InputSchema(pw.Schema): + key: int + genus: str + epithet: str + + table = pw.io.python.read( + TestSubject(), + schema=InputSchema, + ) + + assert_table_equality_wo_index( + table, + T( + """ + key | genus | epithet + 1 | upupa | epops + 3 | bubo | scandiacus + """, + ), + ) + + +def test_python_connector_deletions_disabled_logs_error_on_delete(caplog): + class TestSubject(pw.io.python.ConnectorSubject): + def run(self): + self._add( + api.ref_scalar(1), + json.dumps({"key": 1, "genus": "upupa", "epithet": "epops"}).encode(), + ) + self._remove( + api.ref_scalar(1), + json.dumps({"key": 1, "genus": "upupa", "epithet": "epops"}).encode(), + ) + self._add( + api.ref_scalar(3), + json.dumps( + {"key": 3, "genus": "bubo", "epithet": "scandiacus"} + ).encode(), + ) + + @property + def _deletions_enabled(self) -> bool: + return False + + class InputSchema(pw.Schema): + key: int + genus: str + epithet: str + + pw.io.python.read( + TestSubject(), + schema=InputSchema, + ) + + run_all() + + assert re.search( + r"Trying to delete a row in .* but deletions_enabled is set to False", + caplog.text, + ) + + +def test_python_connector_deletions_disabled_logs_error_on_upsert(caplog): + class TestSubject(pw.io.python.ConnectorSubject): + def run(self): + self._add( + api.ref_scalar(1), + json.dumps({"key": 1, "genus": "upupa", "epithet": "epops"}).encode(), + ) + + @property + def _deletions_enabled(self) -> bool: + return False + + @property + def _session_type(self) -> SessionType: + return SessionType.UPSERT + + class InputSchema(pw.Schema): + key: int + genus: str + epithet: str + + pw.io.python.read( + TestSubject(), + schema=InputSchema, + ) + + run_all() + + assert re.search( + r"Trying to upsert a row in .* but deletions_enabled is set to False", + caplog.text, + ) + + def test_csv_static_read_write(tmp_path: pathlib.Path): data = """ k | v @@ -501,13 +657,12 @@ def run(self): run_all() - threads_num = int(os.environ.get("PATHWAY_THREADS", "1")) root.assert_has_calls( [ mock.call.on_change( key=mock.ANY, row={"data": "foo"}, time=mock.ANY, is_addition=True ), - *[mock.call.on_end()] * threads_num, + mock.call.on_end(), ] ) @@ -585,16 +740,16 @@ async def invoke(self, value: int) -> dict[str, Any]: persistence_mode=api.PersistenceMode.UDF_CACHING, ) - pw.run( - monitoring_level=pw.MonitoringLevel.NONE, + run( persistence_config=persistence_config, ) -@pytest.mark.xfail(reason="stil fails randomly") +@pytest.mark.flaky(reruns=2) +@xfail_on_multiple_threads +@needs_multiprocessing_fork def test_async_transformer_idempotency(monkeypatch): - if os.environ.get("PATHWAY_PERSISTENT_STORAGE"): - monkeypatch.delenv("PATHWAY_PERSISTENT_STORAGE") + monkeypatch.delenv("PATHWAY_PERSISTENT_STORAGE", raising=False) class OutputSchema(pw.Schema): ret: int @@ -768,8 +923,8 @@ def test_fs_raw(tmp_path: pathlib.Path): pw.debug.compute_and_print(table) -@pytest.mark.xfail(reason="randomly fails for yet unknown reason") -@pytest.mark.xdist_group(name="streaming_tests") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_csv_directory(tmp_path: pathlib.Path): inputs_path = tmp_path / "inputs/" os.mkdir(inputs_path) @@ -800,11 +955,11 @@ class InputSchema(pw.Schema): output_path = tmp_path / "output.csv" pw.io.csv.write(table, output_path) - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 2), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 2), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_csv_streaming(tmp_path: pathlib.Path): inputs_path = tmp_path / "inputs/" start_streaming_inputs(inputs_path, 5, 1.0, "csv") @@ -823,11 +978,11 @@ class InputSchema(pw.Schema): output_path = tmp_path / "output.csv" pw.io.csv.write(table, str(output_path)) - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_json_streaming(tmp_path: pathlib.Path): inputs_path = tmp_path / "inputs/" start_streaming_inputs(inputs_path, 5, 1.0, "json") @@ -846,11 +1001,11 @@ class InputSchema(pw.Schema): output_path = tmp_path / "output.csv" pw.io.csv.write(table, str(output_path)) - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_plaintext_streaming(tmp_path: pathlib.Path): inputs_path = tmp_path / "inputs/" start_streaming_inputs(inputs_path, 5, 1.0, "plaintext") @@ -864,11 +1019,11 @@ def test_plaintext_streaming(tmp_path: pathlib.Path): output_path = tmp_path / "output.csv" pw.io.csv.write(table, str(output_path)) - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_csv_streaming_fs_alias(tmp_path: pathlib.Path): inputs_path = tmp_path / "inputs/" start_streaming_inputs(inputs_path, 5, 1.0, "csv") @@ -888,11 +1043,11 @@ class InputSchema(pw.Schema): output_path = tmp_path / "output.csv" pw.io.csv.write(table, str(output_path)) - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_json_streaming_fs_alias(tmp_path: pathlib.Path): inputs_path = tmp_path / "inputs/" start_streaming_inputs(inputs_path, 5, 1.0, "json") @@ -912,11 +1067,11 @@ class InputSchema(pw.Schema): output_path = tmp_path / "output.csv" pw.io.csv.write(table, str(output_path)) - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) -@xfail_on_darwin(reason="running pw.run from separate process not supported") -@pytest.mark.xdist_group(name="streaming_tests") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_plaintext_streaming_fs_alias(tmp_path: pathlib.Path): inputs_path = tmp_path / "inputs/" start_streaming_inputs(inputs_path, 5, 1.0, "plaintext") @@ -931,7 +1086,7 @@ def test_plaintext_streaming_fs_alias(tmp_path: pathlib.Path): output_path = tmp_path / "output.csv" pw.io.csv.write(table, str(output_path)) - assert wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) + wait_result_with_checker(CsvLinesNumberChecker(output_path, 5), 30) def test_pathway_type_mapping(): @@ -1099,7 +1254,7 @@ def run_computation(py_connector_input, fs_connector_input): table_py.data ) pw.io.csv.write(table_joined, output_path) - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(persistent_storage_path), ) @@ -1147,7 +1302,7 @@ def test_no_pstorage(tmp_path: pathlib.Path): api.EngineError, match="persistent metadata backend failed: target object should be a directory", ): - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(path), ) @@ -1163,7 +1318,7 @@ def test_persistent_id_not_assigned_autogenerate(tmp_path: pathlib.Path): table = pw.io.plaintext.read(input_path, mode="static") pw.io.csv.write(table, tmp_path / "output.txt") - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(pstorage_path) ) @@ -1180,7 +1335,7 @@ def test_no_persistent_storage(tmp_path: pathlib.Path): ValueError, match="persistent id 1 is assigned, but no persistent storage is configured", ): - pw.run() + run() def test_duplicated_persistent_id(tmp_path: pathlib.Path): @@ -1202,7 +1357,7 @@ def test_duplicated_persistent_id(tmp_path: pathlib.Path): ValueError, match="Persistent ID 'one' used more than once", ): - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(pstorage_path) ) @@ -1256,7 +1411,7 @@ class InputSchema(pw.Schema): inputs_path, format=input_format, schema=InputSchema, - mode="streaming_with_deletions", + mode="streaming", autocommit_duration_ms=1, with_metadata=True, ) @@ -1267,7 +1422,7 @@ class InputSchema(pw.Schema): inputs_thread = threading.Thread(target=streaming_target, daemon=True) inputs_thread.start() - assert wait_result_with_checker( + wait_result_with_checker( FileLinesNumberChecker(output_path, expected_output_lines), 30 ) @@ -1301,7 +1456,8 @@ class InputSchema(pw.Schema): time_removed[file_name] = timestamp -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_simple_replacement_with_removal(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1322,7 +1478,8 @@ def stream_inputs(): ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_simple_insert_consolidation(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1345,7 +1502,8 @@ def stream_inputs(): ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_simple_replacement_on_file(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1370,7 +1528,8 @@ def stream_inputs(): ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_simple_replacement(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1393,7 +1552,8 @@ def stream_inputs(): ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_last_file_replacement_json(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1416,7 +1576,8 @@ def stream_inputs(): ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_last_file_replacement_csv(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1448,7 +1609,8 @@ def stream_inputs(): ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_file_removal_autogenerated_key(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1469,7 +1631,8 @@ def stream_inputs(): ) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_simple_replacement_autogenerated_key(tmp_path: pathlib.Path, monkeypatch): def stream_inputs(): time.sleep(1) @@ -1505,7 +1668,7 @@ def test_bytes_read(tmp_path: pathlib.Path): autocommit_duration_ms=1000, ) pw.io.jsonlines.write(table, output_path) - pw.run() + run() with open(output_path) as f: result = json.load(f) @@ -1533,7 +1696,7 @@ def on_end(*args, **kwargs): pass pw.io.subscribe(table, on_change=on_change, on_end=on_end) - pw.run() + run() assert rows == [ { @@ -1570,7 +1733,7 @@ class InputSchema(pw.Schema): schema=InputSchema, ) pw.io.csv.write(table, output_path) - pw.run() + run() result = pd.read_csv( output_path, usecols=["key", "value"], index_col=["key"] ).sort_index() @@ -1590,7 +1753,7 @@ def test_text_file_read_in_full(tmp_path: pathlib.Path): autocommit_duration_ms=1000, ) pw.io.jsonlines.write(table, output_path) - pw.run() + run() with open(output_path) as f: result = json.load(f) @@ -1616,7 +1779,7 @@ def test_text_files_directory_read_in_full(tmp_path: pathlib.Path): autocommit_duration_ms=1000, ) pw.io.jsonlines.write(table, output_path) - pw.run() + run() output_lines = [] with open(output_path, "r") as f: @@ -1649,16 +1812,13 @@ def test_persistent_subscribe(tmp_path): ) root = mock.Mock() - on_change, on_end = mock.Mock(), mock.Mock() - root.on_change, root.on_end = on_change, on_end - pw.io.subscribe(table, on_change=on_change, on_end=on_end) + pw.io.subscribe(table, on_change=root.on_change, on_end=root.on_end) pw.run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(pstorage_dir), ), ) - threads_num = int(os.environ.get("PATHWAY_THREADS", "1")) root.assert_has_calls( [ mock.call.on_change( @@ -1667,10 +1827,11 @@ def test_persistent_subscribe(tmp_path): time=mock.ANY, is_addition=True, ), - *[mock.call.on_end()] * threads_num, + mock.call.on_end(), ] ) - assert on_change.call_count == 1 + assert root.on_change.call_count == 1 + assert root.on_end.call_count == 1 G.clear() @@ -1691,7 +1852,7 @@ def test_persistent_subscribe(tmp_path): root = mock.Mock() pw.io.subscribe(table, on_change=root.on_change, on_end=root.on_end) - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(pstorage_dir), ), @@ -1704,10 +1865,11 @@ def test_persistent_subscribe(tmp_path): time=mock.ANY, is_addition=True, ), - *[mock.call.on_end()] * threads_num, + mock.call.on_end(), ] ) - assert on_change.call_count == 1 + assert root.on_change.call_count == 1 + assert root.on_end.call_count == 1 def test_objects_pattern(tmp_path: pathlib.Path): @@ -1727,7 +1889,7 @@ def test_objects_pattern(tmp_path: pathlib.Path): object_pattern="*.txt", ) pw.io.csv.write(table, output_path) - pw.run() + run() result = pd.read_csv(output_path).sort_index() assert set(result["data"]) == {"a", "b", "c"} @@ -1739,7 +1901,7 @@ def test_objects_pattern(tmp_path: pathlib.Path): object_pattern="*.dat", ) pw.io.csv.write(table, output_path) - pw.run() + run() result = pd.read_csv(output_path).sort_index() assert set(result["data"]) == {"d", "e", "f", "g"} @@ -1792,7 +1954,7 @@ def run_graph( pw.io.subscribe(t, callback, callback.on_end) - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(replay_dir), persistence_mode=persistence_mode, @@ -1890,7 +2052,7 @@ def run_graph( pw.io.subscribe(t, callback, callback.on_end) - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.filesystem(replay_dir), persistence_mode=persistence_mode, @@ -1938,7 +2100,7 @@ def test_metadata_column_identity(tmp_path: pathlib.Path): autocommit_duration_ms=1000, ) pw.io.jsonlines.write(table, output_path) - pw.run() + run() metadata_file_names = [] with open(output_path, "r") as f: @@ -1975,7 +2137,7 @@ class InputSchema(pw.Schema): autocommit_duration_ms=1000, ) pw.io.jsonlines.write(table, output_path) - pw.run() + run() metadata_file_names = [] with open(output_path, "r") as f: @@ -2016,7 +2178,7 @@ class InputSchema(pw.Schema): on_change = mock.Mock() pw.io.subscribe(t, on_change=on_change) - pw.run( + run( persistence_config=pw.persistence.Config.simple_config( pw.persistence.Backend.mock(events), persistence_mode=api.PersistenceMode.SPEEDRUN_REPLAY, @@ -2071,7 +2233,7 @@ class InputSchema(pw.Schema): on_change = mock.Mock() pw.io.subscribe(t, on_change=on_change) - pw.run(persistence_config=stream_generator.persistence_config()) + run(persistence_config=stream_generator.persistence_config()) timestamps = set([call.kwargs["time"] for call in on_change.mock_calls]) assert len(timestamps) == 2 @@ -2130,7 +2292,7 @@ class InputSchema(pw.Schema): on_change = mock.Mock() pw.io.subscribe(t, on_change=on_change) - pw.run(persistence_config=stream_generator.persistence_config()) + run(persistence_config=stream_generator.persistence_config()) timestamps = set([call.kwargs["time"] for call in on_change.mock_calls]) assert len(timestamps) == 2 @@ -2187,7 +2349,7 @@ def test_stream_generator_from_markdown(): on_change = mock.Mock() pw.io.subscribe(t, on_change=on_change) - pw.run(persistence_config=stream_generator.persistence_config()) + run(persistence_config=stream_generator.persistence_config()) on_change.assert_has_calls( [ @@ -2279,7 +2441,7 @@ class InputSchema(pw.Schema): on_change = mock.Mock() pw.io.subscribe(t3, on_change=on_change) - pw.run(persistence_config=stream_generator.persistence_config()) + run(persistence_config=stream_generator.persistence_config()) on_change.assert_has_calls( [ @@ -2323,7 +2485,7 @@ def run(self): table = pw.io.python.read(TestSubject(), format="raw", autocommit_duration_ms=10) pw.io.csv.write(table, tmp_path / "output.csv") - pw.run() + run() result = pd.read_csv(tmp_path / "output.csv") return len(result) == 5 @@ -2343,7 +2505,7 @@ def run(self): table = pw.io.python.read(TestSubject(), format="raw", autocommit_duration_ms=10) pw.io.csv.write(table, tmp_path / "output.csv") - pw.run() + run() result = pd.read_csv(tmp_path / "output.csv") return len(result) == 2 @@ -2380,7 +2542,7 @@ class InputSchema(pw.Schema): TestSubject(), format="json", schema=InputSchema, autocommit_duration_ms=10 ) pw.io.csv.write(table, tmp_path / "output.csv") - pw.run() + run() result = pd.read_csv(tmp_path / "output.csv") return len(result) == 5 @@ -2441,7 +2603,8 @@ def test_parse_to_table_deprecation(): assert_table_equality(t, expected) -@xfail_on_darwin(reason="running pw.run from separate process not supported") +@pytest.mark.flaky(reruns=2) +@needs_multiprocessing_fork def test_sqlite(tmp_path: pathlib.Path): database_name = tmp_path / "test.db" output_path = tmp_path / "output.csv" @@ -2462,9 +2625,7 @@ def test_sqlite(tmp_path: pathlib.Path): connection.commit() def stream_target(): - assert wait_result_with_checker( - FileLinesNumberChecker(output_path, 2), 5, 0.1, target=None - ) + wait_result_with_checker(FileLinesNumberChecker(output_path, 2), 5, target=None) connection = sqlite3.connect(database_name) cursor = connection.cursor() cursor.execute( @@ -2473,16 +2634,12 @@ def stream_target(): ) connection.commit() - assert wait_result_with_checker( - FileLinesNumberChecker(output_path, 3), 2, 0.1, target=None - ) + wait_result_with_checker(FileLinesNumberChecker(output_path, 3), 2, target=None) cursor = connection.cursor() cursor.execute("UPDATE users SET name = 'Bob Smith' WHERE id = 2") connection.commit() - assert wait_result_with_checker( - FileLinesNumberChecker(output_path, 5), 2, 0.1, target=None - ) + wait_result_with_checker(FileLinesNumberChecker(output_path, 5), 2, target=None) cursor = connection.cursor() cursor.execute("DELETE FROM users WHERE id = 3") connection.commit() @@ -2500,7 +2657,7 @@ class InputSchema(pw.Schema): inputs_thread = threading.Thread(target=stream_target, daemon=True) inputs_thread.start() - assert wait_result_with_checker(FileLinesNumberChecker(output_path, 6), 30) + wait_result_with_checker(FileLinesNumberChecker(output_path, 6), 30) events = [] with open(output_path, "r") as f: diff --git a/python/pathway/tests/test_json.py b/python/pathway/tests/test_json.py index b8b016d4..1d91c78f 100644 --- a/python/pathway/tests/test_json.py +++ b/python/pathway/tests/test_json.py @@ -409,6 +409,43 @@ def map(a: pw.Json) -> int: ) +def test_json_flatten(): + input = _json_table( + data=[[1, 2], [3], [4, 5]], + ) + + result = input.flatten(pw.this.data).select(data=pw.this.data.as_int()) + + assert_table_equality_wo_index( + T( + """ + | data + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + """ + ).update_types(data=Optional[int]), + result, + ) + + +@pytest.mark.parametrize( + "value", + [1, 0, 1.6, "1", "0", "true", {"value": [1]}, None], +) +def test_json_flatten_wrong_values(value): + input = _json_table( + data=[value], + ) + + input.flatten(pw.this.data) + + with pytest.raises(ValueError, match=r"Pathway can't flatten this Json.*"): + run_all() + + def test_json_type(): table = _json_table( a=[{"value": 1}], b=[2], c=[1.5], d=[True], e="foo", f=[[1, 2, 3]] diff --git a/python/pathway/tests/test_operators.py b/python/pathway/tests/test_operators.py index 218cd9f3..d4dca221 100644 --- a/python/pathway/tests/test_operators.py +++ b/python/pathway/tests/test_operators.py @@ -89,14 +89,6 @@ def test_unary(op_fun: Callable, data: list[Any]) -> None: assert_table_equality(table_pw, table_pd) -def _change_dtypes(table: pw.Table, dtypes: Mapping[str, type]): - for key, dtype in dtypes.items(): - table = table.select( - *pw.this.without(key), **{key: pw.declare_type(dtype, pw.this[key])} - ) - return table - - def _check_pandas_pathway_return_the_same( df: pd.DataFrame, op_fun: Any, @@ -104,12 +96,12 @@ def _check_pandas_pathway_return_the_same( res_dtype: type | None = None, ): table = table_from_pandas(copy.deepcopy(df)) - table = _change_dtypes(table, dtypes) + table = table.update_types(**dtypes) table_pw = table.select(pw.this.a, pw.this.b, c=op_fun(pw.this.a, pw.this.b)) df["c"] = op_fun(df["a"], df["b"]) table_pd = table_from_pandas(df) if res_dtype: - table_pd = _change_dtypes(table_pd, {"c": res_dtype}) + table_pd = table_pd.update_types(c=res_dtype) assert_table_equality(table_pw, table_pd) @@ -904,13 +896,16 @@ def test_date_time_and_duration(op_fun: Any, is_naive: bool) -> None: @pytest.mark.parametrize( - "op_fun", + "op_fun,dtype", [ - operator.mul, - operator.floordiv, + (operator.mul, int), + (operator.floordiv, int), + (operator.truediv, int), + (operator.mul, float), + (operator.truediv, float), ], ) -def test_duration_and_int(op_fun: Any) -> None: +def test_duration_and_int(op_fun: Any, dtype: Any) -> None: pairs = [ [pd.Timedelta(0), 0], [pd.Timedelta(1), 0], @@ -919,6 +914,14 @@ def test_duration_and_int(op_fun: Any) -> None: [pd.Timedelta(2), 0], [pd.Timedelta(2), -1], [pd.Timedelta(-2), -2], + [pd.Timedelta(10), 3], + [pd.Timedelta(10), -3], + [pd.Timedelta(-10), 3], + [pd.Timedelta(-10), -3], + [pd.Timedelta(11), 3], + [pd.Timedelta(11), -3], + [pd.Timedelta(-11), 3], + [pd.Timedelta(-11), -3], [pd.Timedelta(-331399), -227463], [pd.Timedelta(253173), -207184], [pd.Timedelta(-741012), -856821], @@ -943,19 +946,22 @@ def test_duration_and_int(op_fun: Any) -> None: [pd.Timedelta(weeks=1), 10], [pd.Timedelta(weeks=-2), -45], ] - if op_fun == operator.floordiv: + if op_fun in {operator.floordiv, operator.truediv}: pairs = [[a, b if b != 0 else 1] for a, b in pairs] - expected = table_from_pandas(pd.DataFrame({"c": [op_fun(a, b) for a, b in pairs]})) + expected = table_from_pandas( + pd.DataFrame({"c": [op_fun(a, dtype(b)) for a, b in pairs]}) + ) # computing manually because in pandas when operating on columns # (pd.Timedelta(days=-2) // 28).value == -6171428571428, but should be # -6171428571429. Suprisingly, working on single values but not on columns pairs_T = list(zip(*pairs)) df = pd.DataFrame({"a": pairs_T[0], "b": pairs_T[1]}) + df["b"] = df["b"].astype(dtype) table = table_from_pandas(df) result = table.select(c=op_fun(table.a, table.b)) assert_table_equality(result, expected) if op_fun == operator.mul: - result_2 = table.select(c=op_fun(table.a, table.b)) + result_2 = table.select(c=op_fun(table.b, table.a)) assert_table_equality(result_2, expected) @@ -1043,9 +1049,7 @@ def run_matrix_multiplcation( pairs_T = list(zip(*pairs)) a = [a_i.astype(dtype) for a_i in pairs_T[0]] b = [b_i.astype(dtype) for b_i in pairs_T[1]] - t = table_from_pandas( - pd.DataFrame({"a": a, "b": b, "i": list(range(len(a)))}) - ).update_types(a=np.ndarray, b=np.ndarray) + t = table_from_pandas(pd.DataFrame({"a": a, "b": b, "i": list(range(len(a)))})) res = t.select(pw.this.i, c=t.a @ t.b) res_pd = table_to_pandas(res).sort_values(by="i")["c"] expected = [a_i @ b_i for a_i, b_i in zip(a, b)] @@ -1155,9 +1159,7 @@ def test_matrix_multiplication_multidimensional(dtype: type) -> None: ], ) def test_matrix_multiplication_errors_on_shapes_mismatch(a, b) -> None: - t = table_from_pandas(pd.DataFrame({"a": [a], "b": [b]})).update_types( - a=np.ndarray, b=np.ndarray - ) + t = table_from_pandas(pd.DataFrame({"a": [a], "b": [b]})) t.select(c=t.a @ t.b) with pytest.raises(ValueError): run_all() diff --git a/python/pathway/tests/test_utils.py b/python/pathway/tests/test_utils.py index 488f424a..af1d0adf 100644 --- a/python/pathway/tests/test_utils.py +++ b/python/pathway/tests/test_utils.py @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Any - import pandas as pd import pytest @@ -69,9 +67,8 @@ def test_unpack_col(): } ) data = T(data, format="pandas") - data = data.select(data=pw.declare_type(Any, pw.this.data)) result = unpack_col(data.data, "coord1", "coord2", "coord3") - assert_table_equality_wo_types( + assert_table_equality( result, T( """ @@ -581,3 +578,51 @@ def test_argmax_rows_02(): assert_table_equality_wo_index( argmax_rows(input, *[input.foo], what=input.bar), expected ) + + +def test_table_from_rows_stream(): + class TestSchema(pw.Schema): + foo: int = pw.column_definition(primary_key=True) + bar: int + + rows = [ + (1, 2, 1, 1), + (1, 2, 2, -1), + (1, 3, 2, 1), + (4, 2, 2, 1), + ] + expected = T( + """ + foo | bar + 1 | 3 + 4 | 2 + """ + ).with_id_from(pw.this.foo) + + table = pw.debug.table_from_rows(schema=TestSchema, rows=rows, is_stream=True) + assert_table_equality(table, expected) + + +def test_table_from_rows(): + class TestSchema(pw.Schema): + foo: int = pw.column_definition(primary_key=True) + bar: int + + rows = [ + (1, 2), + (2, 2), + (3, 3), + (4, 2), + ] + expected = T( + """ + foo | bar + 1 | 2 + 2 | 2 + 3 | 3 + 4 | 2 + """ + ).with_id_from(pw.this.foo) + + table = pw.debug.table_from_rows(schema=TestSchema, rows=rows, is_stream=False) + assert_table_equality(table, expected) diff --git a/python/pathway/tests/utils.py b/python/pathway/tests/utils.py index 998579cd..3b8d6750 100644 --- a/python/pathway/tests/utils.py +++ b/python/pathway/tests/utils.py @@ -6,7 +6,6 @@ import multiprocessing import os import pathlib -import platform import re import sys import time @@ -33,12 +32,16 @@ except ImportError: _numba_missing = True -xfail_on_darwin = pytest.mark.xfail( - platform.system() == "Darwin", - reason="can't do pw.run() from custom process on Darwin", +xfail_no_numba = pytest.mark.xfail(_numba_missing, reason="unable to import numba") + +needs_multiprocessing_fork = pytest.mark.xfail( + sys.platform != "linux", + reason="multiprocessing needs to use fork() for pw.run() to work", ) -xfail_no_numba = pytest.mark.xfail(_numba_missing, reason="unable to import numba") +xfail_on_multiple_threads = pytest.mark.xfail( + os.getenv("PATHWAY_THREADS", "1") != "1", reason="multiple threads" +) def skip_on_multiple_workers() -> None: @@ -225,7 +228,10 @@ def __init__(self, path, n_lines): self.n_lines = n_lines def __call__(self): - result = pd.read_csv(self.path).sort_index() + try: + result = pd.read_csv(self.path).sort_index() + except Exception: + return False print( f"Actual (expected) lines number: {len(result)} ({self.n_lines})", file=sys.stderr, @@ -233,8 +239,10 @@ def __call__(self): return len(result) == self.n_lines def provide_information_on_failure(self): + if not self.path.exists(): + return f"{self.path} does not exist" with open(self.path, "r") as f: - print(f"Final output contents:\n{f.read()}", file=sys.stderr) + return f"Final output contents:\n{f.read()}" class FileLinesNumberChecker: @@ -243,6 +251,8 @@ def __init__(self, path, n_lines): self.n_lines = n_lines def __call__(self): + if not self.path.exists(): + return False n_lines_actual = 0 with open(self.path, "r") as f: for row in f: @@ -254,8 +264,10 @@ def __call__(self): return n_lines_actual == self.n_lines def provide_information_on_failure(self): + if not self.path.exists(): + return f"{self.path} does not exist" with open(self.path, "r") as f: - print(f"Final output contents:\n{f.read()}", file=sys.stderr) + return f"Final output contents:\n{f.read()}" def expect_csv_checker(expected, output_path, usecols=("k", "v"), index_col=("k")): @@ -266,12 +278,15 @@ def expect_csv_checker(expected, output_path, usecols=("k", "v"), index_col=("k" ) def checker(): - result = ( - pd.read_csv(output_path, usecols=[*usecols, *index_col]) - .convert_dtypes() - .set_index(index_col, drop=False) - .sort_index() - ) + try: + result = ( + pd.read_csv(output_path, usecols=[*usecols, *index_col]) + .convert_dtypes() + .set_index(index_col, drop=False) + .sort_index() + ) + except Exception: + return False return expected.equals(result) return checker @@ -284,6 +299,9 @@ class TestDataSource(datasource.DataSource): def is_bounded(self) -> bool: raise NotImplementedError() + def is_append_only(self) -> bool: + return False + def apply_defaults_for_run_kwargs(kwargs): kwargs.setdefault("debug", True) @@ -385,43 +403,48 @@ def run_all(**kwargs): def wait_result_with_checker( checker, timeout_sec, - step=1.0, + *, + step=0.1, target=run, args=(), kwargs={}, ): - if target is not None: - p = multiprocessing.Process(target=target, args=args, kwargs=kwargs) - p.start() - started_at = time.time() - - succeeded = False - for _ in range(int(timeout_sec / step) + 1): - time.sleep(step) - try: + try: + if target is not None: + assert ( + multiprocessing.get_start_method() == "fork" + ), "multiprocessing does not use fork(), pw.run() will not work" + p = multiprocessing.Process(target=target, args=args, kwargs=kwargs) + p.start() + + succeeded = False + start_time = time.monotonic() + while True: + time.sleep(step) + + elapsed = time.monotonic() - start_time + if elapsed >= timeout_sec: + break + succeeded = checker() if succeeded: print( - "Correct result obtained after {} seconds".format( - time.time() - started_at - ), + f"Correct result obtained after {elapsed:.1f} seconds", file=sys.stderr, ) break - except Exception: - pass - - if not succeeded: - checker.provide_information_on_failure() - - if "persistence_config" in kwargs: - time.sleep(5.0) # allow a little gap to persist state - - if target is not None: - p.terminate() - p.join() - return succeeded + if not succeeded: + details = checker.provide_information_on_failure() + print(f"Checker failed: {details}", file=sys.stderr) + raise AssertionError(details) + finally: + if target is not None: + if "persistence_config" in kwargs: + time.sleep(5.0) # allow a little gap to persist state + + p.terminate() + p.join() def write_csv(path: str | pathlib.Path, table_def: str, **kwargs): diff --git a/python/pathway/xpacks/__init__.py b/python/pathway/xpacks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/pathway/xpacks/llm/__init__.py b/python/pathway/xpacks/llm/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/pathway/xpacks/llm/parser.py b/python/pathway/xpacks/llm/parser.py new file mode 100644 index 00000000..8b8239ee --- /dev/null +++ b/python/pathway/xpacks/llm/parser.py @@ -0,0 +1,116 @@ +""" +A library for document parsers: functions that take raw bytes and return a list of text +chunks along with their metadata. +""" + +from io import BytesIO +from typing import Any, Callable, Optional + + +class ParseUtf8: + def __init__( + self, + ): + pass + + def __call__(self, contents: bytes) -> list[tuple[str, dict]]: + docs: list[tuple[str, dict]] = [(contents.decode("utf-8"), {})] + return docs + + +# Based on: +# https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/document_loaders/unstructured.py#L134 +class ParseUnstructured: + """ + Parse document using https://unstructured.io/. + + Args: + - mode: single, elements or paged. + When single, each document is parsed as one long text string. + When elements, each document is split into unstructured's elements. + When paged, each pages's text is separately extracted. + - post_processors: list of callables that will be applied ot all extracted texts. + - **unstructured_kwargs: extra kwargs to be passed to unstructured.io's `partition` function + """ + + def __init__( + self, + mode: str = "single", + post_processors: Optional[list[Callable]] = None, + **unstructured_kwargs: Any, + ): + # lazy load to prevent unstructured from being a dependency on whole pathway + try: + import unstructured.partition.auto # noqa + except ImportError: + raise ValueError( + "Please install unstructured with all documents support: `pip install unstructured[all-docs]`" + ) + + _valid_modes = {"single", "elements", "paged"} + if mode not in _valid_modes: + raise ValueError( + f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" + ) + self.mode = mode + self.post_processors = post_processors or [] + self.unstructured_kwargs = unstructured_kwargs + + def __call__(self, contents: bytes) -> list[tuple[str, dict]]: + """ + Parse the given document: + + Args: + - contents: document contents + + Returns: + a list of pairs: text chunk and metadata + """ + import unstructured.partition.auto + + elements = unstructured.partition.auto.partition( + file=BytesIO(contents), **self.unstructured_kwargs + ) + + for element in elements: + for post_processor in self.post_processors: + element.apply(post_processor) + + metadata = {} + if self.mode == "elements": + docs: list[tuple[str, dict]] = list() + for element in elements: + # NOTE(MthwRobinson) - the attribute check is for backward compatibility + # with unstructured<0.4.9. The metadata attributed was added in 0.4.9. + if hasattr(element, "metadata"): + metadata.update(element.metadata.to_dict()) + if hasattr(element, "category"): + metadata["category"] = element.category + docs.append((str(element), metadata)) + elif self.mode == "paged": + text_dict: dict[int, str] = {} + meta_dict: dict[int, dict] = {} + + for idx, element in enumerate(elements): + if hasattr(element, "metadata"): + metadata.update(element.metadata.to_dict()) + page_number = metadata.get("page_number", 1) + + # Check if this page_number already exists in docs_dict + if page_number not in text_dict: + # If not, create new entry with initial text and metadata + text_dict[page_number] = str(element) + "\n\n" + meta_dict[page_number] = metadata + else: + # If exists, append to text and update the metadata + text_dict[page_number] += str(element) + "\n\n" + meta_dict[page_number].update(metadata) + + # Convert the dict to a list of dicts representing documents + docs = [(text_dict[key], meta_dict[key]) for key in text_dict.keys()] + elif self.mode == "single": + text = "\n\n".join([str(el) for el in elements]) + docs = [(text, metadata)] + else: + raise ValueError(f"mode of {self.mode} not supported.") + return docs diff --git a/python/pathway/xpacks/llm/splitter.py b/python/pathway/xpacks/llm/splitter.py new file mode 100644 index 00000000..983ded63 --- /dev/null +++ b/python/pathway/xpacks/llm/splitter.py @@ -0,0 +1,19 @@ +""" +A library of text spliiters - routines which slit a long text into smaller chunks. +""" + +from typing import Dict, List, Tuple + + +def null_splitter(txt: str) -> List[Tuple[str, Dict]]: + """A splitter which returns its argument as one long text ith null metadata. + + Args: + txt: text to be split + + Returns: + list of pairs: chunk text and metadata. + + The null splitter always return a list of length one containing the full text and empty metadata. + """ + return [(txt, {})] diff --git a/python/pathway/xpacks/llm/vector_store.py b/python/pathway/xpacks/llm/vector_store.py new file mode 100644 index 00000000..05713da1 --- /dev/null +++ b/python/pathway/xpacks/llm/vector_store.py @@ -0,0 +1,254 @@ +""" +Pathway vector search server and client. + +The server reads source documents and build a vector index over them, then starts serving +HTTP requests. + +The client queries the server and returns matching documents. +""" + + +import json +import threading +from typing import Callable, Optional + +import numpy as np +import requests + +import pathway as pw +import pathway.xpacks.llm.parser +import pathway.xpacks.llm.splitter +from pathway.stdlib.ml import index + + +class QueryInputSchema(pw.Schema): + query: str + k: int + metadata_filter: str | None = pw.column_definition(default_value=None) + stats: bool + + +class VectorStoreServer: + """ + Builds a document indexing pipeline and starts an HTTP REST server for nearest neighbors queries. + + Args: + - docs: pathway tables typically coming out of connectors which contain source documents. + - embedder: callable that embeds a single document + - parser: callable that parses file contents into a list of documents + - splitter: callable that splits long documents + """ + + def __init__( + self, + *docs: pw.Table, + embedder: Callable[[str], list[float]], + parser: Optional[Callable[[bytes], list[tuple[str, dict]]]] = None, + splitter: Optional[Callable[[str], list[tuple[str, dict]]]] = None, + ): + self.docs = docs + self.parser: Callable[[bytes], list[tuple[str, dict]]] = ( + parser + if parser is not None + else pathway.xpacks.llm.parser.ParseUtf8() # type:ignore + ) + self.splitter = ( + splitter + if splitter is not None + else pathway.xpacks.llm.splitter.null_splitter + ) + self.embedder = embedder + + # detect the dimensionality of the embeddings + self.embedding_dimension = len(embedder(".")) + + def _build_graph( + self, + host, + port, + ): + """ + Builds the pathway computation graph for indexing documents and serving queries. + """ + docs_s = self.docs + if not docs_s: + raise ValueError( + """Please provide at least one data source, e.g. read files from disk: + +pw.io.fs.read('./sample_docs', format='binary', mode='static', with_metadata=True) +""" + ) + if len(docs_s) == 1: + (docs,) = docs_s + else: + docs: pw.Table = docs_s[0].concat_reindex(*docs_s[1:]) # type: ignore + + @pw.udf + def parse_doc(data: bytes, metadata) -> list[pw.Json]: + rets = self.parser(data) + metadata = metadata.value + return [dict(text=ret[0], metadata={**metadata, **ret[1]}) for ret in rets] # type: ignore + + parsed_docs = docs.select(data=parse_doc(docs.data, docs._metadata)).flatten( + pw.this.data + ) + + @pw.udf + def split_doc(data_json: pw.Json) -> list[pw.Json]: + data: dict = data_json.value # type:ignore + text = data["text"] + metadata = data["metadata"] + rets = self.splitter(text) + return [ + dict(text=ret[0], metadata={**metadata, **ret[1]}) # type:ignore + for ret in rets + ] + + chunked_docs = parsed_docs.select(data=split_doc(pw.this.data)).flatten( + pw.this.data + ) + + @pw.udf_async + def embedder(txt): + return np.asarray(self.embedder(txt)) + + chunked_docs += chunked_docs.select( + embedding=embedder(pw.this.data["text"].as_str()) + ) + + knn_index = index.KNNIndex( + chunked_docs.embedding, + chunked_docs, + n_dimensions=self.embedding_dimension, + metadata=chunked_docs.data["metadata"], + ) + + post_query, response_writer = pw.io.http.rest_connector( + host=host, + port=port, + route="/", + schema=QueryInputSchema, + autocommit_duration_ms=50, + delete_completed_queries=True, + ) + + stats_query, query = post_query.split(pw.this.stats) + + # VectorStore statistics computation + @pw.udf + def format_stats(counts, last_modified): + if counts is not None: + response = {"file_count": counts, "last_modified": last_modified.value} + else: + response = {"file_count": 0, "last_modified": None} + return json.dumps(response) + + parsed_docs += parsed_docs.select( + modified=pw.this.data["metadata"]["modified_at"] + ) + stats = parsed_docs.reduce( + count=pw.reducers.count(), last_modified=pw.reducers.max(pw.this.modified) + ) + + stats_results = stats_query.join_left(stats, id=stats_query.id).select( + result=format_stats(stats.count, stats.last_modified) + ) + + # Relevant document search + query += query.select( + embedding=embedder(pw.this.query), + ) + + query_results = query + knn_index.get_nearest_items( + query.embedding, + k=pw.this.k, + collapse_rows=True, + metadata_filter=query.metadata_filter, + ).select(result=pw.this.data) + + query_results = query_results.select(pw.this.result) + + results = query_results.concat(stats_results) + response_writer(results) + + def run_server( + self, + host, + port, + threaded: bool = False, + with_cache: bool = True, + cache_backend: pw.persistence.Backend + | None = pw.persistence.Backend.filesystem("./Cache"), + ): + """ + Builds the document processing pipeline and runs it. + + Args: + - host: host to bind the HTTP listener + - port: to bind the HTTP listener + - threaded: if True, run in a thread. Else block computation + - with_cache: if True, embedding requests for the same contents are cached + - cache_backend: the backend to use for caching if it is enabled. The + default is the disk cache, hosted locally in the folder ``./Cache``. You + can use ``Backend`` class of the + [`persistence API`](/developers/api-docs/persistence-api/#pathway.persistence.Backend) + to override it. + + Returns: + If threaded, return the Thread object. Else, does not return. + """ + self._build_graph(host=host, port=port) + + def run(): + if with_cache: + if cache_backend is None: + raise ValueError( + "Cache usage was requested but the backend is unspecified" + ) + persistence_config = pw.persistence.Config.simple_config( + cache_backend, + persistence_mode=pw.PersistenceMode.UDF_CACHING, + ) + else: + persistence_config = None + + pw.run( + monitoring_level=pw.internals.monitoring.MonitoringLevel.NONE, + persistence_config=persistence_config, + ) + + if threaded: + t = threading.Thread(target=run) + t.start() + return t + else: + run() + + +class VectorStoreClient: + def __init__(self, host, port): + self.host = host + self.port = port + + def __call__(self, query, k=3, metadata_filter=None) -> list[dict]: + """Perform a query to the vector store and fetch results.""" + + data = {"query": query, "k": k, "stats": False} + if metadata_filter is not None: + data["metadata_filter"] = metadata_filter + url = f"http://{self.host}:{self.port}" + response = requests.post( + url, data=json.dumps(data), headers={"Content-Type": "application/json"} + ) + responses = response.json() + return responses + + def get_vectorstore_statistics(self): + """Fetch basic statistics about the vector store.""" + data = {"query": "", "k": 0, "stats": True} + url = f"http://{self.host}:{self.port}" + response = requests.post( + url, data=json.dumps(data), headers={"Content-Type": "application/json"} + ) + responses = response.json() + return responses diff --git a/src/connectors/data_storage.rs b/src/connectors/data_storage.rs index 477db72e..b084eead 100644 --- a/src/connectors/data_storage.rs +++ b/src/connectors/data_storage.rs @@ -1,3 +1,4 @@ +use pyo3::exceptions::PyValueError; use rand::Rng; use rdkafka::util::Timeout; use s3::error::S3Error; @@ -1122,34 +1123,21 @@ impl FilesystemScanner { #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum ConnectorMode { Static, - SimpleStreaming, - StreamingWithDeletions, + Streaming, } impl ConnectorMode { - pub fn new_from_flags(poll_new_objects: bool, allow_deletions: bool) -> Self { - if poll_new_objects { - if allow_deletions { - ConnectorMode::StreamingWithDeletions - } else { - ConnectorMode::SimpleStreaming - } - } else { - ConnectorMode::Static - } - } - pub fn is_polling_enabled(&self) -> bool { match self { ConnectorMode::Static => false, - ConnectorMode::SimpleStreaming | ConnectorMode::StreamingWithDeletions => true, + ConnectorMode::Streaming => true, } } pub fn are_deletions_enabled(&self) -> bool { match self { - ConnectorMode::Static | ConnectorMode::SimpleStreaming => false, - ConnectorMode::StreamingWithDeletions => true, + ConnectorMode::Static => false, + ConnectorMode::Streaming => true, } } } @@ -1339,6 +1327,7 @@ pub struct PythonReader { persistent_id: Option, total_entries_read: u64, is_initialized: bool, + is_finished: bool, #[allow(unused)] python_thread_state: PythonThreadState, @@ -1367,6 +1356,7 @@ impl ReaderBuilder for PythonReaderBuilder { python_thread_state, total_entries_read: 0, is_initialized: false, + is_finished: false, })) } @@ -1407,6 +1397,9 @@ impl Reader for PythonReader { with_gil_and_pool(|py| self.subject.borrow(py).start.call0(py))?; self.is_initialized = true; } + if self.is_finished { + return Ok(ReadResult::Finished); + } with_gil_and_pool(|py| { let (event, key, values, metadata): ( @@ -1422,7 +1415,15 @@ impl Reader for PythonReader { .extract(py) .map_err(ReadError::Py)?; + if event != DataEventType::Insert && !self.subject.borrow(py).deletions_enabled { + return Err(ReadError::Py(PyValueError::new_err( + "Trying to delete a row in the Python connector but deletions_enabled is set to False.", + ))); + } + if values == "*FINISH*".as_bytes() { + self.is_finished = true; + self.subject.borrow(py).end.call0(py)?; Ok(ReadResult::Finished) } else { // We use simple sequential offset because Python connector is single threaded, as diff --git a/src/engine/dataflow.rs b/src/engine/dataflow.rs index 8ef810e0..a92ff261 100644 --- a/src/engine/dataflow.rs +++ b/src/engine/dataflow.rs @@ -63,10 +63,9 @@ use ndarray::ArrayD; use once_cell::unsync::{Lazy, OnceCell}; use pyo3::PyObject; use serde::{Deserialize, Serialize}; -use timely::dataflow::channels::pact::Pipeline; use timely::dataflow::operators::probe::Handle as ProbeHandle; use timely::dataflow::operators::ToStream as _; -use timely::dataflow::operators::{Exchange, Filter, Inspect, Operator, Probe}; +use timely::dataflow::operators::{Filter, Inspect, Probe}; use timely::dataflow::scopes::Child; use timely::order::{Product, TotalOrder}; use timely::progress::timestamp::Refines; @@ -76,16 +75,16 @@ use xxhash_rust::xxh3::Xxh3 as Hasher; use self::complex_columns::complex_columns; use self::maybe_total::{MaybeTotalScope, MaybeTotalTimestamp, NotTotal, Total}; +use self::operators::output::{ConsolidateForOutput, OutputBatch}; use self::operators::prev_next::add_prev_next_pointers; use self::operators::stateful_reduce::StatefulReduce; -use self::operators::time_column::TimeColumnBuffer; +use self::operators::time_column::{MaxTimestamp, SelfCompactionTime, TimeColumnBuffer}; use self::operators::{ArrangeWithTypes, MapWrapped}; -use self::operators::{ConsolidateForOutput, Reshard}; -use self::operators::{ConsolidateForOutputMap, MaybeTotal}; +use self::operators::{MaybeTotal, Reshard}; use self::shard::Shard; use super::error::{DynError, DynResult, Trace}; use super::expression::AnyExpression; -use super::graph::DataRow; +use super::graph::{DataRow, SubscribeCallbacks}; use super::http_server::maybe_run_http_server_thread; use super::progress_reporter::{maybe_run_reporter, MonitoringLevel}; use super::reduce::{ @@ -1140,63 +1139,29 @@ impl DataflowGraphInner { fn columns_to_table_properties( &mut self, - columns: Vec<(ColumnHandle, ColumnPath)>, + columns: Vec, ) -> Result { let properties: Result> = columns .into_iter() - .map(|(column_handle, path)| { + .map(|column_handle| { let properties = self .columns .get(column_handle) .ok_or(Error::InvalidColumnHandle)? .properties .clone(); - Ok((path, properties.as_ref().clone())) + Ok(properties.as_ref().clone()) }) .collect(); - TableProperties::from_paths(properties?) + Ok(TableProperties::Table(properties?.as_slice().into())) } fn columns_to_table( &mut self, universe_handle: UniverseHandle, - columns: Vec<(ColumnHandle, ColumnPath)>, + column_handles: Vec, ) -> Result { - fn produce_nested_tuple( - paths: &[(usize, Vec)], - depth: usize, - data: &Arc<[Value]>, - ) -> Value { - if !paths.is_empty() && paths.first().unwrap().1.len() == depth { - let id = paths.first().unwrap().0; - for (next_id, _path) in &paths[1..] { - assert_eq!(data[id], data[*next_id]); - } - return data[id].clone(); - } - let mut path_prefix = 0; - let mut i = 0; - let mut j = 0; - let mut result = Vec::new(); - while i < paths.len() { - while i < paths.len() && path_prefix == paths[i].1[depth] { - i += 1; - } - path_prefix += 1; - if i == j { - // XXX: remove after iterate is properly implemented - // emulate unused cols - result.push(Value::Tuple(Arc::from([]))); - continue; - } - assert!(j < i); //there is at least one entry - result.push(produce_nested_tuple(&paths[j..i], depth + 1, data)); - j = i; - } - Value::from(result.as_slice()) - } - let column_handles: Vec = columns.iter().map(|(handle, _)| *handle).collect(); let tuples_collection = self.tuples(universe_handle, &column_handles)?; let tuples: Collection)> = match tuples_collection { TupleCollection::Zero(c) => { @@ -1210,23 +1175,10 @@ impl DataflowGraphInner { }), TupleCollection::More(c) => c, }; - let properties = self.columns_to_table_properties(columns.clone())?; - - let mut paths: Vec<(usize, Vec)> = columns - .into_iter() - .enumerate() - .map(|(i, (_, path))| match path { - ColumnPath::ValuePath(path) => Ok((i, path)), - ColumnPath::Key => Err(Error::ValueError( - "It is not allowed to use ids in column to table".into(), - )), - }) - .collect::>()?; - - paths.sort_by(|(_, path_i), (_, path_j)| path_i.partial_cmp(path_j).unwrap()); + let properties = self.columns_to_table_properties(column_handles)?; let table_values = tuples.map_named("columns_to_table:pack", move |(key, values)| { - (key, produce_nested_tuple(&paths, 0, &values)) + (key, Value::from(values.as_ref())) }); Ok(self @@ -1406,21 +1358,8 @@ impl DataflowGraphInner { //TODO: report errors let _error_reporter = self.error_reporter.clone(); - let gathered = table.values().inner.exchange(|_| 0).as_collection(); - #[allow(clippy::disallowed_methods)] - let consolidated = - as differential_dataflow::operators::arrange::Arrange< - S, - Key, - Value, - isize, - >>::arrange_core::< - Pipeline, - OrdValSpine::MaybeTotalTimestamp, isize>, - >(&gathered, Pipeline, "consolidate_without_shard") - .consolidate_for_output_map(|k, v| (*k, v.clone())); - - let (on_time, _late) = consolidated.freeze( + + let (on_time, _late) = table.values().freeze( move |val| threshold_time_column_path.extract_from_value(val).unwrap(), move |val| current_time_column_path.extract_from_value(val).unwrap(), ); @@ -1440,7 +1379,10 @@ impl DataflowGraphInner { where ::MaybeTotalTimestamp: Timestamp::MaybeTotalTimestamp> + PathSummary<::MaybeTotalTimestamp> - + Epsilon, + + Epsilon + + MaxTimestamp<::MaybeTotalTimestamp>, + SelfCompactionTime<::MaybeTotalTimestamp>: + MaxTimestamp::MaybeTotalTimestamp>>, { let table = self .tables @@ -1450,10 +1392,11 @@ impl DataflowGraphInner { //TODO: report errors let _error_reporter = self.error_reporter.clone(); - let new_table = table.values().consolidate_for_output().postpone( + let new_table = table.values().postpone( table.values().scope(), move |val| threshold_time_column_path.extract_from_value(val).unwrap(), move |val| current_time_column_path.extract_from_value(val).unwrap(), + true, ); Ok(self @@ -1688,6 +1631,16 @@ impl DataflowGraphInner { .chars() .map(|c| Value::from(ArcStr::from(c.to_string()))) .collect()), + Value::Json(json) => { + if let serde_json::Value::Array(array) = (*json).clone() { + Ok(array.into_iter().map(Value::from).collect()) + } else { + let repr = json.to_string(); + Err(Error::ValueError(format!( + "Pathway can't flatten this Json: {repr}" + ))) + } + } value => Err(Error::ValueError(format!( "Pathway can't flatten this value {value:?}" ))), @@ -2491,7 +2444,7 @@ where #[derive(Debug, Clone)] enum OutputEvent { Commit(Option), - Batch(Vec<((Key, Tuple), u64, isize)>), + Batch(OutputBatch), } #[allow(clippy::unnecessary_wraps)] // we want to always return Result for symmetry @@ -2681,7 +2634,7 @@ impl> DataflowGraphInner { let error_reporter_1 = self.error_reporter.clone(); let error_reporter_2 = self.error_reporter.clone(); - let new_table = table.values().consolidate_for_output().forget( + let new_table = table.values().forget( move |val| { threshold_time_column_path .extract_from_value(val) @@ -2749,13 +2702,14 @@ impl> DataflowGraphInner { fn output_batch( stats: &mut OutputConnectorStats, - batch: Vec<((Key, Tuple), u64, isize)>, + batch: OutputBatch, data_sink: &mut Box, data_formatter: &mut Box, global_persistent_storage: &GlobalPersistentStorage, ) -> Result<(), DynError> { stats.on_batch_started(); - for ((key, values), time, diff) in batch { + let time = batch.time; + for ((key, values), diff) in batch.data { if time == ARTIFICIAL_TIME_ON_REWIND_START && global_persistent_storage.is_some() { // Ignore entries, which had been written before continue; @@ -2804,14 +2758,7 @@ impl> DataflowGraphInner { .as_collection(); let single_threaded = data_sink.single_threaded(); - let output = output_columns.consolidate_for_output().inner; - let inspect_output = { - if single_threaded { - output.exchange(|_| 0) - } else { - output - } - }; + let output = output_columns.consolidate_for_output(single_threaded); let worker_index = self.scope.index(); let sender = if !single_threaded || worker_index == 0 { @@ -2873,7 +2820,7 @@ impl> DataflowGraphInner { None }; - inspect_output + output .inspect_core(move |event| match sender { None => { // There is no connector thread for this worker. @@ -2885,16 +2832,21 @@ impl> DataflowGraphInner { } } Some(ref sender) => { - let event = match event { - Ok((_time, data)) => OutputEvent::Batch(data.to_vec()), + match event { + Ok((_time, batches)) => { + for batch in batches { + sender + .send(OutputEvent::Batch(batch.clone())) + .expect("sending output batch should not fail"); + } + } Err(frontier) => { assert!(frontier.len() <= 1); - OutputEvent::Commit(frontier.first().copied()) + sender + .send(OutputEvent::Commit(frontier.first().copied())) + .expect("sending output commit should not fail"); } }; - sender - .send(event) - .expect("sending output event should not fail"); } }) .probe_with(&mut self.output_probe); @@ -2904,16 +2856,11 @@ impl> DataflowGraphInner { fn subscribe_table( &mut self, - wrapper: BatchWrapper, - mut callback: Box DynResult<()>>, - mut on_end: Box DynResult<()>>, table_handle: TableHandle, column_paths: Vec, + callbacks: SubscribeCallbacks, skip_persisted_batch: bool, ) -> Result<()> { - let mut vector = Vec::new(); - - let error_reporter = self.error_reporter.clone(); let worker_index = self.scope.index(); let sink_id = self @@ -2922,25 +2869,54 @@ impl> DataflowGraphInner { .map(|m| m.lock().unwrap().register_sink()); let global_persistent_storage = self.global_persistent_storage.clone(); let skip_initial_time = skip_persisted_batch && global_persistent_storage.is_some(); + + let error_reporter = self.error_reporter.clone(); + let error_reporter_2 = self.error_reporter.clone(); + + let SubscribeCallbacks { + wrapper, + mut on_data, + mut on_time_end, + mut on_end, + } = callbacks; + let wrapper_2 = wrapper.clone(); + self.extract_columns(table_handle, column_paths)? .as_collection() - .consolidate_for_output() - .inner - .probe_with(&mut self.output_probe) - .sink(Pipeline, "SubscribeColumn", move |input| { - wrapper.run(|| { - while let Some((_time, data)) = input.next() { - data.swap(&mut vector); - for ((key, values), time, diff) in vector.drain(..) { - if time == ARTIFICIAL_TIME_ON_REWIND_START && skip_initial_time { - continue; + .consolidate_for_output(true) + .inspect(move |batch| { + if batch.time == ARTIFICIAL_TIME_ON_REWIND_START && skip_initial_time { + return; + } + wrapper + .run(|| -> DynResult<()> { + if let Some(on_data) = on_data.as_mut() { + for ((key, values), diff) in &batch.data { + on_data(*key, values, batch.time, *diff)?; } - callback(key, &values, time, diff) - .unwrap_with_reporter(&error_reporter); + } + if let Some(on_time_end) = on_time_end.as_mut() { + on_time_end(batch.time)?; + } + Ok(()) + }) + .unwrap_with_reporter(&error_reporter); + }) + .inspect_core(move |event| { + // Another inspect, so we are looking at the first inspect's output fronitier, + // i.e., we are called after every worker has finished processing callbacks from + // the first inspect for this frontier. + if let Err(frontier) = event { + if worker_index == 0 && frontier.is_empty() { + if let Some(on_end) = on_end.as_mut() { + wrapper_2 + .run(on_end) + .unwrap_with_reporter(&error_reporter_2); } } - let time_processed = input.frontier().frontier().first().copied(); + assert!(frontier.len() <= 1); + let time_processed = frontier.first().copied(); if let Some(global_persistent_storage) = &global_persistent_storage { global_persistent_storage .lock() @@ -2951,12 +2927,9 @@ impl> DataflowGraphInner { time_processed, ); } - - if time_processed.is_none() { - on_end().unwrap_with_reporter(&error_reporter); - } - }); - }); + } + }) + .probe_with(&mut self.output_probe); Ok(()) } @@ -3599,7 +3572,7 @@ where fn columns_to_table( &self, universe_handle: UniverseHandle, - columns: Vec<(ColumnHandle, ColumnPath)>, + columns: Vec, ) -> Result { self.0 .borrow_mut() @@ -3946,11 +3919,9 @@ where fn subscribe_table( &self, - _wrapper: BatchWrapper, - _callback: Box DynResult<()>>, - _on_end: Box DynResult<()>>, _table_handle: TableHandle, _column_paths: Vec, + _callbacks: SubscribeCallbacks, _skip_persisted_batch: bool, ) -> Result<()> { Err(Error::IoNotPossible) @@ -4088,7 +4059,7 @@ impl> Graph for OuterDataflowGraph fn columns_to_table( &self, universe_handle: UniverseHandle, - columns: Vec<(ColumnHandle, ColumnPath)>, + columns: Vec, ) -> Result { self.0 .borrow_mut() @@ -4143,19 +4114,15 @@ impl> Graph for OuterDataflowGraph fn subscribe_table( &self, - wrapper: BatchWrapper, - callback: Box DynResult<()>>, - on_end: Box DynResult<()>>, table_handle: TableHandle, column_paths: Vec, + callbacks: SubscribeCallbacks, skip_persisted_batch: bool, ) -> Result<()> { self.0.borrow_mut().subscribe_table( - wrapper, - callback, - on_end, table_handle, column_paths, + callbacks, skip_persisted_batch, ) } diff --git a/src/engine/dataflow/operators.rs b/src/engine/dataflow/operators.rs index 6f114c1f..f0b1b0de 100644 --- a/src/engine/dataflow/operators.rs +++ b/src/engine/dataflow/operators.rs @@ -1,16 +1,16 @@ pub mod gradual_broadcast; +pub mod output; pub mod prev_next; pub mod stateful_reduce; pub mod time_column; mod utils; use std::any::type_name; -use std::collections::BTreeMap; use std::panic::Location; use differential_dataflow::difference::Semigroup; use differential_dataflow::operators::arrange::{Arranged, TraceAgent}; -use differential_dataflow::trace::{Batch, BatchReader, Cursor, Trace, TraceReader}; +use differential_dataflow::trace::{Batch, Trace, TraceReader}; use differential_dataflow::{AsCollection, Collection, Data, ExchangeData}; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -25,135 +25,90 @@ use super::maybe_total::{MaybeTotalScope, MaybeTotalSwitch}; use super::shard::Shard; use super::ArrangedBySelf; -pub trait ConsolidateForOutput -where - S: MaybeTotalScope, - R: Semigroup, -{ - fn consolidate_for_output_named(&self, name: &str) -> Collection; - - #[track_caller] - fn consolidate_for_output(&self) -> Collection { - self.consolidate_for_output_named("ConsolidateForOutput") - } -} - -impl ConsolidateForOutput for Collection +pub trait ArrangeWithTypes where S: MaybeTotalScope, - D: ExchangeData + Shard, + K: ExchangeData, + V: ExchangeData, R: Semigroup + ExchangeData, { #[track_caller] - fn consolidate_for_output_named(&self, name: &str) -> Self { - let arranged: ArrangedBySelf = self.arrange_named(&format!("Arrange: {name}")); - arranged.consolidate_for_output_map_named(name, |k, ()| k.clone()) + fn arrange(&self) -> Arranged> + where + Tr: Trace + TraceReader + 'static, + Tr::Batch: Batch, + { + self.arrange_named("Arrange") } + + fn arrange_named(&self, name: &str) -> Arranged> + where + Tr: Trace + TraceReader + 'static, + Tr::Batch: Batch; } -pub trait ConsolidateForOutputMap +pub trait ArrangeWithTypesSharded where S: MaybeTotalScope, - R: Semigroup, + K: ExchangeData, + V: ExchangeData, + R: Semigroup + ExchangeData, { - fn consolidate_for_output_map_named( - &self, - name: &str, - logic: impl FnMut(&K, &V) -> D + 'static, - ) -> Collection; - #[track_caller] - fn consolidate_for_output_map( + fn arrange_sharded( &self, - logic: impl FnMut(&K, &V) -> D + 'static, - ) -> Collection { - self.consolidate_for_output_map_named("ConsolidateForOutput", logic) + sharding: impl FnMut(&K) -> u64 + 'static, + ) -> Arranged> + where + Tr: Trace + TraceReader + 'static, + Tr::Batch: Batch, + { + self.arrange_sharded_named("Arrange", sharding) } -} -impl ConsolidateForOutputMap for Arranged -where - S: MaybeTotalScope, - Tr: TraceReader( &self, name: &str, - mut logic: impl FnMut(&Tr::Key, &Tr::Val) -> D + 'static, - ) -> Collection { - let caller = Location::caller(); - let name = format!("{name} at {caller}"); - self.stream - .unary(Pipeline, &name, move |_cap, _info| { - move |input, output| { - input.for_each(|cap, data| { - let mut time_diffs = BTreeMap::new(); - for batch in data.iter() { - let mut cursor = batch.cursor(); - while let Some(key) = cursor.get_key(batch) { - while let Some(val) = cursor.get_val(batch) { - cursor.map_times(batch, |time, diff| { - let data = logic(key, val); - time_diffs - .entry((time.clone(), diff.clone())) - .or_insert_with(Vec::new) - .push((data, time.clone(), diff.clone())); - }); - cursor.step_val(batch); - } - cursor.step_key(batch); - } - } - for ((time, _diff), mut vec) in time_diffs { - output.session(&cap.delayed(&time)).give_vec(&mut vec); - } - }); - } - }) - .as_collection() - } + sharding: impl FnMut(&K) -> u64 + 'static, + ) -> Arranged> + where + Tr: Trace + TraceReader + 'static, + Tr::Batch: Batch; } -pub trait ArrangeWithTypes +impl ArrangeWithTypes for T where + T: differential_dataflow::operators::arrange::arrangement::Arrange, S: MaybeTotalScope, - K: ExchangeData, + K: ExchangeData + Shard, V: ExchangeData, R: Semigroup + ExchangeData, { #[track_caller] - fn arrange(&self) -> Arranged> + fn arrange_named(&self, name: &str) -> Arranged> where Tr: Trace + TraceReader + 'static, Tr::Batch: Batch, { - self.arrange_named("Arrange") + self.arrange_sharded_named(name, Shard::shard) } - - fn arrange_named(&self, name: &str) -> Arranged> - where - Tr: Trace + TraceReader + 'static, - Tr::Batch: Batch; } -impl ArrangeWithTypes for T +impl ArrangeWithTypesSharded for T where T: differential_dataflow::operators::arrange::arrangement::Arrange, S: MaybeTotalScope, - K: ExchangeData + Shard, + K: ExchangeData, V: ExchangeData, R: Semigroup + ExchangeData, { #[track_caller] - fn arrange_named(&self, name: &str) -> Arranged> + fn arrange_sharded_named( + &self, + name: &str, + mut sharding: impl FnMut(&K) -> u64 + 'static, + ) -> Arranged> where - K: ExchangeData + Shard, - V: ExchangeData, - R: ExchangeData, Tr: Trace + TraceReader + 'static, Tr::Batch: Batch, { @@ -163,7 +118,8 @@ where key = type_name::(), value = type_name::() ); - let exchange = Exchange::new(|((key, _value), _time, _diff): &((K, V), _, _)| key.shard()); + let exchange = + Exchange::new(move |((key, _value), _time, _diff): &((K, V), _, _)| sharding(key)); #[allow(clippy::disallowed_methods)] differential_dataflow::operators::arrange::arrangement::Arrange::arrange_core( self, exchange, &name, diff --git a/src/engine/dataflow/operators/output.rs b/src/engine/dataflow/operators/output.rs new file mode 100644 index 00000000..43f1cab0 --- /dev/null +++ b/src/engine/dataflow/operators/output.rs @@ -0,0 +1,118 @@ +use std::panic::Location; + +use differential_dataflow::difference::{Monoid, Semigroup}; +use differential_dataflow::operators::arrange::Arranged; +use differential_dataflow::trace::TraceReader; +use differential_dataflow::{Collection, Data, ExchangeData}; +use itertools::partition; +use timely::dataflow::channels::pact::Pipeline; +use timely::dataflow::operators::Operator; +use timely::dataflow::Stream; + +use crate::engine::dataflow::maybe_total::MaybeTotalScope; +use crate::engine::dataflow::shard::Shard; +use crate::engine::dataflow::ArrangedBySelf; + +use super::utils::batch_by_time; +use super::{ArrangeWithTypes, ArrangeWithTypesSharded}; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct OutputBatch { + pub time: T, + pub data: Vec<(D, R)>, +} + +pub trait ConsolidateForOutput +where + S: MaybeTotalScope, + R: Monoid, +{ + fn consolidate_for_output_named( + &self, + name: &str, + single_threaded: bool, + ) -> Stream>; + + #[track_caller] + fn consolidate_for_output( + &self, + single_threaded: bool, + ) -> Stream> { + self.consolidate_for_output_named("ConsolidateForOutput", single_threaded) + } +} + +impl ConsolidateForOutput for Collection +where + S: MaybeTotalScope, + D: ExchangeData + Shard, + R: Monoid + ExchangeData, +{ + #[track_caller] + fn consolidate_for_output_named( + &self, + name: &str, + single_threaded: bool, + ) -> Stream> { + let arranged: ArrangedBySelf = if single_threaded { + self.arrange_sharded_named(&format!("Arrange [single-threaded]: {name}"), |_| 0) + } else { + self.arrange_named(&format!("Arrange: {name}")) + }; + arranged.consolidate_for_output_map_named(name, |k, ()| k.clone()) + } +} + +pub trait ConsolidateForOutputMap +where + S: MaybeTotalScope, + R: Semigroup, +{ + fn consolidate_for_output_map_named( + &self, + name: &str, + logic: impl FnMut(&K, &V) -> D + 'static, + ) -> Stream>; + + #[track_caller] + fn consolidate_for_output_map( + &self, + logic: impl FnMut(&K, &V) -> D + 'static, + ) -> Stream> { + self.consolidate_for_output_map_named("ConsolidateForOutput", logic) + } +} + +impl ConsolidateForOutputMap for Arranged +where + S: MaybeTotalScope, + Tr: TraceReader