From 307037799eb12ecee070b547230e672a890c61e1 Mon Sep 17 00:00:00 2001 From: Adam <41971533+jcadam14@users.noreply.github.com> Date: Thu, 9 May 2024 15:00:44 -0400 Subject: [PATCH 1/2] Pulled in ujson and chunked up the dataframe processing for json --- poetry.lock | 80 ++++++++++++++++++- pyproject.toml | 1 + src/regtech_data_validator/data_formatters.py | 26 +++++- 3 files changed, 99 insertions(+), 8 deletions(-) diff --git a/poetry.lock b/poetry.lock index a58bf3bd..99c0da71 100644 --- a/poetry.lock +++ b/poetry.lock @@ -382,7 +382,6 @@ files = [ {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c38d7b9a690b090de999835f0443d8aa93ce5f2064035dfc48f27f02b4afc3d0"}, {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5670fb70a828663cc37552a2a85bf2ac38475572b0e9b91283dc09efb52c41d1"}, {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:958244ad566c3ffc385f47dddde4145088a0ab893504b54b52c041987a8c1863"}, - {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b6241d4eee5f89453307c2f2bfa03b50362052ca0af1efecf9fef9a41a22bb4f"}, {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:2a66bf12fbd4666dd023b6f51223aed3d9f3b40fef06ce404cb75bafd3d89536"}, {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:9123716666e25b7b71c4e1789ec829ed18663152008b58544d95b008ed9e21e9"}, {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:0c3f67e2aeda739d1cc0b1102c9a9129f7dc83901226cc24dd72ba275ced4218"}, @@ -407,6 +406,7 @@ files = [ {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:9e2addd2d1866fe112bc6f80117bcc6bc25191c5ed1bfbcf9f1386a884252ae8"}, {file = "lxml-5.2.1-cp37-cp37m-win32.whl", hash = "sha256:f51969bac61441fd31f028d7b3b45962f3ecebf691a510495e5d2cd8c8092dbd"}, {file = "lxml-5.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b0b58fbfa1bf7367dde8a557994e3b1637294be6cf2169810375caf8571a085c"}, + {file = "lxml-5.2.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3e183c6e3298a2ed5af9d7a356ea823bccaab4ec2349dc9ed83999fd289d14d5"}, {file = "lxml-5.2.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:804f74efe22b6a227306dd890eecc4f8c59ff25ca35f1f14e7482bbce96ef10b"}, {file = "lxml-5.2.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:08802f0c56ed150cc6885ae0788a321b73505d2263ee56dad84d200cab11c07a"}, {file = "lxml-5.2.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f8c09ed18ecb4ebf23e02b8e7a22a05d6411911e6fabef3a36e4f371f4f2585"}, @@ -613,7 +613,6 @@ optional = false python-versions = ">=3.9" files = [ {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, - {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, @@ -634,7 +633,6 @@ files = [ {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, - {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, @@ -1123,6 +1121,80 @@ files = [ {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, ] +[[package]] +name = "ujson" +version = "5.9.0" +description = "Ultra fast JSON encoder and decoder for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "ujson-5.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab71bf27b002eaf7d047c54a68e60230fbd5cd9da60de7ca0aa87d0bccead8fa"}, + {file = "ujson-5.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a365eac66f5aa7a7fdf57e5066ada6226700884fc7dce2ba5483538bc16c8c5"}, + {file = "ujson-5.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e015122b337858dba5a3dc3533af2a8fc0410ee9e2374092f6a5b88b182e9fcc"}, + {file = "ujson-5.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:779a2a88c53039bebfbccca934430dabb5c62cc179e09a9c27a322023f363e0d"}, + {file = "ujson-5.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10ca3c41e80509fd9805f7c149068fa8dbee18872bbdc03d7cca928926a358d5"}, + {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a566e465cb2fcfdf040c2447b7dd9718799d0d90134b37a20dff1e27c0e9096"}, + {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f833c529e922577226a05bc25b6a8b3eb6c4fb155b72dd88d33de99d53113124"}, + {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b68a0caab33f359b4cbbc10065c88e3758c9f73a11a65a91f024b2e7a1257106"}, + {file = "ujson-5.9.0-cp310-cp310-win32.whl", hash = "sha256:7cc7e605d2aa6ae6b7321c3ae250d2e050f06082e71ab1a4200b4ae64d25863c"}, + {file = "ujson-5.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:a6d3f10eb8ccba4316a6b5465b705ed70a06011c6f82418b59278fbc919bef6f"}, + {file = "ujson-5.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b23bbb46334ce51ddb5dded60c662fbf7bb74a37b8f87221c5b0fec1ec6454b"}, + {file = "ujson-5.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6974b3a7c17bbf829e6c3bfdc5823c67922e44ff169851a755eab79a3dd31ec0"}, + {file = "ujson-5.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5964ea916edfe24af1f4cc68488448fbb1ec27a3ddcddc2b236da575c12c8ae"}, + {file = "ujson-5.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ba7cac47dd65ff88571eceeff48bf30ed5eb9c67b34b88cb22869b7aa19600d"}, + {file = "ujson-5.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6bbd91a151a8f3358c29355a491e915eb203f607267a25e6ab10531b3b157c5e"}, + {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:829a69d451a49c0de14a9fecb2a2d544a9b2c884c2b542adb243b683a6f15908"}, + {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:a807ae73c46ad5db161a7e883eec0fbe1bebc6a54890152ccc63072c4884823b"}, + {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8fc2aa18b13d97b3c8ccecdf1a3c405f411a6e96adeee94233058c44ff92617d"}, + {file = "ujson-5.9.0-cp311-cp311-win32.whl", hash = "sha256:70e06849dfeb2548be48fdd3ceb53300640bc8100c379d6e19d78045e9c26120"}, + {file = "ujson-5.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:7309d063cd392811acc49b5016728a5e1b46ab9907d321ebbe1c2156bc3c0b99"}, + {file = "ujson-5.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:20509a8c9f775b3a511e308bbe0b72897ba6b800767a7c90c5cca59d20d7c42c"}, + {file = "ujson-5.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b28407cfe315bd1b34f1ebe65d3bd735d6b36d409b334100be8cdffae2177b2f"}, + {file = "ujson-5.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d302bd17989b6bd90d49bade66943c78f9e3670407dbc53ebcf61271cadc399"}, + {file = "ujson-5.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f21315f51e0db8ee245e33a649dd2d9dce0594522de6f278d62f15f998e050e"}, + {file = "ujson-5.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5635b78b636a54a86fdbf6f027e461aa6c6b948363bdf8d4fbb56a42b7388320"}, + {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:82b5a56609f1235d72835ee109163c7041b30920d70fe7dac9176c64df87c164"}, + {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5ca35f484622fd208f55041b042d9d94f3b2c9c5add4e9af5ee9946d2d30db01"}, + {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:829b824953ebad76d46e4ae709e940bb229e8999e40881338b3cc94c771b876c"}, + {file = "ujson-5.9.0-cp312-cp312-win32.whl", hash = "sha256:25fa46e4ff0a2deecbcf7100af3a5d70090b461906f2299506485ff31d9ec437"}, + {file = "ujson-5.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:60718f1720a61560618eff3b56fd517d107518d3c0160ca7a5a66ac949c6cf1c"}, + {file = "ujson-5.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d581db9db9e41d8ea0b2705c90518ba623cbdc74f8d644d7eb0d107be0d85d9c"}, + {file = "ujson-5.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ff741a5b4be2d08fceaab681c9d4bc89abf3c9db600ab435e20b9b6d4dfef12e"}, + {file = "ujson-5.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdcb02cabcb1e44381221840a7af04433c1dc3297af76fde924a50c3054c708c"}, + {file = "ujson-5.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e208d3bf02c6963e6ef7324dadf1d73239fb7008491fdf523208f60be6437402"}, + {file = "ujson-5.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4b3917296630a075e04d3d07601ce2a176479c23af838b6cf90a2d6b39b0d95"}, + {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0c4d6adb2c7bb9eb7c71ad6f6f612e13b264942e841f8cc3314a21a289a76c4e"}, + {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0b159efece9ab5c01f70b9d10bbb77241ce111a45bc8d21a44c219a2aec8ddfd"}, + {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0cb4a7814940ddd6619bdce6be637a4b37a8c4760de9373bac54bb7b229698b"}, + {file = "ujson-5.9.0-cp38-cp38-win32.whl", hash = "sha256:dc80f0f5abf33bd7099f7ac94ab1206730a3c0a2d17549911ed2cb6b7aa36d2d"}, + {file = "ujson-5.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:506a45e5fcbb2d46f1a51fead991c39529fc3737c0f5d47c9b4a1d762578fc30"}, + {file = "ujson-5.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d0fd2eba664a22447102062814bd13e63c6130540222c0aa620701dd01f4be81"}, + {file = "ujson-5.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bdf7fc21a03bafe4ba208dafa84ae38e04e5d36c0e1c746726edf5392e9f9f36"}, + {file = "ujson-5.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2f909bc08ce01f122fd9c24bc6f9876aa087188dfaf3c4116fe6e4daf7e194f"}, + {file = "ujson-5.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd4ea86c2afd41429751d22a3ccd03311c067bd6aeee2d054f83f97e41e11d8f"}, + {file = "ujson-5.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:63fb2e6599d96fdffdb553af0ed3f76b85fda63281063f1cb5b1141a6fcd0617"}, + {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:32bba5870c8fa2a97f4a68f6401038d3f1922e66c34280d710af00b14a3ca562"}, + {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:37ef92e42535a81bf72179d0e252c9af42a4ed966dc6be6967ebfb929a87bc60"}, + {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f69f16b8f1c69da00e38dc5f2d08a86b0e781d0ad3e4cc6a13ea033a439c4844"}, + {file = "ujson-5.9.0-cp39-cp39-win32.whl", hash = "sha256:3382a3ce0ccc0558b1c1668950008cece9bf463ebb17463ebf6a8bfc060dae34"}, + {file = "ujson-5.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:6adef377ed583477cf005b58c3025051b5faa6b8cc25876e594afbb772578f21"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ffdfebd819f492e48e4f31c97cb593b9c1a8251933d8f8972e81697f00326ff1"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4eec2ddc046360d087cf35659c7ba0cbd101f32035e19047013162274e71fcf"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbb90aa5c23cb3d4b803c12aa220d26778c31b6e4b7a13a1f49971f6c7d088e"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba0823cb70866f0d6a4ad48d998dd338dce7314598721bc1b7986d054d782dfd"}, + {file = "ujson-5.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4e35d7885ed612feb6b3dd1b7de28e89baaba4011ecdf995e88be9ac614765e9"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b048aa93eace8571eedbd67b3766623e7f0acbf08ee291bef7d8106210432427"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323279e68c195110ef85cbe5edce885219e3d4a48705448720ad925d88c9f851"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ac92d86ff34296f881e12aa955f7014d276895e0e4e868ba7fddebbde38e378"}, + {file = "ujson-5.9.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6eecbd09b316cea1fd929b1e25f70382917542ab11b692cb46ec9b0a26c7427f"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:473fb8dff1d58f49912323d7cb0859df5585cfc932e4b9c053bf8cf7f2d7c5c4"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f91719c6abafe429c1a144cfe27883eace9fb1c09a9c5ef1bcb3ae80a3076a4e"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b1c0991c4fe256f5fdb19758f7eac7f47caac29a6c57d0de16a19048eb86bad"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ea0f55a1396708e564595aaa6696c0d8af532340f477162ff6927ecc46e21"}, + {file = "ujson-5.9.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:07e0cfdde5fd91f54cd2d7ffb3482c8ff1bf558abf32a8b953a5d169575ae1cd"}, + {file = "ujson-5.9.0.tar.gz", hash = "sha256:89cc92e73d5501b8a7f48575eeb14ad27156ad092c2e9fc7e3cf949f07e75532"}, +] + [[package]] name = "urllib3" version = "2.2.1" @@ -1222,4 +1294,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.12,<4" -content-hash = "8d89ad88a7d7c868a07d5e6a5b15e8938ad4c52c89b8ccf61eaeff3a1fde4acc" +content-hash = "be3833de385526bd2d8a3727afd614bf76b1c7ae8425caca2854d02fa1973cdf" diff --git a/pyproject.toml b/pyproject.toml index 184a5d70..c7e87962 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ pandas = "^2.2.2" pandera = "^0.19.0" requests = "^2.31.0" tabulate = "^0.9.0" +ujson = "^5.9.0" [tool.poetry.group.dev.dependencies] pytest = "8.2.0" diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py index 96393379..2caf6971 100644 --- a/src/regtech_data_validator/data_formatters.py +++ b/src/regtech_data_validator/data_formatters.py @@ -1,5 +1,5 @@ import csv -import json +import ujson import pandas as pd from tabulate import tabulate @@ -104,11 +104,28 @@ def df_to_table(df: pd.DataFrame) -> str: def df_to_json(df: pd.DataFrame) -> str: + chunk_size = 500000 # maybe at some point make this configurable? + json_results = [] + # group by the validation_id so we don't lose data while chunking, + # then chunk the group into smaller pieces + grouped_df = df.groupby('validation_id') + + for group_name, group_data in grouped_df: + start_index = 0 + while start_index < len(group_data): + end_index = min(start_index + chunk_size, len(group_data)) + chunk = group_data.iloc[start_index:end_index] + json_results.extend(process_chunk(chunk)) + start_index = end_index + return ujson.dumps(json_results, indent=4, escape_forward_slashes=False) + + +def process_chunk(df: pd.DataFrame) -> [dict]: output_json = [] - if not df.empty: + df.reset_index(drop=True, inplace=True) - findings_json = json.loads(df.to_json(orient='columns')) + findings_json = ujson.loads(df.to_json(orient='columns')) grouped_data = {} for i in range(len(findings_json['record_no'])): @@ -153,4 +170,5 @@ def df_to_json(df: pd.DataFrame) -> str: output_json.append(validation_info) output_json = sorted(output_json, key=lambda x: x['validation']['id']) - return json.dumps(output_json, indent=4) + + return output_json From 74803242ca56394f33da2352c2087e75a04f0542 Mon Sep 17 00:00:00 2001 From: Adam <41971533+jcadam14@users.noreply.github.com> Date: Mon, 13 May 2024 10:56:02 -0400 Subject: [PATCH 2/2] Kept ujson, moved df.concat to outside the loop, group to_json by validation id --- src/regtech_data_validator/create_schemas.py | 14 +-- src/regtech_data_validator/data_formatters.py | 107 +++++++----------- 2 files changed, 49 insertions(+), 72 deletions(-) diff --git a/src/regtech_data_validator/create_schemas.py b/src/regtech_data_validator/create_schemas.py index 9fdec7db..20f174d3 100644 --- a/src/regtech_data_validator/create_schemas.py +++ b/src/regtech_data_validator/create_schemas.py @@ -126,10 +126,11 @@ def validate(schema: DataFrameSchema, submission_df: pd.DataFrame) -> tuple[bool schema(submission_df, lazy=True) except SchemaErrors as err: is_valid = False - + check_findings = [] # NOTE: `type: ignore` because SchemaErrors.schema_errors is supposed to be # `list[dict[str,Any]]`, but it's actually of type `SchemaError` schema_error: SchemaError + for schema_error in err.schema_errors: # type: ignore check = schema_error.check column_name = schema_error.schema.name @@ -145,9 +146,7 @@ def validate(schema: DataFrameSchema, submission_df: pd.DataFrame) -> tuple[bool raise RuntimeError( f'Check {check} type on {column_name} column not supported. Must be of type {SBLCheck}' ) from schema_error - fields = _get_check_fields(check, column_name) - check_output: pd.Series | None = schema_error.check_output if check_output is not None: @@ -155,17 +154,13 @@ def validate(schema: DataFrameSchema, submission_df: pd.DataFrame) -> tuple[bool failed_records_df = _filter_valid_records(submission_df, check_output, fields) failed_records_df.index += next_finding_no next_finding_no = failed_records_df.tail(1).index + 1 # type: ignore - failed_record_fields_df = _records_to_fields(failed_records_df) - check_findings_df = _add_validation_metadata(failed_record_fields_df, check) - - findings_df = pd.concat([findings_df, check_findings_df]) + check_findings.append(_add_validation_metadata(failed_record_fields_df, check)) else: # The above exception handling _should_ prevent this from ever happenin, but...just in case. raise RuntimeError(f'No check output for "{check.name}" check. Pandera SchemaError: {schema_error}') - + findings_df = pd.concat(check_findings) updated_df = add_uid(findings_df, submission_df) - return is_valid, updated_df @@ -182,6 +177,7 @@ def add_uid(results_df: pd.DataFrame, submission_df: pd.DataFrame) -> pd.DataFra def validate_phases(df: pd.DataFrame, context: dict[str, str] | None = None) -> tuple[bool, pd.DataFrame]: + p1_is_valid, p1_findings = validate(get_phase_1_schema_for_lei(context), df) if not p1_is_valid: diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py index 2caf6971..c4ce2120 100644 --- a/src/regtech_data_validator/data_formatters.py +++ b/src/regtech_data_validator/data_formatters.py @@ -4,8 +4,6 @@ from tabulate import tabulate -more_than_2_fields = ["E2014", "E2015", "W2035", "W2036", "W2037", "W2038", "W2039"] - def df_to_download(df: pd.DataFrame) -> str: if df.empty: @@ -104,71 +102,54 @@ def df_to_table(df: pd.DataFrame) -> str: def df_to_json(df: pd.DataFrame) -> str: - chunk_size = 500000 # maybe at some point make this configurable? + # grouping and processing keeps the process from crashing on really large error + # dataframes (millions of errors). We can't chunk because could cause splitting + # related validation data across chunks, without having to add extra processing + # for tying those objects back together. Grouping adds a little more processing + # time for smaller datasets but keeps really larger ones from crashing. json_results = [] - # group by the validation_id so we don't lose data while chunking, - # then chunk the group into smaller pieces grouped_df = df.groupby('validation_id') - for group_name, group_data in grouped_df: - start_index = 0 - while start_index < len(group_data): - end_index = min(start_index + chunk_size, len(group_data)) - chunk = group_data.iloc[start_index:end_index] - json_results.extend(process_chunk(chunk)) - start_index = end_index + json_results.append(process_chunk(group_data, group_name)) + json_results = sorted(json_results, key=lambda x: x['validation']['id']) return ujson.dumps(json_results, indent=4, escape_forward_slashes=False) -def process_chunk(df: pd.DataFrame) -> [dict]: - output_json = [] - if not df.empty: - - df.reset_index(drop=True, inplace=True) - findings_json = ujson.loads(df.to_json(orient='columns')) - - grouped_data = {} - for i in range(len(findings_json['record_no'])): - validation_id = findings_json['validation_id'][str(i)] - if validation_id not in grouped_data: - grouped_data[validation_id] = [] - grouped_data[validation_id].append( - { - 'record_no': findings_json['record_no'][str(i)], - 'uid': findings_json['uid'][str(i)], - 'field_name': findings_json['field_name'][str(i)], - 'field_value': findings_json['field_value'][str(i)], - } - ) - - for validation_id, records in grouped_data.items(): - for key, value in findings_json['validation_id'].items(): - if validation_id == value: - validation_key = key - break - validation_info = { - 'validation': { - 'id': validation_id, - 'name': findings_json['validation_name'][validation_key], - 'description': findings_json['validation_desc'][validation_key], - 'severity': findings_json['validation_severity'][validation_key], - 'scope': findings_json['scope'][validation_key], - 'fig_link': findings_json['fig_link'][validation_key], - }, - 'records': [], +def process_chunk(df: pd.DataFrame, validation_id: str) -> [dict]: + df.reset_index(drop=True, inplace=True) + findings_json = ujson.loads(df.to_json(orient='columns')) + grouped_data = [] + for i in range(len(findings_json['record_no'])): + grouped_data.append( + { + 'record_no': findings_json['record_no'][str(i)], + 'uid': findings_json['uid'][str(i)], + 'field_name': findings_json['field_name'][str(i)], + 'field_value': findings_json['field_value'][str(i)], } - records_dict = {} - for record in records: - record_no = record['record_no'] - if record_no not in records_dict: - records_dict[record_no] = {'record_no': record['record_no'], 'uid': record['uid'], 'fields': []} - records_dict[record_no]['fields'].append({'name': record['field_name'], 'value': record['field_value']}) - validation_info['records'] = list(records_dict.values()) - for record in validation_info['records']: - if len(record['fields']) == 2: - record['fields'][0], record['fields'][1] = record['fields'][1], record['fields'][0] - output_json.append(validation_info) - - output_json = sorted(output_json, key=lambda x: x['validation']['id']) - - return output_json + ) + + validation_info = { + 'validation': { + 'id': validation_id, + 'name': findings_json['validation_name']['0'], + 'description': findings_json['validation_desc']['0'], + 'severity': findings_json['validation_severity']['0'], + 'scope': findings_json['scope']['0'], + 'fig_link': findings_json['fig_link']['0'], + }, + 'records': [], + } + records_dict = {} + for record in grouped_data: + record_no = record['record_no'] + if record_no not in records_dict: + records_dict[record_no] = {'record_no': record['record_no'], 'uid': record['uid'], 'fields': []} + records_dict[record_no]['fields'].append({'name': record['field_name'], 'value': record['field_value']}) + validation_info['records'] = list(records_dict.values()) + + for record in validation_info['records']: + if len(record['fields']) == 2: + record['fields'][0], record['fields'][1] = record['fields'][1], record['fields'][0] + + return validation_info