diff --git a/.gitmodules b/.gitmodules
index e4d63a341183..ed61ddb96ba1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -253,9 +253,6 @@
 [submodule "contrib/qpl"]
 	path = contrib/qpl
 	url = https://github.com/intel/qpl
-[submodule "contrib/idxd-config"]
-	path = contrib/idxd-config
-	url = https://github.com/intel/idxd-config
 [submodule "contrib/wyhash"]
 	path = contrib/wyhash
 	url = https://github.com/wangyi-fudan/wyhash
@@ -296,6 +293,9 @@
 [submodule "contrib/libdivide"]
 	path = contrib/libdivide
 	url = https://github.com/ridiculousfish/libdivide
+[submodule "contrib/libbcrypt"]
+	path = contrib/libbcrypt
+	url = https://github.com/rg3/libbcrypt.git
 [submodule "contrib/ulid-c"]
 	path = contrib/ulid-c
 	url = https://github.com/ClickHouse/ulid-c.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e2505856d0c8..1ccd4f9846d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@
 
 # 2023 Changelog
 
-### <a id="234"></a> ClickHouse release 23.4 LTS, 2023-04-26
+### <a id="234"></a> ClickHouse release 23.4, 2023-04-26
 
 #### Backward Incompatible Change
 * Formatter '%M' in function formatDateTime() now prints the month name instead of the minutes. This makes the behavior consistent with MySQL. The previous behavior can be restored using setting "formatdatetime_parsedatetime_m_is_month_name = 0". [#47246](https://github.com/ClickHouse/ClickHouse/pull/47246) ([Robert Schulze](https://github.com/rschu1ze)).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0554403cce51..263b202049b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,12 +170,6 @@ else ()
     set(NO_WHOLE_ARCHIVE --no-whole-archive)
 endif ()
 
-option(ENABLE_CURL_BUILD "Enable curl, azure, sentry build on by default except MacOS." ON)
-if (OS_DARWIN)
-    # Disable the curl, azure, senry build on MacOS
-    set (ENABLE_CURL_BUILD OFF)
-endif ()
-
 if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
     # Can be lld or ld-lld or lld-13 or /path/to/lld.
     if (LINKER_NAME MATCHES "lld")
@@ -393,9 +387,9 @@ else()
 endif ()
 
 option (ENABLE_GWP_ASAN "Enable Gwp-Asan" ON)
-# We use mmap for allocations more heavily in debug builds, 
-# but GWP-ASan also wants to use mmap frequently, 
-# and due to a large number of memory mappings, 
+# We use mmap for allocations more heavily in debug builds,
+# but GWP-ASan also wants to use mmap frequently,
+# and due to a large number of memory mappings,
 # it does not work together well.
 if ((NOT OS_LINUX AND NOT OS_ANDROID) OR (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG"))
     set(ENABLE_GWP_ASAN OFF)
diff --git a/SECURITY.md b/SECURITY.md
index 44a122956b45..75c1a9d7d6aa 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -22,13 +22,7 @@ The following versions of ClickHouse server are currently being supported with s
 | 22.10 | ❌ |
 | 22.9 | ❌ |
 | 22.8 | ✔️ |
-| 22.7 | ❌ |
-| 22.6 | ❌ |
-| 22.5 | ❌ |
-| 22.4 | ❌ |
-| 22.3 | ❌ |
-| 22.2 | ❌ |
-| 22.1 | ❌ |
+| 22.* | ❌ |
 | 21.* | ❌ |
 | 20.* | ❌ |
 | 19.* | ❌ |
diff --git a/base/harmful/harmful.c b/base/harmful/harmful.c
index 6112f9a339c0..78796ca0c054 100644
--- a/base/harmful/harmful.c
+++ b/base/harmful/harmful.c
@@ -31,7 +31,8 @@ TRAP(argp_state_help)
 TRAP(argp_usage)
 TRAP(asctime)
 TRAP(clearenv)
-TRAP(crypt)
+// Redefined at contrib/libbcrypt/crypt_blowfish/wrapper.c:186
+// TRAP(crypt)
 TRAP(ctime)
 TRAP(cuserid)
 TRAP(drand48)
diff --git a/cmake/fuzzer.cmake b/cmake/fuzzer.cmake
index 578a97572701..52f301ab8ad4 100644
--- a/cmake/fuzzer.cmake
+++ b/cmake/fuzzer.cmake
@@ -7,10 +7,6 @@ if (FUZZER)
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link")
         set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link")
 
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=fuzzer-no-link")
-        endif()
-
         # NOTE: oss-fuzz can change LIB_FUZZING_ENGINE variable
         if (NOT LIB_FUZZING_ENGINE)
             set (LIB_FUZZING_ENGINE "-fsanitize=fuzzer")
diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake
index fc9793d8f356..bf5eddf09f5d 100644
--- a/cmake/sanitize.cmake
+++ b/cmake/sanitize.cmake
@@ -16,49 +16,24 @@ if (SANITIZE)
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
         set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
 
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_FLAGS}")
-        endif()
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan")
-        endif ()
-
     elseif (SANITIZE STREQUAL "memory")
         # MemorySanitizer flags are set according to the official documentation:
         # https://clang.llvm.org/docs/MemorySanitizer.html#usage
-        #
-        # For now, it compiles with `cmake -DSANITIZE=memory -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_FLAGS_ADD="-O1" -DCMAKE_C_FLAGS_ADD="-O1"`
-        # Compiling with -DCMAKE_BUILD_TYPE=Debug leads to ld.lld failures because
-        # of large files (was not tested with ld.gold). This is why we compile with
-        # RelWithDebInfo, and downgrade optimizations to -O1 but not to -Og, to
-        # keep the binary size down.
-        # TODO: try compiling with -Og and with ld.gold.
+
+        # Linking can fail due to relocation overflows (see #49145), caused by too big object files / libraries.
+        # Work around this with position-independent builds (-fPIC and -fpie), this is slightly slower than non-PIC/PIE but that's okay.
         set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-use-after-dtor -fsanitize-memory-track-origins -fno-optimize-sibling-calls -fPIC -fpie -fsanitize-blacklist=${CMAKE_SOURCE_DIR}/tests/msan_suppressions.txt")
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
         set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
 
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=memory")
-        endif()
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libmsan")
-        endif ()
-
     elseif (SANITIZE STREQUAL "thread")
         set (TSAN_FLAGS "-fsanitize=thread")
         if (COMPILER_CLANG)
             set (TSAN_FLAGS "${TSAN_FLAGS} -fsanitize-blacklist=${CMAKE_SOURCE_DIR}/tests/tsan_suppressions.txt")
         endif()
 
-
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${TSAN_FLAGS}")
         set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${TSAN_FLAGS}")
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread")
-        endif()
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libtsan")
-        endif ()
 
     elseif (SANITIZE STREQUAL "undefined")
         set (UBSAN_FLAGS "-fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=float-divide-by-zero")
@@ -77,12 +52,6 @@ if (SANITIZE)
 
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
         set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
-        endif()
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libubsan")
-        endif ()
 
         # llvm-tblgen, that is used during LLVM build, doesn't work with UBSan.
         set (ENABLE_EMBEDDED_COMPILER 0 CACHE BOOL "")
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 0ff8b550a982..0c92ff17f115 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -141,20 +141,19 @@ add_contrib (libuv-cmake libuv)
 add_contrib (liburing-cmake liburing)
 add_contrib (amqpcpp-cmake AMQP-CPP) # requires: libuv
 add_contrib (cassandra-cmake cassandra) # requires: libuv
-
-if (ENABLE_CURL_BUILD)
+if (NOT OS_DARWIN)
     add_contrib (curl-cmake curl)
     add_contrib (azure-cmake azure)
     add_contrib (sentry-native-cmake sentry-native) # requires: curl
 endif()
-
 add_contrib (fmtlib-cmake fmtlib)
 add_contrib (krb5-cmake krb5)
 add_contrib (cyrus-sasl-cmake cyrus-sasl) # for krb5
 add_contrib (libgsasl-cmake libgsasl) # requires krb5
 add_contrib (librdkafka-cmake librdkafka) # requires: libgsasl
 add_contrib (nats-io-cmake nats-io)
-add_contrib (libhdfs3-cmake libhdfs3) # requires: protobuf, krb5
+add_contrib (isa-l-cmake isa-l)
+add_contrib (libhdfs3-cmake libhdfs3) # requires: protobuf, krb5, isa-l
 add_contrib (hive-metastore-cmake hive-metastore) # requires: thrift/avro/arrow/libhdfs3
 add_contrib (cppkafka-cmake cppkafka)
 add_contrib (libpqxx-cmake libpqxx)
@@ -178,21 +177,17 @@ add_contrib (s2geometry-cmake s2geometry)
 add_contrib (c-ares-cmake c-ares)
 add_contrib (qpl-cmake qpl)
 add_contrib (morton-nd-cmake morton-nd)
-
 if (ARCH_S390X)
     add_contrib(crc32-s390x-cmake crc32-s390x)
 endif()
-
 add_contrib (annoy-cmake annoy)
-
 add_contrib (xxHash-cmake xxHash)
 
-add_contrib (google-benchmark-cmake google-benchmark)
+add_contrib (libbcrypt-cmake libbcrypt)
 
+add_contrib (google-benchmark-cmake google-benchmark)
 add_contrib (ulid-c-cmake ulid-c)
 
-add_contrib (isa-l-cmake isa-l)
-
 # Put all targets defined here and in subdirectories under "contrib/<immediate-subdir>" folders in GUI-based IDEs.
 # Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear
 # in "contrib/..." as originally planned, so we workaround this by fixing FOLDER properties of all targets manually,
diff --git a/contrib/curl b/contrib/curl
index c12fb3ddaf48..b0edf0b7dae4 160000
--- a/contrib/curl
+++ b/contrib/curl
@@ -1 +1 @@
-Subproject commit c12fb3ddaf48e709a7a4deaa55ec485e4df163ee
+Subproject commit b0edf0b7dae44d9e66f270a257cf654b35d5263d
diff --git a/contrib/curl-cmake/CMakeLists.txt b/contrib/curl-cmake/CMakeLists.txt
index 8a570bd267c7..70d9c2816dc1 100644
--- a/contrib/curl-cmake/CMakeLists.txt
+++ b/contrib/curl-cmake/CMakeLists.txt
@@ -12,6 +12,9 @@ set (SRCS
     "${LIBRARY_DIR}/lib/noproxy.c"
     "${LIBRARY_DIR}/lib/idn.c"
     "${LIBRARY_DIR}/lib/cfilters.c"
+    "${LIBRARY_DIR}/lib/cf-socket.c"
+    "${LIBRARY_DIR}/lib/cf-haproxy.c"
+    "${LIBRARY_DIR}/lib/cf-https-connect.c"
     "${LIBRARY_DIR}/lib/file.c"
     "${LIBRARY_DIR}/lib/timeval.c"
     "${LIBRARY_DIR}/lib/base64.c"
@@ -37,8 +40,8 @@ set (SRCS
     "${LIBRARY_DIR}/lib/strcase.c"
     "${LIBRARY_DIR}/lib/easy.c"
     "${LIBRARY_DIR}/lib/curl_fnmatch.c"
+    "${LIBRARY_DIR}/lib/curl_log.c"
     "${LIBRARY_DIR}/lib/fileinfo.c"
-    "${LIBRARY_DIR}/lib/wildcard.c"
     "${LIBRARY_DIR}/lib/krb5.c"
     "${LIBRARY_DIR}/lib/memdebug.c"
     "${LIBRARY_DIR}/lib/http_chunks.c"
@@ -96,6 +99,7 @@ set (SRCS
     "${LIBRARY_DIR}/lib/rand.c"
     "${LIBRARY_DIR}/lib/curl_multibyte.c"
     "${LIBRARY_DIR}/lib/conncache.c"
+    "${LIBRARY_DIR}/lib/cf-h1-proxy.c"
     "${LIBRARY_DIR}/lib/http2.c"
     "${LIBRARY_DIR}/lib/smb.c"
     "${LIBRARY_DIR}/lib/curl_endian.c"
@@ -113,12 +117,13 @@ set (SRCS
     "${LIBRARY_DIR}/lib/altsvc.c"
     "${LIBRARY_DIR}/lib/socketpair.c"
     "${LIBRARY_DIR}/lib/bufref.c"
+    "${LIBRARY_DIR}/lib/bufq.c"
     "${LIBRARY_DIR}/lib/dynbuf.c"
+    "${LIBRARY_DIR}/lib/dynhds.c"
     "${LIBRARY_DIR}/lib/hsts.c"
     "${LIBRARY_DIR}/lib/http_aws_sigv4.c"
     "${LIBRARY_DIR}/lib/mqtt.c"
     "${LIBRARY_DIR}/lib/rename.c"
-    "${LIBRARY_DIR}/lib/h2h3.c"
     "${LIBRARY_DIR}/lib/headers.c"
     "${LIBRARY_DIR}/lib/timediff.c"
     "${LIBRARY_DIR}/lib/vauth/vauth.c"
@@ -133,6 +138,7 @@ set (SRCS
     "${LIBRARY_DIR}/lib/vauth/oauth2.c"
     "${LIBRARY_DIR}/lib/vauth/spnego_gssapi.c"
     "${LIBRARY_DIR}/lib/vauth/spnego_sspi.c"
+    "${LIBRARY_DIR}/lib/vquic/vquic.c"
     "${LIBRARY_DIR}/lib/vtls/openssl.c"
     "${LIBRARY_DIR}/lib/vtls/gtls.c"
     "${LIBRARY_DIR}/lib/vtls/vtls.c"
@@ -147,9 +153,6 @@ set (SRCS
     "${LIBRARY_DIR}/lib/vtls/keylog.c"
     "${LIBRARY_DIR}/lib/vtls/x509asn1.c"
     "${LIBRARY_DIR}/lib/vtls/hostcheck.c"
-    "${LIBRARY_DIR}/lib/vquic/ngtcp2.c"
-    "${LIBRARY_DIR}/lib/vquic/quiche.c"
-    "${LIBRARY_DIR}/lib/vquic/msh3.c"
     "${LIBRARY_DIR}/lib/vssh/libssh2.c"
     "${LIBRARY_DIR}/lib/vssh/libssh.c"
 )
diff --git a/contrib/idxd-config b/contrib/idxd-config
deleted file mode 160000
index f6605c41a735..000000000000
--- a/contrib/idxd-config
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f6605c41a735e3fdfef2d2d18655a33af6490b99
diff --git a/contrib/isa-l-cmake/CMakeLists.txt b/contrib/isa-l-cmake/CMakeLists.txt
index fd0218a7b801..d4d6d648268b 100644
--- a/contrib/isa-l-cmake/CMakeLists.txt
+++ b/contrib/isa-l-cmake/CMakeLists.txt
@@ -1,6 +1,23 @@
+option(ENABLE_ISAL_LIBRARY "Enable ISA-L library" ${ENABLE_LIBRARIES})
+if (ARCH_AARCH64)
+    # Disable ISA-L libray on aarch64.
+    set (ENABLE_ISAL_LIBRARY OFF)
+endif ()
+
+if (NOT ENABLE_ISAL_LIBRARY)
+    message(STATUS "Not using isa-l")
+    return()
+endif()
+
 set(ISAL_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/isa-l")
 
-# check nasm compiler
+# The YASM and NASM assembers are somewhat mutually compatible. ISAL specifically needs NASM. If only YASM is installed, then check_language(ASM_NASM)
+# below happily finds YASM, leading to weird errors at build time. Therefore, do an explicit check for NASM here.
+find_program(NASM_PATH NAMES nasm)
+if (NOT NASM_PATH)
+    message(FATAL_ERROR "Please install NASM from 'https://www.nasm.us/' because NASM compiler can not be found!")
+endif ()
+
 include(CheckLanguage)
 check_language(ASM_NASM)
 if(NOT CMAKE_ASM_NASM_COMPILER)
diff --git a/contrib/libbcrypt b/contrib/libbcrypt
new file mode 160000
index 000000000000..8aa32ad94ebe
--- /dev/null
+++ b/contrib/libbcrypt
@@ -0,0 +1 @@
+Subproject commit 8aa32ad94ebe06b76853b0767c910c9fbf7ccef4
diff --git a/contrib/libbcrypt-cmake/CMakeLists.txt b/contrib/libbcrypt-cmake/CMakeLists.txt
new file mode 100644
index 000000000000..d40d7f9195ee
--- /dev/null
+++ b/contrib/libbcrypt-cmake/CMakeLists.txt
@@ -0,0 +1,19 @@
+option(ENABLE_BCRYPT "Enable bcrypt" ${ENABLE_LIBRARIES})
+
+if (NOT ENABLE_BCRYPT)
+    message(STATUS "Not using bcrypt")
+    return()
+endif()
+
+set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/libbcrypt")
+
+set(SRCS 
+    "${LIBRARY_DIR}/bcrypt.c"
+    "${LIBRARY_DIR}/crypt_blowfish/crypt_blowfish.c"
+    "${LIBRARY_DIR}/crypt_blowfish/crypt_gensalt.c"
+    "${LIBRARY_DIR}/crypt_blowfish/wrapper.c"
+)
+
+add_library(_bcrypt ${SRCS})
+target_include_directories(_bcrypt SYSTEM PUBLIC "${LIBRARY_DIR}")
+add_library(ch_contrib::bcrypt ALIAS _bcrypt)
diff --git a/contrib/libhdfs3-cmake/CMakeLists.txt b/contrib/libhdfs3-cmake/CMakeLists.txt
index d9f7009c1bd4..fd9ed7dc182c 100644
--- a/contrib/libhdfs3-cmake/CMakeLists.txt
+++ b/contrib/libhdfs3-cmake/CMakeLists.txt
@@ -172,8 +172,10 @@ if (TARGET OpenSSL::SSL)
     target_link_libraries(_hdfs3 PRIVATE OpenSSL::Crypto OpenSSL::SSL)
 endif()
 
-target_link_libraries(_hdfs3 PRIVATE ch_contrib::isal)
-add_definitions(-DHADOOP_ISAL_LIBRARY)
+if (TARGET ch_contrib::isal)
+    target_link_libraries(_hdfs3 PRIVATE ch_contrib::isal)
+    add_definitions(-DHADOOP_ISAL_LIBRARY)
+endif()
 
 add_library(ch_contrib::hdfs ALIAS _hdfs3)
 
diff --git a/docker/images.json b/docker/images.json
index 9150abe1f1cf..b4f3e755bd1f 100644
--- a/docker/images.json
+++ b/docker/images.json
@@ -123,7 +123,8 @@
             "docker/test/stateless",
             "docker/test/integration/base",
             "docker/test/fuzzer",
-            "docker/test/keeper-jepsen"
+            "docker/test/keeper-jepsen",
+            "docker/test/server-jepsen"
          ]
     },
     "docker/test/integration/kerberized_hadoop": {
@@ -139,6 +140,10 @@
         "name": "clickhouse/keeper-jepsen-test",
         "dependent": []
     },
+    "docker/test/server-jepsen": {
+        "name": "clickhouse/server-jepsen-test",
+        "dependent": []
+    },
     "docker/test/install/deb": {
         "name": "clickhouse/install-deb-test",
         "dependent": []
diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile
index 59e8d2ed3d87..73da4515ff4b 100644
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
     esac
 
 ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release"
-ARG VERSION="23.4.1.1943"
+ARG VERSION="23.4.2.11"
 ARG PACKAGES="clickhouse-keeper"
 
 # user/group precreated explicitly with fixed uid/gid on purpose.
diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine
index d59a08c28052..1a5d2071f6b7 100644
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="23.4.1.1943"
+ARG VERSION="23.4.2.11"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 
 # user/group precreated explicitly with fixed uid/gid on purpose.
diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu
index 390f347d549c..8792d419a165 100644
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@@ -22,7 +22,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
 
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION="23.4.1.1943"
+ARG VERSION="23.4.2.11"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 
 # set non-empty deb_location_url url to create a docker image
diff --git a/docker/test/server-jepsen/Dockerfile b/docker/test/server-jepsen/Dockerfile
index 958dbfa066af..125b187aa5b9 100644
--- a/docker/test/server-jepsen/Dockerfile
+++ b/docker/test/server-jepsen/Dockerfile
@@ -16,6 +16,8 @@ ENV TESTS_TO_RUN="8"
 ENV TIME_LIMIT="30"
 
 ENV KEEPER_NODE=""
+ENV NEMESIS=""
+ENV WORKLOAD=""
 
 
 # volumes
diff --git a/docker/test/server-jepsen/run.sh b/docker/test/server-jepsen/run.sh
index 4a966d50f74b..4e90a74e7055 100644
--- a/docker/test/server-jepsen/run.sh
+++ b/docker/test/server-jepsen/run.sh
@@ -15,8 +15,38 @@ if [ -z "$CLICKHOUSE_REPO_PATH" ]; then
     ls -lath ||:
 fi
 
+clickhouse_source="--clickhouse-source \'$CLICKHOUSE_PACKAGE\'"
+if [ -n "$WITH_LOCAL_BINARY" ]; then
+    clickhouse_source="--clickhouse-source /clickhouse"
+fi
+
+tests_count="--test-count \"$TESTS_TO_RUN\""
+tests_to_run="test-all"
+workload=""
+if [ -n "$WORKLOAD" ]; then
+    tests_to_run="test"
+    workload="--workload $WORKLOAD"
+    tests_count=""
+fi
+
+nemesis=""
+if [ -n "$NEMESIS" ]; then
+    nemesis="--nemesis $NEMESIS"
+fi
+
+rate=""
+if [ -n "$RATE" ]; then
+    rate="--rate $RATE"
+fi
+
+concurrency=""
+if [ -n "$CONCURRENCY" ]; then
+    concurrency="--concurrency $CONCURRENCY"
+fi
+
+
 cd "$CLICKHOUSE_REPO_PATH/tests/jepsen.clickhouse"
 
-(lein run server test-all --keeper "$KEEPER_NODE" --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 -r 50 --clickhouse-source "$CLICKHOUSE_PACKAGE" --test-count "$TESTS_TO_RUN" || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log"
+(lein run server $tests_to_run $workload --keeper "$KEEPER_NODE" $concurrency $nemesis $rate --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 $clickhouse_source $tests_count --reuse-binary || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log"
 
 mv store "$TEST_OUTPUT/"
diff --git a/docker/test/util/process_functional_tests_result.py b/docker/test/util/process_functional_tests_result.py
index 3c1c6e2a795a..470eb61b3fad 100755
--- a/docker/test/util/process_functional_tests_result.py
+++ b/docker/test/util/process_functional_tests_result.py
@@ -80,11 +80,9 @@ def process_test_log(log_path, broken_tests):
                         test_results.append(
                             (
                                 test_name,
-                                "FAIL",
+                                "SKIPPED",
                                 test_time,
-                                [
-                                    "Test is expected to fail! Please, update broken_tests.txt!\n"
-                                ],
+                                ["This test passed. Update broken_tests.txt.\n"],
                             )
                         )
                     else:
diff --git a/docs/changelogs/v23.4.2.11-stable.md b/docs/changelogs/v23.4.2.11-stable.md
new file mode 100644
index 000000000000..3c572b9c1cb2
--- /dev/null
+++ b/docs/changelogs/v23.4.2.11-stable.md
@@ -0,0 +1,20 @@
+---
+sidebar_position: 1
+sidebar_label: 2023
+---
+
+# 2023 Changelog
+
+### ClickHouse release v23.4.2.11-stable (b6442320f9d) FIXME as compared to v23.4.1.1943-stable (3920eb987f7)
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Revert "Fix GCS native copy ([#48981](https://github.com/ClickHouse/ClickHouse/issues/48981))" [#49194](https://github.com/ClickHouse/ClickHouse/pull/49194) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix race on Outdated parts loading [#49223](https://github.com/ClickHouse/ClickHouse/pull/49223) ([Alexander Tokmakov](https://github.com/tavplubix)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Implement status comment [#48468](https://github.com/ClickHouse/ClickHouse/pull/48468) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Update curl to 8.0.1 (for CVEs) [#48765](https://github.com/ClickHouse/ClickHouse/pull/48765) ([Boris Kuschel](https://github.com/bkuschel)).
+* Fallback auth gh api [#49314](https://github.com/ClickHouse/ClickHouse/pull/49314) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
diff --git a/docs/en/development/build.md b/docs/en/development/build.md
index e3a63da6a3e1..a55d44bdf939 100644
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@@ -22,7 +22,7 @@ The minimum recommended Ubuntu version for development is 22.04 LTS.
 ### Install Prerequisites {#install-prerequisites}
 
 ``` bash
-sudo apt-get install git cmake ccache python3 ninja-build yasm gawk
+sudo apt-get install git cmake ccache python3 ninja-build nasm yasm gawk
 ```
 
 ### Install and Use the Clang compiler
@@ -72,7 +72,7 @@ cmake -S . -B build
 cmake --build build  # or: `cd build; ninja`
 ```
 
-To create an executable, run `cmake --build --target clickhouse` (or: `cd build; ninja clickhouse`).
+To create an executable, run `cmake --build build --target clickhouse` (or: `cd build; ninja clickhouse`).
 This will create executable `build/programs/clickhouse` which can be used with `client` or `server` arguments.
 
 ## Building on Any Linux {#how-to-build-clickhouse-on-any-linux}
@@ -92,7 +92,7 @@ If all the components are installed, you may build in the same way as the steps
 Example for OpenSUSE Tumbleweed:
 
 ``` bash
-sudo zypper install git cmake ninja clang-c++ python lld yasm gawk
+sudo zypper install git cmake ninja clang-c++ python lld nasm yasm gawk
 git clone --recursive https://github.com/ClickHouse/ClickHouse.git
 mkdir build
 cmake -S . -B build
@@ -103,7 +103,7 @@ Example for Fedora Rawhide:
 
 ``` bash
 sudo yum update
-sudo yum --nogpg install git cmake make clang python3 ccache yasm gawk
+sudo yum --nogpg install git cmake make clang python3 ccache nasm yasm gawk
 git clone --recursive https://github.com/ClickHouse/ClickHouse.git
 mkdir build
 cmake -S . -B build
diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md
index d5189d4b9d99..7780dee41360 100644
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@@ -439,6 +439,50 @@ Syntax: `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions,
 - `number_of_hash_functions` — The number of hash functions used in the Bloom filter.
 - `random_seed` — The seed for Bloom filter hash functions.
 
+Users can create [UDF](/docs/en/sql-reference/statements/create/function.md) to estimate the parameters set of `ngrambf_v1`. Query statements are as follows:  
+
+```sql
+CREATE FUNCTION bfEstimateFunctions [ON CLUSTER cluster]   
+AS  
+(total_nubmer_of_all_grams, size_of_bloom_filter_in_bits) -> round((size_of_bloom_filter_in_bits / total_nubmer_of_all_grams) * log(2));   
+  
+CREATE FUNCTION bfEstimateBmSize [ON CLUSTER cluster]   
+AS  
+(total_nubmer_of_all_grams,  probability_of_false_positives) -> ceil((total_nubmer_of_all_grams * log(probability_of_false_positives)) / log(1 / pow(2, log(2))));  
+    
+CREATE FUNCTION bfEstimateFalsePositive [ON CLUSTER cluster]  
+AS   
+(total_nubmer_of_all_grams, number_of_hash_functions, size_of_bloom_filter_in_bytes) -> pow(1 - exp(-number_of_hash_functions/ (size_of_bloom_filter_in_bytes / total_nubmer_of_all_grams)), number_of_hash_functions);  
+  
+CREATE FUNCTION bfEstimateGramNumber [ON CLUSTER cluster]   
+AS  
+(number_of_hash_functions, probability_of_false_positives, size_of_bloom_filter_in_bytes) -> ceil(size_of_bloom_filter_in_bytes / (-number_of_hash_functions / log(1 - exp(log(probability_of_false_positives) / number_of_hash_functions))))
+
+```  
+To use those functions,we need to specify two parameter at least.
+For example, if there 4300 ngrams in the granule and we expect false positives to be less than 0.0001. The other parameters can be estimated by executing following queries:   
+  
+
+```sql
+--- estimate number of bits in the filter
+SELECT bfEstimateBmSize(4300, 0.0001) / 8 as size_of_bloom_filter_in_bytes;  
+
+┌─size_of_bloom_filter_in_bytes─┐
+│                         10304 │
+└───────────────────────────────┘
+  
+--- estimate number of hash functions
+SELECT bfEstimateFunctions(4300, bfEstimateBmSize(4300, 0.0001)) as number_of_hash_functions
+  
+┌─number_of_hash_functions─┐
+│                       13 │
+└──────────────────────────┘
+
+```
+Of course, you can also use those functions to estimate parameters by other conditions.
+The functions refer to the content [here](https://hur.st/bloomfilter).
+
+
 #### Token Bloom Filter
 
 The same as `ngrambf_v1`, but stores tokens instead of ngrams. Tokens are sequences separated by non-alphanumeric characters.
@@ -731,7 +775,13 @@ The names given to the described entities can be found in the system tables, [sy
 
 ### Configuration {#table_engine-mergetree-multiple-volumes_configure}
 
-Disks, volumes and storage policies should be declared inside the `<storage_configuration>` tag either in the main file `config.xml` or in a distinct file in the `config.d` directory.
+Disks, volumes and storage policies should be declared inside the `<storage_configuration>` tag either in a file in the `config.d` directory.
+
+:::tip
+Disks can also be declared in the `SETTINGS` section of a query.  This is useful
+for adhoc analysis to temporarily attach a disk that is, for example, hosted at a URL.
+See [dynamic storage](#dynamic-storage) for more details.
+:::
 
 Configuration structure:
 
@@ -876,6 +926,87 @@ You could change storage policy after table creation with [ALTER TABLE ... MODIF
 
 The number of threads performing background moves of data parts can be changed by [background_move_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_move_pool_size) setting.
 
+### Dynamic Storage
+
+This example query shows how to attach a table stored at a URL and configure the
+remote storage within the query. The web storage is not configured in the ClickHouse
+configuration files; all the settings are in the CREATE/ATTACH query.
+
+:::note
+The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk.
+:::
+
+```sql
+ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
+(
+    price UInt32,
+    date Date,
+    postcode1 LowCardinality(String),
+    postcode2 LowCardinality(String),
+    type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
+    is_new UInt8,
+    duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
+    addr1 String,
+    addr2 String,
+    street LowCardinality(String),
+    locality LowCardinality(String),
+    town LowCardinality(String),
+    district LowCardinality(String),
+    county LowCardinality(String)
+)
+ENGINE = MergeTree
+ORDER BY (postcode1, postcode2, addr1, addr2)
+  # highlight-start
+  SETTINGS disk = disk(
+      type=web,
+      endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
+      );
+  # highlight-end
+```
+
+### Nested Dynamic Storage
+
+This example query builds on the above dynamic disk configuration and shows how to
+use a local disk to cache data from a table stored at a URL. Neither the cache disk
+nor the web storage is configured in the ClickHouse configuration files; both are
+configured in the CREATE/ATTACH query settings.
+
+In the settings highlighted below notice that the disk of `type=web` is nested within 
+the disk of `type=cache`.
+
+```sql
+ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
+(
+    price UInt32,
+    date Date,
+    postcode1 LowCardinality(String),
+    postcode2 LowCardinality(String),
+    type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4),
+    is_new UInt8,
+    duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2),
+    addr1 String,
+    addr2 String,
+    street LowCardinality(String),
+    locality LowCardinality(String),
+    town LowCardinality(String),
+    district LowCardinality(String),
+    county LowCardinality(String)
+)
+ENGINE = MergeTree
+ORDER BY (postcode1, postcode2, addr1, addr2)
+  # highlight-start
+  SETTINGS disk = disk(
+    type=cache,
+    max_size='1Gi',
+    path='/var/lib/clickhouse/custom_disk_cache/',
+    disk=disk(
+      type=web,
+      endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/'
+      )
+  );
+  # highlight-end
+```
+
 ### Details {#details}
 
 In the case of `MergeTree` tables, data is getting to disk in different ways:
diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index 02145a2fb6c9..113e42499fe7 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1324,7 +1324,7 @@ The trailing slash is mandatory.
 <path>/var/lib/clickhouse/</path>
 ```
 
-## prometheus {#server_configuration_parameters-prometheus}
+## Prometheus {#server_configuration_parameters-prometheus}
 
 Exposing metrics data for scraping from [Prometheus](https://prometheus.io).
 
@@ -1339,13 +1339,25 @@ Settings:
 **Example**
 
 ``` xml
- <prometheus>
-    <endpoint>/metrics</endpoint>
-    <port>9363</port>
-    <metrics>true</metrics>
-    <events>true</events>
-    <asynchronous_metrics>true</asynchronous_metrics>
-</prometheus>
+<clickhouse>
+    <listen_host>0.0.0.0</listen_host>
+    <http_port>8123</http_port>
+    <tcp_port>9000</tcp_port>
+    <!-- highlight-start -->
+    <prometheus>
+        <endpoint>/metrics</endpoint>
+        <port>9363</port>
+        <metrics>true</metrics>
+        <events>true</events>
+        <asynchronous_metrics>true</asynchronous_metrics>
+    </prometheus>
+    <!-- highlight-end -->
+</clickhouse>
+```
+
+Check (replace `127.0.0.1` with the IP addr or hostname of your ClickHouse server):
+```bash
+curl 127.0.0.1:9363/metrics
 ```
 
 ## query_log {#server_configuration_parameters-query-log}
diff --git a/docs/en/operations/system-tables/clusters.md b/docs/en/operations/system-tables/clusters.md
index 4b1e75c25a1c..deb9a0aaeb37 100644
--- a/docs/en/operations/system-tables/clusters.md
+++ b/docs/en/operations/system-tables/clusters.md
@@ -20,6 +20,9 @@ Columns:
 - `errors_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of times this host failed to reach replica.
 - `slowdowns_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of slowdowns that led to changing replica when establishing a connection with hedged requests.
 - `estimated_recovery_time` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Seconds remaining until the replica error count is zeroed and it is considered to be back to normal.
+- `database_shard_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database shard (for clusters that belong to a `Replicated` database).
+- `database_replica_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database replica (for clusters that belong to a `Replicated` database).
+- `is_active` ([Nullable(UInt8)](../../sql-reference/data-types/int-uint.md)) — The status of the `Replicated` database replica (for clusters that belong to a `Replicated` database): 1 means "replica is online", 0 means "replica is offline", `NULL` means "unknown".
 
 **Example**
 
@@ -47,6 +50,9 @@ default_database:
 errors_count:            0
 slowdowns_count:         0
 estimated_recovery_time: 0
+database_shard_name:
+database_replica_name:
+is_active:               NULL
 
 Row 2:
 ──────
@@ -63,6 +69,9 @@ default_database:
 errors_count:            0
 slowdowns_count:         0
 estimated_recovery_time: 0
+database_shard_name:
+database_replica_name:
+is_active:               NULL
 ```
 
 **See Also**
diff --git a/docs/en/operations/system-tables/users.md b/docs/en/operations/system-tables/users.md
index a90fa01a45db..58cdb82d31f8 100644
--- a/docs/en/operations/system-tables/users.md
+++ b/docs/en/operations/system-tables/users.md
@@ -12,7 +12,7 @@ Columns:
 
 - `storage` ([String](../../sql-reference/data-types/string.md)) — Path to the storage of users. Configured in the `access_control_path` parameter.
 
-- `auth_type` ([Enum8](../../sql-reference/data-types/enum.md)('no_password' = 0,'plaintext_password' = 1, 'sha256_password' = 2, 'double_sha1_password' = 3, 'ldap' = 4, 'kerberos' = 5, 'ssl_certificate' = 6)) — Shows the authentication type. There are multiple ways of user identification: with no password, with plain text password, with [SHA256](https://ru.wikipedia.org/wiki/SHA-2)-encoded password or with [double SHA-1](https://ru.wikipedia.org/wiki/SHA-1)-encoded password.
+- `auth_type` ([Enum8](../../sql-reference/data-types/enum.md)('no_password' = 0, 'plaintext_password' = 1, 'sha256_password' = 2, 'double_sha1_password' = 3, 'ldap' = 4, 'kerberos' = 5, 'ssl_certificate' = 6, 'bcrypt_password' = 7)) — Shows the authentication type. There are multiple ways of user identification: with no password, with plain text password, with [SHA256](https://en.wikipedia.org/wiki/SHA-2)-encoded password, with [double SHA-1](https://en.wikipedia.org/wiki/SHA-1)-encoded password or with [bcrypt](https://en.wikipedia.org/wiki/Bcrypt)-encoded password.
 
 - `auth_params` ([String](../../sql-reference/data-types/string.md)) — Authentication parameters in the JSON format depending on the `auth_type`.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md b/docs/en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md
new file mode 100644
index 000000000000..3da9645181ee
--- /dev/null
+++ b/docs/en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md
@@ -0,0 +1,118 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest
+sidebar_position: 300
+sidebar_label: kolmogorovSmirnovTest
+---
+
+# kolmogorovSmirnovTest
+
+Applies Kolmogorov-Smirnov's test to samples from two populations.
+
+**Syntax**
+
+``` sql
+kolmogorovSmirnovTest([alternative, computation_method])(sample_data, sample_index)
+```
+
+Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population.
+Samples must belong to continuous, one-dimensional probability distributions.
+
+**Arguments**
+
+- `sample_data` — Sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md).
+- `sample_index` — Sample index. [Integer](../../../sql-reference/data-types/int-uint.md).
+
+**Parameters**
+
+- `alternative` — alternative hypothesis. (Optional, default: `'two-sided'`.) [String](../../../sql-reference/data-types/string.md).
+    Let F(x) and G(x) be the CDFs of the first and second distributions respectively.
+    - `'two-sided'`
+        The null hypothesis is that samples come from the same distribution, e.g. F(x) = G(x) for all x.
+        And the alternative is that the distributions are not identical.
+    - `'greater'`
+        The null hypothesis is that values in the first sample are *stohastically smaller* than those in the second one,
+        e.g. the CDF of first distribution lies above and hence to the left of that for the second one.
+        Which in fact means that F(x) >= G(x) for all x. And the alternative in this case is that F(x) < G(x) for at least one x.
+    - `'less'`.
+        The null hypothesis is that values in the first sample are *stohastically greater* than those in the second one,
+        e.g. the CDF of first distribution lies below and hence to the right of that for the second one.
+        Which in fact means that F(x) <= G(x) for all x. And the alternative in this case is that F(x) > G(x) for at least one x.
+- `computation_method` — the method used to compute p-value. (Optional, default: `'auto'`.) [String](../../../sql-reference/data-types/string.md).
+    - `'exact'` - calculation is performed using precise probability distribution of the test statistics. Compute intensive and wasteful except for small samples.
+    - `'asymp'` (`'asymptotic'`) - calculation is performed using an approximation. For large sample sizes, the exact and asymptotic p-values are very similar.
+    - `'auto'`  - the `'exact'` method is used when a maximum number of samples is less than 10'000.
+
+
+**Returned values**
+
+[Tuple](../../../sql-reference/data-types/tuple.md) with two elements:
+
+- calculated statistic. [Float64](../../../sql-reference/data-types/float.md).
+- calculated p-value. [Float64](../../../sql-reference/data-types/float.md).
+
+
+**Example**
+
+Query:
+
+``` sql
+SELECT kolmogorovSmirnovTest('less', 'exact')(value, num)
+FROM
+(
+    SELECT
+        randNormal(0, 10) AS value,
+        0 AS num
+    FROM numbers(10000)
+    UNION ALL
+    SELECT
+        randNormal(0, 10) AS value,
+        1 AS num
+    FROM numbers(10000)
+)
+```
+
+Result:
+
+``` text
+┌─kolmogorovSmirnovTest('less', 'exact')(value, num)─┐
+│ (0.009899999999999996,0.37528595205132287)         │
+└────────────────────────────────────────────────────┘
+```
+
+Note:
+P-value is bigger than 0.05 (for confidence level of 95%), so null hypothesis is not rejected.
+
+
+Query:
+
+``` sql
+SELECT kolmogorovSmirnovTest('two-sided', 'exact')(value, num)
+FROM
+(
+    SELECT
+        randStudentT(10) AS value,
+        0 AS num
+    FROM numbers(100)
+    UNION ALL
+    SELECT
+        randNormal(0, 10) AS value,
+        1 AS num
+    FROM numbers(100)
+)
+```
+
+Result:
+
+``` text
+┌─kolmogorovSmirnovTest('two-sided', 'exact')(value, num)─┐
+│ (0.4100000000000002,6.61735760482795e-8)                │
+└─────────────────────────────────────────────────────────┘
+```
+
+Note:
+P-value is less than 0.05 (for confidence level of 95%), so null hypothesis is rejected.
+
+
+**See Also**
+
+- [Kolmogorov-Smirnov'test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)
diff --git a/docs/en/sql-reference/data-types/index.md b/docs/en/sql-reference/data-types/index.md
index c61a3069db66..2ad8ac4bb239 100644
--- a/docs/en/sql-reference/data-types/index.md
+++ b/docs/en/sql-reference/data-types/index.md
@@ -27,7 +27,7 @@ ClickHouse data types include:
 - **Aggregation function types**: use [`SimpleAggregateFunction`](./simpleaggregatefunction.md) and [`AggregateFunction`](./aggregatefunction.md) for storing the intermediate status of aggregate function results
 - **Nested data structures**: A [`Nested` data structure](./nested-data-structures/index.md) is like a table inside a cell
 - **Tuples**: A [`Tuple` of elements](./tuple.md), each having an individual type.
-- **Nullable**: [`Nullbale`](./nullable.md) allows you to store a value as `NULL` when a value is "missing" (instead of the column gettings its default value for the data type)
+- **Nullable**: [`Nullable`](./nullable.md) allows you to store a value as `NULL` when a value is "missing" (instead of the column gettings its default value for the data type)
 - **IP addresses**: use [`IPv4`](./domains/ipv4.md) and [`IPv6`](./domains/ipv6.md) to efficiently store IP addresses
 - **Geo types**: for[ geographical data](./geo.md), including `Point`, `Ring`, `Polygon` and `MultiPolygon`
 - **Special data types**: including [`Expression`](./special-data-types/expression.md), [`Set`](./special-data-types/set.md), [`Nothing`](./special-data-types/nothing.md) and [`Interval`](./special-data-types/interval.md)
\ No newline at end of file
diff --git a/docs/en/sql-reference/data-types/nullable.md b/docs/en/sql-reference/data-types/nullable.md
index 230b4af7960b..28180f7f9919 100644
--- a/docs/en/sql-reference/data-types/nullable.md
+++ b/docs/en/sql-reference/data-types/nullable.md
@@ -8,7 +8,7 @@ sidebar_label: Nullable
 
 Allows to store special marker ([NULL](../../sql-reference/syntax.md)) that denotes “missing value” alongside normal values allowed by `TypeName`. For example, a `Nullable(Int8)` type column can store `Int8` type values, and the rows that do not have a value will store `NULL`.
 
-For a `TypeName`, you can’t use composite data types [Array](../../sql-reference/data-types/array.md) and [Tuple](../../sql-reference/data-types/tuple.md). Composite data types can contain `Nullable` type values, such as `Array(Nullable(Int8))`.
+For a `TypeName`, you can’t use composite data types [Array](../../sql-reference/data-types/array.md), [Map](../../sql-reference/data-types/map.md) and [Tuple](../../sql-reference/data-types/tuple.md). Composite data types can contain `Nullable` type values, such as `Array(Nullable(Int8))`.
 
 A `Nullable` type field can’t be included in table indexes.
 
diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md
index 48a8ce45d332..189673cdae75 100644
--- a/docs/en/sql-reference/dictionaries/index.md
+++ b/docs/en/sql-reference/dictionaries/index.md
@@ -1658,6 +1658,7 @@ Example of settings:
         <password></password>
         <db>test</db>
         <collection>dictionary_source</collection>
+        <options>ssl=true</options>
     </mongodb>
 </source>
 ```
@@ -1672,6 +1673,7 @@ SOURCE(MONGODB(
     password ''
     db 'test'
     collection 'dictionary_source'
+    options 'ssl=true'
 ))
 ```
 
@@ -1683,6 +1685,8 @@ Setting fields:
 - `password` – Password of the MongoDB user.
 - `db` – Name of the database.
 - `collection` – Name of the collection.
+- `options` -  MongoDB connection string options (optional parameter).
+
 
 ### Redis
 
diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md
index 3548ef7cc071..d168be63c36c 100644
--- a/docs/en/sql-reference/statements/create/user.md
+++ b/docs/en/sql-reference/statements/create/user.md
@@ -32,9 +32,12 @@ There are multiple ways of user identification:
 - `IDENTIFIED WITH sha256_hash BY 'hash'` or `IDENTIFIED WITH sha256_hash BY 'hash' SALT 'salt'`
 - `IDENTIFIED WITH double_sha1_password BY 'qwerty'`
 - `IDENTIFIED WITH double_sha1_hash BY 'hash'`
+- `IDENTIFIED WITH bcrypt_password BY 'qwerty'`
+- `IDENTIFIED WITH bcrypt_hash BY 'hash'`
 - `IDENTIFIED WITH ldap SERVER 'server_name'`
 - `IDENTIFIED WITH kerberos` or `IDENTIFIED WITH kerberos REALM 'realm'`
 - `IDENTIFIED WITH ssl_certificate CN 'mysite.com:user'`
+- `IDENTIFIED BY 'qwerty'`
 
 ## Examples
 
@@ -54,21 +57,12 @@ There are multiple ways of user identification:
     The password is stored in a SQL text file in `/var/lib/clickhouse/access`, so it's not a good idea to use `plaintext_password`. Try `sha256_password` instead, as demonstrated next...
     :::
 
-3. The best option is to use a password that is hashed using SHA-256. ClickHouse will hash the password for you when you specify `IDENTIFIED WITH sha256_password`. For example:
+3. The most common option is to use a password that is hashed using SHA-256. ClickHouse will hash the password for you when you specify `IDENTIFIED WITH sha256_password`. For example:
 
     ```sql
     CREATE USER name3 IDENTIFIED WITH sha256_password BY 'my_password'
     ```
 
-    Notice ClickHouse generates and runs the following command for you:
-
-    ```response
-    CREATE USER name3
-    IDENTIFIED WITH sha256_hash
-    BY '8B3404953FCAA509540617F082DB13B3E0734F90FF6365C19300CC6A6EA818D6'
-    SALT 'D6489D8B5692D82FF944EA6415785A8A8A1AF33825456AFC554487725A74A609'
-    ```
-
     The `name3` user can now login using `my_password`, but the password is stored as the hashed value above. THe following SQL file was created in `/var/lib/clickhouse/access` and gets executed at server startup:
 
     ```bash
@@ -92,6 +86,34 @@ There are multiple ways of user identification:
     CREATE USER name4 IDENTIFIED WITH double_sha1_hash BY 'CCD3A959D6A004B9C3807B728BC2E55B67E10518'
     ```
 
+5. The `bcrypt_password` is the most secure option for storing passwords. It uses the [bcrypt](https://en.wikipedia.org/wiki/Bcrypt) algorithm, which is resilient against brute force attacks even if the password hash is compromised.
+
+    ```sql
+    CREATE USER name5 IDENTIFIED WITH bcrypt_password BY 'my_password'
+    ```
+
+    The length of the password is limited to 72 characters with this method. The bcrypt work factor parameter, which defines the amount of computations and time needed to compute the hash and verify the password, can be modified in the server configuration:
+
+    ```xml
+    <bcrypt_workfactor>12</bcrypt_workfactor>
+    ```
+
+    The work factor must be between 4 and 31, with a default value of 12.
+
+6. The type of the password can also be omitted:
+
+    ```sql
+    CREATE USER name6 IDENTIFIED BY 'my_password'
+    ```
+
+    In this case, ClickHouse will use the default password type specified in the server configuration:
+
+    ```xml
+    <default_password_type>sha256_password</default_password_type>
+    ```
+
+    The available password types are: `plaintext_password`, `sha256_password`, `double_sha1_password`.
+
 ## User Host
 
 User host is a host from which a connection to ClickHouse server could be established. The host can be specified in the `HOST` query section in the following ways:
diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md
index 5a5a771f2393..c5596b7ba5f6 100644
--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@@ -76,7 +76,7 @@ Resets the mark cache.
 
 ## DROP REPLICA
 
-Dead replicas can be dropped using following syntax:
+Dead replicas of `ReplicatedMergeTree` tables can be dropped using following syntax:
 
 ``` sql
 SYSTEM DROP REPLICA 'replica_name' FROM TABLE database.table;
@@ -85,13 +85,25 @@ SYSTEM DROP REPLICA 'replica_name';
 SYSTEM DROP REPLICA 'replica_name' FROM ZKPATH '/path/to/table/in/zk';
 ```
 
-Queries will remove the replica path in ZooKeeper. It is useful when the replica is dead and its metadata cannot be removed from ZooKeeper by `DROP TABLE` because there is no such table anymore. It will only drop the inactive/stale replica, and it cannot drop local replica, please use `DROP TABLE` for that. `DROP REPLICA` does not drop any tables and does not remove any data or metadata from disk.
+Queries will remove the `ReplicatedMergeTree` replica path in ZooKeeper. It is useful when the replica is dead and its metadata cannot be removed from ZooKeeper by `DROP TABLE` because there is no such table anymore. It will only drop the inactive/stale replica, and it cannot drop local replica, please use `DROP TABLE` for that. `DROP REPLICA` does not drop any tables and does not remove any data or metadata from disk.
 
 The first one removes metadata of `'replica_name'` replica of `database.table` table.
 The second one does the same for all replicated tables in the database.
 The third one does the same for all replicated tables on the local server.
 The fourth one is useful to remove metadata of dead replica when all other replicas of a table were dropped. It requires the table path to be specified explicitly. It must be the same path as was passed to the first argument of `ReplicatedMergeTree` engine on table creation.
 
+## DROP DATABASE REPLICA
+
+Dead replicas of `Replicated` databases can be dropped using following syntax:
+
+``` sql
+SYSTEM DROP DATABASE REPLICA 'replica_name' [FROM SHARD 'shard_name'] FROM DATABASE database;
+SYSTEM DROP DATABASE REPLICA 'replica_name' [FROM SHARD 'shard_name'];
+SYSTEM DROP DATABASE REPLICA 'replica_name' [FROM SHARD 'shard_name'] FROM ZKPATH '/path/to/table/in/zk';
+```
+
+Similar to `SYSTEM DROP REPLICA`, but removes the `Replicated` database replica path from ZooKeeper when there's no database to run `DROP DATABASE`. Please note that it does not remove `ReplicatedMergeTree` replicas (so you may need `SYSTEM DROP REPLICA` as well). Shard and replica names are the names that were specified in `Replicated` engine arguments when creating the database. Also, these names can be obtained from `database_shard_name` and `database_replica_name` columns in `system.clusters`. If the `FROM SHARD` clause is missing, then `replica_name` must be a full replica name in `shard_name|replica_name` format.
+
 ## DROP UNCOMPRESSED CACHE
 
 Reset the uncompressed data cache.
diff --git a/docs/ru/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md b/docs/ru/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md
new file mode 100644
index 000000000000..2f8c6bb6760c
--- /dev/null
+++ b/docs/ru/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md
@@ -0,0 +1,117 @@
+---
+slug: /ru/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest
+sidebar_position: 300
+sidebar_label: kolmogorovSmirnovTest
+---
+
+# kolmogorovSmirnovTest {#kolmogorovSmirnovTest}
+
+Проводит статистический тест Колмогорова-Смирнова для двух независимых выборок.
+
+**Синтаксис**
+
+``` sql
+kolmogorovSmirnovTest([alternative, computation_method])(sample_data, sample_index)
+```
+
+Значения выборок берутся из столбца `sample_data`. Если  `sample_index` равно 0, то значение из этой строки принадлежит первой выборке. Во всех остальных случаях значение принадлежит второй выборке.
+Выборки должны принадлежать непрерывным одномерным распределениям.
+
+**Аргументы**
+
+-   `sample_data` — данные выборок. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md).
+-   `sample_index` — индексы выборок. [Integer](../../../sql-reference/data-types/int-uint.md).
+
+**Параметры**
+
+- `alternative` — альтернативная гипотеза (Необязательный параметр, по умолчанию: `'two-sided'`.) [String](../../../sql-reference/data-types/string.md).
+    Пусть F(x) и G(x) - функции распределения первой и второй выборки соотвественно.
+    - `'two-sided'`
+        Нулевая гипотеза состоит в том, что выборки происходит из одного и того же распределение, то есть F(x) = G(x) для любого x.
+        Альтернатива - выборки принадлежат разным распределениям.
+    - `'greater'`
+        Нулевая гипотеза состоит в том, что элементы первой выборки в асимптотически почти наверное меньше элементов из второй выборки,
+        то есть функция распределения первой выборки лежит выше и соотвественно левее, чем функция распределения второй выборки.
+        Таким образом это означает, что F(x) >= G(x) for любого x, а альтернатива в этом случае состоит в том, что F(x) < G(x) хотя бы для одного x.
+    - `'less'`.
+        Нулевая гипотеза состоит в том, что элементы первой выборки в асимптотически почти наверное больше элементов из второй выборки,
+        то есть функция распределения первой выборки лежит ниже и соотвественно правее, чем функция распределения второй выборки.
+        Таким образом это означает, что F(x) <= G(x) for любого x, а альтернатива в этом случае состоит в том, что F(x) > G(x) хотя бы для одного x.
+- `computation_method` — метод, используемый для вычисления p-value. (Необязательный параметр, по умолчанию: `'auto'`.) [String](../../../sql-reference/data-types/string.md).
+    - `'exact'` - вычисление производится с помощью вычисления точного распределения статистики. Требует большого количества вычислительных ресурсов и расточительно для больших выборок.
+    - `'asymp'`(`'asymptotic'`) - используется приближенное вычисление. Для больших выборок приближенный результат и точный почти идентичны.
+    - `'auto'`  - значение вычисляется точно (с помощью метода `'exact'`), если максимальный размер двух выборок не превышает 10'000.
+
+**Возвращаемые значения**
+
+[Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами:
+
+-   вычисленное статистики. [Float64](../../../sql-reference/data-types/float.md).
+-   вычисленное p-value. [Float64](../../../sql-reference/data-types/float.md).
+
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT kolmogorovSmirnovTest('less', 'exact')(value, num)
+FROM
+(
+    SELECT
+        randNormal(0, 10) AS value,
+        0 AS num
+    FROM numbers(10000)
+    UNION ALL
+    SELECT
+        randNormal(0, 10) AS value,
+        1 AS num
+    FROM numbers(10000)
+)
+```
+
+Результат:
+
+``` text
+┌─kolmogorovSmirnovTest('less', 'exact')(value, num)─┐
+│ (0.009899999999999996,0.37528595205132287)         │
+└────────────────────────────────────────────────────┘
+```
+
+Заметки:
+P-value больше чем 0.05 (для уровня значимости 95%), то есть нулевая гипотеза не отвергается.
+
+
+Запрос:
+
+``` sql
+SELECT kolmogorovSmirnovTest('two-sided', 'exact')(value, num)
+FROM
+(
+    SELECT
+        randStudentT(10) AS value,
+        0 AS num
+    FROM numbers(100)
+    UNION ALL
+    SELECT
+        randNormal(0, 10) AS value,
+        1 AS num
+    FROM numbers(100)
+)
+```
+
+Результат:
+
+``` text
+┌─kolmogorovSmirnovTest('two-sided', 'exact')(value, num)─┐
+│ (0.4100000000000002,6.61735760482795e-8)                │
+└─────────────────────────────────────────────────────────┘
+```
+
+Заметки:
+P-value меньше чем 0.05 (для уровня значимости 95%), то есть нулевая гипотеза отвергается.
+
+
+**Смотрите также**
+
+- [Критерий согласия Колмогорова-Смирнова](https://ru.wikipedia.org/wiki/%D0%9A%D1%80%D0%B8%D1%82%D0%B5%D1%80%D0%B8%D0%B9_%D1%81%D0%BE%D0%B3%D0%BB%D0%B0%D1%81%D0%B8%D1%8F_%D0%9A%D0%BE%D0%BB%D0%BC%D0%BE%D0%B3%D0%BE%D1%80%D0%BE%D0%B2%D0%B0)
diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 5768e744f94e..8925f50fe973 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -26,12 +26,13 @@
 #include <Common/TLDListsHolder.h>
 #include <Common/quoteString.h>
 #include <Common/randomSeed.h>
+#include <Common/ThreadPool.h>
 #include <Loggers/Loggers.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/ReadBufferFromString.h>
 #include <IO/WriteBufferFromFileDescriptor.h>
 #include <IO/UseSSL.h>
-#include <IO/IOThreadPool.h>
+#include <IO/SharedThreadPools.h>
 #include <Parsers/IAST.h>
 #include <Parsers/ASTInsertQuery.h>
 #include <Common/ErrorHandlers.h>
@@ -133,6 +134,11 @@ void LocalServer::initialize(Poco::Util::Application & self)
         config().getUInt("max_io_thread_pool_size", 100),
         config().getUInt("max_io_thread_pool_free_size", 0),
         config().getUInt("io_thread_pool_queue_size", 10000));
+
+    OutdatedPartsLoadingThreadPool::initialize(
+        config().getUInt("max_outdated_parts_loading_thread_pool_size", 16),
+        0, // We don't need any threads one all the parts will be loaded
+        config().getUInt("outdated_part_loading_thread_pool_queue_size", 10000));
 }
 
 
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 8c0d50bae55c..bbd536d93004 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -41,10 +41,9 @@
 #include <Common/TLDListsHolder.h>
 #include <Common/Config/AbstractConfigurationComparison.h>
 #include <Core/ServerUUID.h>
-#include <IO/BackupsIOThreadPool.h>
 #include <IO/ReadHelpers.h>
 #include <IO/ReadBufferFromFile.h>
-#include <IO/IOThreadPool.h>
+#include <IO/SharedThreadPools.h>
 #include <IO/UseSSL.h>
 #include <Interpreters/ServerAsynchronousMetrics.h>
 #include <Interpreters/DDLWorker.h>
@@ -778,6 +777,11 @@ try
         server_settings.max_backups_io_thread_pool_free_size,
         server_settings.backups_io_thread_pool_queue_size);
 
+    OutdatedPartsLoadingThreadPool::initialize(
+        server_settings.max_outdated_parts_loading_thread_pool_size,
+        0, // We don't need any threads one all the parts will be loaded
+        server_settings.outdated_part_loading_thread_pool_queue_size);
+
     /// Initialize global local cache for remote filesystem.
     if (config().has("local_cache_for_remote_fs"))
     {
@@ -1852,7 +1856,7 @@ try
                 LOG_INFO(log, "Closed all listening sockets.");
 
             /// Killing remaining queries.
-            if (server_settings.shutdown_wait_unfinished_queries)
+            if (!server_settings.shutdown_wait_unfinished_queries)
                 global_context->getProcessList().killAllQueries();
 
             if (current_connections)
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 1aeda624db2d..51aa04ba0e5d 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -476,6 +476,14 @@
     <allow_no_password>1</allow_no_password>
     <allow_implicit_no_password>1</allow_implicit_no_password>
 
+    <!-- When a user does not specify a password type in the CREATE USER query, the default password type is used.
+         Accepted values are: 'plaintext_password', 'sha256_password', 'double_sha1_password', 'bcrypt_password'.
+      -->
+    <default_password_type>sha256_password</default_password_type>
+
+    <!-- Work factor for bcrypt_password authentication type-->
+    <bcrypt_workfactor>12</bcrypt_workfactor>
+
     <!-- Complexity requirements for user passwords. -->
     <!-- <password_complexity>
         <rule>
diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp
index 875f4965e0b2..6179c823b56c 100644
--- a/src/Access/AccessControl.cpp
+++ b/src/Access/AccessControl.cpp
@@ -271,8 +271,11 @@ void AccessControl::setUpFromMainConfig(const Poco::Util::AbstractConfiguration
     setImplicitNoPasswordAllowed(config_.getBool("allow_implicit_no_password", true));
     setNoPasswordAllowed(config_.getBool("allow_no_password", true));
     setPlaintextPasswordAllowed(config_.getBool("allow_plaintext_password", true));
+    setDefaultPasswordTypeFromConfig(config_.getString("default_password_type", "sha256_password"));
     setPasswordComplexityRulesFromConfig(config_);
 
+    setBcryptWorkfactor(config_.getInt("bcrypt_workfactor", 12));
+
     /// Optional improvements in access control system.
     /// The default values are false because we need to be compatible with earlier access configurations
     setEnabledUsersWithoutRowPoliciesCanReadRows(config_.getBool("access_control_improvements.users_without_row_policies_can_read_rows", false));
@@ -653,6 +656,27 @@ bool AccessControl::isPlaintextPasswordAllowed() const
     return allow_plaintext_password;
 }
 
+void AccessControl::setDefaultPasswordTypeFromConfig(const String & type_)
+{
+    for (auto check_type : collections::range(AuthenticationType::MAX))
+    {
+        const auto & info = AuthenticationTypeInfo::get(check_type);
+
+        if (type_ == info.name && info.is_password)
+        {
+            default_password_type = check_type;
+            return;
+        }
+    }
+
+    throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Unknown password type in 'default_password_type' in config");
+}
+
+AuthenticationType AccessControl::getDefaultPasswordType() const
+{
+    return default_password_type;
+}
+
 void AccessControl::setPasswordComplexityRulesFromConfig(const Poco::Util::AbstractConfiguration & config_)
 {
     password_rules->setPasswordComplexityRulesFromConfig(config_);
@@ -673,6 +697,21 @@ std::vector<std::pair<String, String>> AccessControl::getPasswordComplexityRules
     return password_rules->getPasswordComplexityRules();
 }
 
+void AccessControl::setBcryptWorkfactor(int workfactor_)
+{
+    if (workfactor_ < 4)
+        bcrypt_workfactor = 4;
+    else if (workfactor_ > 31)
+        bcrypt_workfactor = 31;
+    else
+        bcrypt_workfactor = workfactor_;
+}
+
+int AccessControl::getBcryptWorkfactor() const
+{
+    return bcrypt_workfactor;
+}
+
 
 std::shared_ptr<const ContextAccess> AccessControl::getContextAccess(
     const UUID & user_id,
diff --git a/src/Access/AccessControl.h b/src/Access/AccessControl.h
index 957a2483cd1d..2a8293a49e77 100644
--- a/src/Access/AccessControl.h
+++ b/src/Access/AccessControl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Access/MultipleAccessStorage.h>
+#include <Access/Common/AuthenticationType.h>
 #include <Common/SettingsChanges.h>
 #include <Common/ZooKeeper/Common.h>
 #include <base/scope_guard.h>
@@ -147,13 +148,20 @@ class AccessControl : public MultipleAccessStorage
     void setPlaintextPasswordAllowed(const bool allow_plaintext_password_);
     bool isPlaintextPasswordAllowed() const;
 
-    /// Check complexity requirements for plaintext passwords
+    /// Default password type when the user does not specify it.
+    void setDefaultPasswordTypeFromConfig(const String & type_);
+    AuthenticationType getDefaultPasswordType() const;
 
+    /// Check complexity requirements for passwords
     void setPasswordComplexityRulesFromConfig(const Poco::Util::AbstractConfiguration & config_);
     void setPasswordComplexityRules(const std::vector<std::pair<String, String>> & rules_);
     void checkPasswordComplexityRules(const String & password_) const;
     std::vector<std::pair<String, String>> getPasswordComplexityRules() const;
 
+    /// Workfactor for bcrypt encoded passwords
+    void setBcryptWorkfactor(int workfactor_);
+    int getBcryptWorkfactor() const;
+
     /// Enables logic that users without permissive row policies can still read rows using a SELECT query.
     /// For example, if there two users A, B and a row policy is defined only for A, then
     /// if this setting is true the user B will see all rows, and if this setting is false the user B will see no rows.
@@ -242,6 +250,8 @@ class AccessControl : public MultipleAccessStorage
     std::atomic_bool select_from_system_db_requires_grant = false;
     std::atomic_bool select_from_information_schema_requires_grant = false;
     std::atomic_bool settings_constraints_replace_previous = false;
+    std::atomic_int bcrypt_workfactor = 12;
+    std::atomic<AuthenticationType> default_password_type = AuthenticationType::SHA256_PASSWORD;
 };
 
 }
diff --git a/src/Access/Authentication.cpp b/src/Access/Authentication.cpp
index b9bc0ee961cb..f4f5259597aa 100644
--- a/src/Access/Authentication.cpp
+++ b/src/Access/Authentication.cpp
@@ -1,5 +1,5 @@
 #include <Access/Authentication.h>
-#include <Access/Common/AuthenticationData.h>
+#include <Access/AuthenticationData.h>
 #include <Access/Credentials.h>
 #include <Access/ExternalAuthenticators.h>
 #include <Access/LDAPClient.h>
@@ -31,6 +31,11 @@ namespace
         return (Util::encodeDoubleSHA1(password) == password_double_sha1);
     }
 
+    bool checkPasswordBcrypt(std::string_view password, const Digest & password_bcrypt)
+    {
+        return Util::checkPasswordBcrypt(password, password_bcrypt);
+    }
+
     bool checkPasswordSHA256(std::string_view password, const Digest & password_sha256, const String & salt)
     {
         return Util::encodeSHA256(String(password).append(salt)) == password_sha256;
@@ -81,6 +86,7 @@ bool Authentication::areCredentialsValid(const Credentials & credentials, const
             case AuthenticationType::PLAINTEXT_PASSWORD:
             case AuthenticationType::SHA256_PASSWORD:
             case AuthenticationType::DOUBLE_SHA1_PASSWORD:
+            case AuthenticationType::BCRYPT_PASSWORD:
             case AuthenticationType::LDAP:
                 throw Authentication::Require<BasicCredentials>("ClickHouse Basic Authentication");
 
@@ -109,6 +115,7 @@ bool Authentication::areCredentialsValid(const Credentials & credentials, const
                 return checkPasswordDoubleSHA1MySQL(mysql_credentials->getScramble(), mysql_credentials->getScrambledPassword(), auth_data.getPasswordHashBinary());
 
             case AuthenticationType::SHA256_PASSWORD:
+            case AuthenticationType::BCRYPT_PASSWORD:
             case AuthenticationType::LDAP:
             case AuthenticationType::KERBEROS:
                 throw Authentication::Require<BasicCredentials>("ClickHouse Basic Authentication");
@@ -146,6 +153,9 @@ bool Authentication::areCredentialsValid(const Credentials & credentials, const
             case AuthenticationType::SSL_CERTIFICATE:
                 throw Authentication::Require<BasicCredentials>("ClickHouse X.509 Authentication");
 
+            case AuthenticationType::BCRYPT_PASSWORD:
+                return checkPasswordBcrypt(basic_credentials->getPassword(), auth_data.getPasswordHashBinary());
+
             case AuthenticationType::MAX:
                 break;
         }
@@ -159,6 +169,7 @@ bool Authentication::areCredentialsValid(const Credentials & credentials, const
             case AuthenticationType::PLAINTEXT_PASSWORD:
             case AuthenticationType::SHA256_PASSWORD:
             case AuthenticationType::DOUBLE_SHA1_PASSWORD:
+            case AuthenticationType::BCRYPT_PASSWORD:
             case AuthenticationType::LDAP:
                 throw Authentication::Require<BasicCredentials>("ClickHouse Basic Authentication");
 
diff --git a/src/Access/Authentication.h b/src/Access/Authentication.h
index ab787851cb29..d1e00a28ebb9 100644
--- a/src/Access/Authentication.h
+++ b/src/Access/Authentication.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <Access/Common/AuthenticationData.h>
+#include <Access/AuthenticationData.h>
 #include <Common/Exception.h>
 #include <base/types.h>
 
diff --git a/src/Access/AuthenticationData.cpp b/src/Access/AuthenticationData.cpp
new file mode 100644
index 000000000000..409338209ccd
--- /dev/null
+++ b/src/Access/AuthenticationData.cpp
@@ -0,0 +1,448 @@
+#include <Access/AccessControl.h>
+#include <Access/AuthenticationData.h>
+#include <Common/Exception.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/evaluateConstantExpression.h>
+#include <Parsers/ASTExpressionList.h>
+#include <Parsers/ASTLiteral.h>
+#include <Storages/checkAndGetLiteralArgument.h>
+
+#include <Common/OpenSSLHelpers.h>
+#include <Poco/SHA1Engine.h>
+#include <base/types.h>
+#include <boost/algorithm/hex.hpp>
+#include <boost/algorithm/string/case_conv.hpp>
+
+#include "config.h"
+
+#if USE_SSL
+#     include <openssl/crypto.h>
+#     include <openssl/rand.h>
+#     include <openssl/err.h>
+#endif
+
+#if USE_BCRYPT
+#     include <bcrypt.h>
+#endif
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int SUPPORT_IS_DISABLED;
+    extern const int BAD_ARGUMENTS;
+    extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
+    extern const int OPENSSL_ERROR;
+}
+
+AuthenticationData::Digest AuthenticationData::Util::encodeSHA256(std::string_view text [[maybe_unused]])
+{
+#if USE_SSL
+    Digest hash;
+    hash.resize(32);
+    ::DB::encodeSHA256(text, hash.data());
+    return hash;
+#else
+    throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SHA256 passwords support is disabled, because ClickHouse was built without SSL library");
+#endif
+}
+
+
+AuthenticationData::Digest AuthenticationData::Util::encodeSHA1(std::string_view text)
+{
+    Poco::SHA1Engine engine;
+    engine.update(text.data(), text.size());
+    return engine.digest();
+}
+
+AuthenticationData::Digest AuthenticationData::Util::encodeBcrypt(std::string_view text [[maybe_unused]], int workfactor [[maybe_unused]])
+{
+#if USE_BCRYPT
+    if (text.size() > 72)
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS,
+            "bcrypt does not support passwords with a length of more than 72 bytes");
+
+    char salt[BCRYPT_HASHSIZE];
+    Digest hash;
+    hash.resize(64);
+
+    int ret = bcrypt_gensalt(workfactor, salt);
+    if (ret != 0)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "BCrypt library failed: bcrypt_gensalt returned {}", ret);
+
+    ret = bcrypt_hashpw(text.data(), salt, reinterpret_cast<char *>(hash.data()));
+    if (ret != 0)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "BCrypt library failed: bcrypt_hashpw returned {}", ret);
+
+    return hash;
+#else
+    throw Exception(
+        ErrorCodes::SUPPORT_IS_DISABLED,
+        "bcrypt passwords support is disabled, because ClickHouse was built without bcrypt library");
+#endif
+}
+
+bool AuthenticationData::Util::checkPasswordBcrypt(std::string_view password [[maybe_unused]], const Digest & password_bcrypt [[maybe_unused]])
+{
+#if USE_BCRYPT
+    int ret = bcrypt_checkpw(password.data(), reinterpret_cast<const char *>(password_bcrypt.data()));
+    if (ret == -1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "BCrypt library failed: bcrypt_checkpw returned {}", ret);
+    return (ret == 0);
+#else
+    throw Exception(
+        ErrorCodes::SUPPORT_IS_DISABLED,
+        "bcrypt passwords support is disabled, because ClickHouse was built without bcrypt library");
+#endif
+}
+
+bool operator ==(const AuthenticationData & lhs, const AuthenticationData & rhs)
+{
+    return (lhs.type == rhs.type) && (lhs.password_hash == rhs.password_hash)
+        && (lhs.ldap_server_name == rhs.ldap_server_name) && (lhs.kerberos_realm == rhs.kerberos_realm)
+        && (lhs.ssl_certificate_common_names == rhs.ssl_certificate_common_names);
+}
+
+
+void AuthenticationData::setPassword(const String & password_)
+{
+    switch (type)
+    {
+        case AuthenticationType::PLAINTEXT_PASSWORD:
+            return setPasswordHashBinary(Util::stringToDigest(password_));
+
+        case AuthenticationType::SHA256_PASSWORD:
+            return setPasswordHashBinary(Util::encodeSHA256(password_));
+
+        case AuthenticationType::DOUBLE_SHA1_PASSWORD:
+            return setPasswordHashBinary(Util::encodeDoubleSHA1(password_));
+
+        case AuthenticationType::BCRYPT_PASSWORD:
+        case AuthenticationType::NO_PASSWORD:
+        case AuthenticationType::LDAP:
+        case AuthenticationType::KERBEROS:
+        case AuthenticationType::SSL_CERTIFICATE:
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify password for authentication type {}", toString(type));
+
+        case AuthenticationType::MAX:
+            break;
+    }
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "setPassword(): authentication type {} not supported", toString(type));
+}
+
+void AuthenticationData::setPasswordBcrypt(const String & password_, int workfactor_)
+{
+    if (type != AuthenticationType::BCRYPT_PASSWORD)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify bcrypt password for authentication type {}", toString(type));
+
+    return setPasswordHashBinary(Util::encodeBcrypt(password_, workfactor_));
+}
+
+String AuthenticationData::getPassword() const
+{
+    if (type != AuthenticationType::PLAINTEXT_PASSWORD)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot decode the password");
+    return String(password_hash.data(), password_hash.data() + password_hash.size());
+}
+
+
+void AuthenticationData::setPasswordHashHex(const String & hash)
+{
+    Digest digest;
+    digest.resize(hash.size() / 2);
+
+    try
+    {
+        boost::algorithm::unhex(hash.begin(), hash.end(), digest.data());
+    }
+    catch (const std::exception &)
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read password hash in hex, check for valid characters [0-9a-fA-F] and length");
+    }
+
+    setPasswordHashBinary(digest);
+}
+
+
+String AuthenticationData::getPasswordHashHex() const
+{
+    if (type == AuthenticationType::LDAP || type == AuthenticationType::KERBEROS || type == AuthenticationType::SSL_CERTIFICATE)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get password hex hash for authentication type {}", toString(type));
+
+    String hex;
+    hex.resize(password_hash.size() * 2);
+    boost::algorithm::hex(password_hash.begin(), password_hash.end(), hex.data());
+    return hex;
+}
+
+
+void AuthenticationData::setPasswordHashBinary(const Digest & hash)
+{
+    switch (type)
+    {
+        case AuthenticationType::PLAINTEXT_PASSWORD:
+        {
+            password_hash = hash;
+            return;
+        }
+
+        case AuthenticationType::SHA256_PASSWORD:
+        {
+            if (hash.size() != 32)
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                                "Password hash for the 'SHA256_PASSWORD' authentication type has length {} "
+                                "but must be exactly 32 bytes.", hash.size());
+            password_hash = hash;
+            return;
+        }
+
+        case AuthenticationType::DOUBLE_SHA1_PASSWORD:
+        {
+            if (hash.size() != 20)
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                                "Password hash for the 'DOUBLE_SHA1_PASSWORD' authentication type has length {} "
+                                "but must be exactly 20 bytes.", hash.size());
+            password_hash = hash;
+            return;
+        }
+
+        case AuthenticationType::BCRYPT_PASSWORD:
+        {
+            /// Depending on the workfactor the resulting hash can be 59 or 60 characters long.
+            /// However the library we use to encode it requires hash string to be 64 characters long,
+            ///  so we also allow the hash of this length.
+
+            if (hash.size() != 59 && hash.size() != 60 && hash.size() != 64)
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                                "Password hash for the 'BCRYPT_PASSWORD' authentication type has length {} "
+                                "but must be 59 or 60 bytes.", hash.size());
+            password_hash = hash;
+            password_hash.resize(64);
+            return;
+        }
+
+        case AuthenticationType::NO_PASSWORD:
+        case AuthenticationType::LDAP:
+        case AuthenticationType::KERBEROS:
+        case AuthenticationType::SSL_CERTIFICATE:
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify password binary hash for authentication type {}", toString(type));
+
+        case AuthenticationType::MAX:
+            break;
+    }
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "setPasswordHashBinary(): authentication type {} not supported", toString(type));
+}
+
+void AuthenticationData::setSalt(String salt_)
+{
+    if (type != AuthenticationType::SHA256_PASSWORD)
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "setSalt(): authentication type {} not supported", toString(type));
+    salt = std::move(salt_);
+}
+
+String AuthenticationData::getSalt() const
+{
+    return salt;
+}
+
+void AuthenticationData::setSSLCertificateCommonNames(boost::container::flat_set<String> common_names_)
+{
+    if (common_names_.empty())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "The 'SSL CERTIFICATE' authentication type requires a non-empty list of common names.");
+    ssl_certificate_common_names = std::move(common_names_);
+}
+
+std::shared_ptr<ASTAuthenticationData> AuthenticationData::toAST() const
+{
+    auto node = std::make_shared<ASTAuthenticationData>();
+    auto auth_type = getType();
+    node->type = auth_type;
+
+    switch (auth_type)
+    {
+        case AuthenticationType::PLAINTEXT_PASSWORD:
+        {
+            node->contains_password = true;
+            node->children.push_back(std::make_shared<ASTLiteral>(getPassword()));
+            break;
+        }
+        case AuthenticationType::SHA256_PASSWORD:
+        {
+            node->contains_hash = true;
+            node->children.push_back(std::make_shared<ASTLiteral>(getPasswordHashHex()));
+
+            if (!getSalt().empty())
+                node->children.push_back(std::make_shared<ASTLiteral>(getSalt()));
+            break;
+        }
+        case AuthenticationType::DOUBLE_SHA1_PASSWORD:
+        {
+            node->contains_hash = true;
+            node->children.push_back(std::make_shared<ASTLiteral>(getPasswordHashHex()));
+            break;
+        }
+        case AuthenticationType::BCRYPT_PASSWORD:
+        {
+            node->contains_hash = true;
+            node->children.push_back(std::make_shared<ASTLiteral>(AuthenticationData::Util::digestToString(getPasswordHashBinary())));
+            break;
+        }
+        case AuthenticationType::LDAP:
+        {
+            node->children.push_back(std::make_shared<ASTLiteral>(getLDAPServerName()));
+            break;
+        }
+        case AuthenticationType::KERBEROS:
+        {
+            const auto & realm = getKerberosRealm();
+
+            if (!realm.empty())
+                node->children.push_back(std::make_shared<ASTLiteral>(realm));
+
+            break;
+        }
+        case AuthenticationType::SSL_CERTIFICATE:
+        {
+            for (const auto & name : getSSLCertificateCommonNames())
+                node->children.push_back(std::make_shared<ASTLiteral>(name));
+
+            break;
+        }
+
+        case AuthenticationType::NO_PASSWORD: [[fallthrough]];
+        case AuthenticationType::MAX:
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "AST: Unexpected authentication type {}", toString(auth_type));
+    }
+
+    return node;
+}
+
+
+AuthenticationData AuthenticationData::fromAST(const ASTAuthenticationData & query, ContextPtr context, bool check_password_rules)
+{
+    if (query.type && query.type == AuthenticationType::NO_PASSWORD)
+        return AuthenticationData();
+
+    size_t args_size = query.children.size();
+    ASTs args(args_size);
+    for (size_t i = 0; i < args_size; ++i)
+        args[i] = evaluateConstantExpressionAsLiteral(query.children[i], context);
+
+    if (query.contains_password)
+    {
+        if (!query.type && !context)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get default password type without context");
+
+        if (check_password_rules && !context)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot check password complexity rules without context");
+
+        if (query.type == AuthenticationType::BCRYPT_PASSWORD && !context)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get bcrypt work factor without context");
+
+        String value = checkAndGetLiteralArgument<String>(args[0], "password");
+
+        AuthenticationType current_type;
+
+        if (query.type)
+            current_type = *query.type;
+        else
+            current_type = context->getAccessControl().getDefaultPasswordType();
+
+        AuthenticationData auth_data(current_type);
+
+        if (check_password_rules)
+            context->getAccessControl().checkPasswordComplexityRules(value);
+
+        if (query.type == AuthenticationType::BCRYPT_PASSWORD)
+        {
+            int workfactor = context->getAccessControl().getBcryptWorkfactor();
+            auth_data.setPasswordBcrypt(value, workfactor);
+            return auth_data;
+        }
+
+        if (query.type == AuthenticationType::SHA256_PASSWORD)
+        {
+#if USE_SSL
+            ///random generator FIPS complaint
+            uint8_t key[32];
+            if (RAND_bytes(key, sizeof(key)) != 1)
+            {
+                char buf[512] = {0};
+                ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
+                throw Exception(ErrorCodes::OPENSSL_ERROR, "Cannot generate salt for password. OpenSSL {}", buf);
+            }
+
+            String salt;
+            salt.resize(sizeof(key) * 2);
+            char * buf_pos = salt.data();
+            for (uint8_t k : key)
+            {
+                writeHexByteUppercase(k, buf_pos);
+                buf_pos += 2;
+            }
+            value.append(salt);
+            auth_data.setSalt(salt);
+#else
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                            "SHA256 passwords support is disabled, because ClickHouse was built without SSL library");
+#endif
+        }
+
+        auth_data.setPassword(value);
+        return auth_data;
+    }
+
+    AuthenticationData auth_data(*query.type);
+
+    if (query.contains_hash)
+    {
+        String value = checkAndGetLiteralArgument<String>(args[0], "hash");
+
+        if (query.type == AuthenticationType::BCRYPT_PASSWORD)
+        {
+            auth_data.setPasswordHashBinary(AuthenticationData::Util::stringToDigest(value));
+            return auth_data;
+        }
+        else
+        {
+            auth_data.setPasswordHashHex(value);
+        }
+
+        if (query.type == AuthenticationType::SHA256_PASSWORD && args_size == 2)
+        {
+            String parsed_salt = checkAndGetLiteralArgument<String>(args[1], "salt");
+            auth_data.setSalt(parsed_salt);
+        }
+    }
+    else if (query.type == AuthenticationType::LDAP)
+    {
+        String value = checkAndGetLiteralArgument<String>(args[0], "ldap_server_name");
+        auth_data.setLDAPServerName(value);
+    }
+    else if (query.type == AuthenticationType::KERBEROS)
+    {
+        if (!args.empty())
+        {
+            String value = checkAndGetLiteralArgument<String>(args[0], "kerberos_realm");
+            auth_data.setKerberosRealm(value);
+        }
+    }
+    else if (query.type == AuthenticationType::SSL_CERTIFICATE)
+    {
+        boost::container::flat_set<String> common_names;
+        for (const auto & arg : args)
+            common_names.insert(checkAndGetLiteralArgument<String>(arg, "common_name"));
+
+        auth_data.setSSLCertificateCommonNames(std::move(common_names));
+    }
+    else
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected ASTAuthenticationData structure");
+    }
+
+    return auth_data;
+}
+
+}
diff --git a/src/Access/Common/AuthenticationData.h b/src/Access/AuthenticationData.h
similarity index 73%
rename from src/Access/Common/AuthenticationData.h
rename to src/Access/AuthenticationData.h
index ced9fcd4b6da..5ebef7d44f2f 100644
--- a/src/Access/Common/AuthenticationData.h
+++ b/src/Access/AuthenticationData.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#include <Access/Common/AuthenticationType.h>
+#include <Parsers/Access/ASTAuthenticationData.h>
+#include <Interpreters/Context_fwd.h>
+
 #include <base/types.h>
 #include <boost/container/flat_set.hpp>
 #include <vector>
@@ -7,47 +11,6 @@
 namespace DB
 {
 
-enum class AuthenticationType
-{
-    /// User doesn't have to enter password.
-    NO_PASSWORD,
-
-    /// Password is stored as is.
-    PLAINTEXT_PASSWORD,
-
-    /// Password is encrypted in SHA256 hash.
-    SHA256_PASSWORD,
-
-    /// SHA1(SHA1(password)).
-    /// This kind of hash is used by the `mysql_native_password` authentication plugin.
-    DOUBLE_SHA1_PASSWORD,
-
-    /// Password is checked by a [remote] LDAP server. Connection will be made at each authentication attempt.
-    LDAP,
-
-    /// Kerberos authentication performed through GSS-API negotiation loop.
-    KERBEROS,
-
-    /// Authentication is done in SSL by checking user certificate.
-    /// Certificates may only be trusted if 'strict' SSL mode is enabled.
-    SSL_CERTIFICATE,
-
-    MAX,
-};
-
-struct AuthenticationTypeInfo
-{
-    const char * const raw_name;
-    const String name; /// Lowercased with underscores, e.g. "sha256_password".
-    static const AuthenticationTypeInfo & get(AuthenticationType type_);
-};
-
-inline String toString(AuthenticationType type_)
-{
-    return AuthenticationTypeInfo::get(type_).raw_name;
-}
-
-
 /// Stores data for checking password when a user logins.
 class AuthenticationData
 {
@@ -80,6 +43,9 @@ class AuthenticationData
     void setSalt(String salt);
     String getSalt() const;
 
+    /// Sets the password using bcrypt hash with specified workfactor
+    void setPasswordBcrypt(const String & password_, int workfactor_);
+
     /// Sets the server name for authentication type LDAP.
     const String & getLDAPServerName() const { return ldap_server_name; }
     void setLDAPServerName(const String & name) { ldap_server_name = name; }
@@ -94,14 +60,20 @@ class AuthenticationData
     friend bool operator ==(const AuthenticationData & lhs, const AuthenticationData & rhs);
     friend bool operator !=(const AuthenticationData & lhs, const AuthenticationData & rhs) { return !(lhs == rhs); }
 
+    static AuthenticationData fromAST(const ASTAuthenticationData & query, ContextPtr context, bool check_password_rules);
+    std::shared_ptr<ASTAuthenticationData> toAST() const;
+
     struct Util
     {
+        static String digestToString(const Digest & text) { return String(text.data(), text.data() + text.size()); }
         static Digest stringToDigest(std::string_view text) { return Digest(text.data(), text.data() + text.size()); }
         static Digest encodeSHA256(std::string_view text);
         static Digest encodeSHA1(std::string_view text);
         static Digest encodeSHA1(const Digest & text) { return encodeSHA1(std::string_view{reinterpret_cast<const char *>(text.data()), text.size()}); }
         static Digest encodeDoubleSHA1(std::string_view text) { return encodeSHA1(encodeSHA1(text)); }
         static Digest encodeDoubleSHA1(const Digest & text) { return encodeSHA1(encodeSHA1(text)); }
+        static Digest encodeBcrypt(std::string_view text, int workfactor);
+        static bool checkPasswordBcrypt(std::string_view password, const Digest & password_bcrypt);
     };
 
 private:
diff --git a/src/Access/Common/AuthenticationData.cpp b/src/Access/Common/AuthenticationData.cpp
deleted file mode 100644
index 0a22eeb92b39..000000000000
--- a/src/Access/Common/AuthenticationData.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-#include <Access/Common/AuthenticationData.h>
-#include <Common/Exception.h>
-#include <Common/OpenSSLHelpers.h>
-#include <Poco/SHA1Engine.h>
-#include <base/types.h>
-#include <boost/algorithm/hex.hpp>
-#include <boost/algorithm/string/case_conv.hpp>
-
-
-namespace DB
-{
-namespace ErrorCodes
-{
-    extern const int SUPPORT_IS_DISABLED;
-    extern const int BAD_ARGUMENTS;
-    extern const int LOGICAL_ERROR;
-    extern const int NOT_IMPLEMENTED;
-}
-
-
-const AuthenticationTypeInfo & AuthenticationTypeInfo::get(AuthenticationType type_)
-{
-    static constexpr auto make_info = [](const char * raw_name_)
-    {
-        String init_name = raw_name_;
-        boost::to_lower(init_name);
-        return AuthenticationTypeInfo{raw_name_, std::move(init_name)};
-    };
-
-    switch (type_)
-    {
-        case AuthenticationType::NO_PASSWORD:
-        {
-            static const auto info = make_info("NO_PASSWORD");
-            return info;
-        }
-        case AuthenticationType::PLAINTEXT_PASSWORD:
-        {
-            static const auto info = make_info("PLAINTEXT_PASSWORD");
-            return info;
-        }
-        case AuthenticationType::SHA256_PASSWORD:
-        {
-            static const auto info = make_info("SHA256_PASSWORD");
-            return info;
-        }
-        case AuthenticationType::DOUBLE_SHA1_PASSWORD:
-        {
-            static const auto info = make_info("DOUBLE_SHA1_PASSWORD");
-            return info;
-        }
-        case AuthenticationType::LDAP:
-        {
-            static const auto info = make_info("LDAP");
-            return info;
-        }
-        case AuthenticationType::KERBEROS:
-        {
-            static const auto info = make_info("KERBEROS");
-            return info;
-        }
-        case AuthenticationType::SSL_CERTIFICATE:
-        {
-            static const auto info = make_info("SSL_CERTIFICATE");
-            return info;
-        }
-        case AuthenticationType::MAX:
-            break;
-    }
-    throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown authentication type: {}", static_cast<int>(type_));
-}
-
-
-AuthenticationData::Digest AuthenticationData::Util::encodeSHA256(std::string_view text [[maybe_unused]])
-{
-#if USE_SSL
-    Digest hash;
-    hash.resize(32);
-    ::DB::encodeSHA256(text, hash.data());
-    return hash;
-#else
-    throw DB::Exception(DB::ErrorCodes::SUPPORT_IS_DISABLED, "SHA256 passwords support is disabled, because ClickHouse was built without SSL library");
-#endif
-}
-
-
-AuthenticationData::Digest AuthenticationData::Util::encodeSHA1(std::string_view text)
-{
-    Poco::SHA1Engine engine;
-    engine.update(text.data(), text.size());
-    return engine.digest();
-}
-
-
-bool operator ==(const AuthenticationData & lhs, const AuthenticationData & rhs)
-{
-    return (lhs.type == rhs.type) && (lhs.password_hash == rhs.password_hash)
-        && (lhs.ldap_server_name == rhs.ldap_server_name) && (lhs.kerberos_realm == rhs.kerberos_realm)
-        && (lhs.ssl_certificate_common_names == rhs.ssl_certificate_common_names);
-}
-
-
-void AuthenticationData::setPassword(const String & password_)
-{
-    switch (type)
-    {
-        case AuthenticationType::PLAINTEXT_PASSWORD:
-            return setPasswordHashBinary(Util::stringToDigest(password_));
-
-        case AuthenticationType::SHA256_PASSWORD:
-            return setPasswordHashBinary(Util::encodeSHA256(password_));
-
-        case AuthenticationType::DOUBLE_SHA1_PASSWORD:
-            return setPasswordHashBinary(Util::encodeDoubleSHA1(password_));
-
-        case AuthenticationType::NO_PASSWORD:
-        case AuthenticationType::LDAP:
-        case AuthenticationType::KERBEROS:
-        case AuthenticationType::SSL_CERTIFICATE:
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify password for authentication type {}", toString(type));
-
-        case AuthenticationType::MAX:
-            break;
-    }
-    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "setPassword(): authentication type {} not supported", toString(type));
-}
-
-
-String AuthenticationData::getPassword() const
-{
-    if (type != AuthenticationType::PLAINTEXT_PASSWORD)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot decode the password");
-    return String(password_hash.data(), password_hash.data() + password_hash.size());
-}
-
-
-void AuthenticationData::setPasswordHashHex(const String & hash)
-{
-    Digest digest;
-    digest.resize(hash.size() / 2);
-
-    try
-    {
-        boost::algorithm::unhex(hash.begin(), hash.end(), digest.data());
-    }
-    catch (const std::exception &)
-    {
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read password hash in hex, check for valid characters [0-9a-fA-F] and length");
-    }
-
-    setPasswordHashBinary(digest);
-}
-
-
-String AuthenticationData::getPasswordHashHex() const
-{
-    if (type == AuthenticationType::LDAP || type == AuthenticationType::KERBEROS || type == AuthenticationType::SSL_CERTIFICATE)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get password hex hash for authentication type {}", toString(type));
-
-    String hex;
-    hex.resize(password_hash.size() * 2);
-    boost::algorithm::hex(password_hash.begin(), password_hash.end(), hex.data());
-    return hex;
-}
-
-
-void AuthenticationData::setPasswordHashBinary(const Digest & hash)
-{
-    switch (type)
-    {
-        case AuthenticationType::PLAINTEXT_PASSWORD:
-        {
-            password_hash = hash;
-            return;
-        }
-
-        case AuthenticationType::SHA256_PASSWORD:
-        {
-            if (hash.size() != 32)
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                                "Password hash for the 'SHA256_PASSWORD' authentication type has length {} "
-                                "but must be exactly 32 bytes.", hash.size());
-            password_hash = hash;
-            return;
-        }
-
-        case AuthenticationType::DOUBLE_SHA1_PASSWORD:
-        {
-            if (hash.size() != 20)
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                                "Password hash for the 'DOUBLE_SHA1_PASSWORD' authentication type has length {} "
-                                "but must be exactly 20 bytes.", hash.size());
-            password_hash = hash;
-            return;
-        }
-
-        case AuthenticationType::NO_PASSWORD:
-        case AuthenticationType::LDAP:
-        case AuthenticationType::KERBEROS:
-        case AuthenticationType::SSL_CERTIFICATE:
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify password binary hash for authentication type {}", toString(type));
-
-        case AuthenticationType::MAX:
-            break;
-    }
-    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "setPasswordHashBinary(): authentication type {} not supported", toString(type));
-}
-
-void AuthenticationData::setSalt(String salt_)
-{
-    if (type != AuthenticationType::SHA256_PASSWORD)
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "setSalt(): authentication type {} not supported", toString(type));
-    salt = std::move(salt_);
-}
-
-String AuthenticationData::getSalt() const
-{
-    return salt;
-}
-
-void AuthenticationData::setSSLCertificateCommonNames(boost::container::flat_set<String> common_names_)
-{
-    if (common_names_.empty())
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "The 'SSL CERTIFICATE' authentication type requires a non-empty list of common names.");
-    ssl_certificate_common_names = std::move(common_names_);
-}
-
-}
diff --git a/src/Access/Common/AuthenticationType.cpp b/src/Access/Common/AuthenticationType.cpp
new file mode 100644
index 000000000000..7ab28b5fbaf6
--- /dev/null
+++ b/src/Access/Common/AuthenticationType.cpp
@@ -0,0 +1,71 @@
+#include <Access/Common/AuthenticationType.h>
+#include <Common/Exception.h>
+#include <boost/algorithm/string/case_conv.hpp>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+
+const AuthenticationTypeInfo & AuthenticationTypeInfo::get(AuthenticationType type_)
+{
+    static constexpr auto make_info = [](const char * raw_name_, bool is_password_ = false)
+    {
+        String init_name = raw_name_;
+        boost::to_lower(init_name);
+        return AuthenticationTypeInfo{raw_name_, std::move(init_name), is_password_};
+    };
+
+    switch (type_)
+    {
+        case AuthenticationType::NO_PASSWORD:
+        {
+            static const auto info = make_info("NO_PASSWORD");
+            return info;
+        }
+        case AuthenticationType::PLAINTEXT_PASSWORD:
+        {
+            static const auto info = make_info("PLAINTEXT_PASSWORD", true);
+            return info;
+        }
+        case AuthenticationType::SHA256_PASSWORD:
+        {
+            static const auto info = make_info("SHA256_PASSWORD", true);
+            return info;
+        }
+        case AuthenticationType::DOUBLE_SHA1_PASSWORD:
+        {
+            static const auto info = make_info("DOUBLE_SHA1_PASSWORD", true);
+            return info;
+        }
+        case AuthenticationType::LDAP:
+        {
+            static const auto info = make_info("LDAP");
+            return info;
+        }
+        case AuthenticationType::KERBEROS:
+        {
+            static const auto info = make_info("KERBEROS");
+            return info;
+        }
+        case AuthenticationType::SSL_CERTIFICATE:
+        {
+            static const auto info = make_info("SSL_CERTIFICATE");
+            return info;
+        }
+        case AuthenticationType::BCRYPT_PASSWORD:
+        {
+            static const auto info = make_info("BCRYPT_PASSWORD", true);
+            return info;
+        }
+        case AuthenticationType::MAX:
+            break;
+    }
+    throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown authentication type: {}", static_cast<int>(type_));
+}
+
+}
diff --git a/src/Access/Common/AuthenticationType.h b/src/Access/Common/AuthenticationType.h
new file mode 100644
index 000000000000..d7c2d0cdd402
--- /dev/null
+++ b/src/Access/Common/AuthenticationType.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <base/types.h>
+
+namespace DB
+{
+
+enum class AuthenticationType
+{
+    /// User doesn't have to enter password.
+    NO_PASSWORD,
+
+    /// Password is stored as is.
+    PLAINTEXT_PASSWORD,
+
+    /// Password is encrypted in SHA256 hash.
+    SHA256_PASSWORD,
+
+    /// SHA1(SHA1(password)).
+    /// This kind of hash is used by the `mysql_native_password` authentication plugin.
+    DOUBLE_SHA1_PASSWORD,
+
+    /// Password is checked by a [remote] LDAP server. Connection will be made at each authentication attempt.
+    LDAP,
+
+    /// Kerberos authentication performed through GSS-API negotiation loop.
+    KERBEROS,
+
+    /// Authentication is done in SSL by checking user certificate.
+    /// Certificates may only be trusted if 'strict' SSL mode is enabled.
+    SSL_CERTIFICATE,
+
+    /// Password is encrypted in bcrypt hash.
+    BCRYPT_PASSWORD,
+
+    MAX,
+};
+
+struct AuthenticationTypeInfo
+{
+    const char * const raw_name;
+    const String name; /// Lowercased with underscores, e.g. "sha256_password".
+    bool is_password;
+    static const AuthenticationTypeInfo & get(AuthenticationType type_);
+};
+
+inline String toString(AuthenticationType type_)
+{
+    return AuthenticationTypeInfo::get(type_).raw_name;
+}
+
+}
diff --git a/src/Access/User.h b/src/Access/User.h
index 958d8bb486fd..4b4bf90137f0 100644
--- a/src/Access/User.h
+++ b/src/Access/User.h
@@ -2,7 +2,7 @@
 
 #include <Access/IAccessEntity.h>
 #include <Access/AccessRights.h>
-#include <Access/Common/AuthenticationData.h>
+#include <Access/AuthenticationData.h>
 #include <Access/Common/AllowedClientHosts.h>
 #include <Access/GrantedRoles.h>
 #include <Access/RolesOrUsersSet.h>
diff --git a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h
index 737f37b1fba8..33a9966ee2c7 100644
--- a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h
+++ b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h
@@ -91,9 +91,9 @@ struct KolmogorovSmirnov : public StatisticalSample<Float64, Float64>
         UInt64 ny_g = n2 / g;
 
         if (method == "auto")
-            method = std::max(n1, n2) <= 10000 ? "exact" : "asymp";
+            method = std::max(n1, n2) <= 10000 ? "exact" : "asymptotic";
         else if (method == "exact" && nx_g >= std::numeric_limits<Int32>::max() / ny_g)
-            method = "asymp";
+            method = "asymptotic";
 
         Float64 p_value = std::numeric_limits<Float64>::infinity();
 
@@ -143,7 +143,7 @@ struct KolmogorovSmirnov : public StatisticalSample<Float64, Float64>
             }
             p_value = c[n1];
         }
-        else if (method == "asymp")
+        else if (method == "asymp" || method == "asymptotic")
         {
             Float64 n = std::min(n1, n2);
             Float64 m = std::max(n1, n2);
@@ -242,9 +242,9 @@ class AggregateFunctionKolmogorovSmirnov final:
                 throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require second parameter to be a String", getName());
 
         method = params[1].get<String>();
-        if (method != "auto" && method != "exact" && method != "asymp")
+        if (method != "auto" && method != "exact" && method != "asymp" && method != "asymptotic")
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown method in aggregate function {}. "
-                    "It must be one of: 'auto', 'exact', 'asymp'", getName());
+                    "It must be one of: 'auto', 'exact', 'asymp' (or 'asymptotic')", getName());
     }
 
     String getName() const override
diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
index b984772c8ea3..9c809352fd36 100644
--- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
+++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
@@ -47,7 +47,7 @@ struct SingleValueDataFixed
     using ColVecType = ColumnVectorOrDecimal<T>;
 
     bool has_value = false; /// We need to remember if at least one value has been passed. This is necessary for AggregateFunctionIf.
-    T value;
+    T value = T{};
 
 public:
     static constexpr bool is_nullable = false;
@@ -554,7 +554,8 @@ struct SingleValueDataString
         if (capacity < size_to_reserve)
         {
             if (unlikely(MAX_STRING_SIZE < size_to_reserve))
-                throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", size_to_reserve);
+                throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({}), maximum: {}",
+                                size_to_reserve, MAX_STRING_SIZE);
 
             size_t rounded_capacity = roundUpToPowerOfTwoOrZero(size_to_reserve);
             chassert(rounded_capacity <= MAX_STRING_SIZE + 1);  /// rounded_capacity <= 2^31
@@ -624,7 +625,8 @@ struct SingleValueDataString
     void changeImpl(StringRef value, Arena * arena)
     {
         if (unlikely(MAX_STRING_SIZE < value.size))
-            throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", value.size);
+            throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({}), maximum: {}",
+                            value.size, MAX_STRING_SIZE);
 
         UInt32 value_size = static_cast<UInt32>(value.size);
 
diff --git a/src/AggregateFunctions/AggregateFunctionQuantile.h b/src/AggregateFunctions/AggregateFunctionQuantile.h
index c03b7f16a2a8..13320ad90b6e 100644
--- a/src/AggregateFunctions/AggregateFunctionQuantile.h
+++ b/src/AggregateFunctions/AggregateFunctionQuantile.h
@@ -163,15 +163,11 @@ class AggregateFunctionQuantile final
         if constexpr (std::is_same_v<Data, QuantileTiming<Value>>)
         {
             /// QuantileTiming only supports unsigned integers. Too large values are also meaningless.
-#ifdef OS_DARWIN
 #   pragma clang diagnostic push
 #   pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion"
-#endif
             if (isNaN(value) || value > std::numeric_limits<Int64>::max() || value < 0)
                 return;
-#ifdef OS_DARWIN
 #   pragma clang diagnostic pop
-#endif
         }
 
         if constexpr (has_second_arg)
diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
index 563d3d6aa8ab..f2e17940d358 100644
--- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
+++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
@@ -50,7 +50,7 @@ struct AggregateFunctionSequenceMatchData final
     bool sorted = true;
     PODArrayWithStackMemory<TimestampEvents, 64> events_list;
     /// sequenceMatch conditions met at least once in events_list
-    std::bitset<max_events> conditions_met;
+    Events conditions_met;
 
     void add(const Timestamp timestamp, const Events & events)
     {
@@ -101,6 +101,11 @@ struct AggregateFunctionSequenceMatchData final
         size_t size;
         readBinary(size, buf);
 
+        /// If we lose these flags, functionality is broken
+        /// If we serialize/deserialize these flags, we have compatibility issues
+        /// If we set these flags to 1, we have a minor performance penalty, which seems acceptable
+        conditions_met.set();
+
         events_list.clear();
         events_list.reserve(size);
 
diff --git a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp
index 39f57e00c483..2ea01e1d5bcc 100644
--- a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp
+++ b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp
@@ -13,6 +13,7 @@
 
 #include <AggregateFunctions/registerAggregateFunctions.h>
 
+#include <base/scope_guard.h>
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size)
 try
diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp
index 8afb9078fae0..7ab0261850bd 100644
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@@ -116,6 +116,7 @@ namespace ErrorCodes
     extern const int UNKNOWN_TABLE;
     extern const int ILLEGAL_COLUMN;
     extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
+    extern const int FUNCTION_CANNOT_HAVE_PARAMETERS;
 }
 
 /** Query analyzer implementation overview. Please check documentation in QueryAnalysisPass.h first.
@@ -4081,12 +4082,12 @@ ProjectionNames QueryAnalyzer::resolveMatcher(QueryTreeNodePtr & matcher_node, I
                 if (apply_transformer_was_used || replace_transformer_was_used)
                     continue;
 
-                replace_transformer_was_used = true;
-
                 auto replace_expression = replace_transformer->findReplacementExpression(column_name);
                 if (!replace_expression)
                     continue;
 
+                replace_transformer_was_used = true;
+
                 if (replace_transformer->isStrict())
                     strict_transformer_to_used_column_names[replace_transformer].insert(column_name);
 
@@ -4819,6 +4820,11 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                     lambda_expression_untyped->formatASTForErrorMessage(),
                     scope.scope_node->formatASTForErrorMessage());
 
+            if (!parameters.empty())
+            {
+                throw Exception(ErrorCodes::FUNCTION_CANNOT_HAVE_PARAMETERS, "Function {} is not parametric", function_node.formatASTForErrorMessage());
+            }
+
             auto lambda_expression_clone = lambda_expression_untyped->clone();
 
             IdentifierResolveScope lambda_scope(lambda_expression_clone, &scope /*parent_scope*/);
@@ -4935,9 +4941,12 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
     }
 
     FunctionOverloadResolverPtr function = UserDefinedExecutableFunctionFactory::instance().tryGet(function_name, scope.context, parameters);
+    bool is_executable_udf = false;
 
     if (!function)
         function = FunctionFactory::instance().tryGet(function_name, scope.context);
+    else
+        is_executable_udf = true;
 
     if (!function)
     {
@@ -4988,6 +4997,12 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
         return result_projection_names;
     }
 
+    /// Executable UDFs may have parameters. They are checked in UserDefinedExecutableFunctionFactory.
+    if (!parameters.empty() && !is_executable_udf)
+    {
+        throw Exception(ErrorCodes::FUNCTION_CANNOT_HAVE_PARAMETERS, "Function {} is not parametric", function_name);
+    }
+
     /** For lambda arguments we need to initialize lambda argument types DataTypeFunction using `getLambdaArgumentTypes` function.
       * Then each lambda arguments are initialized with columns, where column source is lambda.
       * This information is important for later steps of query processing.
@@ -6679,7 +6694,9 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
 
     bool is_rollup_or_cube = query_node_typed.isGroupByWithRollup() || query_node_typed.isGroupByWithCube();
 
-    if (query_node_typed.isGroupByWithGroupingSets() && query_node_typed.isGroupByWithTotals())
+    if (query_node_typed.isGroupByWithGroupingSets()
+        && query_node_typed.isGroupByWithTotals()
+        && query_node_typed.getGroupBy().getNodes().size() != 1)
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "WITH TOTALS and GROUPING SETS are not supported together");
 
     if (query_node_typed.isGroupByWithGroupingSets() && is_rollup_or_cube)
diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp
index ba5ba170427b..f52940f0043f 100644
--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@@ -5,7 +5,7 @@
 #include <Disks/ObjectStorages/S3/copyS3FileToDisk.h>
 #include <Interpreters/threadPoolCallbackRunner.h>
 #include <Interpreters/Context.h>
-#include <IO/BackupsIOThreadPool.h>
+#include <IO/SharedThreadPools.h>
 #include <IO/ReadBufferFromS3.h>
 #include <IO/WriteBufferFromS3.h>
 #include <IO/HTTPHeaderEntries.h>
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 76e5ef83e414..23e1d1ab42dc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -505,6 +505,10 @@ if (TARGET ch_contrib::sqlite)
     dbms_target_link_libraries(PUBLIC ch_contrib::sqlite)
 endif()
 
+if (TARGET ch_contrib::bcrypt)
+    target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::bcrypt)
+endif()
+
 if (TARGET ch_contrib::msgpack)
     target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::msgpack)
 endif()
diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index a17e720c1e0d..7ae75ba250df 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -34,6 +34,7 @@
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTCreateFunctionQuery.h>
 #include <Parsers/Access/ASTCreateUserQuery.h>
+#include <Parsers/Access/ASTAuthenticationData.h>
 #include <Parsers/ASTDropQuery.h>
 #include <Parsers/ASTSelectQuery.h>
 #include <Parsers/ASTSetQuery.h>
@@ -1612,10 +1613,15 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin
 
     if (const auto * create_user_query = parsed_query->as<ASTCreateUserQuery>())
     {
-        if (!create_user_query->attach && create_user_query->temporary_password_for_checks)
+        if (!create_user_query->attach && create_user_query->auth_data)
         {
-            global_context->getAccessControl().checkPasswordComplexityRules(create_user_query->temporary_password_for_checks.value());
-            create_user_query->temporary_password_for_checks.reset();
+            if (const auto * auth_data = create_user_query->auth_data->as<ASTAuthenticationData>())
+            {
+                auto password = auth_data->getPassword();
+
+                if (password)
+                    global_context->getAccessControl().checkPasswordComplexityRules(*password);
+            }
         }
     }
 
diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp
index 6c55bfe3d975..ac2180103c5b 100644
--- a/src/Common/AsynchronousMetrics.cpp
+++ b/src/Common/AsynchronousMetrics.cpp
@@ -67,8 +67,15 @@ AsynchronousMetrics::AsynchronousMetrics(
     openFileIfExists("/proc/uptime", uptime);
     openFileIfExists("/proc/net/dev", net_dev);
 
-    openFileIfExists("/sys/fs/cgroup/memory/memory.limit_in_bytes", cgroupmem_limit_in_bytes);
-    openFileIfExists("/sys/fs/cgroup/memory/memory.usage_in_bytes", cgroupmem_usage_in_bytes);
+    /// CGroups v2
+    openFileIfExists("/sys/fs/cgroup/memory.max", cgroupmem_limit_in_bytes);
+    openFileIfExists("/sys/fs/cgroup/memory.current", cgroupmem_usage_in_bytes);
+
+    /// CGroups v1
+    if (!cgroupmem_limit_in_bytes)
+        openFileIfExists("/sys/fs/cgroup/memory/memory.limit_in_bytes", cgroupmem_limit_in_bytes);
+    if (!cgroupmem_usage_in_bytes)
+        openFileIfExists("/sys/fs/cgroup/memory/memory.usage_in_bytes", cgroupmem_usage_in_bytes);
 
     openSensors();
     openBlockDevices();
@@ -900,33 +907,25 @@ void AsynchronousMetrics::update(TimePoint update_time)
 
     if (cgroupmem_limit_in_bytes && cgroupmem_usage_in_bytes)
     {
-        try {
+        try
+        {
             cgroupmem_limit_in_bytes->rewind();
             cgroupmem_usage_in_bytes->rewind();
 
-            uint64_t cgroup_mem_limit_in_bytes = 0;
-            uint64_t cgroup_mem_usage_in_bytes = 0;
+            uint64_t limit = 0;
+            uint64_t usage = 0;
 
-            readText(cgroup_mem_limit_in_bytes, *cgroupmem_limit_in_bytes);
-            readText(cgroup_mem_usage_in_bytes, *cgroupmem_usage_in_bytes);
+            tryReadText(limit, *cgroupmem_limit_in_bytes);
+            tryReadText(usage, *cgroupmem_usage_in_bytes);
 
-            if (cgroup_mem_limit_in_bytes && cgroup_mem_usage_in_bytes)
-            {
-                new_values["CgroupMemoryTotal"] = { cgroup_mem_limit_in_bytes, "The total amount of memory in cgroup, in bytes." };
-                new_values["CgroupMemoryUsed"] = { cgroup_mem_usage_in_bytes, "The amount of memory used in cgroup, in bytes." };
-            }
-            else
-            {
-                LOG_DEBUG(log, "Cannot read statistics about the cgroup memory total and used. Total got '{}', Used got '{}'.",
-                    cgroup_mem_limit_in_bytes, cgroup_mem_usage_in_bytes);
-            }
+            new_values["CGroupMemoryTotal"] = { limit, "The total amount of memory in cgroup, in bytes. If stated zero, the limit is the same as OSMemoryTotal." };
+            new_values["CGroupMemoryUsed"] = { usage, "The amount of memory used in cgroup, in bytes." };
         }
         catch (...)
         {
             tryLogCurrentException(__PRETTY_FUNCTION__);
         }
     }
-
     if (meminfo)
     {
         try
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index 0802941bff6d..cfdd78fe7884 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -142,6 +142,8 @@
     M(QueryPipelineExecutorThreadsActive, "Number of threads in the PipelineExecutor thread pool running a task.") \
     M(ParquetDecoderThreads, "Number of threads in the ParquetBlockInputFormat thread pool running a task.") \
     M(ParquetDecoderThreadsActive, "Number of threads in the ParquetBlockInputFormat thread pool.") \
+    M(OutdatedPartsLoadingThreads, "Number of threads in the threadpool for loading Outdated data parts.") \
+    M(OutdatedPartsLoadingThreadsActive, "Number of active threads in the threadpool for loading Outdated data parts.") \
     M(DistributedFilesToInsert, "Number of pending files to process for asynchronous insertion into Distributed tables. Number of files for every shard is summed.") \
     M(BrokenDistributedFilesToInsert, "Number of files for asynchronous insertion into Distributed tables that has been marked as broken. This metric will starts from 0 on start. Number of files for every shard is summed.") \
     M(TablesToDropQueueSize, "Number of dropped tables, that are waiting for background data removal.") \
@@ -188,6 +190,7 @@
     M(MergeTreeReadTaskRequestsSent, "The current number of callback requests in flight from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the remote server side.") \
     M(MergeTreeAllRangesAnnouncementsSent, "The current number of announcement being sent in flight from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.")
 
+
 namespace CurrentMetrics
 {
     #define M(NAME, DOCUMENTATION) extern const Metric NAME = Metric(__COUNTER__);
diff --git a/src/Common/Documentation.cpp b/src/Common/Documentation.cpp
new file mode 100644
index 000000000000..862ecb6054c6
--- /dev/null
+++ b/src/Common/Documentation.cpp
@@ -0,0 +1,30 @@
+#include <Common/Documentation.h>
+
+namespace DB
+{
+
+std::string Documentation::examplesAsString() const
+{
+    std::string res;
+    for (const auto & [example_name, example_query] : examples)
+    {
+        res += example_name + ":\n\n";
+        res += "```sql\n";
+        res += example_query + "\n";
+        res += "```\n";
+    }
+    return res;
+}
+
+std::string Documentation::categoriesAsString() const
+{
+    if (categories.empty())
+        return "";
+
+    std::string res = categories[0];
+    for (size_t i = 1; i < categories.size(); ++i)
+        res += ", " + categories[i];
+    return res;
+}
+
+}
diff --git a/src/Common/Documentation.h b/src/Common/Documentation.h
index 0b0eacbeccd8..c71aa8772eda 100644
--- a/src/Common/Documentation.h
+++ b/src/Common/Documentation.h
@@ -42,27 +42,44 @@ namespace DB
   *
   * Documentation does not support multiple languages.
   * The only available language is English.
+  *
+  * TODO: Allow to specify Syntax, Argument(s) and a Returned Value.
+  * TODO: Organize Examples as a struct of ExampleName, ExampleQuery and ExampleResult.
   */
 struct Documentation
 {
     using Description = std::string;
+
+    using Syntax = std::string;
+
+    using Argument = std::string;
+    using Arguments = std::vector<Argument>;
+
+    using ReturnedValue = std::string;
+
     using ExampleName = std::string;
     using ExampleQuery = std::string;
     using Examples = std::map<ExampleName, ExampleQuery>;
+
     using Category = std::string;
     using Categories = std::vector<Category>;
 
+    using Related = std::string;
+
     Description description;
     Examples examples;
     Categories categories;
 
-    Documentation(Description description_) : description(std::move(description_)) {}
+    Documentation(Description description_) : description(std::move(description_)) {} /// NOLINT
     Documentation(Description description_, Examples examples_) : description(std::move(description_)), examples(std::move(examples_)) {}
     Documentation(Description description_, Examples examples_, Categories categories_)
         : description(std::move(description_)), examples(std::move(examples_)), categories(std::move(categories_)) {}
 
     /// TODO: Please remove this constructor. Documentation should always be non-empty.
-    Documentation() {}
+    Documentation() = default;
+
+    std::string examplesAsString() const;
+    std::string categoriesAsString() const;
 };
 
 }
diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h
index 172714fe04fa..b09f096d761c 100644
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@@ -319,6 +319,9 @@ struct CheckRequest : virtual Request
     String path;
     int32_t version = -1;
 
+    /// should it check if a node DOES NOT exist
+    bool not_exists = false;
+
     void addRootPath(const String & root_path) override;
     String getPath() const override { return path; }
 
@@ -524,7 +527,7 @@ class IKeeper
         const Requests & requests,
         MultiCallback callback) = 0;
 
-    virtual DB::KeeperApiVersion getApiVersion() = 0;
+    virtual DB::KeeperApiVersion getApiVersion() const = 0;
 
     /// Expire session and finish all pending requests
     virtual void finalize(const String & reason) = 0;
diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h
index fb4e527e50e1..27405d8d5711 100644
--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@@ -91,7 +91,7 @@ class TestKeeper final : public IKeeper
 
     void finalize(const String & reason) override;
 
-    DB::KeeperApiVersion getApiVersion() override
+    DB::KeeperApiVersion getApiVersion() const override
     {
         return KeeperApiVersion::ZOOKEEPER_COMPATIBLE;
     }
diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp
index a8da0dff0cca..09047b5b2325 100644
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@@ -342,6 +342,31 @@ void ZooKeeper::createAncestors(const std::string & path)
     }
 }
 
+void ZooKeeper::checkExistsAndGetCreateAncestorsOps(const std::string & path, Coordination::Requests & requests)
+{
+    std::vector<std::string> paths_to_check;
+    size_t pos = 1;
+    while (true)
+    {
+        pos = path.find('/', pos);
+        if (pos == std::string::npos)
+            break;
+        paths_to_check.emplace_back(path.substr(0, pos));
+        ++pos;
+    }
+
+    MultiExistsResponse response = exists(paths_to_check);
+
+    for (size_t i = 0; i < paths_to_check.size(); ++i)
+    {
+        if (response[i].error != Coordination::Error::ZOK)
+        {
+            /// Ephemeral nodes cannot have children
+            requests.emplace_back(makeCreateRequest(paths_to_check[i], "", CreateMode::Persistent));
+        }
+    }
+}
+
 Coordination::Error ZooKeeper::removeImpl(const std::string & path, int32_t version)
 {
     auto future_result = asyncTryRemoveNoThrow(path, version);
@@ -821,7 +846,7 @@ bool ZooKeeper::expired()
     return impl->isExpired();
 }
 
-DB::KeeperApiVersion ZooKeeper::getApiVersion()
+DB::KeeperApiVersion ZooKeeper::getApiVersion() const
 {
     return impl->getApiVersion();
 }
@@ -1282,7 +1307,6 @@ Coordination::RequestPtr makeExistsRequest(const std::string & path)
     return request;
 }
 
-
 std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log)
 {
     if (!zookeeper_path.empty() && zookeeper_path.back() == '/')
diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h
index 8e7639b8cc12..ca6a44c4cbc7 100644
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@@ -215,7 +215,7 @@ class ZooKeeper
     /// Returns true, if the session has expired.
     bool expired();
 
-    DB::KeeperApiVersion getApiVersion();
+    DB::KeeperApiVersion getApiVersion() const;
 
     /// Create a znode.
     /// Throw an exception if something went wrong.
@@ -237,6 +237,8 @@ class ZooKeeper
     /// Does not create the node itself.
     void createAncestors(const std::string & path);
 
+    void checkExistsAndGetCreateAncestorsOps(const std::string & path, Coordination::Requests & requests);
+
     /// Remove the node if the version matches. (if version == -1, remove any version).
     void remove(const std::string & path, int32_t version = -1);
 
@@ -522,8 +524,6 @@ class ZooKeeper
     void setServerCompletelyStarted();
 
 private:
-    friend class EphemeralNodeHolder;
-
     void init(ZooKeeperArgs args_);
 
     /// The following methods don't any throw exceptions but return error codes.
@@ -674,4 +674,20 @@ bool hasZooKeeperConfig(const Poco::Util::AbstractConfiguration & config);
 
 String getZooKeeperConfigName(const Poco::Util::AbstractConfiguration & config);
 
+template <typename Client>
+void addCheckNotExistsRequest(Coordination::Requests & requests, const Client & client, const std::string & path)
+{
+    if (client.getApiVersion() >= DB::KeeperApiVersion::WITH_CHECK_NOT_EXISTS)
+    {
+        auto request = std::make_shared<Coordination::CheckRequest>();
+        request->path = path;
+        request->not_exists = true;
+        requests.push_back(std::move(request));
+        return;
+    }
+
+    requests.push_back(makeCreateRequest(path, "", zkutil::CreateMode::Persistent));
+    requests.push_back(makeRemoveRequest(path, -1));
+}
+
 }
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index 1ee56936889f..5031af38812a 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -666,7 +666,15 @@ ZooKeeperResponsePtr ZooKeeperGetRequest::makeResponse() const { return setTime(
 ZooKeeperResponsePtr ZooKeeperSetRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperSetResponse>()); }
 ZooKeeperResponsePtr ZooKeeperListRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperListResponse>()); }
 ZooKeeperResponsePtr ZooKeeperSimpleListRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperSimpleListResponse>()); }
-ZooKeeperResponsePtr ZooKeeperCheckRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperCheckResponse>()); }
+
+ZooKeeperResponsePtr ZooKeeperCheckRequest::makeResponse() const
+{
+    if (not_exists)
+        return setTime(std::make_shared<ZooKeeperCheckNotExistsResponse>());
+
+    return setTime(std::make_shared<ZooKeeperCheckResponse>());
+}
+
 ZooKeeperResponsePtr ZooKeeperMultiRequest::makeResponse() const
 {
     std::shared_ptr<ZooKeeperMultiResponse> response;
@@ -931,6 +939,8 @@ void registerZooKeeperRequest(ZooKeeperRequestFactory & factory)
             res->operation_type = ZooKeeperMultiRequest::OperationType::Read;
         else if constexpr (num == OpNum::Multi)
             res->operation_type = ZooKeeperMultiRequest::OperationType::Write;
+        else if constexpr (num == OpNum::CheckNotExists)
+            res->not_exists = true;
 
         return res;
     });
@@ -956,6 +966,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
     registerZooKeeperRequest<OpNum::GetACL, ZooKeeperGetACLRequest>(*this);
     registerZooKeeperRequest<OpNum::SetACL, ZooKeeperSetACLRequest>(*this);
     registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
+    registerZooKeeperRequest<OpNum::CheckNotExists, ZooKeeperCheckRequest>(*this);
 }
 
 PathMatchResult matchPath(std::string_view path, std::string_view match_to)
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h
index 1755ebd8cccd..5f00698423e5 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@@ -390,12 +390,12 @@ struct ZooKeeperSimpleListResponse final : ZooKeeperListResponse
     size_t bytesSize() const override { return ZooKeeperListResponse::bytesSize() - sizeof(stat); }
 };
 
-struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest
+struct ZooKeeperCheckRequest : CheckRequest, ZooKeeperRequest
 {
     ZooKeeperCheckRequest() = default;
     explicit ZooKeeperCheckRequest(const CheckRequest & base) : CheckRequest(base) {}
 
-    OpNum getOpNum() const override { return OpNum::Check; }
+    OpNum getOpNum() const override { return not_exists ? OpNum::CheckNotExists : OpNum::Check; }
     void writeImpl(WriteBuffer & out) const override;
     void readImpl(ReadBuffer & in) override;
     std::string toStringImpl() const override;
@@ -408,7 +408,7 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest
     void createLogElements(LogElements & elems) const override;
 };
 
-struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse
+struct ZooKeeperCheckResponse : CheckResponse, ZooKeeperResponse
 {
     void readImpl(ReadBuffer &) override {}
     void writeImpl(WriteBuffer &) const override {}
@@ -417,6 +417,12 @@ struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse
     size_t bytesSize() const override { return CheckResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
 };
 
+struct ZooKeeperCheckNotExistsResponse : public ZooKeeperCheckResponse
+{
+    OpNum getOpNum() const override { return OpNum::CheckNotExists; }
+    using ZooKeeperCheckResponse::ZooKeeperCheckResponse;
+};
+
 /// This response may be received only as an element of responses in MultiResponse.
 struct ZooKeeperErrorResponse final : ErrorResponse, ZooKeeperResponse
 {
diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.cpp b/src/Common/ZooKeeper/ZooKeeperConstants.cpp
index c2e4c0f5cbd8..86f70ea547a5 100644
--- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp
@@ -26,6 +26,7 @@ static const std::unordered_set<int32_t> VALID_OPERATIONS =
     static_cast<int32_t>(OpNum::SetACL),
     static_cast<int32_t>(OpNum::GetACL),
     static_cast<int32_t>(OpNum::FilteredList),
+    static_cast<int32_t>(OpNum::CheckNotExists),
 };
 
 std::string toString(OpNum op_num)
@@ -70,6 +71,8 @@ std::string toString(OpNum op_num)
             return "GetACL";
         case OpNum::FilteredList:
             return "FilteredList";
+        case OpNum::CheckNotExists:
+            return "CheckNotExists";
     }
     int32_t raw_op = static_cast<int32_t>(op_num);
     throw Exception("Operation " + std::to_string(raw_op) + " is unknown", Error::ZUNIMPLEMENTED);
diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h
index 912e253718b3..6b50c5c5d092 100644
--- a/src/Common/ZooKeeper/ZooKeeperConstants.h
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.h
@@ -36,6 +36,7 @@ enum class OpNum : int32_t
 
     // CH Keeper specific operations
     FilteredList = 500,
+    CheckNotExists = 501,
 
     SessionID = 997, /// Special internal request
 };
diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
index f97bf292198a..6c79fc4f178b 100644
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@@ -1085,7 +1085,7 @@ void ZooKeeper::pushRequest(RequestInfo && info)
     ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions);
 }
 
-KeeperApiVersion ZooKeeper::getApiVersion()
+KeeperApiVersion ZooKeeper::getApiVersion() const
 {
     return keeper_api_version;
 }
diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h
index 9fff12309bd6..c0c57d3f7198 100644
--- a/src/Common/ZooKeeper/ZooKeeperImpl.h
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.h
@@ -179,7 +179,7 @@ class ZooKeeper final : public IKeeper
         const Requests & requests,
         MultiCallback callback) override;
 
-    DB::KeeperApiVersion getApiVersion() override;
+    DB::KeeperApiVersion getApiVersion() const override;
 
     /// Without forcefully invalidating (finalizing) ZooKeeper session before
     /// establishing a new one, there was a possibility that server is using
diff --git a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h
index 130590ceb400..a0b8527f4805 100644
--- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h
+++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h
@@ -6,6 +6,7 @@
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
 #include <Common/logger_useful.h>
 #include <Common/randomSeed.h>
+#include "Coordination/KeeperConstants.h"
 
 namespace DB
 {
@@ -257,19 +258,22 @@ class ZooKeeperWithFaultInjection
 
     Coordination::Error tryCreate(const std::string & path, const std::string & data, int32_t mode, std::string & path_created)
     {
+        path_created.clear();
+
         auto error = access(
             "tryCreate",
             path,
             [&]() { return keeper->tryCreate(path, data, mode, path_created); },
-            [&](Coordination::Error &)
+            [&](Coordination::Error & code)
             {
                 try
                 {
-                    if (mode == zkutil::CreateMode::EphemeralSequential || mode == zkutil::CreateMode::Ephemeral)
+                    if (!path_created.empty() && (mode == zkutil::CreateMode::EphemeralSequential || mode == zkutil::CreateMode::Ephemeral))
                     {
-                        keeper->remove(path);
+                        keeper->remove(path_created);
                         if (unlikely(logger))
-                            LOG_TRACE(logger, "ZooKeeperWithFaultInjection cleanup: seed={} func={} path={}", seed, "tryCreate", path);
+                            LOG_TRACE(logger, "ZooKeeperWithFaultInjection cleanup: seed={} func={} path={} path_created={} code={}",
+                                seed, "tryCreate", path, path_created, code);
                     }
                 }
                 catch (const zkutil::KeeperException & e)
@@ -277,10 +281,11 @@ class ZooKeeperWithFaultInjection
                     if (unlikely(logger))
                         LOG_TRACE(
                             logger,
-                            "ZooKeeperWithFaultInjection cleanup FAILED: seed={} func={} path={} code={} message={} ",
+                            "ZooKeeperWithFaultInjection cleanup FAILED: seed={} func={} path={} path_created={} code={} message={} ",
                             seed,
                             "tryCreate",
                             path,
+                            path_created,
                             e.code,
                             e.message());
                 }
@@ -289,8 +294,8 @@ class ZooKeeperWithFaultInjection
         /// collect ephemeral nodes when no fault was injected (to clean up later)
         if (unlikely(fault_policy))
         {
-            if (mode == zkutil::CreateMode::EphemeralSequential || mode == zkutil::CreateMode::Ephemeral)
-                ephemeral_nodes.push_back(path);
+            if (!path_created.empty() && (mode == zkutil::CreateMode::EphemeralSequential || mode == zkutil::CreateMode::Ephemeral))
+                ephemeral_nodes.push_back(path_created);
         }
 
         return error;
@@ -356,6 +361,10 @@ class ZooKeeperWithFaultInjection
         return access("trySet", path, [&]() { return keeper->trySet(path, data, version, stat); });
     }
 
+    void checkExistsAndGetCreateAncestorsOps(const std::string & path, Coordination::Requests & requests)
+    {
+        return access("checkExistsAndGetCreateAncestorsOps", path, [&]() { return keeper->checkExistsAndGetCreateAncestorsOps(path, requests); });
+    }
 
     void handleEphemeralNodeExistenceNoFailureInjection(const std::string & path, const std::string & fast_delete_if_equal_value)
     {
@@ -381,6 +390,11 @@ class ZooKeeperWithFaultInjection
         ephemeral_nodes.clear();
     }
 
+    KeeperApiVersion getApiVersion() const
+    {
+        return keeper->getApiVersion();
+    }
+
 private:
     void faultInjectionBefore(std::function<void()> fault_cleanup)
     {
diff --git a/src/Common/config.h.in b/src/Common/config.h.in
index d5080ba447e7..6e3e53cc1bf1 100644
--- a/src/Common/config.h.in
+++ b/src/Common/config.h.in
@@ -57,3 +57,4 @@
 #cmakedefine01 USE_SKIM
 #cmakedefine01 USE_OPENSSL_INTREE
 #cmakedefine01 USE_ULID
+#cmakedefine01 USE_BCRYPT
diff --git a/src/Common/examples/average.cpp b/src/Common/examples/average.cpp
index d7e2344dc380..f281abdced26 100644
--- a/src/Common/examples/average.cpp
+++ b/src/Common/examples/average.cpp
@@ -473,7 +473,8 @@ Float NO_INLINE buffered(const PODArray<UInt8> & keys, const PODArray<Float> & v
     return map[0].result();
 }
 
-
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wframe-larger-than"
 template <size_t UNROLL_COUNT>
 Float NO_INLINE really_unrolled(const PODArray<UInt8> & keys, const PODArray<Float> & values)
 {
@@ -496,6 +497,7 @@ Float NO_INLINE really_unrolled(const PODArray<UInt8> & keys, const PODArray<Flo
 
     return map[0].result();
 }
+#pragma clang diagnostic pop
 
 
 struct State4
diff --git a/src/Coordination/KeeperConstants.h b/src/Coordination/KeeperConstants.h
index 952689af01fe..4b5a5b54be0e 100644
--- a/src/Coordination/KeeperConstants.h
+++ b/src/Coordination/KeeperConstants.h
@@ -9,10 +9,11 @@ enum class KeeperApiVersion : uint8_t
 {
     ZOOKEEPER_COMPATIBLE = 0,
     WITH_FILTERED_LIST,
-    WITH_MULTI_READ
+    WITH_MULTI_READ,
+    WITH_CHECK_NOT_EXISTS,
 };
 
-inline constexpr auto current_keeper_api_version = KeeperApiVersion::WITH_MULTI_READ;
+inline constexpr auto current_keeper_api_version = KeeperApiVersion::WITH_CHECK_NOT_EXISTS;
 
 const std::string keeper_system_path = "/keeper";
 const std::string keeper_api_version_path = keeper_system_path + "/api_version";
diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp
index cfc1c2bd12bc..a838de07ecb8 100644
--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@@ -1449,24 +1449,39 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc
 
 struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestProcessor
 {
+    explicit KeeperStorageCheckRequestProcessor(const Coordination::ZooKeeperRequestPtr & zk_request_)
+        : KeeperStorageRequestProcessor(zk_request_)
+    {
+        check_not_exists = zk_request->getOpNum() == Coordination::OpNum::CheckNotExists;
+    }
+
     bool checkAuth(KeeperStorage & storage, int64_t session_id, bool is_local) const override
     {
-        return storage.checkACL(zk_request->getPath(), Coordination::ACL::Read, session_id, is_local);
+        auto path = zk_request->getPath();
+        return storage.checkACL(check_not_exists ? parentPath(path) : path, Coordination::ACL::Read, session_id, is_local);
     }
 
-    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
     std::vector<KeeperStorage::Delta>
     preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
     {
         ProfileEvents::increment(ProfileEvents::KeeperCheckRequest);
-        Coordination::ZooKeeperCheckRequest & request = dynamic_cast<Coordination::ZooKeeperCheckRequest &>(*zk_request);
 
-        if (!storage.uncommitted_state.getNode(request.path))
-            return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};
+        Coordination::ZooKeeperCheckRequest & request = dynamic_cast<Coordination::ZooKeeperCheckRequest &>(*zk_request);
 
         auto node = storage.uncommitted_state.getNode(request.path);
-        if (request.version != -1 && request.version != node->stat.version)
-            return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}};
+        if (check_not_exists)
+        {
+            if (node && (request.version == -1 || request.version == node->stat.version))
+                return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}};
+        }
+        else
+        {
+            if (!node)
+                return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};
+
+            if (request.version != -1 && request.version != node->stat.version)
+                return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}};
+        }
 
         return {};
     }
@@ -1497,17 +1512,22 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro
 
         auto & container = storage.container;
         auto node_it = container.find(request.path);
-        if (node_it == container.end())
-        {
-            on_error(Coordination::Error::ZNONODE);
-        }
-        else if (request.version != -1 && request.version != node_it->value.stat.version)
+
+        if (check_not_exists)
         {
-            on_error(Coordination::Error::ZBADVERSION);
+            if (node_it != container.end() && (request.version == -1 || request.version == node_it->value.stat.version))
+                on_error(Coordination::Error::ZNODEEXISTS);
+            else
+                response.error = Coordination::Error::ZOK;
         }
         else
         {
-            response.error = Coordination::Error::ZOK;
+            if (node_it == container.end())
+                on_error(Coordination::Error::ZNONODE);
+            else if (request.version != -1 && request.version != node_it->value.stat.version)
+                on_error(Coordination::Error::ZBADVERSION);
+            else
+                response.error = Coordination::Error::ZOK;
         }
 
         return response_ptr;
@@ -1523,6 +1543,9 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro
         ProfileEvents::increment(ProfileEvents::KeeperCheckRequest);
         return processImpl<true>(storage, zxid);
     }
+
+private:
+    bool check_not_exists;
 };
 
 
@@ -1716,6 +1739,7 @@ struct KeeperStorageMultiRequestProcessor final : public KeeperStorageRequestPro
                     concrete_requests.push_back(std::make_shared<KeeperStorageSetRequestProcessor>(sub_zk_request));
                     break;
                 case Coordination::OpNum::Check:
+                case Coordination::OpNum::CheckNotExists:
                     check_operation_type(OperationType::Write);
                     concrete_requests.push_back(std::make_shared<KeeperStorageCheckRequestProcessor>(sub_zk_request));
                     break;
@@ -1971,6 +1995,7 @@ KeeperStorageRequestProcessorsFactory::KeeperStorageRequestProcessorsFactory()
     registerKeeperRequestProcessor<Coordination::OpNum::MultiRead, KeeperStorageMultiRequestProcessor>(*this);
     registerKeeperRequestProcessor<Coordination::OpNum::SetACL, KeeperStorageSetACLRequestProcessor>(*this);
     registerKeeperRequestProcessor<Coordination::OpNum::GetACL, KeeperStorageGetACLRequestProcessor>(*this);
+    registerKeeperRequestProcessor<Coordination::OpNum::CheckNotExists, KeeperStorageCheckRequestProcessor>(*this);
 }
 
 
diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp
index b1bea8ddf248..62217fb2dd36 100644
--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@@ -2451,6 +2451,78 @@ TEST_P(CoordinationTest, ChangelogTestMaxLogSize)
 
 }
 
+TEST_P(CoordinationTest, TestCheckNotExistsRequest)
+{
+    using namespace DB;
+    using namespace Coordination;
+
+    KeeperStorage storage{500, "", keeper_context};
+
+    int32_t zxid = 0;
+
+    const auto create_path = [&](const auto & path)
+    {
+        const auto create_request = std::make_shared<ZooKeeperCreateRequest>();
+        int new_zxid = ++zxid;
+        create_request->path = path;
+        storage.preprocessRequest(create_request, 1, 0, new_zxid);
+        auto responses = storage.processRequest(create_request, 1, new_zxid);
+
+        EXPECT_GE(responses.size(), 1);
+        EXPECT_EQ(responses[0].response->error, Coordination::Error::ZOK) << "Failed to create " << path;
+    };
+
+    const auto check_request = std::make_shared<ZooKeeperCheckRequest>();
+    check_request->path = "/test_node";
+    check_request->not_exists = true;
+
+    {
+        SCOPED_TRACE("CheckNotExists returns ZOK");
+        int new_zxid = ++zxid;
+        storage.preprocessRequest(check_request, 1, 0, new_zxid);
+        auto responses = storage.processRequest(check_request, 1, new_zxid);
+        EXPECT_GE(responses.size(), 1);
+        auto error = responses[0].response->error;
+        EXPECT_EQ(error, Coordination::Error::ZOK) << "CheckNotExists returned invalid result: " << errorMessage(error);
+    }
+
+    create_path("/test_node");
+    auto node_it = storage.container.find("/test_node");
+    ASSERT_NE(node_it, storage.container.end());
+    auto node_version = node_it->value.stat.version;
+
+    {
+        SCOPED_TRACE("CheckNotExists returns ZNODEEXISTS");
+        int new_zxid = ++zxid;
+        storage.preprocessRequest(check_request, 1, 0, new_zxid);
+        auto responses = storage.processRequest(check_request, 1, new_zxid);
+        EXPECT_GE(responses.size(), 1);
+        auto error = responses[0].response->error;
+        EXPECT_EQ(error, Coordination::Error::ZNODEEXISTS) << "CheckNotExists returned invalid result: " << errorMessage(error);
+    }
+
+    {
+        SCOPED_TRACE("CheckNotExists returns ZNODEEXISTS for same version");
+        int new_zxid = ++zxid;
+        check_request->version = node_version;
+        storage.preprocessRequest(check_request, 1, 0, new_zxid);
+        auto responses = storage.processRequest(check_request, 1, new_zxid);
+        EXPECT_GE(responses.size(), 1);
+        auto error = responses[0].response->error;
+        EXPECT_EQ(error, Coordination::Error::ZNODEEXISTS) << "CheckNotExists returned invalid result: " << errorMessage(error);
+    }
+
+    {
+        SCOPED_TRACE("CheckNotExists returns ZOK for different version");
+        int new_zxid = ++zxid;
+        check_request->version = node_version + 1;
+        storage.preprocessRequest(check_request, 1, 0, new_zxid);
+        auto responses = storage.processRequest(check_request, 1, new_zxid);
+        EXPECT_GE(responses.size(), 1);
+        auto error = responses[0].response->error;
+        EXPECT_EQ(error, Coordination::Error::ZOK) << "CheckNotExists returned invalid result: " << errorMessage(error);
+    }
+}
 
 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
     CoordinationTest,
diff --git a/src/Core/Defines.h b/src/Core/Defines.h
index 3fae123fb6b3..e9b84b71cae1 100644
--- a/src/Core/Defines.h
+++ b/src/Core/Defines.h
@@ -29,11 +29,6 @@
 #define DEFAULT_INSERT_BLOCK_SIZE \
     1048449 /// 1048576 - PADDING_FOR_SIMD - (PADDING_FOR_SIMD - 1) bytes padding that we usually have in arrays
 
-/** The same, but for merge operations. Less DEFAULT_BLOCK_SIZE for saving RAM (since all the columns are read).
-  * Significantly less, since there are 10-way mergers.
-  */
-#define DEFAULT_MERGE_BLOCK_SIZE 8192
-
 #define DEFAULT_PERIODIC_LIVE_VIEW_REFRESH_SEC 60
 #define SHOW_CHARS_ON_SYNTAX_ERROR ptrdiff_t(160)
 #define DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3
@@ -83,4 +78,3 @@
 #else
 #define QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS 0
 #endif
-
diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h
index aabc89cc6d75..f2583ed3da2b 100644
--- a/src/Core/ServerSettings.h
+++ b/src/Core/ServerSettings.h
@@ -21,6 +21,8 @@ namespace DB
     M(UInt64, max_io_thread_pool_size, 100, "The maximum number of threads that would be used for IO operations", 0) \
     M(UInt64, max_io_thread_pool_free_size, 0, "Max free size for IO thread pool.", 0) \
     M(UInt64, io_thread_pool_queue_size, 10000, "Queue size for IO thread pool.", 0) \
+    M(UInt64, max_outdated_parts_loading_thread_pool_size, 32, "The maximum number of threads that would be used for loading outdated data parts on startup", 0) \
+    M(UInt64, outdated_part_loading_thread_pool_queue_size, 10000, "Queue size for parts loading thread pool.", 0) \
     M(UInt64, max_remote_read_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for read. Zero means unlimited.", 0) \
     M(UInt64, max_remote_write_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for write. Zero means unlimited.", 0) \
     M(UInt64, max_local_read_bandwidth_for_server, 0, "The maximum speed of local reads in bytes per second. Zero means unlimited.", 0) \
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 7f1fe838b802..aa640664b683 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -49,6 +49,8 @@ class IColumn;
     M(MaxThreads, max_download_threads, 4, "The maximum number of threads to download data (e.g. for URL engine).", 0) \
     M(UInt64, max_download_buffer_size, 10*1024*1024, "The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread.", 0) \
     M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \
+    M(UInt64, max_read_buffer_size_local_fs, 128*1024, "The maximum size of the buffer to read from local filesystem. If set to 0 then max_read_buffer_size will be used.", 0) \
+    M(UInt64, max_read_buffer_size_remote_fs, 0, "The maximum size of the buffer to read from remote filesystem. If set to 0 then max_read_buffer_size will be used.", 0) \
     M(UInt64, max_distributed_connections, 1024, "The maximum number of connections for distributed processing of one query (should be greater than max_threads).", 0) \
     M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "The maximum number of bytes of a query string parsed by the SQL parser. Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction.", 0) \
     M(UInt64, interactive_delay, 100000, "The interval in microseconds to check if the request is cancelled, and to send progress info.", 0) \
@@ -71,6 +73,7 @@ class IColumn;
     M(UInt64, idle_connection_timeout, 3600, "Close idle TCP connections after specified number of seconds.", 0) \
     M(UInt64, distributed_connections_pool_size, 1024, "Maximum number of connections with one remote server in the pool.", 0) \
     M(UInt64, connections_with_failover_max_tries, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, "The maximum number of attempts to connect to replicas.", 0) \
+    M(UInt64, s3_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \
     M(UInt64, s3_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \
     M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \
     M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \
@@ -647,7 +650,7 @@ class IColumn;
     M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes required for remote read (url, s3) to do seek, instead of read with ignore.", 0) \
     \
     M(UInt64, async_insert_threads, 16, "Maximum number of threads to actually parse and insert data in background. Zero means asynchronous mode is disabled", 0) \
-    M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. Makes sense only for inserts via HTTP protocol. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \
+    M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \
     M(Bool, wait_for_async_insert, true, "If true wait for processing of asynchronous insertion", 0) \
     M(Seconds, wait_for_async_insert_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "Timeout for waiting for processing asynchronous insertion", 0) \
     M(UInt64, async_insert_max_data_size, 1000000, "Maximum size in bytes of unparsed data collected per query before being inserted", 0) \
diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp
index 5f5cd2667cb3..0c12b78738d7 100644
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@@ -117,9 +117,14 @@ DatabaseReplicated::DatabaseReplicated(
         fillClusterAuthInfo(db_settings.collection_name.value, context_->getConfigRef());
 }
 
+String DatabaseReplicated::getFullReplicaName(const String & shard, const String & replica)
+{
+    return shard + '|' + replica;
+}
+
 String DatabaseReplicated::getFullReplicaName() const
 {
-    return shard_name + '|' + replica_name;
+    return getFullReplicaName(shard_name, replica_name);
 }
 
 std::pair<String, String> DatabaseReplicated::parseFullReplicaName(const String & name)
@@ -216,7 +221,7 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const
     assert(!hosts.empty());
     assert(hosts.size() == host_ids.size());
     String current_shard = parseFullReplicaName(hosts.front()).first;
-    std::vector<Strings> shards;
+    std::vector<std::vector<DatabaseReplicaInfo>> shards;
     shards.emplace_back();
     for (size_t i = 0; i < hosts.size(); ++i)
     {
@@ -232,25 +237,61 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const
             if (!shards.back().empty())
                 shards.emplace_back();
         }
-        shards.back().emplace_back(unescapeForFileName(host_port));
+        String hostname = unescapeForFileName(host_port);
+        shards.back().push_back(DatabaseReplicaInfo{std::move(hostname), std::move(shard), std::move(replica)});
     }
 
     UInt16 default_port = getContext()->getTCPPort();
 
     bool treat_local_as_remote = false;
     bool treat_local_port_as_remote = getContext()->getApplicationType() == Context::ApplicationType::LOCAL;
-    return std::make_shared<Cluster>(
-        getContext()->getSettingsRef(),
-        shards,
+    ClusterConnectionParameters params{
         cluster_auth_info.cluster_username,
         cluster_auth_info.cluster_password,
         default_port,
         treat_local_as_remote,
         treat_local_port_as_remote,
         cluster_auth_info.cluster_secure_connection,
-        /*priority=*/1,
+        /*priority=*/ 1,
         TSA_SUPPRESS_WARNING_FOR_READ(database_name),     /// FIXME
-        cluster_auth_info.cluster_secret);
+        cluster_auth_info.cluster_secret};
+
+    return std::make_shared<Cluster>(getContext()->getSettingsRef(), shards, params);
+}
+
+std::vector<UInt8> DatabaseReplicated::tryGetAreReplicasActive(const ClusterPtr & cluster_) const
+{
+    Strings paths;
+    const auto & addresses_with_failover = cluster->getShardsAddresses();
+    const auto & shards_info = cluster_->getShardsInfo();
+    for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index)
+    {
+        for (const auto & replica : addresses_with_failover[shard_index])
+        {
+            String full_name = getFullReplicaName(replica.database_shard_name, replica.database_replica_name);
+            paths.emplace_back(fs::path(zookeeper_path) / "replicas" / full_name / "active");
+        }
+    }
+
+    try
+    {
+        auto current_zookeeper = getZooKeeper();
+        auto res = current_zookeeper->exists(paths);
+
+        std::vector<UInt8> statuses;
+        statuses.resize(paths.size());
+
+        for (size_t i = 0; i < res.size(); ++i)
+            if (res[i].error == Coordination::Error::ZOK)
+                statuses[i] = 1;
+
+        return statuses;
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log);
+        return {};
+    }
 }
 
 
@@ -1043,12 +1084,14 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node
 }
 
 void DatabaseReplicated::dropReplica(
-    DatabaseReplicated * database, const String & database_zookeeper_path, const String & full_replica_name)
+    DatabaseReplicated * database, const String & database_zookeeper_path, const String & shard, const String & replica)
 {
     assert(!database || database_zookeeper_path == database->zookeeper_path);
 
+    String full_replica_name = shard.empty() ? replica : getFullReplicaName(shard, replica);
+
     if (full_replica_name.find('/') != std::string::npos)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid replica name: {}", full_replica_name);
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid replica name, '/' is not allowed: {}", full_replica_name);
 
     auto zookeeper = Context::getGlobalContextInstance()->getZooKeeper();
 
diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h
index 6a897f7322a0..b3397a832f24 100644
--- a/src/Databases/DatabaseReplicated.h
+++ b/src/Databases/DatabaseReplicated.h
@@ -55,6 +55,7 @@ class DatabaseReplicated : public DatabaseAtomic
     String getShardName() const { return shard_name; }
     String getReplicaName() const { return replica_name; }
     String getFullReplicaName() const;
+    static String getFullReplicaName(const String & shard, const String & replica);
     static std::pair<String, String> parseFullReplicaName(const String & name);
 
     const String & getZooKeeperPath() const { return zookeeper_path; }
@@ -77,7 +78,9 @@ class DatabaseReplicated : public DatabaseAtomic
 
     bool shouldReplicateQuery(const ContextPtr & query_context, const ASTPtr & query_ptr) const override;
 
-    static void dropReplica(DatabaseReplicated * database, const String & database_zookeeper_path, const String & full_replica_name);
+    static void dropReplica(DatabaseReplicated * database, const String & database_zookeeper_path, const String & shard, const String & replica);
+
+    std::vector<UInt8> tryGetAreReplicasActive(const ClusterPtr & cluster_) const;
 
     friend struct DatabaseReplicatedTask;
     friend class DatabaseReplicatedDDLWorker;
diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp
index c0d6a4a2a470..191adab8983f 100644
--- a/src/Dictionaries/HashedDictionary.cpp
+++ b/src/Dictionaries/HashedDictionary.cpp
@@ -761,6 +761,9 @@ void HashedDictionary<dictionary_key_type, sparse, sharded>::blockToAttributes(c
         auto & attribute = attributes[attribute_index];
         bool attribute_is_nullable = attribute.is_nullable_sets.has_value();
 
+        /// Number of elements should not take into account multiple attributes.
+        new_element_count = 0;
+
         getAttributeContainers(attribute_index, [&](auto & containers)
         {
             using ContainerType = std::decay_t<decltype(containers.front())>;
@@ -957,6 +960,15 @@ void HashedDictionary<dictionary_key_type, sparse, sharded>::calculateBytesAlloc
 
     for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index)
     {
+        /// bucket_count should be a sum over all shards (CollectionsHolder),
+        /// but it should not be a sum over all attributes, since it is used to
+        /// calculate load_factor like this:
+        ///
+        ///    element_count / bucket_count
+        ///
+        /// While element_count is a sum over all shards, not over all attributes.
+        bucket_count = 0;
+
         getAttributeContainers(attribute_index, [&](const auto & containers)
         {
             for (const auto & container : containers)
@@ -973,12 +985,12 @@ void HashedDictionary<dictionary_key_type, sparse, sharded>::calculateBytesAlloc
                     /// and since this is sparsehash, empty cells should not be significant,
                     /// and since items cannot be removed from the dictionary, deleted is also not important.
                     bytes_allocated += container.size() * (sizeof(KeyType) + sizeof(AttributeValueType));
-                    bucket_count = container.bucket_count();
+                    bucket_count += container.bucket_count();
                 }
                 else
                 {
                     bytes_allocated += container.getBufferSizeInBytes();
-                    bucket_count = container.getBufferSizeInCells();
+                    bucket_count += container.getBufferSizeInCells();
                 }
             }
         });
@@ -1002,12 +1014,12 @@ void HashedDictionary<dictionary_key_type, sparse, sharded>::calculateBytesAlloc
             if constexpr (sparse)
             {
                 bytes_allocated += container.size() * (sizeof(KeyType));
-                bucket_count = container.bucket_count();
+                bucket_count += container.bucket_count();
             }
             else
             {
                 bytes_allocated += container.getBufferSizeInBytes();
-                bucket_count = container.getBufferSizeInCells();
+                bucket_count += container.getBufferSizeInCells();
             }
         }
     }
diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp
index 922e1e71bbb7..b7e342f3c809 100644
--- a/src/Dictionaries/MongoDBDictionarySource.cpp
+++ b/src/Dictionaries/MongoDBDictionarySource.cpp
@@ -3,13 +3,13 @@
 #include "DictionaryStructure.h"
 #include "registerDictionaries.h"
 #include <Storages/ExternalDataSourceConfiguration.h>
-
+#include <Storages/StorageMongoDBSocketFactory.h>
 
 namespace DB
 {
 
 static const std::unordered_set<std::string_view> dictionary_allowed_keys = {
-    "host", "port", "user", "password", "db", "database", "uri", "collection", "name", "method"};
+    "host", "port", "user", "password", "db", "database", "uri", "collection", "name", "method", "options"};
 
 void registerDictionarySourceMongoDB(DictionarySourceFactory & factory)
 {
@@ -51,6 +51,7 @@ void registerDictionarySourceMongoDB(DictionarySourceFactory & factory)
             config.getString(config_prefix + ".method", ""),
             configuration.database,
             config.getString(config_prefix + ".collection"),
+            config.getString(config_prefix + ".options", ""),
             sample_block);
     };
 
@@ -98,6 +99,7 @@ MongoDBDictionarySource::MongoDBDictionarySource(
     const std::string & method_,
     const std::string & db_,
     const std::string & collection_,
+    const std::string & options_,
     const Block & sample_block_)
     : dict_struct{dict_struct_}
     , uri{uri_}
@@ -108,13 +110,15 @@ MongoDBDictionarySource::MongoDBDictionarySource(
     , method{method_}
     , db{db_}
     , collection{collection_}
+    , options(options_)
     , sample_block{sample_block_}
     , connection{std::make_shared<Poco::MongoDB::Connection>()}
 {
+
+    StorageMongoDBSocketFactory socket_factory;
     if (!uri.empty())
     {
         // Connect with URI.
-        Poco::MongoDB::Connection::SocketFactory socket_factory;
         connection->connect(uri, socket_factory);
 
         Poco::URI poco_uri(connection->uri());
@@ -140,8 +144,10 @@ MongoDBDictionarySource::MongoDBDictionarySource(
     }
     else
     {
-        // Connect with host/port/user/etc.
-        connection->connect(host, port);
+        // Connect with host/port/user/etc through constructing the uri
+        std::string uri_constructed("mongodb://" + host + ":" + std::to_string(port) + "/" + db + (options.empty() ? "" : "?" + options));
+        connection->connect(uri_constructed, socket_factory);
+
         if (!user.empty())
         {
             Poco::MongoDB::Database poco_db(db);
@@ -154,7 +160,9 @@ MongoDBDictionarySource::MongoDBDictionarySource(
 
 MongoDBDictionarySource::MongoDBDictionarySource(const MongoDBDictionarySource & other)
     : MongoDBDictionarySource{
-        other.dict_struct, other.uri, other.host, other.port, other.user, other.password, other.method, other.db, other.collection, other.sample_block}
+        other.dict_struct, other.uri, other.host, other.port, other.user, other.password, other.method, other.db,
+        other.collection, other.options, other.sample_block
+    }
 {
 }
 
diff --git a/src/Dictionaries/MongoDBDictionarySource.h b/src/Dictionaries/MongoDBDictionarySource.h
index 4c7ae649f095..fefcb1bff9f2 100644
--- a/src/Dictionaries/MongoDBDictionarySource.h
+++ b/src/Dictionaries/MongoDBDictionarySource.h
@@ -41,6 +41,7 @@ class MongoDBDictionarySource final : public IDictionarySource
         const std::string & method_,
         const std::string & db_,
         const std::string & collection_,
+        const std::string & options,
         const Block & sample_block_);
 
     MongoDBDictionarySource(const MongoDBDictionarySource & other);
@@ -80,6 +81,7 @@ class MongoDBDictionarySource final : public IDictionarySource
     const std::string method;
     std::string db;
     const std::string collection;
+    const std::string options;
     Block sample_block;
 
     std::shared_ptr<Poco::MongoDB::Connection> connection;
diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
index 68efd3f5d785..91a04bde0bfe 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
@@ -1,6 +1,7 @@
 #include "CachedOnDiskReadBufferFromFile.h"
 
 #include <Disks/IO/createReadBufferFromFileBase.h>
+#include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
 #include <IO/ReadBufferFromFile.h>
 #include <base/scope_guard.h>
 #include <Common/assert_cast.h>
@@ -115,27 +116,25 @@ void CachedOnDiskReadBufferFromFile::initialize(size_t offset, size_t size)
 
     if (settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache)
     {
-        file_segments_holder.emplace(cache->get(cache_key, offset, size));
+        file_segments = cache->get(cache_key, offset, size);
     }
     else
     {
         CreateFileSegmentSettings create_settings(is_persistent ? FileSegmentKind::Persistent : FileSegmentKind::Regular);
-        file_segments_holder.emplace(cache->getOrSet(cache_key, offset, size, create_settings));
+        file_segments = cache->getOrSet(cache_key, offset, size, create_settings);
     }
 
     /**
      * Segments in returned list are ordered in ascending order and represent a full contiguous
      * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY.
      */
-    if (file_segments_holder->file_segments.empty())
+    if (file_segments->empty())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "List of file segments cannot be empty");
 
     LOG_TEST(
         log,
         "Having {} file segments to read: {}, current offset: {}",
-        file_segments_holder->file_segments.size(), file_segments_holder->toString(), file_offset_of_buffer_end);
-
-    current_file_segment_it = file_segments_holder->file_segments.begin();
+        file_segments->size(), file_segments->toString(), file_offset_of_buffer_end);
 
     initialized = true;
 }
@@ -165,7 +164,7 @@ CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segm
 }
 
 CachedOnDiskReadBufferFromFile::ImplementationBufferPtr
-CachedOnDiskReadBufferFromFile::getRemoteFSReadBuffer(FileSegment & file_segment, ReadType read_type_)
+CachedOnDiskReadBufferFromFile::getRemoteReadBuffer(FileSegment & file_segment, ReadType read_type_)
 {
     switch (read_type_)
     {
@@ -202,7 +201,7 @@ CachedOnDiskReadBufferFromFile::getRemoteFSReadBuffer(FileSegment & file_segment
             }
             else
             {
-                chassert(remote_fs_segment_reader->getFileOffsetOfBufferEnd() == file_segment.getCurrentWriteOffset());
+                chassert(remote_fs_segment_reader->getFileOffsetOfBufferEnd() == file_segment.getCurrentWriteOffset(false));
             }
 
             return remote_fs_segment_reader;
@@ -239,27 +238,27 @@ bool CachedOnDiskReadBufferFromFile::canStartFromCache(size_t current_offset, co
     /// requested_range:    [__________]
     ///                     ^
     ///                     current_offset
-    size_t first_non_downloaded_offset = file_segment.getFirstNonDownloadedOffset();
+    size_t first_non_downloaded_offset = file_segment.getFirstNonDownloadedOffset(true);
     return first_non_downloaded_offset > current_offset;
 }
 
 CachedOnDiskReadBufferFromFile::ImplementationBufferPtr
-CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & file_segment)
+CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegment & file_segment)
 {
-    auto download_state = file_segment->state();
+    auto download_state = file_segment.state();
 
     if (settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache)
     {
         if (download_state == FileSegment::State::DOWNLOADED)
         {
             read_type = ReadType::CACHED;
-            return getCacheReadBuffer(*file_segment);
+            return getCacheReadBuffer(file_segment);
         }
         else
         {
             LOG_TEST(log, "Bypassing cache because `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` option is used");
             read_type = ReadType::REMOTE_FS_READ_BYPASS_CACHE;
-            return getRemoteFSReadBuffer(*file_segment, read_type);
+            return getRemoteReadBuffer(file_segment, read_type);
         }
     }
 
@@ -267,15 +266,15 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
     {
         switch (download_state)
         {
-            case FileSegment::State::SKIP_CACHE:
+            case FileSegment::State::DETACHED:
             {
-                LOG_TRACE(log, "Bypassing cache because file segment state is `SKIP_CACHE`");
+                LOG_TRACE(log, "Bypassing cache because file segment state is `DETACHED`");
                 read_type = ReadType::REMOTE_FS_READ_BYPASS_CACHE;
-                return getRemoteFSReadBuffer(*file_segment, read_type);
+                return getRemoteReadBuffer(file_segment, read_type);
             }
             case FileSegment::State::DOWNLOADING:
             {
-                if (canStartFromCache(file_offset_of_buffer_end, *file_segment))
+                if (canStartFromCache(file_offset_of_buffer_end, file_segment))
                 {
                     ///                      segment{k} state: DOWNLOADING
                     /// cache:           [______|___________
@@ -286,21 +285,21 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                     ///                     file_offset_of_buffer_end
 
                     read_type = ReadType::CACHED;
-                    return getCacheReadBuffer(*file_segment);
+                    return getCacheReadBuffer(file_segment);
                 }
 
-                download_state = file_segment->wait();
+                download_state = file_segment.wait(file_offset_of_buffer_end);
                 continue;
             }
             case FileSegment::State::DOWNLOADED:
             {
                 read_type = ReadType::CACHED;
-                return getCacheReadBuffer(*file_segment);
+                return getCacheReadBuffer(file_segment);
             }
             case FileSegment::State::EMPTY:
             case FileSegment::State::PARTIALLY_DOWNLOADED:
             {
-                if (canStartFromCache(file_offset_of_buffer_end, *file_segment))
+                if (canStartFromCache(file_offset_of_buffer_end, file_segment))
                 {
                     ///                      segment{k} state: PARTIALLY_DOWNLOADED
                     /// cache:           [______|___________
@@ -311,13 +310,13 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                     ///                     file_offset_of_buffer_end
 
                     read_type = ReadType::CACHED;
-                    return getCacheReadBuffer(*file_segment);
+                    return getCacheReadBuffer(file_segment);
                 }
 
-                auto downloader_id = file_segment->getOrSetDownloader();
-                if (downloader_id == file_segment->getCallerId())
+                auto downloader_id = file_segment.getOrSetDownloader();
+                if (downloader_id == file_segment.getCallerId())
                 {
-                    if (canStartFromCache(file_offset_of_buffer_end, *file_segment))
+                    if (canStartFromCache(file_offset_of_buffer_end, file_segment))
                     {
                         ///                      segment{k}
                         /// cache:           [______|___________
@@ -328,11 +327,12 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                         ///                     file_offset_of_buffer_end
 
                         read_type = ReadType::CACHED;
-                        file_segment->resetDownloader();
-                        return getCacheReadBuffer(*file_segment);
+                        file_segment.resetDownloader();
+                        return getCacheReadBuffer(file_segment);
                     }
 
-                    if (file_segment->getCurrentWriteOffset() < file_offset_of_buffer_end)
+                    auto current_write_offset = file_segment.getCurrentWriteOffset(false);
+                    if (current_write_offset < file_offset_of_buffer_end)
                     {
                         ///                   segment{1}
                         /// cache:         [_____|___________
@@ -342,25 +342,25 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                         ///                           ^
                         ///                           file_offset_of_buffer_end
 
-                        LOG_TEST(log, "Predownload. File segment info: {}", file_segment->getInfoForLog());
-                        chassert(file_offset_of_buffer_end > file_segment->getCurrentWriteOffset());
-                        bytes_to_predownload = file_offset_of_buffer_end - file_segment->getCurrentWriteOffset();
-                        chassert(bytes_to_predownload < file_segment->range().size());
+                        LOG_TEST(log, "Predownload. File segment info: {}", file_segment.getInfoForLog());
+                        chassert(file_offset_of_buffer_end > current_write_offset);
+                        bytes_to_predownload = file_offset_of_buffer_end - current_write_offset;
+                        chassert(bytes_to_predownload < file_segment.range().size());
                     }
 
                     read_type = ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE;
-                    return getRemoteFSReadBuffer(*file_segment, read_type);
+                    return getRemoteReadBuffer(file_segment, read_type);
                 }
 
-                download_state = file_segment->state();
+                download_state = file_segment.state();
                 continue;
             }
             case FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION:
             {
-                if (canStartFromCache(file_offset_of_buffer_end, *file_segment))
+                if (canStartFromCache(file_offset_of_buffer_end, file_segment))
                 {
                     read_type = ReadType::CACHED;
-                    return getCacheReadBuffer(*file_segment);
+                    return getCacheReadBuffer(file_segment);
                 }
                 else
                 {
@@ -368,7 +368,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                         log,
                         "Bypassing cache because file segment state is `PARTIALLY_DOWNLOADED_NO_CONTINUATION` and downloaded part already used");
                     read_type = ReadType::REMOTE_FS_READ_BYPASS_CACHE;
-                    return getRemoteFSReadBuffer(*file_segment, read_type);
+                    return getRemoteReadBuffer(file_segment, read_type);
                 }
             }
         }
@@ -376,12 +376,12 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
 }
 
 CachedOnDiskReadBufferFromFile::ImplementationBufferPtr
-CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegmentPtr & file_segment)
+CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegment & file_segment)
 {
-    chassert(!file_segment->isDownloader());
-    chassert(file_offset_of_buffer_end >= file_segment->range().left);
+    chassert(!file_segment.isDownloader());
+    chassert(file_offset_of_buffer_end >= file_segment.range().left);
 
-    auto range = file_segment->range();
+    auto range = file_segment.range();
     bytes_to_predownload = 0;
 
     Stopwatch watch(CLOCK_MONOTONIC);
@@ -393,17 +393,18 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegmentPtr & file_se
         ProfileEvents::FileSegmentWaitReadBufferMicroseconds, watch.elapsedMicroseconds());
 
     [[maybe_unused]] auto download_current_segment = read_type == ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE;
-    chassert(download_current_segment == file_segment->isDownloader());
+    chassert(download_current_segment == file_segment.isDownloader());
 
-    chassert(file_segment->range() == range);
+    chassert(file_segment.range() == range);
     chassert(file_offset_of_buffer_end >= range.left && file_offset_of_buffer_end <= range.right);
 
     LOG_TEST(
         log,
-        "Current file segment: {}, read type: {}, current file offset: {}",
-        range.toString(),
+        "Current read type: {}, read offset: {}, impl read range: {}, file segment: {}",
         toString(read_type),
-        file_offset_of_buffer_end);
+        file_offset_of_buffer_end,
+        read_buffer_for_file_segment->getFileOffsetOfBufferEnd(),
+        file_segment.getInfoForLog());
 
     read_buffer_for_file_segment->setReadUntilPosition(range.right + 1); /// [..., range.right]
 
@@ -445,11 +446,11 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegmentPtr & file_se
         }
         case ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE:
         {
-            chassert(file_segment->isDownloader());
+            chassert(file_segment.isDownloader());
 
             if (bytes_to_predownload)
             {
-                size_t current_write_offset = file_segment->getCurrentWriteOffset();
+                const size_t current_write_offset = file_segment.getCurrentWriteOffset(false);
                 read_buffer_for_file_segment->seek(current_write_offset, SEEK_SET);
             }
             else
@@ -459,7 +460,7 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegmentPtr & file_se
                 assert(read_buffer_for_file_segment->getFileOffsetOfBufferEnd() == file_offset_of_buffer_end);
             }
 
-            auto current_write_offset = file_segment->getCurrentWriteOffset();
+            const auto current_write_offset = file_segment.getCurrentWriteOffset(false);
             if (current_write_offset != static_cast<size_t>(read_buffer_for_file_segment->getPosition()))
             {
                 throw Exception(
@@ -470,7 +471,7 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegmentPtr & file_se
                     current_write_offset,
                     read_buffer_for_file_segment->getPosition(),
                     read_buffer_for_file_segment->getFileOffsetOfBufferEnd(),
-                    file_segment->getInfoForLog());
+                    file_segment.getInfoForLog());
             }
 
             break;
@@ -484,52 +485,41 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegmentPtr & file_se
 
 bool CachedOnDiskReadBufferFromFile::completeFileSegmentAndGetNext()
 {
-    LOG_TEST(log, "Completed segment: {}", (*current_file_segment_it)->range().toString());
+    auto * current_file_segment = &file_segments->front();
+    auto completed_range = current_file_segment->range();
 
     if (enable_logging)
-        appendFilesystemCacheLog((*current_file_segment_it)->range(), read_type);
-
-    auto file_segment_it = current_file_segment_it++;
-    auto & file_segment = *file_segment_it;
-
-    [[maybe_unused]] const auto & range = file_segment->range();
-    chassert(file_offset_of_buffer_end > range.right);
+        appendFilesystemCacheLog(completed_range, read_type);
 
-    LOG_TEST(
-        log,
-        "Removing file segment: {}, downloader: {}, state: {}",
-        file_segment->range().toString(),
-        file_segment->getDownloader(),
-        file_segment->state());
+    chassert(file_offset_of_buffer_end > completed_range.right);
 
-    /// Do not hold pointer to file segment if it is not needed anymore
-    /// so can become releasable and can be evicted from cache.
-    file_segment->completeWithoutState();
-    file_segments_holder->file_segments.erase(file_segment_it);
-
-    if (current_file_segment_it == file_segments_holder->file_segments.end())
+    file_segments->popFront();
+    if (file_segments->empty())
         return false;
 
-    implementation_buffer = getImplementationBuffer(*current_file_segment_it);
+    current_file_segment = &file_segments->front();
+    current_file_segment->use();
+    implementation_buffer = getImplementationBuffer(*current_file_segment);
 
     if (read_type == ReadType::CACHED)
-        (*current_file_segment_it)->incrementHitsCount();
+        current_file_segment->incrementHitsCount();
+
+    LOG_TEST(
+        log, "New segment range: {}, old range: {}",
+        current_file_segment->range().toString(), completed_range.toString());
 
-    LOG_TEST(log, "New segment: {}", (*current_file_segment_it)->range().toString());
     return true;
 }
 
 CachedOnDiskReadBufferFromFile::~CachedOnDiskReadBufferFromFile()
 {
-    if (enable_logging
-        && file_segments_holder
-        && current_file_segment_it != file_segments_holder->file_segments.end())
+    if (enable_logging && file_segments && !file_segments->empty())
     {
-        appendFilesystemCacheLog((*current_file_segment_it)->range(), read_type);
+        appendFilesystemCacheLog(file_segments->front().range(), read_type);
     }
 }
 
-void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
+void CachedOnDiskReadBufferFromFile::predownload(FileSegment & file_segment)
 {
     Stopwatch predownload_watch(CLOCK_MONOTONIC);
     SCOPE_EXIT({
@@ -548,9 +538,10 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
         /// download from offset a'' < a', but return buffer from offset a'.
         LOG_TEST(log, "Bytes to predownload: {}, caller_id: {}", bytes_to_predownload, FileSegment::getCallerId());
 
-        chassert(static_cast<size_t>(implementation_buffer->getPosition()) == file_segment->getCurrentWriteOffset());
-        size_t current_offset = file_segment->getCurrentWriteOffset();
-        const auto & current_range = file_segment->range();
+        /// chassert(implementation_buffer->getFileOffsetOfBufferEnd() == file_segment.getCurrentWriteOffset(false));
+        chassert(static_cast<size_t>(implementation_buffer->getPosition()) == file_segment.getCurrentWriteOffset(false));
+        size_t current_offset = file_segment.getCurrentWriteOffset(false);
+        const auto & current_range = file_segment.range();
 
         while (true)
         {
@@ -575,7 +566,7 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
                         "current download offset: {}, expected: {}, eof: {}",
                         bytes_to_predownload,
                         current_range.toString(),
-                        file_segment->getCurrentWriteOffset(),
+                        file_segment.getCurrentWriteOffset(false),
                         file_offset_of_buffer_end,
                         implementation_buffer->eof());
 
@@ -585,7 +576,7 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
                 {
                     nextimpl_working_buffer_offset = implementation_buffer->offset();
 
-                    auto current_write_offset = file_segment->getCurrentWriteOffset();
+                    auto current_write_offset = file_segment.getCurrentWriteOffset(false);
                     if (current_write_offset != static_cast<size_t>(implementation_buffer->getPosition())
                         || current_write_offset != file_offset_of_buffer_end)
                     {
@@ -597,7 +588,7 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
                             current_write_offset,
                             file_offset_of_buffer_end,
                             implementation_buffer->getPosition(),
-                            file_segment->getInfoForLog());
+                            file_segment.getInfoForLog());
                     }
                 }
 
@@ -609,15 +600,15 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
 
             ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromSourceBytes, current_impl_buffer_size);
 
-            bool continue_predownload = file_segment->reserve(current_predownload_size);
+            bool continue_predownload = file_segment.reserve(current_predownload_size);
             if (continue_predownload)
             {
                 LOG_TEST(log, "Left to predownload: {}, buffer size: {}", bytes_to_predownload, current_impl_buffer_size);
 
-                chassert(file_segment->getCurrentWriteOffset() == static_cast<size_t>(implementation_buffer->getPosition()));
+                chassert(file_segment.getCurrentWriteOffset(false) == static_cast<size_t>(implementation_buffer->getPosition()));
 
-                bool success = writeCache(implementation_buffer->buffer().begin(), current_predownload_size, current_offset, *file_segment);
-                if (success)
+                continue_predownload = writeCache(implementation_buffer->buffer().begin(), current_predownload_size, current_offset, file_segment);
+                if (continue_predownload)
                 {
                     current_offset += current_predownload_size;
 
@@ -627,13 +618,8 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
                 else
                 {
                     LOG_TEST(log, "Bypassing cache because writeCache (in predownload) method failed");
-                    continue_predownload = false;
                 }
             }
-            else
-            {
-                file_segment->completeWithState(FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
-            }
 
             if (!continue_predownload)
             {
@@ -653,21 +639,21 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
                 /// TODO: allow seek more than once with seek avoiding.
 
                 bytes_to_predownload = 0;
+                file_segment.completePartAndResetDownloader();
+                chassert(file_segment.state() == FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
 
-                chassert(file_segment->state() == FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION
-                         || file_segment->state() == FileSegment::State::SKIP_CACHE);
-                LOG_TEST(log, "Bypassing cache because for {}", file_segment->getInfoForLog());
+                LOG_TEST(log, "Bypassing cache because for {}", file_segment.getInfoForLog());
 
                 read_type = ReadType::REMOTE_FS_READ_BYPASS_CACHE;
 
                 swap(*implementation_buffer);
                 resetWorkingBuffer();
 
-                implementation_buffer = getRemoteFSReadBuffer(*file_segment, read_type);
+                implementation_buffer = getRemoteReadBuffer(file_segment, read_type);
 
                 swap(*implementation_buffer);
 
-                implementation_buffer->setReadUntilPosition(file_segment->range().right + 1); /// [..., range.right]
+                implementation_buffer->setReadUntilPosition(file_segment.range().right + 1); /// [..., range.right]
                 implementation_buffer->seek(file_offset_of_buffer_end, SEEK_SET);
 
                 LOG_TRACE(
@@ -684,12 +670,12 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment)
 
 bool CachedOnDiskReadBufferFromFile::updateImplementationBufferIfNeeded()
 {
-    auto & file_segment = *current_file_segment_it;
-    auto current_read_range = file_segment->range();
-    auto current_state = file_segment->state();
+    auto & file_segment = file_segments->front();
+    const auto & current_read_range = file_segment.range();
+    auto current_state = file_segment.state();
 
     chassert(current_read_range.left <= file_offset_of_buffer_end);
-    chassert(!file_segment->isDownloader());
+    chassert(!file_segment.isDownloader());
 
     if (file_offset_of_buffer_end > current_read_range.right)
     {
@@ -708,7 +694,7 @@ bool CachedOnDiskReadBufferFromFile::updateImplementationBufferIfNeeded()
         ///                     ^
         ///                     file_offset_of_buffer_end
 
-        auto current_write_offset = file_segment->getCurrentWriteOffset();
+        auto current_write_offset = file_segment.getCurrentWriteOffset(true);
         bool cached_part_is_finished = current_write_offset == file_offset_of_buffer_end;
 
         LOG_TEST(log, "Current write offset: {}, file offset of buffer end: {}", current_write_offset, file_offset_of_buffer_end);
@@ -716,7 +702,7 @@ bool CachedOnDiskReadBufferFromFile::updateImplementationBufferIfNeeded()
         if (cached_part_is_finished)
         {
             /// TODO: makes sense to reuse local file reader if we return here with CACHED read type again?
-            implementation_buffer = getImplementationBuffer(*current_file_segment_it);
+            implementation_buffer = getImplementationBuffer(file_segment);
 
             return true;
         }
@@ -744,7 +730,7 @@ bool CachedOnDiskReadBufferFromFile::updateImplementationBufferIfNeeded()
         * to read by marks range given to him. Therefore, each nextImpl() call, in case of
         * READ_AND_PUT_IN_CACHE, starts with getOrSetDownloader().
         */
-        implementation_buffer = getImplementationBuffer(*current_file_segment_it);
+        implementation_buffer = getImplementationBuffer(file_segment);
     }
 
     return true;
@@ -795,15 +781,13 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
 {
     last_caller_id = FileSegment::getCallerId();
 
-    assertCorrectness();
-
     if (file_offset_of_buffer_end == read_until_position)
         return false;
 
     if (!initialized)
         initialize(file_offset_of_buffer_end, getTotalSizeToRead());
 
-    if (current_file_segment_it == file_segments_holder->file_segments.end())
+    if (file_segments->empty())
         return false;
 
     bool implementation_buffer_can_be_reused = false;
@@ -813,25 +797,25 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
             /// Save state of current file segment before it is completed.
             nextimpl_step_log_info = getInfoForLog();
 
-            if (current_file_segment_it == file_segments_holder->file_segments.end())
+            if (file_segments->empty())
                 return;
 
-            auto & file_segment = *current_file_segment_it;
+            auto & file_segment = file_segments->front();
 
             bool download_current_segment = read_type == ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE;
             if (download_current_segment)
             {
-                bool need_complete_file_segment = file_segment->isDownloader();
+                bool need_complete_file_segment = file_segment.isDownloader();
                 if (need_complete_file_segment)
                 {
                     if (!implementation_buffer_can_be_reused)
-                        file_segment->resetRemoteFileReader();
+                        file_segment.resetRemoteFileReader();
 
-                    file_segment->completePartAndResetDownloader();
+                    file_segment.completePartAndResetDownloader();
                 }
             }
 
-            chassert(!file_segment->isDownloader());
+            chassert(!file_segment.isDownloader());
         }
         catch (...)
         {
@@ -849,10 +833,10 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
     }
     else
     {
-        implementation_buffer = getImplementationBuffer(*current_file_segment_it);
+        implementation_buffer = getImplementationBuffer(file_segments->front());
 
         if (read_type == ReadType::CACHED)
-            (*current_file_segment_it)->incrementHitsCount();
+            file_segments->front().incrementHitsCount();
     }
 
     chassert(!internal_buffer.empty());
@@ -863,16 +847,16 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
     // the caller doesn't try to use this CachedOnDiskReadBufferFromFile after it threw an exception.)
     swap(*implementation_buffer);
 
-    auto & file_segment = *current_file_segment_it;
-    auto current_read_range = file_segment->range();
+    auto & file_segment = file_segments->front();
+    const auto & current_read_range = file_segment.range();
 
     LOG_TEST(
         log,
-        "Current count: {}, position: {}, buffer end: {}, file segment: {}",
-        implementation_buffer->count(),
-        implementation_buffer->getPosition(),
+        "Current read type: {}, read offset: {}, impl offset: {}, file segment: {}",
+        toString(read_type),
+        file_offset_of_buffer_end,
         implementation_buffer->getFileOffsetOfBufferEnd(),
-        file_segment->getInfoForLog());
+        file_segment.getInfoForLog());
 
     chassert(current_read_range.left <= file_offset_of_buffer_end);
     chassert(current_read_range.right >= file_offset_of_buffer_end);
@@ -890,12 +874,12 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
     }
 
     auto download_current_segment = read_type == ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE;
-    if (download_current_segment != file_segment->isDownloader())
+    if (download_current_segment != file_segment.isDownloader())
     {
         throw Exception(
             ErrorCodes::LOGICAL_ERROR,
             "Incorrect segment state. Having read type: {}, file segment info: {}",
-            toString(read_type), file_segment->getInfoForLog());
+            toString(read_type), file_segment.getInfoForLog());
     }
 
     if (!result)
@@ -937,7 +921,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
             log,
             "Read {} bytes, read type {}, position: {}, offset: {}, segment end: {}",
             size, toString(read_type), implementation_buffer->getPosition(),
-            implementation_buffer->getFileOffsetOfBufferEnd(), file_segment->range().right);
+            implementation_buffer->getFileOffsetOfBufferEnd(), file_segment.range().right);
 
         if (read_type == ReadType::CACHED)
         {
@@ -955,20 +939,20 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
     {
         if (download_current_segment)
         {
-            chassert(file_offset_of_buffer_end + size - 1 <= file_segment->range().right);
+            chassert(file_offset_of_buffer_end + size - 1 <= file_segment.range().right);
 
-            bool success = file_segment->reserve(size);
+            bool success = file_segment.reserve(size);
             if (success)
             {
-                chassert(file_segment->getCurrentWriteOffset() == static_cast<size_t>(implementation_buffer->getPosition()));
+                chassert(file_segment.getCurrentWriteOffset(false) == static_cast<size_t>(implementation_buffer->getPosition()));
 
-                success = writeCache(implementation_buffer->position(), size, file_offset_of_buffer_end, *file_segment);
+                success = writeCache(implementation_buffer->position(), size, file_offset_of_buffer_end, file_segment);
                 if (success)
                 {
-                    chassert(file_segment->getCurrentWriteOffset() <= file_segment->range().right + 1);
+                    chassert(file_segment.getCurrentWriteOffset(false) <= file_segment.range().right + 1);
                     chassert(
-                        std::next(current_file_segment_it) == file_segments_holder->file_segments.end()
-                        || file_segment->getCurrentWriteOffset() == implementation_buffer->getFileOffsetOfBufferEnd());
+                        /* last_file_segment */file_segments->size() == 1
+                        || file_segment.getCurrentWriteOffset(false) == implementation_buffer->getFileOffsetOfBufferEnd());
 
                     LOG_TEST(log, "Successfully written {} bytes", size);
 
@@ -980,20 +964,13 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
                 }
                 else
                 {
-                    chassert(file_segment->state() == FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
+                    chassert(file_segment.state() == FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
                     LOG_TRACE(log, "Bypassing cache because writeCache method failed");
                 }
             }
             else
             {
                 LOG_TRACE(log, "No space left in cache to reserve {} bytes, will continue without cache download", size);
-                file_segment->completeWithState(FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
-            }
-
-            if (!success)
-            {
-                read_type = ReadType::REMOTE_FS_READ_BYPASS_CACHE;
-                download_current_segment = false;
             }
         }
 
@@ -1003,7 +980,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
         /// Therefore need to resize to a smaller size. And resize must be done after write into cache.
         /// - If last file segment was read from local fs, then we could read more than
         /// file_segemnt->range().right, so resize is also needed.
-        if (std::next(current_file_segment_it) == file_segments_holder->file_segments.end())
+        if (file_segments->size() == 1)
         {
             size_t remaining_size_to_read
                 = std::min(current_read_range.right, read_until_position - 1) - file_offset_of_buffer_end + 1;
@@ -1023,17 +1000,17 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
 
     // No necessary because of the SCOPE_EXIT above, but useful for logging below.
     if (download_current_segment)
-        file_segment->completePartAndResetDownloader();
+        file_segment.completePartAndResetDownloader();
 
-    chassert(!file_segment->isDownloader());
+    chassert(!file_segment.isDownloader());
 
     LOG_TEST(
         log,
         "Key: {}. Returning with {} bytes, buffer position: {} (offset: {}, predownloaded: {}), "
-        "buffer available: {}, current range: {}, current offset: {}, file segment state: {}, "
+        "buffer available: {}, current range: {}, file offset of buffer end: {}, impl offset: {}, file segment state: {}, "
         "current write offset: {}, read_type: {}, reading until position: {}, started with offset: {}, "
         "remaining ranges: {}",
-        getHexUIntLowercase(cache_key),
+        cache_key.toString(),
         working_buffer.size(),
         getPosition(),
         offset(),
@@ -1041,12 +1018,13 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
         available(),
         current_read_range.toString(),
         file_offset_of_buffer_end,
-        FileSegment::stateToString(file_segment->state()),
-        file_segment->getCurrentWriteOffset(),
+        implementation_buffer->getFileOffsetOfBufferEnd(),
+        FileSegment::stateToString(file_segment.state()),
+        file_segment.getCurrentWriteOffset(false),
         toString(read_type),
         read_until_position,
         first_offset,
-        file_segments_holder->toString());
+        file_segments->toString());
 
     if (size == 0 && file_offset_of_buffer_end < read_until_position)
     {
@@ -1065,7 +1043,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
             cache_file_size ? std::to_string(cache_file_size) : "None",
             cache_file_path,
             implementation_buffer->getFileOffsetOfBufferEnd(),
-            file_segment->getInfoForLog());
+            file_segment.getInfoForLog());
     }
 
     return result;
@@ -1113,13 +1091,13 @@ off_t CachedOnDiskReadBufferFromFile::seek(off_t offset, int whence)
     first_offset = file_offset_of_buffer_end = new_pos;
     resetWorkingBuffer();
 
-    // if (file_segments_holder && current_file_segment_it != file_segments_holder->file_segments.end())
+    // if (file_segments && current_file_segment_it != file_segments->file_segments.end())
     // {
-    //      auto & file_segments = file_segments_holder->file_segments;
+    //      auto & file_segments = file_segments->file_segments;
     //      LOG_TRACE(
     //          log,
     //          "Having {} file segments to read: {}, current offset: {}",
-    //          file_segments_holder->file_segments.size(), file_segments_holder->toString(), file_offset_of_buffer_end);
+    //          file_segments->file_segments.size(), file_segments->toString(), file_offset_of_buffer_end);
 
     //      auto it = std::upper_bound(
     //          file_segments.begin(),
@@ -1150,7 +1128,7 @@ off_t CachedOnDiskReadBufferFromFile::seek(off_t offset, int whence)
     //      }
     // }
 
-    file_segments_holder.reset();
+    file_segments.reset();
     implementation_buffer.reset();
     initialized = false;
 
@@ -1185,7 +1163,7 @@ void CachedOnDiskReadBufferFromFile::setReadUntilPosition(size_t position)
 
     file_offset_of_buffer_end = getPosition();
     resetWorkingBuffer();
-    file_segments_holder.reset();
+    file_segments.reset();
     implementation_buffer.reset();
     initialized = false;
 
@@ -1204,25 +1182,9 @@ off_t CachedOnDiskReadBufferFromFile::getPosition()
     return file_offset_of_buffer_end - available();
 }
 
-std::optional<size_t> CachedOnDiskReadBufferFromFile::getLastNonDownloadedOffset() const
-{
-    if (!file_segments_holder)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "File segments holder not initialized");
-
-    const auto & file_segments = file_segments_holder->file_segments;
-    for (auto it = file_segments.rbegin(); it != file_segments.rend(); ++it)
-    {
-        const auto & file_segment = *it;
-        if (file_segment->state() != FileSegment::State::DOWNLOADED)
-            return file_segment->range().right;
-    }
-
-    return std::nullopt;
-}
-
 void CachedOnDiskReadBufferFromFile::assertCorrectness() const
 {
-    if (FileCache::isReadOnly()
+    if (!CachedObjectStorage::canUseReadThroughCache()
         && !settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache usage is not allowed (query_id: {})", query_id);
 }
@@ -1230,16 +1192,16 @@ void CachedOnDiskReadBufferFromFile::assertCorrectness() const
 String CachedOnDiskReadBufferFromFile::getInfoForLog()
 {
     String current_file_segment_info;
-    if (current_file_segment_it != file_segments_holder->file_segments.end())
-        current_file_segment_info = (*current_file_segment_it)->getInfoForLog();
-    else
+    if (file_segments->empty())
         current_file_segment_info = "None";
+    else
+        current_file_segment_info = file_segments->front().getInfoForLog();
 
     return fmt::format(
         "Buffer path: {}, hash key: {}, file_offset_of_buffer_end: {}, read_until_position: {}, "
         "internal buffer end: {}, read_type: {}, last caller: {}, file segment info: {}",
         source_file_path,
-        getHexUIntLowercase(cache_key),
+        cache_key.toString(),
         file_offset_of_buffer_end,
         read_until_position,
         implementation_buffer ? std::to_string(implementation_buffer->getFileOffsetOfBufferEnd()) : "None",
diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h
index d3c265a522b4..bd8d5af7bcde 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h
@@ -62,25 +62,28 @@ class CachedOnDiskReadBufferFromFile : public ReadBufferFromFileBase
 
 private:
     void initialize(size_t offset, size_t size);
+    void assertCorrectness() const;
+
+    /**
+     * Return a list of file segments ordered in ascending order. This list represents
+     * a full contiguous interval (without holes).
+     */
+    FileSegmentsHolderPtr getFileSegments(size_t offset, size_t size) const;
 
-    ImplementationBufferPtr getImplementationBuffer(FileSegmentPtr & file_segment);
+    ImplementationBufferPtr getImplementationBuffer(FileSegment & file_segment);
 
-    ImplementationBufferPtr getReadBufferForFileSegment(FileSegmentPtr & file_segment);
+    ImplementationBufferPtr getReadBufferForFileSegment(FileSegment & file_segment);
 
     ImplementationBufferPtr getCacheReadBuffer(const FileSegment & file_segment) const;
 
-    std::optional<size_t> getLastNonDownloadedOffset() const;
+    ImplementationBufferPtr getRemoteReadBuffer(FileSegment & file_segment, ReadType read_type_);
 
     bool updateImplementationBufferIfNeeded();
 
-    void predownload(FileSegmentPtr & file_segment);
+    void predownload(FileSegment & file_segment);
 
     bool nextImplStep();
 
-    void assertCorrectness() const;
-
-    std::shared_ptr<ReadBufferFromFileBase> getRemoteFSReadBuffer(FileSegment & file_segment, ReadType read_type_);
-
     size_t getTotalSizeToRead();
 
     bool completeFileSegmentAndGetNext();
@@ -107,8 +110,7 @@ class CachedOnDiskReadBufferFromFile : public ReadBufferFromFileBase
     /// Remote read buffer, which can only be owned by current buffer.
     FileSegment::RemoteFileReaderPtr remote_file_reader;
 
-    std::optional<FileSegmentsHolder> file_segments_holder;
-    FileSegments::iterator current_file_segment_it;
+    FileSegmentsHolderPtr file_segments;
 
     ImplementationBufferPtr implementation_buffer;
     bool initialized = false;
@@ -142,7 +144,7 @@ class CachedOnDiskReadBufferFromFile : public ReadBufferFromFileBase
     CurrentMetrics::Increment metric_increment{CurrentMetrics::FilesystemCacheReadBuffers};
     ProfileEvents::Counters current_file_segment_counters;
 
-    FileCache::QueryContextHolder query_context_holder;
+    FileCache::QueryContextHolderPtr query_context_holder;
 
     bool is_persistent;
 };
diff --git a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp
index 169bbfac886e..d72dcecb4848 100644
--- a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp
@@ -50,27 +50,29 @@ bool FileSegmentRangeWriter::write(const char * data, size_t size, size_t offset
             offset, expected_write_offset);
     }
 
-    auto & file_segments = file_segments_holder.file_segments;
+    FileSegment * file_segment;
 
-    if (file_segments.empty() || file_segments.back()->isDownloaded())
+    if (file_segments.empty() || file_segments.back().isDownloaded())
     {
-        allocateFileSegment(expected_write_offset, segment_kind);
+        file_segment = &allocateFileSegment(expected_write_offset, segment_kind);
+    }
+    else
+    {
+        file_segment = &file_segments.back();
     }
-
-    auto & file_segment = file_segments.back();
 
     SCOPE_EXIT({
-        if (file_segments.back()->isDownloader())
-            file_segments.back()->completePartAndResetDownloader();
+        if (file_segments.back().isDownloader())
+            file_segments.back().completePartAndResetDownloader();
     });
 
     while (size > 0)
     {
-        size_t available_size = file_segment->range().size() - file_segment->getDownloadedSize();
+        size_t available_size = file_segment->range().size() - file_segment->getDownloadedSize(false);
         if (available_size == 0)
         {
             completeFileSegment(*file_segment);
-            file_segment = allocateFileSegment(expected_write_offset, segment_kind);
+            file_segment = &allocateFileSegment(expected_write_offset, segment_kind);
             continue;
         }
 
@@ -86,7 +88,6 @@ bool FileSegmentRangeWriter::write(const char * data, size_t size, size_t offset
         bool reserved = file_segment->reserve(size_to_write);
         if (!reserved)
         {
-            file_segment->completeWithState(FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
             appendFilesystemCacheLog(*file_segment);
 
             LOG_DEBUG(
@@ -113,11 +114,10 @@ void FileSegmentRangeWriter::finalize()
     if (finalized)
         return;
 
-    auto & file_segments = file_segments_holder.file_segments;
     if (file_segments.empty())
         return;
 
-    completeFileSegment(*file_segments.back());
+    completeFileSegment(file_segments.back());
     finalized = true;
 }
 
@@ -134,24 +134,21 @@ FileSegmentRangeWriter::~FileSegmentRangeWriter()
     }
 }
 
-FileSegmentPtr & FileSegmentRangeWriter::allocateFileSegment(size_t offset, FileSegmentKind segment_kind)
+FileSegment & FileSegmentRangeWriter::allocateFileSegment(size_t offset, FileSegmentKind segment_kind)
 {
     /**
     * Allocate a new file segment starting `offset`.
     * File segment capacity will equal `max_file_segment_size`, but actual size is 0.
     */
 
-    std::lock_guard cache_lock(cache->mutex);
-
-    CreateFileSegmentSettings create_settings(segment_kind);
+    CreateFileSegmentSettings create_settings(segment_kind, false);
 
     /// We set max_file_segment_size to be downloaded,
     /// if we have less size to write, file segment will be resized in complete() method.
-    auto file_segment = cache->createFileSegmentForDownload(
-        key, offset, cache->max_file_segment_size, create_settings, cache_lock);
-
-    auto & file_segments = file_segments_holder.file_segments;
-    return *file_segments.insert(file_segments.end(), file_segment);
+    auto holder = cache->set(key, offset, cache->getMaxFileSegmentSize(), create_settings);
+    chassert(holder->size() == 1);
+    holder->moveTo(file_segments);
+    return file_segments.back();
 }
 
 void FileSegmentRangeWriter::appendFilesystemCacheLog(const FileSegment & file_segment)
@@ -159,7 +156,7 @@ void FileSegmentRangeWriter::appendFilesystemCacheLog(const FileSegment & file_s
     if (cache_log)
     {
         auto file_segment_range = file_segment.range();
-        size_t file_segment_right_bound = file_segment_range.left + file_segment.getDownloadedSize() - 1;
+        size_t file_segment_right_bound = file_segment_range.left + file_segment.getDownloadedSize(false) - 1;
 
         FilesystemCacheLogElement elem
         {
@@ -185,7 +182,7 @@ void FileSegmentRangeWriter::completeFileSegment(FileSegment & file_segment)
     if (file_segment.isDetached() || file_segment.isCompleted())
         return;
 
-    file_segment.completeWithoutState();
+    file_segment.complete();
     appendFilesystemCacheLog(file_segment);
 }
 
@@ -224,7 +221,7 @@ void CachedOnDiskWriteBufferFromFile::nextImpl()
     {
         /// If something was already written to cache, remove it.
         cache_writer.reset();
-        cache->removeIfExists(key);
+        cache->removeKeyIfExists(key);
 
         throw;
     }
diff --git a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h
index 834e584c8db4..194afe88d882 100644
--- a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h
+++ b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h
@@ -39,7 +39,7 @@ class FileSegmentRangeWriter
     ~FileSegmentRangeWriter();
 
 private:
-    FileSegmentPtr & allocateFileSegment(size_t offset, FileSegmentKind segment_kind);
+    FileSegment & allocateFileSegment(size_t offset, FileSegmentKind segment_kind);
 
     void appendFilesystemCacheLog(const FileSegment & file_segment);
 
@@ -53,7 +53,7 @@ class FileSegmentRangeWriter
     String query_id;
     String source_path;
 
-    FileSegmentsHolder file_segments_holder{};
+    FileSegmentsHolder file_segments{};
 
     size_t expected_write_offset = 0;
 
diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
index 8450e740ab54..344abcbc95b0 100644
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@@ -3,6 +3,7 @@
 #include <IO/SeekableReadBuffer.h>
 
 #include <Disks/IO/CachedOnDiskReadBufferFromFile.h>
+#include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
 #include <Common/logger_useful.h>
 #include <iostream>
 #include <base/hex.h>
@@ -56,7 +57,7 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c
 
     if (with_cache)
     {
-        auto cache_key = settings.remote_fs_cache->hash(object_path);
+        auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path);
         return std::make_shared<CachedOnDiskReadBufferFromFile>(
             object_path,
             cache_key,
diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp
index 851b6ba24c94..b8022e9ed3aa 100644
--- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp
@@ -43,13 +43,7 @@ DataSourceDescription CachedObjectStorage::getDataSourceDescription() const
 
 FileCache::Key CachedObjectStorage::getCacheKey(const std::string & path) const
 {
-    return cache->hash(path);
-}
-
-String CachedObjectStorage::getCachePath(const std::string & path) const
-{
-    FileCache::Key cache_key = getCacheKey(path);
-    return cache->getPathInLocalCache(cache_key);
+    return cache->createKeyForPath(path);
 }
 
 std::string CachedObjectStorage::generateBlobNameForPath(const std::string & path)
@@ -62,7 +56,7 @@ ReadSettings CachedObjectStorage::patchSettings(const ReadSettings & read_settin
     ReadSettings modified_settings{read_settings};
     modified_settings.remote_fs_cache = cache;
 
-    if (FileCache::isReadOnly())
+    if (!canUseReadThroughCache())
         modified_settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache = true;
 
     return IObjectStorage::patchSettings(modified_settings);
@@ -192,7 +186,6 @@ std::unique_ptr<WriteBufferFromFileBase> CachedObjectStorage::writeObject( /// N
     if (cache_on_write)
     {
         auto key = getCacheKey(path_key_for_cache);
-        LOG_TEST(log, "Caching file `{}` to `{}` with key {}", object.absolute_path, getCachePath(path_key_for_cache), key.toString());
 
         return std::make_unique<CachedOnDiskWriteBufferFromFile>(
             std::move(implementation_buffer),
@@ -213,7 +206,7 @@ void CachedObjectStorage::removeCacheIfExists(const std::string & path_key_for_c
         return;
 
     /// Add try catch?
-    cache->removeIfExists(getCacheKey(path_key_for_cache));
+    cache->removeKeyIfExists(getCacheKey(path_key_for_cache));
 }
 
 void CachedObjectStorage::removeObject(const StoredObject & object)
@@ -308,4 +301,11 @@ String CachedObjectStorage::getObjectsNamespace() const
     return object_storage->getObjectsNamespace();
 }
 
+bool CachedObjectStorage::canUseReadThroughCache()
+{
+    return CurrentThread::isInitialized()
+        && CurrentThread::get().getQueryContext()
+        && !CurrentThread::getQueryId().empty();
+}
+
 }
diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h
index e61142f4d040..276f6669bfae 100644
--- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h
+++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h
@@ -113,11 +113,11 @@ class CachedObjectStorage final : public IObjectStorage
 
     WriteSettings getAdjustedSettingsFromMetadataFile(const WriteSettings & settings, const std::string & path) const override;
 
+    static bool canUseReadThroughCache();
+
 private:
     FileCache::Key getCacheKey(const std::string & path) const;
 
-    String getCachePath(const std::string & path) const;
-
     ReadSettings patchSettings(const ReadSettings & read_settings) const override;
 
     ObjectStoragePtr object_storage;
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
index 0b57b14eb1c6..bf5d0ab829d7 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@@ -15,6 +15,7 @@
 #include <Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.h>
 #include <Disks/ObjectStorages/DiskObjectStorageTransaction.h>
 #include <Disks/FakeDiskTransaction.h>
+#include <Common/ThreadPool.h>
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Interpreters/Context.h>
 
diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp
index 529466a83c2a..e34d646dfd41 100644
--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@@ -128,7 +128,7 @@ std::unique_ptr<S3::Client> getClient(
     if (uri.key.back() != '/')
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3 path must ends with '/', but '{}' doesn't.", uri.key);
 
-    client_configuration.connectTimeoutMs = config.getUInt(config_prefix + ".connect_timeout_ms", 10000);
+    client_configuration.connectTimeoutMs = config.getUInt(config_prefix + ".connect_timeout_ms", 1000);
     client_configuration.requestTimeoutMs = config.getUInt(config_prefix + ".request_timeout_ms", 30000);
     client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", 100);
     client_configuration.endpointOverride = uri.endpoint;
diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 628b81a79f84..423915518fd2 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -5,7 +5,7 @@
 #include <Formats/FormatSettings.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ProcessList.h>
-#include <IO/IOThreadPool.h>
+#include <IO/SharedThreadPools.h>
 #include <Processors/Formats/IRowInputFormat.h>
 #include <Processors/Formats/IRowOutputFormat.h>
 #include <Processors/Formats/Impl/MySQLOutputFormat.h>
diff --git a/src/Functions/caseWithExpression.cpp b/src/Functions/caseWithExpression.cpp
index c8b8dbd76cf4..9547cd200b24 100644
--- a/src/Functions/caseWithExpression.cpp
+++ b/src/Functions/caseWithExpression.cpp
@@ -24,6 +24,9 @@ class FunctionCaseWithExpression : public IFunction
 
     explicit FunctionCaseWithExpression(ContextPtr context_) : context(context_) {}
     bool isVariadic() const override { return true; }
+    bool useDefaultImplementationForConstants() const override { return false; }
+    bool useDefaultImplementationForNulls() const override { return false; }
+    bool useDefaultImplementationForNothing() const override { return false; }
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
     size_t getNumberOfArguments() const override { return 0; }
     String getName() const override { return name; }
diff --git a/src/Functions/concat.cpp b/src/Functions/concat.cpp
index 1bdd155aaa15..8fefc2d5b8a0 100644
--- a/src/Functions/concat.cpp
+++ b/src/Functions/concat.cpp
@@ -205,6 +205,10 @@ class ConcatOverloadResolver : public IFunctionOverloadResolver
         {
             return FunctionFactory::instance().getImpl("arrayConcat", context)->build(arguments);
         }
+        else if (isMap(arguments.at(0).type))
+        {
+            return FunctionFactory::instance().getImpl("mapConcat", context)->build(arguments);
+        }
         else
             return std::make_unique<FunctionToFunctionBaseAdaptor>(
                 FunctionConcat::create(context), collections::map<DataTypes>(arguments, [](const auto & elem) { return elem.type; }), return_type);
diff --git a/src/Functions/date_trunc.cpp b/src/Functions/date_trunc.cpp
index 016b8f4da5e7..87fff0b7f3cb 100644
--- a/src/Functions/date_trunc.cpp
+++ b/src/Functions/date_trunc.cpp
@@ -1,6 +1,6 @@
 #include <Columns/ColumnConst.h>
-#include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeInterval.h>
@@ -25,7 +25,7 @@ class FunctionDateTrunc : public IFunction
 public:
     static constexpr auto name = "dateTrunc";
 
-    explicit FunctionDateTrunc(ContextPtr context_) : context(context_) {}
+    explicit FunctionDateTrunc(ContextPtr context_) : context(context_) { }
 
     static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionDateTrunc>(context); }
 
@@ -39,51 +39,58 @@ class FunctionDateTrunc : public IFunction
     {
         /// The first argument is a constant string with the name of datepart.
 
-        auto result_type_is_date = false;
+        intermediate_type_is_date = false;
         String datepart_param;
-        auto check_first_argument = [&] {
+        auto check_first_argument = [&]
+        {
             const ColumnConst * datepart_column = checkAndGetColumnConst<ColumnString>(arguments[0].column.get());
             if (!datepart_column)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be constant string: "
-                    "name of datepart", getName());
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "First argument for function {} must be constant string: "
+                    "name of datepart",
+                    getName());
 
             datepart_param = datepart_column->getValue<String>();
             if (datepart_param.empty())
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "First argument (name of datepart) for function {} cannot be empty",
-                    getName());
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS, "First argument (name of datepart) for function {} cannot be empty", getName());
 
             if (!IntervalKind::tryParseString(datepart_param, datepart_kind))
                 throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} doesn't look like datepart name in {}", datepart_param, getName());
 
-            result_type_is_date = (datepart_kind == IntervalKind::Year)
-                || (datepart_kind == IntervalKind::Quarter) || (datepart_kind == IntervalKind::Month)
-                || (datepart_kind == IntervalKind::Week);
+            intermediate_type_is_date = (datepart_kind == IntervalKind::Year) || (datepart_kind == IntervalKind::Quarter)
+                || (datepart_kind == IntervalKind::Month) || (datepart_kind == IntervalKind::Week);
         };
 
         bool second_argument_is_date = false;
-        auto check_second_argument = [&] {
+        auto check_second_argument = [&]
+        {
             if (!isDate(arguments[1].type) && !isDateTime(arguments[1].type) && !isDateTime64(arguments[1].type))
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of 2nd argument of function {}. "
-                    "Should be a date or a date with time", arguments[1].type->getName(), getName());
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Illegal type {} of 2nd argument of function {}. "
+                    "Should be a date or a date with time",
+                    arguments[1].type->getName(),
+                    getName());
 
             second_argument_is_date = isDate(arguments[1].type);
 
-            if (second_argument_is_date && ((datepart_kind == IntervalKind::Hour)
-                || (datepart_kind == IntervalKind::Minute) || (datepart_kind == IntervalKind::Second)))
+            if (second_argument_is_date
+                && ((datepart_kind == IntervalKind::Hour) || (datepart_kind == IntervalKind::Minute)
+                    || (datepart_kind == IntervalKind::Second)))
                 throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type Date of argument for function {}", getName());
         };
 
-        auto check_timezone_argument = [&] {
+        auto check_timezone_argument = [&]
+        {
             if (!WhichDataType(arguments[2].type).isString())
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. "
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Illegal type {} of argument of function {}. "
                     "This argument is optional and must be a constant string with timezone name",
-                    arguments[2].type->getName(), getName());
-
-            if (second_argument_is_date && result_type_is_date)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                                "The timezone argument of function {} with datepart '{}' "
-                                "is allowed only when the 2nd argument has the type DateTime",
-                                getName(), datepart_param);
+                    arguments[2].type->getName(),
+                    getName());
         };
 
         if (arguments.size() == 2)
@@ -99,15 +106,14 @@ class FunctionDateTrunc : public IFunction
         }
         else
         {
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+            throw Exception(
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                 "Number of arguments for function {} doesn't match: passed {}, should be 2 or 3",
-                getName(), arguments.size());
+                getName(),
+                arguments.size());
         }
 
-        if (result_type_is_date)
-            return std::make_shared<DataTypeDate>();
-        else
-            return std::make_shared<DataTypeDateTime>(extractTimeZoneNameFromFunctionArguments(arguments, 2, 1));
+        return std::make_shared<DataTypeDateTime>(extractTimeZoneNameFromFunctionArguments(arguments, 2, 1));
     }
 
     bool useDefaultImplementationForConstants() const override { return true; }
@@ -124,26 +130,40 @@ class FunctionDateTrunc : public IFunction
 
         auto to_start_of_interval = FunctionFactory::instance().get("toStartOfInterval", context);
 
+        ColumnPtr truncated_column;
+        auto date_type = std::make_shared<DataTypeDate>();
+
         if (arguments.size() == 2)
-            return to_start_of_interval->build(temp_columns)->execute(temp_columns, result_type, input_rows_count);
+            truncated_column = to_start_of_interval->build(temp_columns)
+                                    ->execute(temp_columns, intermediate_type_is_date ? date_type : result_type, input_rows_count);
+        else
+        {
+            temp_columns[2] = arguments[2];
+            truncated_column = to_start_of_interval->build(temp_columns)
+                                    ->execute(temp_columns, intermediate_type_is_date ? date_type : result_type, input_rows_count);
+        }
 
-        temp_columns[2] = arguments[2];
-        return to_start_of_interval->build(temp_columns)->execute(temp_columns, result_type, input_rows_count);
-    }
+        if (!intermediate_type_is_date)
+            return truncated_column;
 
-    bool hasInformationAboutMonotonicity() const override
-    {
-        return true;
+        ColumnsWithTypeAndName temp_truncated_column(1);
+        temp_truncated_column[0] = {truncated_column, date_type, ""};
+
+        auto to_date_time_or_default = FunctionFactory::instance().get("toDateTime", context);
+        return to_date_time_or_default->build(temp_truncated_column)->execute(temp_truncated_column, result_type, input_rows_count);
     }
 
+    bool hasInformationAboutMonotonicity() const override { return true; }
+
     Monotonicity getMonotonicityForRange(const IDataType &, const Field &, const Field &) const override
     {
-        return { .is_monotonic = true, .is_always_monotonic = true };
+        return {.is_monotonic = true, .is_always_monotonic = true};
     }
 
 private:
     ContextPtr context;
     mutable IntervalKind::Kind datepart_kind = IntervalKind::Kind::Second;
+    mutable bool intermediate_type_is_date = false;
 };
 
 }
diff --git a/src/Functions/hasColumnInTable.cpp b/src/Functions/hasColumnInTable.cpp
index 824056a452b2..4676b4083b70 100644
--- a/src/Functions/hasColumnInTable.cpp
+++ b/src/Functions/hasColumnInTable.cpp
@@ -130,14 +130,18 @@ ColumnPtr FunctionHasColumnInTable::executeImpl(const ColumnsWithTypeAndName & a
 
         bool treat_local_as_remote = false;
         bool treat_local_port_as_remote = getContext()->getApplicationType() == Context::ApplicationType::LOCAL;
-        auto cluster = std::make_shared<Cluster>(
-            getContext()->getSettings(),
-            host_names,
+        ClusterConnectionParameters params{
             !user_name.empty() ? user_name : "default",
             password,
             getContext()->getTCPPort(),
             treat_local_as_remote,
-            treat_local_port_as_remote);
+            treat_local_port_as_remote,
+            /* secure= */ false,
+            /* priority= */ 1,
+            /* cluster_name= */ "",
+            /* password= */ ""
+        };
+        auto cluster = std::make_shared<Cluster>(getContext()->getSettings(), host_names, params);
 
         // FIXME this (probably) needs a non-constant access to query context,
         // because it might initialized a storage. Ideally, the tables required
diff --git a/src/Functions/parseDateTime.cpp b/src/Functions/parseDateTime.cpp
index c4843ecadb0c..c3fbc08c4a95 100644
--- a/src/Functions/parseDateTime.cpp
+++ b/src/Functions/parseDateTime.cpp
@@ -150,7 +150,7 @@ namespace
             if (text == "bc")
                 throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Era BC exceeds the range of DateTime");
             else if (text != "ad")
-                throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Unknown era {}", text);
+                throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Unknown era {} (expected 'ad' or 'bc')", text);
         }
 
         void setCentury(Int32 century)
diff --git a/src/Functions/transform.cpp b/src/Functions/transform.cpp
index df0f13ed97ca..33e03b541e98 100644
--- a/src/Functions/transform.cpp
+++ b/src/Functions/transform.cpp
@@ -1,24 +1,26 @@
 #include <mutex>
 #include <base/bit_cast.h>
 
-#include <Common/FieldVisitorDump.h>
-#include <Common/FieldVisitorConvertToNumber.h>
-#include <DataTypes/DataTypeArray.h>
-#include <Columns/ColumnString.h>
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnConst.h>
-#include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnDecimal.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
+#include <Core/DecimalFunctions.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/getLeastSupertype.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <Interpreters/castColumn.h>
+#include <Interpreters/convertFieldToType.h>
+#include <base/StringRef.h>
 #include <Common/Arena.h>
+#include <Common/FieldVisitorConvertToNumber.h>
+#include <Common/FieldVisitorDump.h>
 #include <Common/HashTable/HashMap.h>
 #include <Common/typeid_cast.h>
-#include <base/StringRef.h>
-#include <Functions/IFunction.h>
-#include <Functions/FunctionHelpers.h>
-#include <Functions/FunctionFactory.h>
-#include <DataTypes/getLeastSupertype.h>
-#include <Interpreters/convertFieldToType.h>
-
 
 namespace DB
 {
@@ -32,11 +34,9 @@ namespace ErrorCodes
 
 namespace
 {
-
-/** transform(x, from_array, to_array[, default]) - convert x according to an explicitly passed match.
+    /** transform(x, from_array, to_array[, default]) - convert x according to an explicitly passed match.
   */
-
-/** transform(x, [from...], [to...], default)
+    /** transform(x, [from...], [to...], default)
   * - converts the values according to the explicitly specified mapping.
   *
   * x - what to transform.
@@ -56,1147 +56,658 @@ namespace
   *
   * Note: the implementation is rather cumbersome.
   */
-class FunctionTransform : public IFunction
-{
-public:
-    static constexpr auto name = "transform";
-    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionTransform>(); }
-
-    String getName() const override
+    class FunctionTransform : public IFunction
     {
-        return name;
-    }
+    public:
+        static constexpr auto name = "transform";
+        static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionTransform>(); }
 
-    bool isVariadic() const override { return true; }
-    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
-    size_t getNumberOfArguments() const override { return 0; }
-    bool useDefaultImplementationForConstants() const override { return true; }
-    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
-
-    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
-    {
-        const auto args_size = arguments.size();
-        if (args_size != 3 && args_size != 4)
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: "
-                "passed {}, should be 3 or 4", getName(), args_size);
+        String getName() const override { return name; }
 
-        const DataTypePtr & type_x = arguments[0];
+        bool isVariadic() const override { return true; }
+        bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+        size_t getNumberOfArguments() const override { return 0; }
+        bool useDefaultImplementationForConstants() const override { return false; }
+        bool useDefaultImplementationForNulls() const override { return false; }
+        bool useDefaultImplementationForNothing() const override { return false; }
+        ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
 
-        if (!type_x->isValueRepresentedByNumber() && !isString(type_x))
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                            "Unsupported type {} of first argument "
-                            "of function {}, must be numeric type or Date/DateTime or String",
-                            type_x->getName(), getName());
-
-        const DataTypeArray * type_arr_from = checkAndGetDataType<DataTypeArray>(arguments[1].get());
-
-        if (!type_arr_from)
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                            "Second argument of function {}, must be array of source values to transform from.",
-                            getName());
-
-        const auto type_arr_from_nested = type_arr_from->getNestedType();
-
-        if ((type_x->isValueRepresentedByNumber() != type_arr_from_nested->isValueRepresentedByNumber())
-            || (isString(type_x) != isString(type_arr_from_nested)))
+        DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
         {
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                            "First argument and elements of array "
-                            "of second argument of function {} must have compatible types: "
-                            "both numeric or both strings.", getName());
-        }
-
-        const DataTypeArray * type_arr_to = checkAndGetDataType<DataTypeArray>(arguments[2].get());
-
-        if (!type_arr_to)
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                            "Third argument of function {}, must be array of destination values to transform to.",
-                            getName());
-
-        const DataTypePtr & type_arr_to_nested = type_arr_to->getNestedType();
-
-        if (args_size == 3)
-        {
-            if ((type_x->isValueRepresentedByNumber() != type_arr_to_nested->isValueRepresentedByNumber())
-                || (isString(type_x) != isString(type_arr_to_nested)))
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {} has signature: "
-                                "transform(T, Array(T), Array(U), U) -> U; "
-                                "or transform(T, Array(T), Array(T)) -> T; where T and U are types.", getName());
-
-            return getLeastSupertype(DataTypes{type_x, type_arr_to_nested});
-        }
-        else
-        {
-            const DataTypePtr & type_default = arguments[3];
+            const auto args_size = arguments.size();
+            if (args_size != 3 && args_size != 4)
+                throw Exception(
+                    ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                    "Number of arguments for function {} doesn't match: "
+                    "passed {}, should be 3 or 4",
+                    getName(),
+                    args_size);
+
+            const DataTypePtr & type_x = arguments[0];
+            const auto & type_x_nn = removeNullable(type_x);
+
+            if (!type_x_nn->isValueRepresentedByNumber() && !isString(type_x_nn) && !isNothing(type_x_nn))
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Unsupported type {} of first argument "
+                    "of function {}, must be numeric type or Date/DateTime or String",
+                    type_x->getName(),
+                    getName());
+
+            const DataTypeArray * type_arr_from = checkAndGetDataType<DataTypeArray>(arguments[1].get());
+
+            if (!type_arr_from)
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Second argument of function {}, must be array of source values to transform from.",
+                    getName());
+
+            const auto type_arr_from_nested = type_arr_from->getNestedType();
+
+            if ((type_x->isValueRepresentedByNumber() != type_arr_from_nested->isValueRepresentedByNumber())
+                || (isString(type_x) != isString(type_arr_from_nested)))
+            {
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "First argument and elements of array "
+                    "of second argument of function {} must have compatible types: "
+                    "both numeric or both strings.",
+                    getName());
+            }
 
-            if (!type_default->isValueRepresentedByNumber() && !isString(type_default))
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                                "Unsupported type {} of fourth argument (default value) "
-                                "of function {}, must be numeric type or Date/DateTime or String",
-                                type_default->getName(), getName());
+            const DataTypeArray * type_arr_to = checkAndGetDataType<DataTypeArray>(arguments[2].get());
 
-            bool default_is_string = WhichDataType(type_default).isString();
-            bool nested_is_string = WhichDataType(type_arr_to_nested).isString();
+            if (!type_arr_to)
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Third argument of function {}, must be array of destination values to transform to.",
+                    getName());
 
-            if ((type_default->isValueRepresentedByNumber() != type_arr_to_nested->isValueRepresentedByNumber())
-                || (default_is_string != nested_is_string))
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {} have signature: "
-                                "transform(T, Array(T), Array(U), U) -> U; "
-                                "or transform(T, Array(T), Array(T)) -> T; where T and U are types.", getName());
+            const DataTypePtr & type_arr_to_nested = type_arr_to->getNestedType();
 
-            if (type_arr_to_nested->isValueRepresentedByNumber() && type_default->isValueRepresentedByNumber())
+            if (args_size == 3)
             {
-                /// We take the smallest common type for the elements of the array of values `to` and for `default`.
-                return getLeastSupertype(DataTypes{type_arr_to_nested, type_default});
+                if ((type_x->isValueRepresentedByNumber() != type_arr_to_nested->isValueRepresentedByNumber())
+                    || (isString(type_x) != isString(type_arr_to_nested)))
+                    throw Exception(
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                        "Function {} has signature: "
+                        "transform(T, Array(T), Array(U), U) -> U; "
+                        "or transform(T, Array(T), Array(T)) -> T; where T and U are types.",
+                        getName());
+
+                auto ret = tryGetLeastSupertype(DataTypes{type_arr_to_nested, type_x});
+                if (!ret)
+                    throw Exception(
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                        "Function {} has signature: "
+                        "transform(T, Array(T), Array(U), U) -> U; "
+                        "or transform(T, Array(T), Array(T)) -> T; where T and U are types.",
+                        getName());
+                checkAllowedType(ret);
+                return ret;
+            }
+            else
+            {
+                auto ret = tryGetLeastSupertype(DataTypes{type_arr_to_nested, arguments[3]});
+                if (!ret)
+                    throw Exception(
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                        "Function {} have signature: "
+                        "transform(T, Array(T), Array(U), U) -> U; "
+                        "or transform(T, Array(T), Array(T)) -> T; where T and U are types.",
+                        getName());
+                checkAllowedType(ret);
+                return ret;
             }
-
-            /// TODO More checks.
-            return type_arr_to_nested;
         }
-    }
 
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        const ColumnConst * array_from = checkAndGetColumnConst<ColumnArray>(arguments[1].column.get());
-        const ColumnConst * array_to = checkAndGetColumnConst<ColumnArray>(arguments[2].column.get());
-
-        if (!array_from || !array_to)
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second and third arguments of function {} must be constant arrays.", getName());
-
-        initialize(array_from->getValue<Array>(), array_to->getValue<Array>(), arguments);
-
-        const auto * in = arguments.front().column.get();
-
-        if (isColumnConst(*in))
-            return executeConst(arguments, result_type, input_rows_count);
-
-        const IColumn * default_column = nullptr;
-        if (arguments.size() == 4)
-            default_column = arguments[3].column.get();
-
-        auto column_result = result_type->createColumn();
-        auto * out = column_result.get();
-
-        if (!executeNum<UInt8>(in, out, default_column)
-            && !executeNum<UInt16>(in, out, default_column)
-            && !executeNum<UInt32>(in, out, default_column)
-            && !executeNum<UInt64>(in, out, default_column)
-            && !executeNum<Int8>(in, out, default_column)
-            && !executeNum<Int16>(in, out, default_column)
-            && !executeNum<Int32>(in, out, default_column)
-            && !executeNum<Int64>(in, out, default_column)
-            && !executeNum<Float32>(in, out, default_column)
-            && !executeNum<Float64>(in, out, default_column)
-            && !executeDecimal<Decimal32>(in, out, default_column)
-            && !executeDecimal<Decimal64>(in, out, default_column)
-            && !executeString(in, out, default_column))
+        ColumnPtr
+        executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
         {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", in->getName(), getName());
+            initialize(arguments, result_type);
+
+            const auto * in = arguments.front().column.get();
+
+            if (isColumnConst(*in))
+                return executeConst(arguments, result_type, input_rows_count);
+
+            ColumnPtr default_non_const;
+            if (!cache.default_column && arguments.size() == 4)
+                default_non_const = castColumn(arguments[3], result_type);
+
+            auto column_result = result_type->createColumn();
+            if (!executeNum<ColumnVector<UInt8>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<UInt16>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<UInt32>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<UInt64>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<Int8>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<Int16>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<Int32>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<Int64>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<Float32>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnVector<Float64>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnDecimal<Decimal32>>(in, *column_result, default_non_const)
+                && !executeNum<ColumnDecimal<Decimal64>>(in, *column_result, default_non_const)
+                && !executeString(in, *column_result, default_non_const))
+            {
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", in->getName(), getName());
+            }
+            return column_result;
         }
 
-        return column_result;
-    }
-
-private:
-    static ColumnPtr executeConst(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count)
-    {
-        /// Materialize the input column and compute the function as usual.
+    private:
+        static ColumnPtr executeConst(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count)
+        {
+            /// Materialize the input column and compute the function as usual.
 
-        ColumnsWithTypeAndName args = arguments;
-        args[0].column = args[0].column->cloneResized(input_rows_count)->convertToFullColumnIfConst();
+            ColumnsWithTypeAndName args = arguments;
+            args[0].column = args[0].column->cloneResized(input_rows_count)->convertToFullColumnIfConst();
 
-        auto impl = FunctionToOverloadResolverAdaptor(std::make_shared<FunctionTransform>()).build(args);
+            auto impl = FunctionToOverloadResolverAdaptor(std::make_shared<FunctionTransform>()).build(args);
 
-        return impl->execute(args, result_type, input_rows_count);
-    }
+            return impl->execute(args, result_type, input_rows_count);
+        }
 
-    template <typename T>
-    bool executeNum(const IColumn * in_untyped, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        if (const auto in = checkAndGetColumn<ColumnVector<T>>(in_untyped))
+        template <typename T>
+        bool executeNum(const IColumn * in_untyped, IColumn & column_result, const ColumnPtr default_non_const) const
         {
-            if (!default_untyped)
+            const auto * const in = checkAndGetColumn<T>(in_untyped);
+            if (!in)
+                return false;
+            const auto & pod = in->getData();
+            UInt32 in_scale = 0;
+            if constexpr (std::is_same_v<ColumnDecimal<Decimal32>, T> || std::is_same_v<ColumnDecimal<Decimal64>, T>)
+                in_scale = in->getScale();
+
+            if (!executeNumToString(pod, column_result, default_non_const)
+                && !executeNumToNum<ColumnVector<UInt8>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<UInt16>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<UInt32>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<UInt64>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<Int8>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<Int16>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<Int32>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<Int64>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<Float32>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnVector<Float64>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnDecimal<Decimal32>>(pod, column_result, default_non_const, in_scale)
+                && !executeNumToNum<ColumnDecimal<Decimal64>>(pod, column_result, default_non_const, in_scale))
             {
-                auto out = typeid_cast<ColumnVector<T> *>(out_untyped);
-                if (!out)
+                const size_t size = pod.size();
+                const auto & table = *cache.table_num_to_idx;
+                column_result.reserve(size);
+                for (size_t i = 0; i < size; ++i)
                 {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN,
-                                    "Illegal column {} of elements "
-                                    "of array of third argument of function {}, must be {}",
-                                    out_untyped->getName(), getName(), in->getName());
+                    const auto * it = table.find(bit_cast<UInt64>(pod[i]));
+                    if (it)
+                        column_result.insertFrom(*cache.to_columns, it->getMapped());
+                    else if (cache.default_column)
+                        column_result.insertFrom(*cache.default_column, 0);
+                    else if (default_non_const)
+                        column_result.insertFrom(*default_non_const, i);
+                    else
+                        column_result.insertFrom(*in, i);
                 }
-
-                executeImplNumToNum<T>(in->getData(), out->getData());
             }
-            else if (isColumnConst(*default_untyped))
+            return true;
+        }
+
+        template <typename T>
+        bool executeNumToString(const PaddedPODArray<T> & pod, IColumn & column_result, const ColumnPtr default_non_const) const
+        {
+            auto * out = typeid_cast<ColumnString *>(&column_result);
+            if (!out)
+                return false;
+            auto & out_offs = out->getOffsets();
+            const size_t size = pod.size();
+            out_offs.resize(size);
+            auto & out_chars = out->getChars();
+
+            const auto * to_col = reinterpret_cast<const ColumnString *>(cache.to_columns.get());
+            const auto & to_chars = to_col->getChars();
+            const auto & to_offs = to_col->getOffsets();
+            const auto & table = *cache.table_num_to_idx;
+
+            if (cache.default_column)
             {
-                if (!executeNumToNumWithConstDefault<T, UInt8>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, UInt16>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, UInt32>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, UInt64>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, Int8>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, Int16>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, Int32>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, Int64>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, Float32>(in, out_untyped)
-                    && !executeNumToNumWithConstDefault<T, Float64>(in, out_untyped)
-                    && !executeNumToDecimalWithConstDefault<T, Decimal32>(in, out_untyped)
-                    && !executeNumToDecimalWithConstDefault<T, Decimal64>(in, out_untyped)
-                    && !executeNumToStringWithConstDefault<T>(in, out_untyped))
-                {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of elements of array of second argument of function {}",
-                        in->getName(), getName());
-                }
+                const auto * def = reinterpret_cast<const ColumnString *>(cache.default_column.get());
+                const auto & def_chars = def->getChars();
+                const auto & def_offs = def->getOffsets();
+                const auto * def_data = def_chars.data();
+                auto def_size = def_offs[0];
+                executeNumToStringHelper(table, pod, out_chars, out_offs, to_chars, to_offs, def_data, def_size, size);
             }
             else
             {
-                if (!executeNumToNumWithNonConstDefault<T, UInt8>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, UInt16>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, UInt32>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, UInt64>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, Int8>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, Int16>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, Int32>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, Int64>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, Float32>(in, out_untyped, default_untyped)
-                    && !executeNumToNumWithNonConstDefault<T, Float64>(in, out_untyped, default_untyped)
-                    && !executeNumToDecimalWithNonConstDefault<T, Decimal32>(in, out_untyped, default_untyped)
-                    && !executeNumToDecimalWithNonConstDefault<T, Decimal64>(in, out_untyped, default_untyped)
-                    && !executeNumToStringWithNonConstDefault<T>(in, out_untyped, default_untyped))
-                {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of elements of array of second argument of function {}",
-                        in->getName(), getName());
-                }
+                const auto * def = reinterpret_cast<const ColumnString *>(default_non_const.get());
+                const auto & def_chars = def->getChars();
+                const auto & def_offs = def->getOffsets();
+                executeNumToStringHelper(table, pod, out_chars, out_offs, to_chars, to_offs, def_chars, def_offs, size);
             }
-
             return true;
         }
 
-        return false;
-    }
-
-    template <typename T>
-    bool executeDecimal(const IColumn * in_untyped, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        if (const auto in = checkAndGetColumn<ColumnDecimal<T>>(in_untyped))
+        template <typename Table, typename In, typename DefData, typename DefOffs>
+        void executeNumToStringHelper(
+            const Table & table,
+            const PaddedPODArray<In> & pod,
+            ColumnString::Chars & out_data,
+            ColumnString::Offsets & out_offsets,
+            const ColumnString::Chars & to_data,
+            const ColumnString::Offsets & to_offsets,
+            const DefData & def_data,
+            const DefOffs & def_offsets,
+            const size_t size) const
         {
-            if (!default_untyped)
+            size_t out_cur_off = 0;
+            for (size_t i = 0; i < size; ++i)
             {
-                auto out = typeid_cast<ColumnDecimal<T> *>(out_untyped);
-                if (!out)
+                const char8_t * to = nullptr;
+                size_t to_size = 0;
+                const auto * it = table.find(bit_cast<UInt64>(pod[i]));
+                if (it)
                 {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN,
-                                    "Illegal column {} of elements "
-                                    "of array of third argument of function {}, must be {}",
-                                    out_untyped->getName(), getName(), in->getName());
+                    const auto idx = it->getMapped();
+                    const auto start = to_offsets[idx - 1];
+                    to = &to_data[start];
+                    to_size = to_offsets[idx] - start;
                 }
-
-                executeImplNumToNum<T>(in->getData(), out->getData());
-            }
-            else if (isColumnConst(*default_untyped))
-            {
-                if (!executeDecimalToNumWithConstDefault<T, UInt8>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, UInt16>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, UInt32>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, UInt64>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, Int8>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, Int16>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, Int32>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, Int64>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, Float32>(in, out_untyped)
-                    && !executeDecimalToNumWithConstDefault<T, Float64>(in, out_untyped)
-                    && !executeDecimalToDecimalWithConstDefault<T, Decimal32>(in, out_untyped)
-                    && !executeDecimalToDecimalWithConstDefault<T, Decimal64>(in, out_untyped)
-                    && !executeDecimalToStringWithConstDefault<T>(in, out_untyped))
+                else if constexpr (std::is_same_v<DefData, ColumnString::Chars>)
                 {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of elements of array of second argument of function {}",
-                                    in->getName(), getName());
+                    const auto start = def_offsets[i - 1];
+                    to = &def_data[start];
+                    to_size = def_offsets[i] - start;
                 }
-            }
-            else
-            {
-                if (!executeDecimalToNumWithNonConstDefault<T, UInt8>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, UInt16>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, UInt32>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, UInt64>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, Int8>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, Int16>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, Int32>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, Int64>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, Float32>(in, out_untyped, default_untyped)
-                    && !executeDecimalToNumWithNonConstDefault<T, Float64>(in, out_untyped, default_untyped)
-                    && !executeDecimalToDecimalWithNonConstDefault<T, Decimal32>(in, out_untyped, default_untyped)
-                    && !executeDecimalToDecimalWithNonConstDefault<T, Decimal64>(in, out_untyped, default_untyped)
-                    && !executeDecimalToStringWithNonConstDefault<T>(in, out_untyped, default_untyped))
+                else
                 {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of elements of array of second argument of function {}",
-                                    in->getName(), getName());
+                    to = def_data;
+                    to_size = def_offsets;
                 }
+                out_data.resize(out_cur_off + to_size);
+                memcpy(&out_data[out_cur_off], to, to_size);
+                out_cur_off += to_size;
+                out_offsets[i] = out_cur_off;
             }
-
-            return true;
         }
 
-        return false;
-    }
-
-    bool executeString(const IColumn * in_untyped, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        if (const auto * in = checkAndGetColumn<ColumnString>(in_untyped))
+        template <typename T, typename U>
+        bool executeNumToNum(
+            const PaddedPODArray<U> & pod, IColumn & column_result, const ColumnPtr default_non_const, const UInt32 in_scale) const
         {
-            if (!default_untyped)
+            auto * out = typeid_cast<T *>(&column_result);
+            if (!out)
+                return false;
+            auto & out_pod = out->getData();
+            const size_t size = pod.size();
+            out_pod.resize(size);
+            UInt32 out_scale = 0;
+            if constexpr (std::is_same_v<ColumnDecimal<Decimal32>, T> || std::is_same_v<ColumnDecimal<Decimal64>, T>)
+                out_scale = out->getScale();
+
+            const auto & to_pod = reinterpret_cast<const T *>(cache.to_columns.get())->getData();
+            const auto & table = *cache.table_num_to_idx;
+            if (cache.default_column)
             {
-                if (!executeStringToString(in, out_untyped))
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of elements of array of second argument of function {}",
-                        in->getName(), getName());
+                const auto const_def = reinterpret_cast<const T *>(cache.default_column.get())->getData()[0];
+                executeNumToNumHelper(table, pod, out_pod, to_pod, const_def, size, out_scale, out_scale);
             }
-            else if (isColumnConst(*default_untyped))
+            else if (default_non_const)
             {
-                if (!executeStringToNumWithConstDefault<UInt8>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<UInt16>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<UInt32>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<UInt64>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<Int8>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<Int16>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<Int32>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<Int64>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<Float32>(in, out_untyped)
-                    && !executeStringToNumWithConstDefault<Float64>(in, out_untyped)
-                    && !executeStringToDecimalWithConstDefault<Decimal32>(in, out_untyped)
-                    && !executeStringToDecimalWithConstDefault<Decimal64>(in, out_untyped)
-                    && !executeStringToStringWithConstDefault(in, out_untyped))
-                {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of elements of array of second argument of function {}",
-                        in->getName(), getName());
-                }
+                const auto & nconst_def = reinterpret_cast<const T *>(default_non_const.get())->getData();
+                executeNumToNumHelper(table, pod, out_pod, to_pod, nconst_def, size, out_scale, out_scale);
             }
             else
-            {
-                if (!executeStringToNumWithNonConstDefault<UInt8>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<UInt16>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<UInt32>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<UInt64>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<Int8>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<Int16>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<Int32>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<Int64>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<Float32>(in, out_untyped, default_untyped)
-                    && !executeStringToNumWithNonConstDefault<Float64>(in, out_untyped, default_untyped)
-                    && !executeStringToDecimalWithNonConstDefault<Decimal32>(in, out_untyped, default_untyped)
-                    && !executeStringToDecimalWithNonConstDefault<Decimal64>(in, out_untyped, default_untyped)
-
-                    && !executeStringToStringWithNonConstDefault(in, out_untyped, default_untyped))
-                {
-                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of elements of array of second argument of function {}",
-                        in->getName(), getName());
-                }
-            }
-
+                executeNumToNumHelper(table, pod, out_pod, to_pod, pod, size, out_scale, in_scale);
             return true;
         }
 
-        return false;
-    }
-
-    template <typename T, typename U>
-    bool executeNumToNumWithConstDefault(const ColumnVector<T> * in, IColumn * out_untyped) const
-    {
-        auto out = typeid_cast<ColumnVector<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        executeImplNumToNumWithConstDefault<T, U>(in->getData(), out->getData(), static_cast<U>(cache.const_default_value.get<U>()));
-        return true;
-    }
-
-    template <typename T, typename U>
-    bool executeNumToDecimalWithConstDefault(const ColumnVector<T> * in, IColumn * out_untyped) const
-    {
-        auto out = typeid_cast<ColumnDecimal<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        executeImplNumToNumWithConstDefault<T, U>(in->getData(), out->getData(), cache.const_default_value.get<U>());
-        return true;
-    }
-
-
-    template <typename T, typename U>
-    bool executeDecimalToNumWithConstDefault(const ColumnDecimal<T> * in, IColumn * out_untyped) const
-    {
-        auto out = typeid_cast<ColumnVector<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        executeImplNumToNumWithConstDefault<T, U>(in->getData(), out->getData(), static_cast<U>(cache.const_default_value.get<U>()));
-        return true;
-    }
-
-    template <typename T, typename U>
-    bool executeDecimalToDecimalWithConstDefault(const ColumnDecimal<T> * in, IColumn * out_untyped) const
-    {
-        auto out = typeid_cast<ColumnDecimal<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        executeImplNumToNumWithConstDefault<T, U>(in->getData(), out->getData(), cache.const_default_value.get<U>());
-        return true;
-    }
-
-    template <typename T, typename U>
-    bool executeNumToNumWithNonConstDefault(const ColumnVector<T> * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto out = typeid_cast<ColumnVector<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        if (!executeNumToNumWithNonConstDefault2<T, U, UInt8>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, UInt16>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, UInt32>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, UInt64>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, Int8>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, Int16>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, Int32>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, Int64>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, Float32>(in, out, default_untyped)
-            && !executeNumToNumWithNonConstDefault2<T, U, Float64>(in, out, default_untyped))
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                default_untyped->getName(), getName());
-        }
-
-        return true;
-    }
-
-    template <typename T, typename U>
-    bool executeNumToDecimalWithNonConstDefault(const ColumnVector<T> * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto out = typeid_cast<ColumnDecimal<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        if (!executeNumToDecimalWithNonConstDefault2<T, U, UInt8>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, UInt16>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, UInt32>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, UInt64>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, Int8>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, Int16>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, Int32>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, Int64>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, Float32>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefault2<T, U, Float64>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefaultDecimal2<T, U, Decimal32>(in, out, default_untyped)
-            && !executeNumToDecimalWithNonConstDefaultDecimal2<T, U, Decimal64>(in, out, default_untyped))
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                default_untyped->getName(), getName());
-        }
-
-        return true;
-    }
-
-    template <typename T, typename U>
-    bool executeDecimalToNumWithNonConstDefault(const ColumnDecimal<T> * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto out = typeid_cast<ColumnVector<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        if (!executeDecimalToNumWithNonConstDefault2<T, U, UInt8>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, UInt16>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, UInt32>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, UInt64>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, Int8>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, Int16>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, Int32>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, Int64>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, Float32>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefault2<T, U, Float64>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefaultDecimal2<T, U, Decimal32>(in, out, default_untyped)
-            && !executeDecimalToNumWithNonConstDefaultDecimal2<T, U, Decimal64>(in, out, default_untyped))
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                default_untyped->getName(), getName());
-        }
-
-        return true;
-    }
-
-    template <typename T, typename U>
-    bool executeDecimalToDecimalWithNonConstDefault(const ColumnDecimal<T> * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto out = typeid_cast<ColumnDecimal<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        if (!executeDecimalToDecimalWithNonConstDefault2<T, U, UInt8>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, UInt16>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, UInt32>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, UInt64>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, Int8>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, Int16>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, Int32>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, Int64>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, Float32>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefault2<T, U, Float64>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefaultDecimal2<T, U, Decimal32>(in, out, default_untyped)
-            && !executeDecimalToDecimalWithNonConstDefaultDecimal2<T, U, Decimal64>(in, out, default_untyped))
+        template <typename Table, typename In, typename Out, typename Def>
+        void executeNumToNumHelper(
+            const Table & table,
+            const PaddedPODArray<In> & pod,
+            PaddedPODArray<Out> & out_pod,
+            const PaddedPODArray<Out> & to_pod,
+            const Def & def,
+            const size_t size,
+            const UInt32 out_scale,
+            const UInt32 def_scale) const
         {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                default_untyped->getName(), getName());
-        }
-
-        return true;
-    }
-
-    template <typename T, typename U, typename V>
-    bool executeNumToNumWithNonConstDefault2(const ColumnVector<T> * in, ColumnVector<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnVector<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplNumToNumWithNonConstDefault<T, U, V>(in->getData(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename T, typename U, typename V>
-    bool executeNumToDecimalWithNonConstDefault2(const ColumnVector<T> * in, ColumnDecimal<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnVector<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplNumToNumWithNonConstDefault<T, U, V>(in->getData(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename T, typename U, typename V>
-    bool executeNumToDecimalWithNonConstDefaultDecimal2(const ColumnVector<T> * in, ColumnDecimal<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnDecimal<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplNumToNumWithNonConstDefault<T, U, V>(in->getData(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename T, typename U, typename V>
-    bool executeDecimalToNumWithNonConstDefault2(const ColumnDecimal<T> * in, ColumnVector<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnVector<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplNumToNumWithNonConstDefault<T, U, V>(in->getData(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename T, typename U, typename V>
-    bool executeDecimalToDecimalWithNonConstDefault2(const ColumnDecimal<T> * in, ColumnDecimal<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnVector<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplNumToNumWithNonConstDefault<T, U, V>(in->getData(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename T, typename U, typename V>
-    bool executeDecimalToNumWithNonConstDefaultDecimal2(const ColumnDecimal<T> * in, ColumnVector<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnDecimal<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplNumToNumWithNonConstDefault<T, U, V>(in->getData(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename T, typename U, typename V>
-    bool executeDecimalToDecimalWithNonConstDefaultDecimal2(const ColumnDecimal<T> * in, ColumnDecimal<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnDecimal<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplNumToNumWithNonConstDefault<T, U, V>(in->getData(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename T>
-    bool executeNumToStringWithConstDefault(const ColumnVector<T> * in, IColumn * out_untyped) const
-    {
-        auto * out = typeid_cast<ColumnString *>(out_untyped);
-        if (!out)
-            return false;
-
-        const String & default_str = cache.const_default_value.get<const String &>();
-        StringRef default_string_ref{default_str.data(), default_str.size() + 1};
-        executeImplNumToStringWithConstDefault<T>(in->getData(), out->getChars(), out->getOffsets(), default_string_ref);
-        return true;
-    }
-
-    template <typename T>
-    bool executeDecimalToStringWithConstDefault(const ColumnDecimal<T> * in, IColumn * out_untyped) const
-    {
-        auto * out = typeid_cast<ColumnString *>(out_untyped);
-        if (!out)
-            return false;
-
-        const String & default_str = cache.const_default_value.get<const String &>();
-        StringRef default_string_ref{default_str.data(), default_str.size() + 1};
-        executeImplNumToStringWithConstDefault<T>(in->getData(), out->getChars(), out->getOffsets(), default_string_ref);
-        return true;
-    }
-
-    template <typename T>
-    bool executeNumToStringWithNonConstDefault(const ColumnVector<T> * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto * out = typeid_cast<ColumnString *>(out_untyped);
-        if (!out)
-            return false;
-
-        const auto * default_col = checkAndGetColumn<ColumnString>(default_untyped);
-        if (!default_col)
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                default_untyped->getName(), getName());
-        }
-
-        executeImplNumToStringWithNonConstDefault<T>(
-            in->getData(),
-            out->getChars(), out->getOffsets(),
-            default_col->getChars(), default_col->getOffsets());
-
-        return true;
-    }
-
-    template <typename T>
-    bool executeDecimalToStringWithNonConstDefault(const ColumnDecimal<T> * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto * out = typeid_cast<ColumnString *>(out_untyped);
-        if (!out)
-            return false;
-
-        const auto * default_col = checkAndGetColumn<ColumnString>(default_untyped);
-        if (!default_col)
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                            default_untyped->getName(), getName());
-        }
-
-        executeImplNumToStringWithNonConstDefault<T>(
-            in->getData(),
-            out->getChars(), out->getOffsets(),
-            default_col->getChars(), default_col->getOffsets());
-
-        return true;
-    }
-
-    template <typename U>
-    bool executeStringToNumWithConstDefault(const ColumnString * in, IColumn * out_untyped) const
-    {
-        auto out = typeid_cast<ColumnVector<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        executeImplStringToNumWithConstDefault<U>(
-            in->getChars(), in->getOffsets(), out->getData(), static_cast<U>(cache.const_default_value.get<U>()));
-        return true;
-    }
-
-    template <typename U>
-    bool executeStringToDecimalWithConstDefault(const ColumnString * in, IColumn * out_untyped) const
-    {
-        auto out = typeid_cast<ColumnDecimal<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        executeImplStringToNumWithConstDefault<U>(in->getChars(), in->getOffsets(), out->getData(), cache.const_default_value.get<U>());
-        return true;
-    }
-
-    template <typename U>
-    bool executeStringToNumWithNonConstDefault(const ColumnString * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto out = typeid_cast<ColumnVector<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        if (!executeStringToNumWithNonConstDefault2<U, UInt8>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, UInt16>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, UInt32>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, UInt64>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, Int8>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, Int16>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, Int32>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, Int64>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, Float32>(in, out, default_untyped)
-            && !executeStringToNumWithNonConstDefault2<U, Float64>(in, out, default_untyped))
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                default_untyped->getName(), getName());
-        }
-
-        return true;
-    }
-
-    template <typename U>
-    bool executeStringToDecimalWithNonConstDefault(const ColumnString * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto out = typeid_cast<ColumnDecimal<U> *>(out_untyped);
-        if (!out)
-            return false;
-
-        if (!executeStringToDecimalWithNonConstDefault2<U, UInt8>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, UInt16>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, UInt32>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, UInt64>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, Int8>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, Int16>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, Int32>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, Int64>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, Float32>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefault2<U, Float64>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefaultDecimal2<U, Decimal32>(in, out, default_untyped)
-            && !executeStringToDecimalWithNonConstDefaultDecimal2<U, Decimal64>(in, out, default_untyped))
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                            default_untyped->getName(), getName());
-        }
-
-        return true;
-    }
-
-
-    template <typename U, typename V>
-    bool executeStringToNumWithNonConstDefault2(const ColumnString * in, ColumnVector<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnVector<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplStringToNumWithNonConstDefault<U, V>(in->getChars(), in->getOffsets(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename U, typename V>
-    bool executeStringToDecimalWithNonConstDefault2(const ColumnString * in, ColumnDecimal<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnVector<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplStringToNumWithNonConstDefault<U, V>(in->getChars(), in->getOffsets(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    template <typename U, typename V>
-    bool executeStringToDecimalWithNonConstDefaultDecimal2(const ColumnString * in, ColumnDecimal<U> * out, const IColumn * default_untyped) const
-    {
-        auto col_default = checkAndGetColumn<ColumnDecimal<V>>(default_untyped);
-        if (!col_default)
-            return false;
-
-        executeImplStringToNumWithNonConstDefault<U, V>(in->getChars(), in->getOffsets(), out->getData(), col_default->getData());
-        return true;
-    }
-
-    bool executeStringToString(const ColumnString * in, IColumn * out_untyped) const
-    {
-        auto * out = typeid_cast<ColumnString *>(out_untyped);
-        if (!out)
-            return false;
-
-        executeImplStringToString(in->getChars(), in->getOffsets(), out->getChars(), out->getOffsets());
-        return true;
-    }
-
-    bool executeStringToStringWithConstDefault(const ColumnString * in, IColumn * out_untyped) const
-    {
-        auto * out = typeid_cast<ColumnString *>(out_untyped);
-        if (!out)
-            return false;
-
-        const String & default_str = cache.const_default_value.get<const String &>();
-        StringRef default_string_ref{default_str.data(), default_str.size() + 1};
-        executeImplStringToStringWithConstDefault(in->getChars(), in->getOffsets(), out->getChars(), out->getOffsets(), default_string_ref);
-        return true;
-    }
-
-    bool executeStringToStringWithNonConstDefault(const ColumnString * in, IColumn * out_untyped, const IColumn * default_untyped) const
-    {
-        auto * out = typeid_cast<ColumnString *>(out_untyped);
-        if (!out)
-            return false;
-
-        const auto * default_col = checkAndGetColumn<ColumnString>(default_untyped);
-        if (!default_col)
-        {
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of fourth argument of function {}",
-                default_untyped->getName(), getName());
-        }
-
-        executeImplStringToStringWithNonConstDefault(
-            in->getChars(), in->getOffsets(),
-            out->getChars(), out->getOffsets(),
-            default_col->getChars(), default_col->getOffsets());
-
-        return true;
-    }
-
-
-    template <typename T, typename U>
-    void executeImplNumToNumWithConstDefault(const PaddedPODArray<T> & src, PaddedPODArray<U> & dst, U dst_default) const
-    {
-        const auto & table = *cache.table_num_to_num;
-        size_t size = src.size();
-        dst.resize(size);
-        for (size_t i = 0; i < size; ++i)
-        {
-            const auto * it = table.find(bit_cast<UInt64>(src[i]));
-            if (it)
+            for (size_t i = 0; i < size; ++i)
             {
-                if constexpr (std::endian::native == std::endian::little)
-                    memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));
+                const auto * it = table.find(bit_cast<UInt64>(pod[i]));
+                if (it)
+                {
+                    const auto idx = it->getMapped();
+                    out_pod[i] = to_pod[idx];
+                }
+                else if constexpr (std::is_same_v<Def, Out>)
+                    out_pod[i] = def;
+                else if constexpr (is_decimal<Out> && !is_decimal<typename Def::value_type>)
+                    out_pod[i] = DecimalUtils::decimalFromComponents<Out>(static_cast<typename Out::NativeType>(def[i]), 0, out_scale);
+                else if constexpr (is_decimal<Out>)
+                {
+                    if (def_scale == out_scale)
+                        out_pod[i] = static_cast<typename Out::NativeType>(def[i]);
+                    else
+                    {
+                        const auto whole = static_cast<typename Out::NativeType>(DecimalUtils::getWholePart(def[i], def_scale));
+                        const auto fract = static_cast<typename Out::NativeType>(DecimalUtils::getFractionalPart(def[i], def_scale));
+                        out_pod[i] = DecimalUtils::decimalFromComponents<Out>(whole, fract, out_scale);
+                    }
+                }
                 else
-                    memcpy(&dst[i], reinterpret_cast<const char *>(&it->getMapped()) + sizeof(UInt64) - sizeof(dst[i]), sizeof(dst[i]));
+                    out_pod[i] = static_cast<Out>(def[i]); // NOLINT(bugprone-signed-char-misuse,cert-str34-c)
             }
-            else
-                dst[i] = dst_default;
         }
-    }
 
-    template <typename T, typename U, typename V>
-    void executeImplNumToNumWithNonConstDefault(const PaddedPODArray<T> & src, PaddedPODArray<U> & dst, const PaddedPODArray<V> & dst_default) const
-    {
-        const auto & table = *cache.table_num_to_num;
-        size_t size = src.size();
-        dst.resize(size);
-        for (size_t i = 0; i < size; ++i)
+        bool executeString(const IColumn * in_untyped, IColumn & column_result, const ColumnPtr default_non_const) const
         {
-            const auto * it = table.find(bit_cast<UInt64>(src[i]));
-            if (it)
+            const auto * const in = checkAndGetColumn<ColumnString>(in_untyped);
+            if (!in)
+                return false;
+            const auto & data = in->getChars();
+            const auto & offsets = in->getOffsets();
+
+            if (!executeStringToString(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<UInt8>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<UInt16>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<UInt32>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<UInt64>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<Int8>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<Int16>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<Int32>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<Int64>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<Float32>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnVector<Float64>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnDecimal<Decimal32>>(data, offsets, column_result, default_non_const)
+                && !executeStringToNum<ColumnDecimal<Decimal64>>(data, offsets, column_result, default_non_const))
             {
-                if constexpr (std::endian::native == std::endian::little)
-                    memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));
-                else
-                    memcpy(&dst[i], reinterpret_cast<const char *>(&it->getMapped()) + sizeof(UInt64) - sizeof(dst[i]), sizeof(dst[i]));
+                const size_t size = offsets.size();
+                const auto & table = *cache.table_string_to_idx;
+                ColumnString::Offset current_offset = 0;
+                for (size_t i = 0; i < size; ++i)
+                {
+                    const StringRef ref{&data[current_offset], offsets[i] - current_offset};
+                    current_offset = offsets[i];
+                    const auto * it = table.find(ref);
+                    if (it)
+                        column_result.insertFrom(*cache.to_columns, it->getMapped());
+                    else if (cache.default_column)
+                        column_result.insertFrom(*cache.default_column, 0);
+                    else if (default_non_const)
+                        column_result.insertFrom(*default_non_const, 0);
+                    else
+                        column_result.insertFrom(*in, i);
+                }
             }
-            else if constexpr (is_decimal<U>)
-                dst[i] = static_cast<typename U::NativeType>(dst_default[i]);
-            else
-                dst[i] = static_cast<U>(dst_default[i]); // NOLINT(bugprone-signed-char-misuse,cert-str34-c)
+            return true;
         }
-    }
 
-    template <typename T>
-    void executeImplNumToNum(const PaddedPODArray<T> & src, PaddedPODArray<T> & dst) const
-    {
-        const auto & table = *cache.table_num_to_num;
-        size_t size = src.size();
-        dst.resize(size);
-        for (size_t i = 0; i < size; ++i)
+        bool executeStringToString(
+            const ColumnString::Chars & data,
+            const ColumnString::Offsets & offsets,
+            IColumn & column_result,
+            const ColumnPtr default_non_const) const
         {
-            const auto * it = table.find(bit_cast<UInt64>(src[i]));
-            if (it)
+            auto * out = typeid_cast<ColumnString *>(&column_result);
+            if (!out)
+                return false;
+            auto & out_offs = out->getOffsets();
+            const size_t size = offsets.size();
+            out_offs.resize(size);
+            auto & out_chars = out->getChars();
+
+            const auto * to_col = reinterpret_cast<const ColumnString *>(cache.to_columns.get());
+            const auto & to_chars = to_col->getChars();
+            const auto & to_offs = to_col->getOffsets();
+
+            const auto & table = *cache.table_string_to_idx;
+            if (cache.default_column)
             {
-                if constexpr (std::endian::native == std::endian::little)
-                    memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));
-                else
-                    memcpy(&dst[i], reinterpret_cast<const char *>(&it->getMapped()) + sizeof(UInt64) - sizeof(dst[i]), sizeof(dst[i]));
+                const auto * def = reinterpret_cast<const ColumnString *>(cache.default_column.get());
+                const auto & def_chars = def->getChars();
+                const auto & def_offs = def->getOffsets();
+                const auto * def_data = def_chars.data();
+                auto def_size = def_offs[0];
+                executeStringToStringHelper(table, data, offsets, out_chars, out_offs, to_chars, to_offs, def_data, def_size, size);
+            }
+            else if (default_non_const)
+            {
+                const auto * def = reinterpret_cast<const ColumnString *>(default_non_const.get());
+                const auto & def_chars = def->getChars();
+                const auto & def_offs = def->getOffsets();
+                executeStringToStringHelper(table, data, offsets, out_chars, out_offs, to_chars, to_offs, def_chars, def_offs, size);
             }
             else
-                dst[i] = src[i];
-        }
-    }
-
-    template <typename T>
-    void executeImplNumToStringWithConstDefault(const PaddedPODArray<T> & src,
-        ColumnString::Chars & dst_data, ColumnString::Offsets & dst_offsets, StringRef dst_default) const
-    {
-        const auto & table = *cache.table_num_to_string;
-        size_t size = src.size();
-        dst_offsets.resize(size);
-        ColumnString::Offset current_dst_offset = 0;
-        for (size_t i = 0; i < size; ++i)
-        {
-            const auto * it = table.find(bit_cast<UInt64>(src[i]));
-            StringRef ref = it ? it->getMapped() : dst_default;
-            dst_data.resize(current_dst_offset + ref.size);
-            memcpy(&dst_data[current_dst_offset], ref.data, ref.size);
-            current_dst_offset += ref.size;
-            dst_offsets[i] = current_dst_offset;
+            {
+                executeStringToStringHelper(table, data, offsets, out_chars, out_offs, to_chars, to_offs, data, offsets, size);
+            }
+            return true;
         }
-    }
 
-    template <typename T>
-    void executeImplNumToStringWithNonConstDefault(const PaddedPODArray<T> & src,
-        ColumnString::Chars & dst_data, ColumnString::Offsets & dst_offsets,
-        const ColumnString::Chars & dst_default_data, const ColumnString::Offsets & dst_default_offsets) const
-    {
-        const auto & table = *cache.table_num_to_string;
-        size_t size = src.size();
-        dst_offsets.resize(size);
-        ColumnString::Offset current_dst_offset = 0;
-        ColumnString::Offset current_dst_default_offset = 0;
-        for (size_t i = 0; i < size; ++i)
+        template <typename Table, typename DefData, typename DefOffs>
+        void executeStringToStringHelper(
+            const Table & table,
+            const ColumnString::Chars & data,
+            const ColumnString::Offsets & offsets,
+            ColumnString::Chars & out_data,
+            ColumnString::Offsets & out_offsets,
+            const ColumnString::Chars & to_data,
+            const ColumnString::Offsets & to_offsets,
+            const DefData & def_data,
+            const DefOffs & def_offsets,
+            const size_t size) const
         {
-            const auto * it = table.find(bit_cast<UInt64>(src[i]));
-            StringRef ref;
-
-            if (it)
-                ref = it->getMapped();
-            else
+            ColumnString::Offset current_offset = 0;
+            size_t out_cur_off = 0;
+            for (size_t i = 0; i < size; ++i)
             {
-                ref.data = reinterpret_cast<const char *>(&dst_default_data[current_dst_default_offset]);
-                ref.size = dst_default_offsets[i] - current_dst_default_offset;
+                const char8_t * to = nullptr;
+                size_t to_size = 0;
+                const StringRef ref{&data[current_offset], offsets[i] - current_offset};
+                current_offset = offsets[i];
+                const auto * it = table.find(ref);
+                if (it)
+                {
+                    const auto idx = it->getMapped();
+                    const auto start = to_offsets[idx - 1];
+                    to = &to_data[start];
+                    to_size = to_offsets[idx] - start;
+                }
+                else if constexpr (std::is_same_v<DefData, ColumnString::Chars>)
+                {
+                    const auto start = def_offsets[i - 1];
+                    to = &def_data[start];
+                    to_size = def_offsets[i] - start;
+                }
+                else
+                {
+                    to = def_data;
+                    to_size = def_offsets;
+                }
+                out_data.resize(out_cur_off + to_size);
+                memcpy(&out_data[out_cur_off], to, to_size);
+                out_cur_off += to_size;
+                out_offsets[i] = out_cur_off;
             }
-
-            dst_data.resize(current_dst_offset + ref.size);
-            memcpy(&dst_data[current_dst_offset], ref.data, ref.size);
-            current_dst_offset += ref.size;
-            current_dst_default_offset = dst_default_offsets[i];
-            dst_offsets[i] = current_dst_offset;
         }
-    }
 
-    template <typename U>
-    void executeImplStringToNumWithConstDefault(
-        const ColumnString::Chars & src_data, const ColumnString::Offsets & src_offsets,
-        PaddedPODArray<U> & dst, U dst_default) const
-    {
-        const auto & table = *cache.table_string_to_num;
-        size_t size = src_offsets.size();
-        dst.resize(size);
-        ColumnString::Offset current_src_offset = 0;
-        for (size_t i = 0; i < size; ++i)
+        template <typename T>
+        bool executeStringToNum(
+            const ColumnString::Chars & data,
+            const ColumnString::Offsets & offsets,
+            IColumn & column_result,
+            const ColumnPtr default_non_const) const
         {
-            StringRef ref{&src_data[current_src_offset], src_offsets[i] - current_src_offset};
-            current_src_offset = src_offsets[i];
-            const auto * it = table.find(ref);
-            if (it)
+            auto * out = typeid_cast<T *>(&column_result);
+            if (!out)
+                return false;
+            auto & out_pod = out->getData();
+            const size_t size = offsets.size();
+            out_pod.resize(size);
+
+            const auto & to_pod = reinterpret_cast<const T *>(cache.to_columns.get())->getData();
+            const auto & table = *cache.table_string_to_idx;
+            if (cache.default_column)
             {
-                if constexpr (std::endian::native == std::endian::little)
-                    memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));
-                else
-                    memcpy(&dst[i], reinterpret_cast<const char *>(&it->getMapped()) + sizeof(UInt64) - sizeof(dst[i]), sizeof(dst[i]));
+                const auto const_def = reinterpret_cast<const T *>(cache.default_column.get())->getData()[0];
+                executeStringToNumHelper(table, data, offsets, out_pod, to_pod, const_def, size);
             }
             else
-                dst[i] = dst_default;
+            {
+                const auto & nconst_def = reinterpret_cast<const T *>(default_non_const.get())->getData();
+                executeStringToNumHelper(table, data, offsets, out_pod, to_pod, nconst_def, size);
+            }
+            return true;
         }
-    }
 
-    template <typename U, typename V>
-    void executeImplStringToNumWithNonConstDefault(
-        const ColumnString::Chars & src_data, const ColumnString::Offsets & src_offsets,
-        PaddedPODArray<U> & dst, const PaddedPODArray<V> & dst_default) const
-    {
-        const auto & table = *cache.table_string_to_num;
-        size_t size = src_offsets.size();
-        dst.resize(size);
-        ColumnString::Offset current_src_offset = 0;
-        for (size_t i = 0; i < size; ++i)
+        template <typename Table, typename Out, typename Def>
+        void executeStringToNumHelper(
+            const Table & table,
+            const ColumnString::Chars & data,
+            const ColumnString::Offsets & offsets,
+            PaddedPODArray<Out> & out_pod,
+            const PaddedPODArray<Out> & to_pod,
+            const Def & def,
+            const size_t size) const
         {
-            StringRef ref{&src_data[current_src_offset], src_offsets[i] - current_src_offset};
-            current_src_offset = src_offsets[i];
-            const auto * it = table.find(ref);
-            if (it)
+            ColumnString::Offset current_offset = 0;
+            for (size_t i = 0; i < size; ++i)
             {
-                if constexpr (std::endian::native == std::endian::little)
-                    memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));
+                const StringRef ref{&data[current_offset], offsets[i] - current_offset};
+                current_offset = offsets[i];
+                const auto * it = table.find(ref);
+                if (it)
+                {
+                    const auto idx = it->getMapped();
+                    out_pod[i] = to_pod[idx];
+                }
+                else if constexpr (std::is_same_v<Def, Out>)
+                    out_pod[i] = def;
+                else if constexpr (is_decimal<Out>)
+                    out_pod[i] = static_cast<typename Out::NativeType>(def[i]);
                 else
-                    memcpy(&dst[i], reinterpret_cast<const char *>(&it->getMapped()) + sizeof(UInt64) - sizeof(dst[i]), sizeof(dst[i]));
+                    out_pod[i] = static_cast<Out>(def[i]); // NOLINT(bugprone-signed-char-misuse,cert-str34-c)
             }
-            else if constexpr (is_decimal<U>)
-                dst[i] = static_cast<typename U::NativeType>(dst_default[i]);
-            else
-                dst[i] = static_cast<U>(dst_default[i]); // NOLINT(bugprone-signed-char-misuse,cert-str34-c)
         }
-    }
 
-    template <bool with_default>
-    void executeImplStringToStringWithOrWithoutConstDefault(
-        const ColumnString::Chars & src_data, const ColumnString::Offsets & src_offsets,
-        ColumnString::Chars & dst_data, ColumnString::Offsets & dst_offsets, StringRef dst_default) const
-    {
-        const auto & table = *cache.table_string_to_string;
-        size_t size = src_offsets.size();
-        dst_offsets.resize(size);
-        ColumnString::Offset current_src_offset = 0;
-        ColumnString::Offset current_dst_offset = 0;
-        for (size_t i = 0; i < size; ++i)
+        /// Different versions of the hash tables to implement the mapping.
+
+        struct Cache
         {
-            StringRef src_ref{&src_data[current_src_offset], src_offsets[i] - current_src_offset};
-            current_src_offset = src_offsets[i];
+            using NumToIdx = HashMap<UInt64, size_t, HashCRC32<UInt64>>;
+            using StringToIdx = HashMap<StringRef, size_t, StringRefHash>;
 
-            const auto * it = table.find(src_ref);
+            std::unique_ptr<NumToIdx> table_num_to_idx;
+            std::unique_ptr<StringToIdx> table_string_to_idx;
 
-            StringRef dst_ref = it ? it->getMapped() : (with_default ? dst_default : src_ref);
-            dst_data.resize(current_dst_offset + dst_ref.size);
-            memcpy(&dst_data[current_dst_offset], dst_ref.data, dst_ref.size);
-            current_dst_offset += dst_ref.size;
-            dst_offsets[i] = current_dst_offset;
-        }
-    }
+            ColumnPtr to_columns;
+            ColumnPtr default_column;
 
-    void executeImplStringToString(
-        const ColumnString::Chars & src_data, const ColumnString::Offsets & src_offsets,
-        ColumnString::Chars & dst_data, ColumnString::Offsets & dst_offsets) const
-    {
-        executeImplStringToStringWithOrWithoutConstDefault<false>(src_data, src_offsets, dst_data, dst_offsets, {});
-    }
+            Arena string_pool;
 
-    void executeImplStringToStringWithConstDefault(
-        const ColumnString::Chars & src_data, const ColumnString::Offsets & src_offsets,
-        ColumnString::Chars & dst_data, ColumnString::Offsets & dst_offsets, StringRef dst_default) const
-    {
-        executeImplStringToStringWithOrWithoutConstDefault<true>(src_data, src_offsets, dst_data, dst_offsets, dst_default);
-    }
+            std::atomic<bool> initialized{false};
+            std::mutex mutex;
+        };
 
-    void executeImplStringToStringWithNonConstDefault(
-        const ColumnString::Chars & src_data, const ColumnString::Offsets & src_offsets,
-        ColumnString::Chars & dst_data, ColumnString::Offsets & dst_offsets,
-        const ColumnString::Chars & dst_default_data, const ColumnString::Offsets & dst_default_offsets) const
-    {
-        const auto & table = *cache.table_string_to_string;
-        size_t size = src_offsets.size();
-        dst_offsets.resize(size);
-        ColumnString::Offset current_src_offset = 0;
-        ColumnString::Offset current_dst_offset = 0;
-        ColumnString::Offset current_dst_default_offset = 0;
-        for (size_t i = 0; i < size; ++i)
-        {
-            StringRef src_ref{&src_data[current_src_offset], src_offsets[i] - current_src_offset};
-            current_src_offset = src_offsets[i];
+        mutable Cache cache;
 
-            const auto * it = table.find(src_ref);
-            StringRef dst_ref;
 
-            if (it)
-                dst_ref = it->getMapped();
-            else
+        static UInt64 bitCastToUInt64(const Field & x)
+        {
+            switch (x.getType())
             {
-                dst_ref.data = reinterpret_cast<const char *>(&dst_default_data[current_dst_default_offset]);
-                dst_ref.size = dst_default_offsets[i] - current_dst_default_offset;
+                case Field::Types::UInt64:
+                    return x.get<UInt64>();
+                case Field::Types::Int64:
+                    return x.get<Int64>();
+                case Field::Types::Float64:
+                    return std::bit_cast<UInt64>(x.get<Float64>());
+                case Field::Types::Bool:
+                    return x.get<bool>();
+                case Field::Types::Decimal32:
+                    return x.get<DecimalField<Decimal32>>().getValue();
+                case Field::Types::Decimal64:
+                    return x.get<DecimalField<Decimal64>>().getValue();
+                default:
+                    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type in function 'transform'");
             }
-
-            dst_data.resize(current_dst_offset + dst_ref.size);
-            memcpy(&dst_data[current_dst_offset], dst_ref.data, dst_ref.size);
-            current_dst_offset += dst_ref.size;
-            current_dst_default_offset = dst_default_offsets[i];
-            dst_offsets[i] = current_dst_offset;
         }
-    }
-
-
-    /// Different versions of the hash tables to implement the mapping.
-
-    struct Cache
-    {
-        using NumToNum = HashMap<UInt64, UInt64, HashCRC32<UInt64>>;
-        using NumToString = HashMap<UInt64, StringRef, HashCRC32<UInt64>>;     /// Everywhere StringRef's with trailing zero.
-        using StringToNum = HashMap<StringRef, UInt64, StringRefHash>;
-        using StringToString = HashMap<StringRef, StringRef, StringRefHash>;
-
-        std::unique_ptr<NumToNum> table_num_to_num;
-        std::unique_ptr<NumToString> table_num_to_string;
-        std::unique_ptr<StringToNum> table_string_to_num;
-        std::unique_ptr<StringToString> table_string_to_string;
-
-        Arena string_pool;
-
-        Field const_default_value;    /// Null, if not specified.
-
-        std::atomic<bool> initialized{false};
-        std::mutex mutex;
-    };
 
-    mutable Cache cache;
-
-
-    static UInt64 bitCastToUInt64(const Field & x)
-    {
-        switch (x.getType())
+        static void checkAllowedType(const DataTypePtr & dt)
         {
-            case Field::Types::UInt64:      return x.get<UInt64>();
-            case Field::Types::Int64:       return x.get<Int64>();
-            case Field::Types::Float64:     return std::bit_cast<UInt64>(x.get<Float64>());
-            case Field::Types::Bool:        return x.get<bool>();
-            case Field::Types::Decimal32:   return x.get<DecimalField<Decimal32>>().getValue();
-            case Field::Types::Decimal64:   return x.get<DecimalField<Decimal64>>().getValue();
-            default:
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type in function 'transform'");
+            if (dt->isNullable())
+                checkAllowedTypeHelper(static_cast<const DataTypeNullable *>(dt.get())->getNestedType());
+            else
+                checkAllowedTypeHelper(dt);
         }
-    }
-
-    /// Can be called from different threads. It works only on the first call.
-    void initialize(const Array & from, const Array & to, const ColumnsWithTypeAndName & arguments) const
-    {
-        if (cache.initialized)
-            return;
 
-        const size_t size = from.size();
-        if (0 == size)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Empty arrays are illegal in function {}", getName());
+        static void checkAllowedTypeHelper(const DataTypePtr & dt)
+        {
+            if (isStringOrFixedString(dt))
+                return;
+            auto dtsize = dt->getMaximumSizeOfValueInMemory();
+            if (dtsize <= sizeof(UInt64))
+                return;
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type {} in function 'transform'", dt->getName());
+        }
 
-        std::lock_guard lock(cache.mutex);
+        /// Can be called from different threads. It works only on the first call.
+        void initialize(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const
+        {
+            const ColumnConst * array_from = checkAndGetColumnConst<ColumnArray>(arguments[1].column.get());
+            const ColumnConst * array_to = checkAndGetColumnConst<ColumnArray>(arguments[2].column.get());
 
-        if (cache.initialized)
-            return;
+            if (!array_from || !array_to)
+                throw Exception(
+                    ErrorCodes::ILLEGAL_COLUMN, "Second and third arguments of function {} must be constant arrays.", getName());
 
-        if (size != to.size())
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second and third arguments of function {} must be arrays of same size", getName());
+            if (cache.initialized)
+                return;
 
-        Array converted_to;
-        const Array * used_to = &to;
+            const auto & from = array_from->getValue<Array>();
+            const size_t size = from.size();
+            if (0 == size)
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Empty arrays are illegal in function {}", getName());
 
-        /// Whether the default value is set.
+            std::lock_guard lock(cache.mutex);
 
-        if (arguments.size() == 4)
-        {
-            const IColumn * default_col = arguments[3].column.get();
-            const ColumnConst * const_default_col = typeid_cast<const ColumnConst *>(default_col);
+            if (cache.initialized)
+                return;
 
-            if (const_default_col)
-                cache.const_default_value = (*const_default_col)[0];
+            const auto & to = array_to->getValue<Array>();
+            if (size != to.size())
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS, "Second and third arguments of function {} must be arrays of same size", getName());
 
-            /// Do we need to convert the elements `to` and `default_value` to the smallest common type that is Float64?
-            bool default_col_is_float =
-                   checkColumn<ColumnFloat32>(default_col)
-                || checkColumn<ColumnFloat64>(default_col)
-                || checkColumnConst<ColumnFloat32>(default_col)
-                || checkColumnConst<ColumnFloat64>(default_col);
+            /// Whether the default value is set.
 
-            bool to_is_float = to[0].getType() == Field::Types::Float64;
-
-            if (default_col_is_float && !to_is_float)
+            if (arguments.size() == 4)
             {
-                converted_to.resize(size);
-                for (size_t i = 0; i < size; ++i)
-                    converted_to[i] = applyVisitor(FieldVisitorConvertToNumber<Float64>(), to[i]);
-                used_to = &converted_to;
-            }
-            else if (!default_col_is_float && to_is_float)
-            {
-                if (const_default_col)
-                    cache.const_default_value = applyVisitor(FieldVisitorConvertToNumber<Float64>(), cache.const_default_value);
+                const IColumn * default_col = arguments[3].column.get();
+                if (default_col && isColumnConst(*default_col))
+                {
+                    auto default_column = result_type->createColumn();
+                    if (!default_col->onlyNull())
+                    {
+                        Field f = convertFieldToType((*default_col)[0], *result_type);
+                        default_column->insert(f);
+                    }
+                    else
+                        default_column->insertDefault();
+                    cache.default_column = std::move(default_column);
+                }
             }
-        }
 
-        /// Note: Doesn't check the duplicates in the `from` array.
+            /// Note: Doesn't check the duplicates in the `from` array.
 
-        const IDataType & from_type = *arguments[0].type;
+            const IDataType & from_type = *arguments[0].type;
 
-        if (from[0].getType() != Field::Types::String)
-        {
-            if (to[0].getType() != Field::Types::String)
+            if (from[0].getType() != Field::Types::String)
             {
-                cache.table_num_to_num = std::make_unique<Cache::NumToNum>();
-                auto & table = *cache.table_num_to_num;
+                cache.table_num_to_idx = std::make_unique<Cache::NumToIdx>();
+                auto & table = *cache.table_num_to_idx;
                 for (size_t i = 0; i < size; ++i)
                 {
                     Field key = convertFieldToType(from[i], from_type);
@@ -1204,55 +715,31 @@ class FunctionTransform : public IFunction
                         continue;
 
                     /// Field may be of Float type, but for the purpose of bitwise equality we can treat them as UInt64
-                    table[bitCastToUInt64(key)] = bitCastToUInt64((*used_to)[i]);
+                    table[bitCastToUInt64(key)] = i;
                 }
             }
             else
             {
-                cache.table_num_to_string = std::make_unique<Cache::NumToString>();
-                auto & table = *cache.table_num_to_string;
-                for (size_t i = 0; i < size; ++i)
-                {
-                    Field key = convertFieldToType(from[i], from_type);
-                    if (key.isNull())
-                        continue;
-
-                    const String & str_to = to[i].get<const String &>();
-                    StringRef ref{cache.string_pool.insert(str_to.data(), str_to.size() + 1), str_to.size() + 1};
-                    table[bitCastToUInt64(key)] = ref;
-                }
-            }
-        }
-        else
-        {
-            if (to[0].getType() != Field::Types::String)
-            {
-                cache.table_string_to_num = std::make_unique<Cache::StringToNum>();
-                auto & table = *cache.table_string_to_num;
+                cache.table_string_to_idx = std::make_unique<Cache::StringToIdx>();
+                auto & table = *cache.table_string_to_idx;
                 for (size_t i = 0; i < size; ++i)
                 {
                     const String & str_from = from[i].get<const String &>();
                     StringRef ref{cache.string_pool.insert(str_from.data(), str_from.size() + 1), str_from.size() + 1};
-                    table[ref] = bitCastToUInt64((*used_to)[i]);
+                    table[ref] = i;
                 }
             }
-            else
+
+            auto to_columns = result_type->createColumn();
+            for (size_t i = 0; i < size; ++i)
             {
-                cache.table_string_to_string = std::make_unique<Cache::StringToString>();
-                auto & table = *cache.table_string_to_string;
-                for (size_t i = 0; i < size; ++i)
-                {
-                    const String & str_from = from[i].get<const String &>();
-                    const String & str_to = to[i].get<const String &>();
-                    StringRef ref_from{cache.string_pool.insert(str_from.data(), str_from.size() + 1), str_from.size() + 1};
-                    StringRef ref_to{cache.string_pool.insert(str_to.data(), str_to.size() + 1), str_to.size() + 1};
-                    table[ref_from] = ref_to;
-                }
+                Field to_value = convertFieldToType(to[i], *result_type);
+                to_columns->insert(to_value);
             }
-        }
+            cache.to_columns = std::move(to_columns);
 
-        cache.initialized = true;
-    }
+            cache.initialized = true;
+        }
 };
 
 }
diff --git a/src/IO/BackupsIOThreadPool.cpp b/src/IO/BackupsIOThreadPool.cpp
deleted file mode 100644
index e135ef66ffb6..000000000000
--- a/src/IO/BackupsIOThreadPool.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <IO/BackupsIOThreadPool.h>
-#include <Common/CurrentMetrics.h>
-#include <Common/ThreadPool.h>
-#include <Core/Field.h>
-
-namespace CurrentMetrics
-{
-    extern const Metric BackupsIOThreads;
-    extern const Metric BackupsIOThreadsActive;
-}
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
-std::unique_ptr<ThreadPool> BackupsIOThreadPool::instance;
-
-void BackupsIOThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size)
-{
-    if (instance)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "The BackupsIO thread pool is initialized twice");
-    }
-
-    instance = std::make_unique<ThreadPool>(
-        CurrentMetrics::BackupsIOThreads,
-        CurrentMetrics::BackupsIOThreadsActive,
-        max_threads,
-        max_free_threads,
-        queue_size,
-        /* shutdown_on_exception= */ false);
-}
-
-ThreadPool & BackupsIOThreadPool::get()
-{
-    if (!instance)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "The BackupsIO thread pool is not initialized");
-    }
-
-    return *instance;
-}
-
-}
diff --git a/src/IO/BackupsIOThreadPool.h b/src/IO/BackupsIOThreadPool.h
deleted file mode 100644
index 745bf267300b..000000000000
--- a/src/IO/BackupsIOThreadPool.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include <Common/ThreadPool_fwd.h>
-#include <cstdlib>
-#include <memory>
-
-namespace DB
-{
-
-/*
- * ThreadPool used for the Backup IO.
- */
-class BackupsIOThreadPool
-{
-    static std::unique_ptr<ThreadPool> instance;
-
-public:
-    static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size);
-    static ThreadPool & get();
-};
-
-}
diff --git a/src/IO/BrotliReadBuffer.cpp b/src/IO/BrotliReadBuffer.cpp
index 56ef2b5446bb..1863cef8a39e 100644
--- a/src/IO/BrotliReadBuffer.cpp
+++ b/src/IO/BrotliReadBuffer.cpp
@@ -3,6 +3,7 @@
 #if USE_BROTLI
 #    include <brotli/decode.h>
 #    include "BrotliReadBuffer.h"
+#    include <IO/WithFileName.h>
 
 namespace DB
 {
@@ -60,7 +61,10 @@ bool BrotliReadBuffer::nextImpl()
 
         if (brotli->result == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT && (!in_available || in->eof()))
         {
-            throw Exception(ErrorCodes::BROTLI_READ_FAILED, "brotli decode error");
+            throw Exception(
+                ErrorCodes::BROTLI_READ_FAILED,
+                "brotli decode error{}",
+                getExceptionEntryWithFileName(*in));
         }
 
         out_capacity = internal_buffer.size();
@@ -83,13 +87,19 @@ bool BrotliReadBuffer::nextImpl()
         }
         else
         {
-            throw Exception(ErrorCodes::BROTLI_READ_FAILED, "brotli decode error");
+            throw Exception(
+                ErrorCodes::BROTLI_READ_FAILED,
+                "brotli decode error{}",
+                getExceptionEntryWithFileName(*in));
         }
     }
 
     if (brotli->result == BROTLI_DECODER_RESULT_ERROR)
     {
-        throw Exception(ErrorCodes::BROTLI_READ_FAILED, "brotli decode error");
+        throw Exception(
+            ErrorCodes::BROTLI_READ_FAILED,
+            "brotli decode error{}",
+            getExceptionEntryWithFileName(*in));
     }
 
     return true;
diff --git a/src/IO/Bzip2ReadBuffer.cpp b/src/IO/Bzip2ReadBuffer.cpp
index 9970edcbcf3a..45ce8f452324 100644
--- a/src/IO/Bzip2ReadBuffer.cpp
+++ b/src/IO/Bzip2ReadBuffer.cpp
@@ -3,6 +3,7 @@
 #if USE_BZIP2
 #    include <IO/Bzip2ReadBuffer.h>
 #    include <bzlib.h>
+#    include <IO/WithFileName.h>
 
 namespace DB
 {
@@ -118,13 +119,17 @@ bool Bzip2ReadBuffer::nextImpl()
     if (ret != BZ_OK)
         throw Exception(
             ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
-            "bzip2 stream decoder failed: error code: {}",
-            ret);
+            "bzip2 stream decoder failed: error code: {}{}",
+            ret,
+            getExceptionEntryWithFileName(*in));
 
     if (in->eof())
     {
         eof_flag = true;
-        throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of bzip2 archive");
+        throw Exception(
+            ErrorCodes::UNEXPECTED_END_OF_FILE,
+            "Unexpected end of bzip2 archive{}",
+            getExceptionEntryWithFileName(*in));
     }
 
     return true;
diff --git a/src/IO/HTTPChunkedReadBuffer.cpp b/src/IO/HTTPChunkedReadBuffer.cpp
index 65ccad4aab72..29034b35e16e 100644
--- a/src/IO/HTTPChunkedReadBuffer.cpp
+++ b/src/IO/HTTPChunkedReadBuffer.cpp
@@ -33,7 +33,7 @@ size_t HTTPChunkedReadBuffer::readChunkHeader()
     } while (!in->eof() && isHexDigit(*in->position()));
 
     if (res > max_chunk_size)
-        throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Chunk size exceeded the limit");
+        throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Chunk size exceeded the limit (max size: {})", max_chunk_size);
 
     /// NOTE: If we want to read any chunk extensions, it should be done here.
 
diff --git a/src/IO/HadoopSnappyReadBuffer.cpp b/src/IO/HadoopSnappyReadBuffer.cpp
index 6ba31997b37f..285888542684 100644
--- a/src/IO/HadoopSnappyReadBuffer.cpp
+++ b/src/IO/HadoopSnappyReadBuffer.cpp
@@ -11,6 +11,8 @@
 
 #include "HadoopSnappyReadBuffer.h"
 
+#include <IO/WithFileName.h>
+
 namespace DB
 {
 namespace ErrorCodes
@@ -89,9 +91,8 @@ inline HadoopSnappyDecoder::Status HadoopSnappyDecoder::readCompressedLength(siz
     {
         auto status = readLength(avail_in, next_in, &compressed_length);
         if (unlikely(compressed_length > 0 && static_cast<size_t>(compressed_length) > sizeof(buffer)))
-            throw Exception(ErrorCodes::SNAPPY_UNCOMPRESS_FAILED,
-                            "Too large snappy compressed block. buffer size: {}, compressed block size: {}",
-                            sizeof(buffer), compressed_length);
+            return Status::TOO_LARGE_COMPRESSED_BLOCK;
+
         return status;
     }
     return Status::OK;
@@ -196,7 +197,11 @@ bool HadoopSnappyReadBuffer::nextImpl()
 
         if (decoder->result == Status::NEEDS_MORE_INPUT && (!in_available || in->eof()))
         {
-            throw Exception(ErrorCodes::SNAPPY_UNCOMPRESS_FAILED, "hadoop snappy decode error: {}", statusToString(decoder->result));
+            throw Exception(
+                ErrorCodes::SNAPPY_UNCOMPRESS_FAILED,
+                "hadoop snappy decode error: {}{}",
+                statusToString(decoder->result),
+                getExceptionEntryWithFileName(*in));
         }
 
         out_capacity = internal_buffer.size();
@@ -219,9 +224,13 @@ bool HadoopSnappyReadBuffer::nextImpl()
         }
         return true;
     }
-    else if (decoder->result == Status::INVALID_INPUT || decoder->result == Status::BUFFER_TOO_SMALL)
+    else if (decoder->result != Status::NEEDS_MORE_INPUT)
     {
-        throw Exception(ErrorCodes::SNAPPY_UNCOMPRESS_FAILED, "hadoop snappy decode error: {}", statusToString(decoder->result));
+        throw Exception(
+            ErrorCodes::SNAPPY_UNCOMPRESS_FAILED,
+            "hadoop snappy decode error: {}{}",
+            statusToString(decoder->result),
+            getExceptionEntryWithFileName(*in));
     }
     return true;
 }
diff --git a/src/IO/HadoopSnappyReadBuffer.h b/src/IO/HadoopSnappyReadBuffer.h
index 6d1b95f6813d..b5fb1fec0936 100644
--- a/src/IO/HadoopSnappyReadBuffer.h
+++ b/src/IO/HadoopSnappyReadBuffer.h
@@ -29,6 +29,7 @@ class HadoopSnappyDecoder
         INVALID_INPUT = 1,
         BUFFER_TOO_SMALL = 2,
         NEEDS_MORE_INPUT = 3,
+        TOO_LARGE_COMPRESSED_BLOCK = 4,
     };
 
     HadoopSnappyDecoder() = default;
@@ -84,6 +85,8 @@ class HadoopSnappyReadBuffer : public CompressedReadBufferWrapper
                 return "BUFFER_TOO_SMALL";
             case Status::NEEDS_MORE_INPUT:
                 return "NEEDS_MORE_INPUT";
+            case Status::TOO_LARGE_COMPRESSED_BLOCK:
+                return "TOO_LARGE_COMPRESSED_BLOCK";
         }
         UNREACHABLE();
     }
diff --git a/src/IO/IOThreadPool.cpp b/src/IO/IOThreadPool.cpp
deleted file mode 100644
index 6765deff4d48..000000000000
--- a/src/IO/IOThreadPool.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <IO/IOThreadPool.h>
-#include <Common/CurrentMetrics.h>
-#include <Common/ThreadPool.h>
-#include <Core/Field.h>
-
-namespace CurrentMetrics
-{
-    extern const Metric IOThreads;
-    extern const Metric IOThreadsActive;
-}
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
-std::unique_ptr<ThreadPool> IOThreadPool::instance;
-
-void IOThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size)
-{
-    if (instance)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "The IO thread pool is initialized twice");
-    }
-
-    instance = std::make_unique<ThreadPool>(
-        CurrentMetrics::IOThreads,
-        CurrentMetrics::IOThreadsActive,
-        max_threads,
-        max_free_threads,
-        queue_size,
-        /* shutdown_on_exception= */ false);
-}
-
-ThreadPool & IOThreadPool::get()
-{
-    if (!instance)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "The IO thread pool is not initialized");
-    }
-
-    return *instance;
-}
-
-}
diff --git a/src/IO/IOThreadPool.h b/src/IO/IOThreadPool.h
deleted file mode 100644
index cfe755ed45a4..000000000000
--- a/src/IO/IOThreadPool.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include <Common/ThreadPool_fwd.h>
-#include <cstdlib>
-#include <memory>
-
-namespace DB
-{
-
-/*
- * ThreadPool used for the IO.
- */
-class IOThreadPool
-{
-    static std::unique_ptr<ThreadPool> instance;
-
-public:
-    static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size);
-    static ThreadPool & get();
-};
-
-}
diff --git a/src/IO/LZMAInflatingReadBuffer.cpp b/src/IO/LZMAInflatingReadBuffer.cpp
index 6d40dafd517a..a6f3c74ae73b 100644
--- a/src/IO/LZMAInflatingReadBuffer.cpp
+++ b/src/IO/LZMAInflatingReadBuffer.cpp
@@ -1,4 +1,5 @@
 #include <IO/LZMAInflatingReadBuffer.h>
+#include <IO/WithFileName.h>
 
 namespace DB
 {
@@ -78,18 +79,20 @@ bool LZMAInflatingReadBuffer::nextImpl()
         {
             throw Exception(
                 ErrorCodes::LZMA_STREAM_DECODER_FAILED,
-                "lzma decoder finished, but input stream has not exceeded: error code: {}; lzma version: {}",
+                "lzma decoder finished, but input stream has not exceeded: error code: {}; lzma version: {}{}",
                 ret,
-                LZMA_VERSION_STRING);
+                LZMA_VERSION_STRING,
+                getExceptionEntryWithFileName(*in));
         }
     }
 
     if (ret != LZMA_OK)
         throw Exception(
             ErrorCodes::LZMA_STREAM_DECODER_FAILED,
-            "lzma_stream_decoder failed: error code: error codeL {}; lzma version: {}",
+            "lzma_stream_decoder failed: error code: error code {}; lzma version: {}{}",
             ret,
-            LZMA_VERSION_STRING);
+            LZMA_VERSION_STRING,
+            getExceptionEntryWithFileName(*in));
 
     return true;
 }
diff --git a/src/IO/Lz4InflatingReadBuffer.cpp b/src/IO/Lz4InflatingReadBuffer.cpp
index 049f3a4d15a6..eaa71048e703 100644
--- a/src/IO/Lz4InflatingReadBuffer.cpp
+++ b/src/IO/Lz4InflatingReadBuffer.cpp
@@ -1,4 +1,5 @@
 #include <IO/Lz4InflatingReadBuffer.h>
+#include <IO/WithFileName.h>
 
 namespace DB
 {
@@ -72,9 +73,10 @@ bool Lz4InflatingReadBuffer::nextImpl()
     if (LZ4F_isError(ret))
         throw Exception(
             ErrorCodes::LZ4_DECODER_FAILED,
-            "LZ4 decompression failed. LZ4F version: {}. Error: {}",
+            "LZ4 decompression failed. LZ4F version: {}. Error: {}{}",
             LZ4F_VERSION,
-            LZ4F_getErrorName(ret));
+            LZ4F_getErrorName(ret),
+            getExceptionEntryWithFileName(*in));
 
     if (in->eof())
     {
diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp
index e0bca52d3331..93e2c46b0809 100644
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@@ -203,11 +203,13 @@ off_t ReadBufferFromS3::seek(off_t offset_, int whence)
         return offset_;
 
     if (impl && restricted_seek)
+    {
         throw Exception(
-                        ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
-                        "Seek is allowed only before first read attempt from the buffer (current offset: "
-                        "{}, new offset: {}, reading until position: {}, available: {})",
-                        getPosition(), offset_, read_until_position, available());
+            ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
+            "Seek is allowed only before first read attempt from the buffer (current offset: "
+            "{}, new offset: {}, reading until position: {}, available: {})",
+            getPosition(), offset_, read_until_position, available());
+    }
 
     if (whence != SEEK_SET)
         throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed.");
diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h
index fc229ada59c1..59ab402c7489 100644
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@@ -68,7 +68,9 @@ struct ReadSettings
     /// Method to use reading from remote filesystem.
     RemoteFSReadMethod remote_fs_method = RemoteFSReadMethod::threadpool;
 
-    size_t local_fs_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
+    /// https://eklitzke.org/efficient-file-copying-on-linux
+    size_t local_fs_buffer_size = 128 * 1024;
+
     size_t remote_fs_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
     size_t prefetch_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
 
diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp
index 9aabc73fc660..ee14e251cb5a 100644
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@@ -260,6 +260,7 @@ void PocoHTTPClient::makeRequestInternal(
     Poco::Logger * log = &Poco::Logger::get("AWSClient");
 
     auto uri = request.GetUri().GetURIString();
+#if 0
     auto provider_type = getProviderTypeFromURL(uri);
 
     if (provider_type == ProviderType::GCS)
@@ -269,6 +270,7 @@ void PocoHTTPClient::makeRequestInternal(
         request.DeleteHeader("amz-sdk-invocation-id");
         request.DeleteHeader("amz-sdk-request");
     }
+#endif
 
     if (enable_s3_requests_logging)
         LOG_TEST(log, "Make request to: {}", uri);
diff --git a/src/IO/SharedThreadPools.cpp b/src/IO/SharedThreadPools.cpp
new file mode 100644
index 000000000000..b7b6aea15672
--- /dev/null
+++ b/src/IO/SharedThreadPools.cpp
@@ -0,0 +1,108 @@
+#include <IO/SharedThreadPools.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/ThreadPool.h>
+#include <Core/Field.h>
+
+namespace CurrentMetrics
+{
+    extern const Metric IOThreads;
+    extern const Metric IOThreadsActive;
+    extern const Metric BackupsIOThreads;
+    extern const Metric BackupsIOThreadsActive;
+    extern const Metric OutdatedPartsLoadingThreads;
+    extern const Metric OutdatedPartsLoadingThreadsActive;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+std::unique_ptr<ThreadPool> IOThreadPool::instance;
+
+void IOThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size)
+{
+    if (instance)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "The IO thread pool is initialized twice");
+    }
+
+    instance = std::make_unique<ThreadPool>(
+        CurrentMetrics::IOThreads,
+        CurrentMetrics::IOThreadsActive,
+        max_threads,
+        max_free_threads,
+        queue_size,
+        /* shutdown_on_exception= */ false);
+}
+
+ThreadPool & IOThreadPool::get()
+{
+    if (!instance)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "The IO thread pool is not initialized");
+    }
+
+    return *instance;
+}
+
+std::unique_ptr<ThreadPool> BackupsIOThreadPool::instance;
+
+void BackupsIOThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size)
+{
+    if (instance)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "The BackupsIO thread pool is initialized twice");
+    }
+
+    instance = std::make_unique<ThreadPool>(
+        CurrentMetrics::BackupsIOThreads,
+        CurrentMetrics::BackupsIOThreadsActive,
+        max_threads,
+        max_free_threads,
+        queue_size,
+        /* shutdown_on_exception= */ false);
+}
+
+ThreadPool & BackupsIOThreadPool::get()
+{
+    if (!instance)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "The BackupsIO thread pool is not initialized");
+    }
+
+    return *instance;
+}
+
+std::unique_ptr<ThreadPool> OutdatedPartsLoadingThreadPool::instance;
+
+void OutdatedPartsLoadingThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size)
+{
+    if (instance)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "The PartsLoadingThreadPool thread pool is initialized twice");
+    }
+
+    instance = std::make_unique<ThreadPool>(
+        CurrentMetrics::OutdatedPartsLoadingThreads,
+        CurrentMetrics::OutdatedPartsLoadingThreadsActive,
+        max_threads,
+        max_free_threads,
+        queue_size,
+        /* shutdown_on_exception= */ false);
+}
+
+ThreadPool & OutdatedPartsLoadingThreadPool::get()
+{
+    if (!instance)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "The PartsLoadingThreadPool thread pool is not initialized");
+    }
+
+    return *instance;
+}
+
+}
diff --git a/src/IO/SharedThreadPools.h b/src/IO/SharedThreadPools.h
new file mode 100644
index 000000000000..1b43dfe778c5
--- /dev/null
+++ b/src/IO/SharedThreadPools.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <Common/ThreadPool_fwd.h>
+#include <cstdlib>
+#include <memory>
+
+namespace DB
+{
+
+/*
+ * ThreadPool used for the IO.
+ */
+class IOThreadPool
+{
+    static std::unique_ptr<ThreadPool> instance;
+
+public:
+    static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size);
+    static ThreadPool & get();
+};
+
+
+/*
+ * ThreadPool used for the Backup IO.
+ */
+class BackupsIOThreadPool
+{
+    static std::unique_ptr<ThreadPool> instance;
+
+public:
+    static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size);
+    static ThreadPool & get();
+};
+
+
+/*
+ * ThreadPool used for the loading of Outdated data parts for MergeTree tables.
+ */
+class OutdatedPartsLoadingThreadPool
+{
+    static std::unique_ptr<ThreadPool> instance;
+
+public:
+    static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size);
+    static ThreadPool & get();
+};
+
+}
diff --git a/src/IO/WithFileName.cpp b/src/IO/WithFileName.cpp
index 6ecb3671ca04..9d9f264c861c 100644
--- a/src/IO/WithFileName.cpp
+++ b/src/IO/WithFileName.cpp
@@ -26,4 +26,14 @@ String getFileNameFromReadBuffer(const ReadBuffer & in)
         return getFileName(in);
 }
 
+String getExceptionEntryWithFileName(const ReadBuffer & in)
+{
+    auto filename = getFileNameFromReadBuffer(in);
+
+    if (filename.empty())
+        return "";
+
+    return fmt::format(": While reading from: {}", filename);
+}
+
 }
diff --git a/src/IO/WithFileName.h b/src/IO/WithFileName.h
index d770634e7388..595f1a768c59 100644
--- a/src/IO/WithFileName.h
+++ b/src/IO/WithFileName.h
@@ -14,5 +14,6 @@ class WithFileName
 };
 
 String getFileNameFromReadBuffer(const ReadBuffer & in);
+String getExceptionEntryWithFileName(const ReadBuffer & in);
 
 }
diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index 4c1b1b65d19c..19f25b1617ff 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -85,7 +85,8 @@ WriteBufferFromS3::WriteBufferFromS3(
     , upload_settings(request_settings.getUploadSettings())
     , client_ptr(std::move(client_ptr_))
     , object_metadata(std::move(object_metadata_))
-    , upload_part_size(upload_settings.min_upload_part_size)
+    , strict_upload_part_size(upload_settings.strict_upload_part_size)
+    , current_upload_part_size(upload_settings.min_upload_part_size)
     , schedule(std::move(schedule_))
     , write_settings(write_settings_)
 {
@@ -100,28 +101,79 @@ void WriteBufferFromS3::nextImpl()
     /// Buffer in a bad state after exception
     if (temporary_buffer->tellp() == -1)
         allocateBuffer();
+    else
+        chassert(temporary_buffer->tellp() == static_cast<std::streamoff>(last_part_size));
+
+    if (strict_upload_part_size)
+        processWithStrictParts();
+    else
+        processWithDynamicParts();
+
+    waitForReadyBackGroundTasks();
+}
+
+void WriteBufferFromS3::processWithStrictParts()
+{
+    chassert(strict_upload_part_size > 0);
+
+    size_t buffer_size = offset();
+    size_t left_in_buffer = buffer_size;
+    size_t new_size = last_part_size + buffer_size;
+    size_t buffer_offset = 0;
+
+    if (new_size > strict_upload_part_size)
+    {
+        /// Data size will exceed fixed part size threshold for multipart upload, need to use multipart upload.
+        if (multipart_upload_id.empty())
+            createMultipartUpload();
+
+        while (new_size > strict_upload_part_size)
+        {
+            size_t to_write = strict_upload_part_size - last_part_size;
+            temporary_buffer->write(working_buffer.begin() + buffer_offset, to_write);
+            buffer_offset += to_write;
+
+            writePart();
+            allocateBuffer();
+
+            new_size -= strict_upload_part_size;
+            left_in_buffer -= to_write;
+        }
+    }
+
+    if (left_in_buffer)
+    {
+        temporary_buffer->write(working_buffer.begin() + buffer_offset, left_in_buffer);
+        last_part_size += left_in_buffer;
+    }
+
+    ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, buffer_size);
+
+    if (write_settings.remote_throttler)
+        write_settings.remote_throttler->add(buffer_size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds);
+}
+
+void WriteBufferFromS3::processWithDynamicParts()
+{
+    chassert(current_upload_part_size > 0);
 
     size_t size = offset();
     temporary_buffer->write(working_buffer.begin(), size);
+    ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, size);
+    last_part_size += size;
 
-    ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, offset());
-    last_part_size += offset();
     if (write_settings.remote_throttler)
-        write_settings.remote_throttler->add(offset(), ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds);
+        write_settings.remote_throttler->add(size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds);
 
     /// Data size exceeds singlepart upload threshold, need to use multipart upload.
     if (multipart_upload_id.empty() && last_part_size > upload_settings.max_single_part_upload_size)
         createMultipartUpload();
 
-    chassert(upload_part_size > 0);
-    if (!multipart_upload_id.empty() && last_part_size > upload_part_size)
+    if (!multipart_upload_id.empty() && last_part_size > current_upload_part_size)
     {
         writePart();
-
         allocateBuffer();
     }
-
-    waitForReadyBackGroundTasks();
 }
 
 void WriteBufferFromS3::allocateBuffer()
@@ -335,14 +387,17 @@ void WriteBufferFromS3::fillUploadRequest(S3::UploadPartRequest & req)
     /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
     req.SetContentType("binary/octet-stream");
 
-    /// Maybe increase `upload_part_size` (we need to increase it sometimes to keep `part_number` less or equal than `max_part_number`).
-    auto threshold = upload_settings.upload_part_size_multiply_parts_count_threshold;
-    if (!multipart_upload_id.empty() && (part_number % threshold == 0))
+    if (!strict_upload_part_size)
     {
-        auto max_upload_part_size = upload_settings.max_upload_part_size;
-        auto upload_part_size_multiply_factor = upload_settings.upload_part_size_multiply_factor;
-        upload_part_size *= upload_part_size_multiply_factor;
-        upload_part_size = std::min(upload_part_size, max_upload_part_size);
+        /// Maybe increase `current_upload_part_size` (we need to increase it sometimes to keep `part_number` less or equal than `max_part_number`).
+        auto threshold = upload_settings.upload_part_size_multiply_parts_count_threshold;
+        if (!multipart_upload_id.empty() && (part_number % threshold == 0))
+        {
+            auto max_upload_part_size = upload_settings.max_upload_part_size;
+            auto upload_part_size_multiply_factor = upload_settings.upload_part_size_multiply_factor;
+            current_upload_part_size *= upload_part_size_multiply_factor;
+            current_upload_part_size = std::min(current_upload_part_size, max_upload_part_size);
+        }
     }
 }
 
diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h
index 5fa934b886e1..e57b8c159a24 100644
--- a/src/IO/WriteBufferFromS3.h
+++ b/src/IO/WriteBufferFromS3.h
@@ -58,6 +58,9 @@ class WriteBufferFromS3 final : public BufferWithOwnMemory<WriteBuffer>
 private:
     void allocateBuffer();
 
+    void processWithStrictParts();
+    void processWithDynamicParts();
+
     void createMultipartUpload();
     void writePart();
     void completeMultipartUpload();
@@ -86,7 +89,10 @@ class WriteBufferFromS3 final : public BufferWithOwnMemory<WriteBuffer>
     const std::shared_ptr<const S3::Client> client_ptr;
     const std::optional<std::map<String, String>> object_metadata;
 
-    size_t upload_part_size = 0;
+    /// Strict/static Part size, no adjustments will be done on fly.
+    size_t strict_upload_part_size = 0;
+    /// Part size will be adjusted on fly (for bigger uploads)
+    size_t current_upload_part_size = 0;
     std::shared_ptr<Aws::StringStream> temporary_buffer; /// Buffer to accumulate data.
     size_t last_part_size = 0;
     size_t part_number = 0;
diff --git a/src/IO/ZlibInflatingReadBuffer.cpp b/src/IO/ZlibInflatingReadBuffer.cpp
index 09e4fce7c4c8..b43dda1bfccb 100644
--- a/src/IO/ZlibInflatingReadBuffer.cpp
+++ b/src/IO/ZlibInflatingReadBuffer.cpp
@@ -1,5 +1,5 @@
 #include <IO/ZlibInflatingReadBuffer.h>
-
+#include <IO/WithFileName.h>
 
 namespace DB
 {
@@ -99,14 +99,22 @@ bool ZlibInflatingReadBuffer::nextImpl()
             {
                 rc = inflateReset(&zstr);
                 if (rc != Z_OK)
-                    throw Exception(ErrorCodes::ZLIB_INFLATE_FAILED, "inflateReset failed: {}", zError(rc));
+                    throw Exception(
+                        ErrorCodes::ZLIB_INFLATE_FAILED,
+                        "inflateReset failed: {}{}",
+                        zError(rc),
+                        getExceptionEntryWithFileName(*in));
                 return true;
             }
         }
 
         /// If it is not end and not OK, something went wrong, throw exception
         if (rc != Z_OK)
-            throw Exception(ErrorCodes::ZLIB_INFLATE_FAILED, "inflate failed: {}", zError(rc));
+            throw Exception(
+                ErrorCodes::ZLIB_INFLATE_FAILED,
+                "inflate failed: {}{}",
+                zError(rc),
+                getExceptionEntryWithFileName(*in));
     }
     while (working_buffer.empty());
 
diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp
index 6f5c8b4dc715..2b663ec71452 100644
--- a/src/IO/ZstdInflatingReadBuffer.cpp
+++ b/src/IO/ZstdInflatingReadBuffer.cpp
@@ -1,4 +1,5 @@
 #include <IO/ZstdInflatingReadBuffer.h>
+#include <IO/WithFileName.h>
 #include <zstd_errors.h>
 
 
@@ -61,12 +62,13 @@ bool ZstdInflatingReadBuffer::nextImpl()
         {
             throw Exception(
                 ErrorCodes::ZSTD_DECODER_FAILED,
-                "ZSTD stream decoding failed: error '{}'{}; ZSTD version: {}",
+                "ZSTD stream decoding failed: error '{}'{}; ZSTD version: {}{}",
                 ZSTD_getErrorName(ret),
                 ZSTD_error_frameParameter_windowTooLarge == ret
                     ? ". You can increase the maximum window size with the 'zstd_window_log_max' setting in ClickHouse. Example: 'SET zstd_window_log_max = 31'"
                     : "",
-                ZSTD_VERSION_STRING);
+                ZSTD_VERSION_STRING,
+                getExceptionEntryWithFileName(*in));
         }
 
         /// Check that something has changed after decompress (input or output position)
diff --git a/src/Interpreters/Access/InterpreterCreateUserQuery.cpp b/src/Interpreters/Access/InterpreterCreateUserQuery.cpp
index 0bde147fbb60..165937560ccf 100644
--- a/src/Interpreters/Access/InterpreterCreateUserQuery.cpp
+++ b/src/Interpreters/Access/InterpreterCreateUserQuery.cpp
@@ -23,10 +23,12 @@ namespace
     void updateUserFromQueryImpl(
         User & user,
         const ASTCreateUserQuery & query,
+        const std::optional<AuthenticationData> auth_data,
         const std::shared_ptr<ASTUserNameWithHost> & override_name,
         const std::optional<RolesOrUsersSet> & override_default_roles,
         const std::optional<SettingsProfileElements> & override_settings,
         const std::optional<RolesOrUsersSet> & override_grantees,
+        bool allow_implicit_no_password,
         bool allow_no_password,
         bool allow_plaintext_password)
     {
@@ -37,10 +39,16 @@ namespace
         else if (query.names->size() == 1)
             user.setName(query.names->front()->toString());
 
-        if (query.auth_data)
-            user.auth_data = *query.auth_data;
+        if (!query.attach && !query.alter && !auth_data && !allow_implicit_no_password)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Authentication type NO_PASSWORD must "
+                            "be explicitly specified, check the setting allow_implicit_no_password "
+                            "in the server configuration");
 
-        if (query.auth_data || !query.alter)
+        if (auth_data)
+            user.auth_data = *auth_data;
+
+        if (auth_data || !query.alter)
         {
             auto auth_type = user.auth_data.getType();
             if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) ||
@@ -104,17 +112,9 @@ BlockIO InterpreterCreateUserQuery::execute()
     bool no_password_allowed = access_control.isNoPasswordAllowed();
     bool plaintext_password_allowed = access_control.isPlaintextPasswordAllowed();
 
-     if (!query.attach && !query.alter && !query.auth_data && !implicit_no_password_allowed)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Authentication type NO_PASSWORD must "
-                        "be explicitly specified, check the setting allow_implicit_no_password "
-                        "in the server configuration");
-
-    if (!query.attach && query.temporary_password_for_checks)
-    {
-        access_control.checkPasswordComplexityRules(query.temporary_password_for_checks.value());
-        query.temporary_password_for_checks.reset();
-    }
+    std::optional<AuthenticationData> auth_data;
+    if (query.auth_data)
+        auth_data = AuthenticationData::fromAST(*query.auth_data, getContext(), !query.attach);
 
     std::optional<RolesOrUsersSet> default_roles_from_query;
     if (query.default_roles)
@@ -148,7 +148,7 @@ BlockIO InterpreterCreateUserQuery::execute()
         auto update_func = [&](const AccessEntityPtr & entity) -> AccessEntityPtr
         {
             auto updated_user = typeid_cast<std::shared_ptr<User>>(entity->clone());
-            updateUserFromQueryImpl(*updated_user, query, {}, default_roles_from_query, settings_from_query, grantees_from_query, no_password_allowed, plaintext_password_allowed);
+            updateUserFromQueryImpl(*updated_user, query, auth_data, {}, default_roles_from_query, settings_from_query, grantees_from_query, implicit_no_password_allowed, no_password_allowed, plaintext_password_allowed);
             return updated_user;
         };
 
@@ -167,7 +167,7 @@ BlockIO InterpreterCreateUserQuery::execute()
         for (const auto & name : *query.names)
         {
             auto new_user = std::make_shared<User>();
-            updateUserFromQueryImpl(*new_user, query, name, default_roles_from_query, settings_from_query, RolesOrUsersSet::AllTag{}, no_password_allowed, plaintext_password_allowed);
+            updateUserFromQueryImpl(*new_user, query, auth_data, name, default_roles_from_query, settings_from_query, RolesOrUsersSet::AllTag{}, implicit_no_password_allowed, no_password_allowed, plaintext_password_allowed);
             new_users.emplace_back(std::move(new_user));
         }
 
@@ -197,7 +197,11 @@ BlockIO InterpreterCreateUserQuery::execute()
 
 void InterpreterCreateUserQuery::updateUserFromQuery(User & user, const ASTCreateUserQuery & query, bool allow_no_password, bool allow_plaintext_password)
 {
-    updateUserFromQueryImpl(user, query, {}, {}, {}, {}, allow_no_password, allow_plaintext_password);
+    std::optional<AuthenticationData> auth_data;
+    if (query.auth_data)
+        auth_data = AuthenticationData::fromAST(*query.auth_data, {}, !query.attach);
+
+    updateUserFromQueryImpl(user, query, auth_data, {}, {}, {}, {}, allow_no_password, allow_plaintext_password, true);
 }
 
 }
diff --git a/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp b/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp
index 7b9a8f98c8f5..5508fa0f825d 100644
--- a/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp
+++ b/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp
@@ -62,7 +62,7 @@ namespace
         }
 
         if (user.auth_data.getType() != AuthenticationType::NO_PASSWORD)
-            query->auth_data = user.auth_data;
+            query->auth_data = user.auth_data.toAST();
 
         if (!user.settings.empty())
         {
diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp
index 3bb3ea67e295..b3bb6e9db90c 100644
--- a/src/Interpreters/ActionsVisitor.cpp
+++ b/src/Interpreters/ActionsVisitor.cpp
@@ -535,7 +535,6 @@ ActionsMatcher::Data::Data(
     bool no_subqueries_,
     bool no_makeset_,
     bool only_consts_,
-    bool create_source_for_in_,
     AggregationKeysInfo aggregation_keys_info_,
     bool build_expression_with_window_functions_,
     bool is_create_parameterized_view_)
@@ -547,7 +546,6 @@ ActionsMatcher::Data::Data(
     , no_subqueries(no_subqueries_)
     , no_makeset(no_makeset_)
     , only_consts(only_consts_)
-    , create_source_for_in(create_source_for_in_)
     , visit_depth(0)
     , actions_stack(std::move(actions_dag), context_)
     , aggregation_keys_info(aggregation_keys_info_)
@@ -1000,7 +998,6 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
             data.no_subqueries,
             data.no_makeset,
             data.only_consts,
-            /*create_source_for_in*/ false,
             data.aggregation_keys_info);
 
         NamesWithAliases args;
@@ -1432,7 +1429,7 @@ FutureSet ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no
           * In case that we have HAVING with IN subquery, we have to force creating set for it.
           * Also it doesn't make sense if it is GLOBAL IN or ordinary IN.
           */
-        if (data.create_source_for_in && !subquery_for_set.hasSource())
+        if (!subquery_for_set.hasSource())
         {
             auto interpreter = interpretSubquery(right_in_operand, data.getContext(), data.subquery_depth, {});
             subquery_for_set.createSource(*interpreter);
diff --git a/src/Interpreters/ActionsVisitor.h b/src/Interpreters/ActionsVisitor.h
index 260fd5ab2c01..3cf43746630c 100644
--- a/src/Interpreters/ActionsVisitor.h
+++ b/src/Interpreters/ActionsVisitor.h
@@ -129,7 +129,6 @@ class ActionsMatcher
         bool no_subqueries;
         bool no_makeset;
         bool only_consts;
-        bool create_source_for_in;
         size_t visit_depth;
         ScopeStack actions_stack;
         AggregationKeysInfo aggregation_keys_info;
@@ -153,7 +152,6 @@ class ActionsMatcher
             bool no_subqueries_,
             bool no_makeset_,
             bool only_consts_,
-            bool create_source_for_in_,
             AggregationKeysInfo aggregation_keys_info_,
             bool build_expression_with_window_functions_ = false,
             bool is_create_parameterized_view_ = false);
diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 81ea57626090..2ba44f64d1f7 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -1,20 +1,16 @@
 #include "FileCache.h"
 
 #include <Common/randomSeed.h>
-#include <Common/SipHash.h>
-#include <Common/logger_useful.h>
 #include <Interpreters/Cache/FileCacheSettings.h>
 #include <Interpreters/Cache/LRUFileCachePriority.h>
+#include <Interpreters/Context.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBufferFromFile.h>
 #include <IO/ReadSettings.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
 #include <pcg-random/pcg_random.hpp>
-#include <filesystem>
-
-
-namespace fs = std::filesystem;
+#include <base/hex.h>
 
 namespace DB
 {
@@ -24,207 +20,159 @@ namespace ErrorCodes
 }
 
 FileCache::FileCache(const FileCacheSettings & settings)
-    : cache_base_path(settings.base_path)
-    , max_size(settings.max_size)
-    , max_element_size(settings.max_elements)
-    , max_file_segment_size(settings.max_file_segment_size)
+    : max_file_segment_size(settings.max_file_segment_size)
     , allow_persistent_files(settings.do_not_evict_index_and_mark_files)
-    , enable_cache_hits_threshold(settings.enable_cache_hits_threshold)
-    , enable_filesystem_query_cache_limit(settings.enable_filesystem_query_cache_limit)
-    , enable_bypass_cache_with_threashold(settings.enable_bypass_cache_with_threashold)
-    , bypass_cache_threashold(settings.bypass_cache_threashold)
+    , bypass_cache_threshold(settings.enable_bypass_cache_with_threashold ? settings.bypass_cache_threashold : 0)
+    , delayed_cleanup_interval_ms(settings.delayed_cleanup_interval_ms)
     , log(&Poco::Logger::get("FileCache"))
-    , main_priority(std::make_unique<LRUFileCachePriority>())
-    , stash_priority(std::make_unique<LRUFileCachePriority>())
-    , max_stash_element_size(settings.max_elements)
+    , metadata(settings.base_path)
 {
-}
+    main_priority = std::make_unique<LRUFileCachePriority>(settings.max_size, settings.max_elements);
 
-FileCache::Key FileCache::hash(const String & path)
-{
-    return Key(sipHash128(path.data(), path.size()));
-}
+    if (settings.cache_hits_threshold)
+        stash = std::make_unique<HitsCountStash>(settings.cache_hits_threshold, settings.max_elements);
 
-String FileCache::getPathInLocalCache(const Key & key, size_t offset, FileSegmentKind segment_kind) const
-{
-    String file_suffix;
-    switch (segment_kind)
-    {
-        case FileSegmentKind::Persistent:
-            file_suffix = "_persistent";
-            break;
-        case FileSegmentKind::Temporary:
-            file_suffix = "_temporary";
-            break;
-        case FileSegmentKind::Regular:
-            file_suffix = "";
-            break;
-    }
-
-    auto key_str = key.toString();
-    return fs::path(cache_base_path)
-        / key_str.substr(0, 3)
-        / key_str
-        / (std::to_string(offset) + file_suffix);
+    if (settings.enable_filesystem_query_cache_limit)
+        query_limit = std::make_unique<FileCacheQueryLimit>();
 }
 
-String FileCache::getPathInLocalCache(const Key & key) const
+FileCache::Key FileCache::createKeyForPath(const String & path)
 {
-    auto key_str = key.toString();
-    return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str;
+    return Key(path);
 }
 
-void FileCache::removeKeyDirectoryIfExists(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) const
+const String & FileCache::getBasePath() const
 {
-    /// Note: it is guaranteed that there is no concurrency here with files deletion
-    /// because cache key directories are create only in FileCache class under cache_lock.
-
-    auto key_str = key.toString();
-    auto key_prefix_path = fs::path(cache_base_path) / key_str.substr(0, 3);
-    auto key_path = key_prefix_path / key_str;
-
-    if (!fs::exists(key_path))
-        return;
-
-    fs::remove_all(key_path);
-
-    if (fs::is_empty(key_prefix_path))
-        fs::remove(key_prefix_path);
+    return metadata.getBaseDirectory();
 }
 
-static bool isQueryInitialized()
+String FileCache::getPathInLocalCache(const Key & key, size_t offset, FileSegmentKind segment_kind) const
 {
-    return CurrentThread::isInitialized()
-        && CurrentThread::get().getQueryContext()
-        && !CurrentThread::getQueryId().empty();
+    return metadata.getPathInLocalCache(key, offset, segment_kind);
 }
 
-bool FileCache::isReadOnly()
+String FileCache::getPathInLocalCache(const Key & key) const
 {
-    return !isQueryInitialized();
+    return metadata.getPathInLocalCache(key);
 }
 
-void FileCache::assertInitialized(std::lock_guard<std::mutex> & /* cache_lock */) const
+void FileCache::assertInitialized() const
 {
+    if (is_initialized)
+        return;
+
+    std::unique_lock lock(init_mutex);
+    if (is_initialized)
+        return;
+
+    if (init_exception)
+        std::rethrow_exception(init_exception);
     if (!is_initialized)
-    {
-        if (initialization_exception)
-            std::rethrow_exception(initialization_exception);
-        else
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache not initialized");
-    }
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache not initialized");
 }
 
 void FileCache::initialize()
 {
-    std::lock_guard cache_lock(mutex);
-    if (!is_initialized)
-    {
-        if (fs::exists(cache_base_path))
-        {
-            try
-            {
-                loadCacheInfoIntoMemory(cache_lock);
-            }
-            catch (...)
-            {
-                initialization_exception = std::current_exception();
-                throw;
-            }
-        }
-        else
-        {
-            fs::create_directories(cache_base_path);
-        }
+    std::lock_guard lock(init_mutex);
 
-        is_initialized = true;
-    }
-}
-
-void FileCache::useCell(
-    const FileSegmentCell & cell, FileSegments & result, std::lock_guard<std::mutex> & cache_lock)
-{
-    auto file_segment = cell.file_segment;
+    if (is_initialized)
+        return;
 
-    if (file_segment->isDownloaded())
+    try
     {
-        fs::path path = file_segment->getPathInLocalCache();
-        if (!fs::exists(path))
+        if (fs::exists(getBasePath()))
         {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "File path does not exist, but file has DOWNLOADED state. {}",
-                file_segment->getInfoForLog());
+            loadMetadata();
         }
-
-        if (fs::file_size(path) == 0)
+        else
         {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cannot have zero size downloaded file segments. {}",
-                file_segment->getInfoForLog());
+            fs::create_directories(getBasePath());
         }
     }
-
-    result.push_back(cell.file_segment);
-
-    /**
-     * A cell receives a queue iterator on first successful space reservation attempt
-     * (space is reserved incrementally on each read buffer nextImpl() call).
-     */
-    if (cell.queue_iterator)
+    catch (...)
     {
-        /// Move to the end of the queue. The iterator remains valid.
-        cell.queue_iterator->use(cache_lock);
+        init_exception = std::current_exception();
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        throw;
     }
-}
-
-FileCache::FileSegmentCell * FileCache::getCell(
-    const Key & key, size_t offset, std::lock_guard<std::mutex> & /* cache_lock */)
-{
-    auto it = files.find(key);
-    if (it == files.end())
-        return nullptr;
 
-    auto & offsets = it->second;
-    auto cell_it = offsets.find(offset);
-    if (cell_it == offsets.end())
-        return nullptr;
+    is_initialized = true;
 
-    return &cell_it->second;
+    cleanup_task = Context::getGlobalContextInstance()->getSchedulePool().createTask("FileCacheCleanup", [this]{ cleanupThreadFunc(); });
+    cleanup_task->activate();
+    cleanup_task->scheduleAfter(delayed_cleanup_interval_ms);
 }
 
-FileSegments FileCache::getImpl(
-    const Key & key, const FileSegment::Range & range, std::lock_guard<std::mutex> & cache_lock)
+FileSegments FileCache::getImpl(const LockedKey & locked_key, const FileSegment::Range & range) const
 {
     /// Given range = [left, right] and non-overlapping ordered set of file segments,
     /// find list [segment1, ..., segmentN] of segments which intersect with given range.
 
-    FileSegments result;
-
-    if (enable_bypass_cache_with_threashold && (range.size() > bypass_cache_threashold))
+    if (bypass_cache_threshold && range.size() > bypass_cache_threshold)
     {
         auto file_segment = std::make_shared<FileSegment>(
-            range.left, range.size(), key, this, FileSegment::State::SKIP_CACHE, CreateFileSegmentSettings{});
-        {
-            std::unique_lock segment_lock(file_segment->mutex);
-            file_segment->detachAssumeStateFinalized(segment_lock);
-        }
-        result.emplace_back(file_segment);
-        return result;
+            locked_key.getKey(), range.left, range.size(), FileSegment::State::DETACHED);
+        return { file_segment };
     }
 
-    auto it = files.find(key);
-    if (it == files.end())
+    const auto & file_segments = *locked_key.getKeyMetadata();
+    if (file_segments.empty())
         return {};
 
-    const auto & file_segments = it->second;
-    if (file_segments.empty())
+    FileSegments result;
+    auto add_to_result = [&](const FileSegmentMetadata & file_segment_metadata)
     {
-        files.erase(key);
-        removeKeyDirectoryIfExists(key, cache_lock);
-        return {};
-    }
+        FileSegmentPtr file_segment;
+        if (file_segment_metadata.valid())
+        {
+            file_segment = file_segment_metadata.file_segment;
+            if (file_segment->isDownloaded())
+            {
+                if (file_segment->getDownloadedSize(true) == 0)
+                {
+                    throw Exception(
+                        ErrorCodes::LOGICAL_ERROR,
+                        "Cannot have zero size downloaded file segments. {}",
+                        file_segment->getInfoForLog());
+                }
+
+#ifndef NDEBUG
+                /**
+                * Check that in-memory state of the cache is consistent with the state on disk.
+                * Check only in debug build, because such checks can be done often and can be quite
+                * expensive compared to overall query execution time.
+                */
+
+                fs::path path = file_segment->getPathInLocalCache();
+                if (!fs::exists(path))
+                {
+                    throw Exception(
+                        ErrorCodes::LOGICAL_ERROR,
+                        "File path does not exist, but file has DOWNLOADED state. {}",
+                        file_segment->getInfoForLog());
+                }
+
+                if (fs::file_size(path) == 0)
+                {
+                    throw Exception(
+                        ErrorCodes::LOGICAL_ERROR,
+                        "Cannot have zero size downloaded file segments. {}",
+                        file_segment->getInfoForLog());
+                }
+#endif
+            }
+        }
+        else
+        {
+            file_segment = std::make_shared<FileSegment>(
+                locked_key.getKey(),
+                file_segment_metadata.file_segment->offset(),
+                file_segment_metadata.file_segment->range().size(),
+                FileSegment::State::DETACHED);
+        }
+
+        result.push_back(file_segment);
+    };
 
     auto segment_it = file_segments.lower_bound(range.left);
     if (segment_it == file_segments.end())
@@ -236,27 +184,27 @@ FileSegments FileCache::getImpl(
         ///     ^                                        ^
         ///     range.left                               range.left
 
-        const auto & cell = file_segments.rbegin()->second;
-        if (cell.file_segment->range().right < range.left)
+        const auto & file_segment_metadata = *file_segments.rbegin()->second;
+        if (file_segment_metadata.file_segment->range().right < range.left)
             return {};
 
-        useCell(cell, result, cache_lock);
+        add_to_result(file_segment_metadata);
     }
     else /// segment_it <-- segmment{k}
     {
         if (segment_it != file_segments.begin())
         {
-            const auto & prev_cell = std::prev(segment_it)->second;
-            const auto & prev_cell_range = prev_cell.file_segment->range();
+            const auto & prev_file_segment_metadata = *std::prev(segment_it)->second;
+            const auto & prev_range = prev_file_segment_metadata.file_segment->range();
 
-            if (range.left <= prev_cell_range.right)
+            if (range.left <= prev_range.right)
             {
                 ///   segment{k-1}  segment{k}
                 ///   [________]   [_____
                 ///       [___________
                 ///       ^
                 ///       range.left
-                useCell(prev_cell, result, cache_lock);
+                add_to_result(prev_file_segment_metadata);
             }
         }
 
@@ -268,11 +216,11 @@ FileSegments FileCache::getImpl(
 
         while (segment_it != file_segments.end())
         {
-            const auto & cell = segment_it->second;
-            if (range.right < cell.file_segment->range().left)
+            const auto & file_segment_metadata = *segment_it->second;
+            if (range.right < file_segment_metadata.file_segment->range().left)
                 break;
 
-            useCell(cell, result, cache_lock);
+            add_to_result(file_segment_metadata);
             ++segment_it;
         }
     }
@@ -280,34 +228,32 @@ FileSegments FileCache::getImpl(
     return result;
 }
 
-FileSegments FileCache::splitRangeIntoCells(
-    const Key & key,
+FileSegments FileCache::splitRangeIntoFileSegments(
+    LockedKey & locked_key,
     size_t offset,
     size_t size,
     FileSegment::State state,
-    const CreateFileSegmentSettings & settings,
-    std::lock_guard<std::mutex> & cache_lock)
+    const CreateFileSegmentSettings & settings)
 {
     assert(size > 0);
 
     auto current_pos = offset;
     auto end_pos_non_included = offset + size;
 
-    size_t current_cell_size;
+    size_t current_file_segment_size;
     size_t remaining_size = size;
 
     FileSegments file_segments;
     while (current_pos < end_pos_non_included)
     {
-        current_cell_size = std::min(remaining_size, max_file_segment_size);
-        remaining_size -= current_cell_size;
+        current_file_segment_size = std::min(remaining_size, max_file_segment_size);
+        remaining_size -= current_file_segment_size;
 
-        auto * cell = addCell(key, current_pos, current_cell_size, state, settings, cache_lock);
-        if (cell)
-            file_segments.push_back(cell->file_segment);
-        assert(cell);
+        auto file_segment_metadata_it = addFileSegment(
+            locked_key, current_pos, current_file_segment_size, state, settings, nullptr);
+        file_segments.push_back(file_segment_metadata_it->second->file_segment);
 
-        current_pos += current_cell_size;
+        current_pos += current_file_segment_size;
     }
 
     assert(file_segments.empty() || offset + size - 1 == file_segments.back()->range().right);
@@ -315,12 +261,11 @@ FileSegments FileCache::splitRangeIntoCells(
 }
 
 void FileCache::fillHolesWithEmptyFileSegments(
+    LockedKey & locked_key,
     FileSegments & file_segments,
-    const Key & key,
     const FileSegment::Range & range,
     bool fill_with_detached_file_segments,
-    const CreateFileSegmentSettings & settings,
-    std::lock_guard<std::mutex> & cache_lock)
+    const CreateFileSegmentSettings & settings)
 {
     /// There are segments [segment1, ..., segmentN]
     /// (non-overlapping, non-empty, ascending-ordered) which (maybe partially)
@@ -330,7 +275,9 @@ void FileCache::fillHolesWithEmptyFileSegments(
     /// [____________________]         -- requested range
     ///     [____]  [_]   [_________]  -- intersecting cache [segment1, ..., segmentN]
     ///
-    /// For each such hole create a cell with file segment state EMPTY.
+    /// For each such hole create a file_segment_metadata with file segment state EMPTY.
+
+    assert(!file_segments.empty());
 
     auto it = file_segments.begin();
     auto segment_range = (*it)->range();
@@ -366,16 +313,16 @@ void FileCache::fillHolesWithEmptyFileSegments(
 
         if (fill_with_detached_file_segments)
         {
-            auto file_segment = std::make_shared<FileSegment>(current_pos, hole_size, key, this, FileSegment::State::EMPTY, settings);
-            {
-                std::unique_lock segment_lock(file_segment->mutex);
-                file_segment->detachAssumeStateFinalized(segment_lock);
-            }
+            auto file_segment = std::make_shared<FileSegment>(
+                locked_key.getKey(), current_pos, hole_size, FileSegment::State::DETACHED, settings);
+
             file_segments.insert(it, file_segment);
         }
         else
         {
-            file_segments.splice(it, splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, settings, cache_lock));
+            auto split = splitRangeIntoFileSegments(
+                locked_key, current_pos, hole_size, FileSegment::State::EMPTY, settings);
+            file_segments.splice(it, std::move(split));
         }
 
         current_pos = segment_range.right + 1;
@@ -393,655 +340,481 @@ void FileCache::fillHolesWithEmptyFileSegments(
 
         if (fill_with_detached_file_segments)
         {
-            auto file_segment = std::make_shared<FileSegment>(current_pos, hole_size, key, this, FileSegment::State::EMPTY, settings);
-            {
-                std::unique_lock segment_lock(file_segment->mutex);
-                file_segment->detachAssumeStateFinalized(segment_lock);
-            }
+            auto file_segment = std::make_shared<FileSegment>(
+                locked_key.getKey(), current_pos, hole_size, FileSegment::State::DETACHED, settings);
+
             file_segments.insert(file_segments.end(), file_segment);
         }
         else
         {
-            file_segments.splice(
-                file_segments.end(),
-                splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, settings, cache_lock));
+            auto split = splitRangeIntoFileSegments(
+                locked_key, current_pos, hole_size, FileSegment::State::EMPTY, settings);
+            file_segments.splice(file_segments.end(), std::move(split));
         }
     }
 }
 
-FileSegmentsHolder FileCache::getOrSet(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings)
+FileSegmentsHolderPtr FileCache::set(
+    const Key & key,
+    size_t offset,
+    size_t size,
+    const CreateFileSegmentSettings & settings)
 {
-    std::lock_guard cache_lock(mutex);
-
-    assertInitialized(cache_lock);
-
-#ifndef NDEBUG
-    assertCacheCorrectness(key, cache_lock);
-#endif
+    assertInitialized();
 
+    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::CREATE_EMPTY);
     FileSegment::Range range(offset, offset + size - 1);
-    /// Get all segments which intersect with the given range.
-    auto file_segments = getImpl(key, range, cache_lock);
 
-    if (file_segments.empty())
+    auto file_segments = getImpl(*locked_key, range);
+    if (!file_segments.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Having intersection with already existing cache");
+
+    if (settings.unbounded)
     {
-        file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::EMPTY, settings, cache_lock);
+        /// If the file is unbounded, we can create a single file_segment_metadata for it.
+        auto file_segment_metadata_it = addFileSegment(
+            *locked_key, offset, size, FileSegment::State::EMPTY, settings, nullptr);
+        file_segments = {file_segment_metadata_it->second->file_segment};
     }
     else
     {
-        fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */false, settings, cache_lock);
+        file_segments = splitRangeIntoFileSegments(
+            *locked_key, offset, size, FileSegment::State::EMPTY, settings);
     }
-    assert(!file_segments.empty());
-    return FileSegmentsHolder(std::move(file_segments));
-}
 
-FileSegmentsHolder FileCache::set(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings)
-{
-    std::lock_guard cache_lock(mutex);
-
-    auto it = files.find(key);
-    if (it != files.end())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "File {} already exists", key.toString());
-
-    if (settings.unbounded)
-    {
-        /// If the file is unbounded, we can create a single cell for it.
-        FileSegments file_segments;
-        if (auto * cell = addCell(key, offset, size, FileSegment::State::EMPTY, settings, cache_lock))
-            file_segments.push_back(cell->file_segment);
-        else
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add cell for file {}", key.toString());
-        return FileSegmentsHolder(std::move(file_segments));
-    }
-    return FileSegmentsHolder(splitRangeIntoCells(key, offset, size, FileSegment::State::EMPTY, settings, cache_lock));
+    return std::make_unique<FileSegmentsHolder>(std::move(file_segments));
 }
 
-FileSegmentsHolder FileCache::get(const Key & key, size_t offset, size_t size)
+FileSegmentsHolderPtr FileCache::getOrSet(
+    const Key & key,
+    size_t offset,
+    size_t size,
+    const CreateFileSegmentSettings & settings)
 {
-    std::lock_guard cache_lock(mutex);
-
-    assertInitialized(cache_lock);
-
-#ifndef NDEBUG
-    assertCacheCorrectness(key, cache_lock);
-#endif
+    assertInitialized();
 
     FileSegment::Range range(offset, offset + size - 1);
 
-    /// Get all segments which intersect with the given range.
-    auto file_segments = getImpl(key, range, cache_lock);
+    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::CREATE_EMPTY);
 
+    /// Get all segments which intersect with the given range.
+    auto file_segments = getImpl(*locked_key, range);
     if (file_segments.empty())
     {
-        auto file_segment = std::make_shared<FileSegment>(
-            offset, size, key, this, FileSegment::State::EMPTY, CreateFileSegmentSettings{});
-        {
-            std::unique_lock segment_lock(file_segment->mutex);
-            file_segment->detachAssumeStateFinalized(segment_lock);
-        }
-        file_segments = { file_segment };
+        file_segments = splitRangeIntoFileSegments(
+            *locked_key, offset, size, FileSegment::State::EMPTY, settings);
     }
     else
     {
-        fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */true, {}, cache_lock);
+        fillHolesWithEmptyFileSegments(
+            *locked_key, file_segments, range, /* fill_with_detached */false, settings);
     }
 
-    return FileSegmentsHolder(std::move(file_segments));
+    chassert(!file_segments.empty());
+    return std::make_unique<FileSegmentsHolder>(std::move(file_segments));
 }
 
-FileCache::FileSegmentCell * FileCache::addCell(
-    const Key & key, size_t offset, size_t size,
-    FileSegment::State state, const CreateFileSegmentSettings & settings,
-    std::lock_guard<std::mutex> & cache_lock)
+FileSegmentsHolderPtr FileCache::get(const Key & key, size_t offset, size_t size)
 {
-    /// Create a file segment cell and put it in `files` map by [key][offset].
+    assertInitialized();
 
-    if (!size)
-        return nullptr; /// Empty files are not cached.
-
-    if (files[key].contains(offset))
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Cache cell already exists for key: `{}`, offset: {}, size: {}.\nCurrent cache structure: {}",
-            key.toString(), offset, size, dumpStructureUnlocked(key, cache_lock));
-
-    auto skip_or_download = [&]() -> FileSegmentPtr
+    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::RETURN_NULL);
+    if (locked_key)
     {
-        FileSegment::State result_state = state;
-        if (state == FileSegment::State::EMPTY && enable_cache_hits_threshold)
-        {
-            auto record = stash_records.find({key, offset});
-
-            if (record == stash_records.end())
-            {
-                auto priority_iter = stash_priority->add(key, offset, 0, cache_lock);
-                stash_records.insert({{key, offset}, priority_iter});
+        FileSegment::Range range(offset, offset + size - 1);
 
-                if (stash_priority->getElementsNum(cache_lock) > max_stash_element_size)
-                {
-                    auto remove_priority_iter = stash_priority->getLowestPriorityWriteIterator(cache_lock);
-                    stash_records.erase({remove_priority_iter->key(), remove_priority_iter->offset()});
-                    remove_priority_iter->removeAndGetNext(cache_lock);
-                }
-
-                /// For segments that do not reach the download threshold,
-                /// we do not download them, but directly read them
-                result_state = FileSegment::State::SKIP_CACHE;
-            }
-            else
-            {
-                auto priority_iter = record->second;
-                priority_iter->use(cache_lock);
+        /// Get all segments which intersect with the given range.
+        auto file_segments = getImpl(*locked_key, range);
+        if (!file_segments.empty())
+        {
+            fillHolesWithEmptyFileSegments(
+                *locked_key, file_segments, range, /* fill_with_detached */true, CreateFileSegmentSettings{});
 
-                result_state = priority_iter->hits() >= enable_cache_hits_threshold
-                    ? FileSegment::State::EMPTY
-                    : FileSegment::State::SKIP_CACHE;
-            }
+            return std::make_unique<FileSegmentsHolder>(std::move(file_segments));
         }
-
-        return std::make_shared<FileSegment>(offset, size, key, this, result_state, settings);
-    };
-
-    FileSegmentCell cell(skip_or_download(), this, cache_lock);
-    auto & offsets = files[key];
-
-    if (offsets.empty())
-    {
-        auto key_path = getPathInLocalCache(key);
-
-        if (!fs::exists(key_path))
-            fs::create_directories(key_path);
     }
 
-    auto [it, inserted] = offsets.insert({offset, std::move(cell)});
-    if (!inserted)
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Failed to insert into cache key: `{}`, offset: {}, size: {}",
-            key.toString(), offset, size);
-
-    return &(it->second);
+    return std::make_unique<FileSegmentsHolder>(FileSegments{
+        std::make_shared<FileSegment>(key, offset, size, FileSegment::State::DETACHED)});
 }
 
-FileSegmentPtr FileCache::createFileSegmentForDownload(
-    const Key & key,
+KeyMetadata::iterator FileCache::addFileSegment(
+    LockedKey & locked_key,
     size_t offset,
     size_t size,
+    FileSegment::State state,
     const CreateFileSegmentSettings & settings,
-    std::lock_guard<std::mutex> & cache_lock)
+    const CacheGuard::Lock * lock)
 {
-#ifndef NDEBUG
-    assertCacheCorrectness(key, cache_lock);
-#endif
+    /// Create a file_segment_metadata and put it in `files` map by [key][offset].
 
-    if (!settings.unbounded && size > max_file_segment_size)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Requested size exceeds max file segment size");
+    chassert(size > 0); /// Empty file segments in cache are not allowed.
 
-    auto * cell = getCell(key, offset, cache_lock);
-    if (cell)
+    const auto & key = locked_key.getKey();
+    if (locked_key.tryGetByOffset(offset))
+    {
         throw Exception(
             ErrorCodes::LOGICAL_ERROR,
-            "Cache cell already exists for key `{}` and offset {}",
-            key.toString(), offset);
+            "Cache entry already exists for key: `{}`, offset: {}, size: {}.",
+            key, offset, size);
+    }
 
-    cell = addCell(key, offset, size, FileSegment::State::EMPTY, settings, cache_lock);
+    FileSegment::State result_state;
 
-    if (!cell)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to add a new cell for download");
+    /// `stash` - a queue of "stashed" key-offset pairs. Implements counting of
+    /// cache entries and allows caching only if cache hit threadhold is reached.
+    if (stash && state == FileSegment::State::EMPTY)
+    {
+        if (!lock)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Using stash requires cache_lock");
 
-    return cell->file_segment;
-}
+        KeyAndOffset stash_key(key, offset);
 
-bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
-{
-    auto query_context = enable_filesystem_query_cache_limit ? getCurrentQueryContext(cache_lock) : nullptr;
-    if (!query_context)
-    {
-        return tryReserveForMainList(key, offset, size, nullptr, cache_lock);
+        auto record_it = stash->records.find(stash_key);
+        if (record_it == stash->records.end())
+        {
+            auto & stash_records = stash->records;
+
+            stash_records.emplace(
+                stash_key, stash->queue->add(locked_key.getKeyMetadata(), offset, 0, *lock));
+
+            if (stash->queue->getElementsCount(*lock) > stash->queue->getElementsLimit())
+                stash->queue->pop(*lock);
+
+            result_state = FileSegment::State::DETACHED;
+        }
+        else
+        {
+            result_state = record_it->second->use(*lock) >= stash->hits_threshold
+                ? FileSegment::State::EMPTY
+                : FileSegment::State::DETACHED;
+        }
     }
-    /// The maximum cache capacity of the request is not reached, thus the
-    //// cache block is evicted from the main LRU queue by tryReserveForMainList().
-    else if (query_context->getCacheSize() + size <= query_context->getMaxCacheSize())
+    else
     {
-        return tryReserveForMainList(key, offset, size, query_context, cache_lock);
+        result_state = state;
     }
-    /// When skip_download_if_exceeds_query_cache is true, there is no need
-    /// to evict old data, skip the cache and read directly from remote fs.
-    else if (query_context->isSkipDownloadIfExceed())
+
+    PriorityIterator cache_it;
+    if (state == FileSegment::State::DOWNLOADED)
     {
-        return false;
+        cache_it = main_priority->add(locked_key.getKeyMetadata(), offset, size, *lock);
     }
-    /// The maximum cache size of the query is reached, the cache will be
-    /// evicted from the history cache accessed by the current query.
-    else
-    {
-        size_t removed_size = 0;
-        size_t queue_size = main_priority->getElementsNum(cache_lock);
 
-        auto * cell_for_reserve = getCell(key, offset, cache_lock);
+    try
+    {
+        auto file_segment = std::make_shared<FileSegment>(
+            key, offset, size, result_state, settings, this, locked_key.getKeyMetadata(), cache_it);
+        auto file_segment_metadata = std::make_shared<FileSegmentMetadata>(std::move(file_segment));
 
-        struct Segment
+        auto [file_segment_metadata_it, inserted] = locked_key.getKeyMetadata()->emplace(offset, file_segment_metadata);
+        if (!inserted)
         {
-            Key key;
-            size_t offset;
-            size_t size;
+            if (cache_it)
+                cache_it->remove(*lock);
 
-            Segment(Key key_, size_t offset_, size_t size_)
-                : key(key_), offset(offset_), size(size_) {}
-        };
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Failed to insert {}:{}: entry already exists", key, offset);
+        }
 
-        std::vector<Segment> ghost;
-        std::vector<FileSegmentCell *> trash;
-        std::vector<FileSegmentCell *> to_evict;
+        return file_segment_metadata_it;
+    }
+    catch (...)
+    {
+        if (cache_it)
+            cache_it->remove(*lock);
+        throw;
+    }
+}
 
-        auto is_overflow = [&]
-        {
-            return (max_size != 0 && main_priority->getCacheSize(cache_lock) + size - removed_size > max_size)
-            || (max_element_size != 0 && queue_size > max_element_size)
-            || (query_context->getCacheSize() + size - removed_size > query_context->getMaxCacheSize());
-        };
+bool FileCache::tryReserve(FileSegment & file_segment, size_t size)
+{
+    assertInitialized();
+    auto cache_lock = cache_guard.lock();
 
-        /// Select the cache from the LRU queue held by query for expulsion.
-        for (auto iter = query_context->getPriority()->getLowestPriorityWriteIterator(cache_lock); iter->valid();)
-        {
-            if (!is_overflow())
-                break;
+    /// In case of per query cache limit (by default disabled), we add/remove entries from both
+    /// (main_priority and query_priority) priority queues, but iterate entries in order of query_priority,
+    /// while checking the limits in both.
+    Priority * query_priority = nullptr;
 
-            auto * cell = getCell(iter->key(), iter->offset(), cache_lock);
+    auto query_context = query_limit ? query_limit->tryGetQueryContext(cache_lock) : nullptr;
+    if (query_context)
+    {
+        query_priority = &query_context->getPriority();
 
-            if (!cell)
-            {
-                /// The cache corresponding to this record may be swapped out by
-                /// other queries, so it has become invalid.
-                removed_size += iter->size();
-                ghost.push_back(Segment(iter->key(), iter->offset(), iter->size()));
-                /// next()
-                iter->removeAndGetNext(cache_lock);
-            }
-            else
-            {
-                size_t cell_size = cell->size();
-                assert(iter->size() == cell_size);
+        const bool query_limit_exceeded = query_priority->getSize(cache_lock) + size > query_priority->getSizeLimit();
+        if (query_limit_exceeded && !query_context->recacheOnFileCacheQueryLimitExceeded())
+            return false;
+    }
 
-                if (cell->releasable())
-                {
-                    auto & file_segment = cell->file_segment;
+    size_t queue_size = main_priority->getElementsCount(cache_lock);
+    chassert(queue_size <= main_priority->getElementsLimit());
 
-                    if (file_segment->isPersistent() && allow_persistent_files)
-                    {
-                        LOG_DEBUG(log, "File segment will not be removed, because it is persistent: {}", file_segment->getInfoForLog());
-                        continue;
-                    }
+    /// A file_segment_metadata acquires a LRUQueue iterator on first successful space reservation attempt.
+    auto queue_iterator = file_segment.getQueueIterator();
+    if (queue_iterator)
+        chassert(file_segment.getReservedSize() > 0);
+    else
+        queue_size += 1;
 
-                    std::lock_guard segment_lock(file_segment->mutex);
+    size_t removed_size = 0;
 
-                    switch (file_segment->download_state)
-                    {
-                        case FileSegment::State::DOWNLOADED:
-                        {
-                            to_evict.push_back(cell);
-                            break;
-                        }
-                        default:
-                        {
-                            trash.push_back(cell);
-                            break;
-                        }
-                    }
-                    removed_size += cell_size;
-                    --queue_size;
-                }
+    class EvictionCandidates final : public std::vector<FileSegmentMetadataPtr>
+    {
+    public:
+        explicit EvictionCandidates(KeyMetadataPtr key_metadata_) : key_metadata(key_metadata_) {}
 
-                iter->next();
-            }
-        }
+        KeyMetadata & getMetadata() { return *key_metadata; }
 
-        auto remove_file_segment = [&](FileSegmentPtr file_segment, size_t file_segment_size)
+        void add(FileSegmentMetadataPtr candidate)
         {
-            query_context->remove(file_segment->key(), file_segment->offset(), file_segment_size, cache_lock);
-            remove(file_segment, cache_lock);
-        };
+            candidate->removal_candidate = true;
+            push_back(candidate);
+        }
 
-        assert(trash.empty());
-        for (auto & cell : trash)
+        ~EvictionCandidates()
         {
-            if (auto file_segment = cell->file_segment)
-                remove_file_segment(file_segment, cell->size());
+            for (const auto & candidate : *this)
+                candidate->removal_candidate = false;
         }
 
-        for (auto & entry : ghost)
-            query_context->remove(entry.key, entry.offset, entry.size, cache_lock);
-
-        if (is_overflow())
-            return false;
+    private:
+        KeyMetadataPtr key_metadata;
+    };
 
-        if (cell_for_reserve)
-        {
-            auto queue_iterator = cell_for_reserve->queue_iterator;
-            if (queue_iterator)
-                queue_iterator->updateSize(size, cache_lock);
-            else
-                cell_for_reserve->queue_iterator = main_priority->add(key, offset, size, cache_lock);
-        }
+    std::unordered_map<Key, EvictionCandidates> to_delete;
 
-        for (auto & cell : to_evict)
-        {
-            if (auto file_segment = cell->file_segment)
-                remove_file_segment(file_segment, cell->size());
-        }
+    auto iterate_func = [&](LockedKey & locked_key, FileSegmentMetadataPtr segment_metadata)
+    {
+        chassert(segment_metadata->file_segment->assertCorrectness());
 
-        query_context->reserve(key, offset, size, cache_lock);
-        return true;
-    }
-}
+        const bool is_persistent = allow_persistent_files && segment_metadata->file_segment->isPersistent();
+        const bool releasable = segment_metadata->releasable() && !is_persistent;
 
-bool FileCache::tryReserveForMainList(
-    const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard<std::mutex> & cache_lock)
-{
-    auto removed_size = 0;
-    size_t queue_size = main_priority->getElementsNum(cache_lock);
-    assert(queue_size <= max_element_size);
+        if (releasable)
+        {
+            removed_size += segment_metadata->size();
+            --queue_size;
 
-    /// Since space reservation is incremental, cache cell already exists if it's state is EMPTY.
-    /// And it cache cell does not exist on startup -- as we first check for space and then add a cell.
-    auto * cell_for_reserve = getCell(key, offset, cache_lock);
+            auto segment = segment_metadata->file_segment;
+            if (segment->state() == FileSegment::State::DOWNLOADED)
+            {
+                const auto & key = segment->key();
+                auto it = to_delete.find(key);
+                if (it == to_delete.end())
+                    it = to_delete.emplace(key, locked_key.getKeyMetadata()).first;
+                it->second.add(segment_metadata);
+                return PriorityIterationResult::CONTINUE;
+            }
 
-    /// A cell acquires a LRUQueue iterator on first successful space reservation attempt.
-    /// cell_for_reserve can be nullptr here when we call tryReserve() from loadCacheInfoIntoMemory().
-    if (!cell_for_reserve || !cell_for_reserve->queue_iterator)
-        queue_size += 1;
+            /// TODO: we can resize if partially downloaded instead.
+            locked_key.removeFileSegment(segment->offset(), segment->lock());
+            return PriorityIterationResult::REMOVE_AND_CONTINUE;
+        }
+        return PriorityIterationResult::CONTINUE;
+    };
 
-    auto is_overflow = [&]
+    if (query_priority)
     {
-        /// max_size == 0 means unlimited cache size, max_element_size means unlimited number of cache elements.
-        return (max_size != 0 && main_priority->getCacheSize(cache_lock) + size - removed_size > max_size)
-            || (max_element_size != 0 && queue_size > max_element_size);
-    };
+        auto is_query_priority_overflow = [&]
+        {
+            const size_t new_size = query_priority->getSize(cache_lock) + size - removed_size;
+            return new_size > query_priority->getSizeLimit();
+        };
 
-    std::vector<FileSegmentCell *> to_evict;
-    std::vector<FileSegmentCell *> trash;
+        query_priority->iterate(
+            [&](LockedKey & locked_key, FileSegmentMetadataPtr segment_metadata)
+            { return is_query_priority_overflow() ? iterate_func(locked_key, segment_metadata) : PriorityIterationResult::BREAK; },
+            cache_lock);
 
-    for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next())
+        if (is_query_priority_overflow())
+            return false;
+    }
+
+    auto is_main_priority_overflow = [&]
     {
-        const auto & entry_key = it->key();
-        auto entry_offset = it->offset();
+        /// max_size == 0 means unlimited cache size,
+        /// max_element_size means unlimited number of cache elements.
+        return (main_priority->getSizeLimit() != 0 && main_priority->getSize(cache_lock) + size - removed_size > main_priority->getSizeLimit())
+            || (main_priority->getElementsLimit() != 0 && queue_size > main_priority->getElementsLimit());
+    };
 
-        if (!is_overflow())
-            break;
+    main_priority->iterate(
+        [&](LockedKey & locked_key, FileSegmentMetadataPtr segment_metadata)
+        { return is_main_priority_overflow() ? iterate_func(locked_key, segment_metadata) : PriorityIterationResult::BREAK; },
+        cache_lock);
 
-        auto * cell = getCell(entry_key, entry_offset, cache_lock);
-        if (!cell)
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cache became inconsistent. Key: {}, offset: {}",
-                key.toString(), offset);
+    if (is_main_priority_overflow())
+        return false;
 
-        size_t cell_size = cell->size();
-        assert(it->size() == cell_size);
+    if (!file_segment.getKeyMetadata()->createBaseDirectory())
+        return false;
 
-        /// It is guaranteed that cell is not removed from cache as long as
-        /// pointer to corresponding file segment is hold by any other thread.
+    for (auto & [current_key, deletion_info] : to_delete)
+    {
+        auto locked_key = deletion_info.getMetadata().tryLock();
+        if (!locked_key)
+            continue; /// key could become invalid after we released the key lock above, just skip it.
 
-        if (cell->releasable())
+        for (auto it = deletion_info.begin(); it != deletion_info.end();)
         {
-            auto & file_segment = cell->file_segment;
+            chassert((*it)->releasable());
 
-            if (file_segment->isPersistent() && allow_persistent_files)
-            {
-                LOG_DEBUG(log, "File segment will not be removed, because it is persistent: {}", file_segment->getInfoForLog());
-                continue;
-            }
+            auto segment = (*it)->file_segment;
+            locked_key->removeFileSegment(segment->offset(), segment->lock());
+            segment->getQueueIterator()->remove(cache_lock);
 
-            std::lock_guard segment_lock(file_segment->mutex);
+            if (query_context)
+                query_context->remove(current_key, segment->offset(), cache_lock);
 
-            switch (file_segment->download_state)
-            {
-                case FileSegment::State::DOWNLOADED:
-                {
-                    /// Cell will actually be removed only if
-                    /// we managed to reserve enough space.
-
-                    to_evict.push_back(cell);
-                    break;
-                }
-                default:
-                {
-                    trash.push_back(cell);
-                    break;
-                }
-            }
-
-            removed_size += cell_size;
-            --queue_size;
+            it = deletion_info.erase(it);
         }
     }
 
-    /// This case is very unlikely, can happen in case of exception from
-    /// file_segment->complete(), which would be a logical error.
-    assert(trash.empty());
-    for (auto & cell : trash)
+    /// queue_iteratir is std::nullopt here if no space has been reserved yet, a file_segment_metadata
+    /// acquires queue iterator on first successful space reservation attempt.
+    /// If queue iterator already exists, we need to update the size after each space reservation.
+    if (queue_iterator)
     {
-        if (auto file_segment = cell->file_segment)
-            remove(file_segment, cache_lock);
+        queue_iterator->updateSize(size);
     }
-
-    if (is_overflow())
-        return false;
-
-    /// cache cell is nullptr on server startup because we first check for space and then add a cell.
-    if (cell_for_reserve)
+    else
     {
-        /// queue_iteratir is std::nullopt here if no space has been reserved yet, a cache cell
-        /// acquires queue iterator on first successful space reservation attempt.
-        /// If queue iterator already exists, we need to update the size after each space reservation.
-        auto queue_iterator = cell_for_reserve->queue_iterator;
-        if (queue_iterator)
-            queue_iterator->updateSize(size, cache_lock);
-        else
-            cell_for_reserve->queue_iterator = main_priority->add(key, offset, size, cache_lock);
+        /// Space reservation is incremental, so file_segment_metadata is created first (with state empty),
+        /// and getQueueIterator() is assigned on first space reservation attempt.
+        file_segment.setQueueIterator(main_priority->add(
+            file_segment.getKeyMetadata(), file_segment.offset(), size, cache_lock));
     }
 
-    for (auto & cell : to_evict)
+    if (query_context)
     {
-        if (auto file_segment = cell->file_segment)
-            remove(file_segment, cache_lock);
+        auto query_queue_it = query_context->tryGet(file_segment.key(), file_segment.offset(), cache_lock);
+        if (query_queue_it)
+            query_queue_it->updateSize(size);
+        else
+            query_context->add(file_segment, cache_lock);
     }
 
-    if (main_priority->getCacheSize(cache_lock) > (1ull << 63))
+    if (main_priority->getSize(cache_lock) > (1ull << 63))
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache became inconsistent. There must be a bug");
 
-    if (query_context)
-        query_context->reserve(key, offset, size, cache_lock);
-
+    file_segment.reserved_size += size;
     return true;
 }
 
-void FileCache::removeIfExists(const Key & key)
+void FileCache::removeKeyIfExists(const Key & key)
 {
-    std::lock_guard cache_lock(mutex);
+    assertInitialized();
 
-    assertInitialized(cache_lock);
-
-    auto it = files.find(key);
-    if (it == files.end())
+    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::RETURN_NULL);
+    if (!locked_key)
         return;
 
-    auto & offsets = it->second;
-
-    std::vector<FileSegmentCell *> to_remove;
-    to_remove.reserve(offsets.size());
-
-    for (auto & [offset, cell] : offsets)
-        to_remove.push_back(&cell);
-
-    bool some_cells_were_skipped = false;
-    for (auto & cell : to_remove)
-    {
-        /// In ordinary case we remove data from cache when it's not used by anyone.
-        /// But if we have multiple replicated zero-copy tables on the same server
-        /// it became possible to start removing something from cache when it is used
-        /// by other "zero-copy" tables. That is why it's not an error.
-        if (!cell->releasable())
-        {
-            some_cells_were_skipped = true;
-            continue;
-        }
-
-        auto file_segment = cell->file_segment;
-        if (file_segment)
-        {
-            std::unique_lock<std::mutex> segment_lock(file_segment->mutex);
-            file_segment->detach(cache_lock, segment_lock);
-            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
-        }
-    }
-
-    if (!some_cells_were_skipped)
-    {
-        files.erase(key);
-        removeKeyDirectoryIfExists(key, cache_lock);
-    }
+    /// In ordinary case we remove data from cache when it's not used by anyone.
+    /// But if we have multiple replicated zero-copy tables on the same server
+    /// it became possible to start removing something from cache when it is used
+    /// by other "zero-copy" tables. That is why it's not an error.
+    locked_key->removeAllReleasable();
 }
 
-void FileCache::removeIfReleasable()
+void FileCache::removeAllReleasable()
 {
-    /// Try remove all cached files by cache_base_path.
+    assertInitialized();
+
     /// Only releasable file segments are evicted.
     /// `remove_persistent_files` defines whether non-evictable by some criteria files
     /// (they do not comply with the cache eviction policy) should also be removed.
 
-    std::lock_guard cache_lock(mutex);
+    auto lock = cache_guard.lock();
 
-    std::vector<FileSegmentPtr> to_remove;
-    for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next())
+    main_priority->iterate([&](LockedKey & locked_key, FileSegmentMetadataPtr segment_metadata)
     {
-        const auto & key = it->key();
-        auto offset = it->offset();
-
-        auto * cell = getCell(key, offset, cache_lock);
-        if (!cell)
+        if (segment_metadata->releasable())
         {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cache is in inconsistent state: LRU queue contains entries with no cache cell");
-        }
-
-        if (cell->releasable())
-        {
-            auto file_segment = cell->file_segment;
-
-            if (file_segment)
-            {
-                to_remove.emplace_back(file_segment);
-            }
+            auto file_segment = segment_metadata->file_segment;
+            locked_key.removeFileSegment(file_segment->offset(), file_segment->lock());
+            return PriorityIterationResult::REMOVE_AND_CONTINUE;
         }
-    }
+        return PriorityIterationResult::CONTINUE;
+    }, lock);
 
-    for (auto & file_segment : to_remove)
+    if (stash)
     {
-        std::unique_lock segment_lock(file_segment->mutex);
-        file_segment->detach(cache_lock, segment_lock);
-        remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
+        /// Remove all access information.
+        stash->records.clear();
+        stash->queue->removeAll(lock);
     }
-
-    /// Remove all access information.
-    stash_records.clear();
-    stash_priority->removeAll(cache_lock);
-
-#ifndef NDEBUG
-    assertCacheCorrectness(cache_lock);
-#endif
-}
-
-void FileCache::remove(FileSegmentPtr file_segment, std::lock_guard<std::mutex> & cache_lock)
-{
-    std::unique_lock segment_lock(file_segment->mutex);
-    remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
 }
 
-void FileCache::remove(
-    Key key, size_t offset,
-    std::lock_guard<std::mutex> & cache_lock, std::unique_lock<std::mutex> & /* segment_lock */)
+void FileCache::loadMetadata()
 {
-    LOG_DEBUG(log, "Remove from cache. Key: {}, offset: {}", key.toString(), offset);
-
-    String cache_file_path;
-
-    {
-        auto * cell = getCell(key, offset, cache_lock);
-        if (!cell)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "No cache cell for key: {}, offset: {}", key.toString(), offset);
+    auto lock = cache_guard.lock();
 
-        if (cell->queue_iterator)
-        {
-            cell->queue_iterator->removeAndGetNext(cache_lock);
-        }
-
-        cache_file_path = cell->file_segment->getPathInLocalCache();
-    }
-
-    auto & offsets = files[key];
-    offsets.erase(offset);
-
-    if (fs::exists(cache_file_path))
-    {
-        try
-        {
-            fs::remove(cache_file_path);
-
-            if (is_initialized && offsets.empty())
-            {
-                files.erase(key);
-                removeKeyDirectoryIfExists(key, cache_lock);
-            }
-        }
-        catch (...)
-        {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Removal of cached file failed. Key: {}, offset: {}, path: {}, error: {}",
-                key.toString(), offset, cache_file_path, getCurrentExceptionMessage(false));
-        }
-    }
-}
-
-void FileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock)
-{
-    Key key;
     UInt64 offset = 0;
     size_t size = 0;
-    std::vector<std::pair<IFileCachePriority::WriteIterator, std::weak_ptr<FileSegment>>> queue_entries;
+    std::vector<std::pair<PriorityIterator, std::weak_ptr<FileSegment>>> queue_entries;
 
     /// cache_base_path / key_prefix / key / offset
-    if (!files.empty())
+    if (!metadata.empty())
+    {
         throw Exception(
             ErrorCodes::LOGICAL_ERROR,
             "Cache initialization is partially made. "
             "This can be a result of a failed first attempt to initialize cache. "
             "Please, check log for error messages");
+    }
 
-    fs::directory_iterator key_prefix_it{cache_base_path};
-    for (; key_prefix_it != fs::directory_iterator(); ++key_prefix_it)
+    size_t total_size = 0;
+    for (auto key_prefix_it = fs::directory_iterator{metadata.getBaseDirectory()};
+         key_prefix_it != fs::directory_iterator();)
     {
-        if (!key_prefix_it->is_directory())
+        const fs::path key_prefix_directory = key_prefix_it->path();
+        key_prefix_it++;
+
+        if (!fs::is_directory(key_prefix_directory))
+        {
+            if (key_prefix_directory.filename() != "status")
+            {
+                LOG_WARNING(
+                    log, "Unexpected file {} (not a directory), will skip it",
+                    key_prefix_directory.string());
+            }
+            continue;
+        }
+
+        if (fs::is_empty(key_prefix_directory))
         {
-            if (key_prefix_it->path().filename() != "status")
-                LOG_DEBUG(log, "Unexpected file {} (not a directory), will skip it", key_prefix_it->path().string());
+            LOG_DEBUG(log, "Removing empty key prefix directory: {}", key_prefix_directory.string());
+            fs::remove(key_prefix_directory);
             continue;
         }
 
-        fs::directory_iterator key_it{key_prefix_it->path()};
-        for (; key_it != fs::directory_iterator(); ++key_it)
+        for (fs::directory_iterator key_it{key_prefix_directory}; key_it != fs::directory_iterator();)
         {
-            if (!key_it->is_directory())
+            const fs::path key_directory = key_it->path();
+            ++key_it;
+
+            if (!fs::is_directory(key_directory))
             {
-                LOG_DEBUG(log, "Unexpected file: {}. Expected a directory", key_it->path().string());
+                LOG_DEBUG(
+                    log,
+                    "Unexpected file: {} (not a directory). Expected a directory",
+                    key_directory.string());
                 continue;
             }
 
-            key = Key(unhexUInt<UInt128>(key_it->path().filename().string().data()));
-            fs::directory_iterator offset_it{key_it->path()};
-            for (; offset_it != fs::directory_iterator(); ++offset_it)
+            if (fs::is_empty(key_directory))
+            {
+                LOG_DEBUG(log, "Removing empty key directory: {}", key_directory.string());
+                fs::remove(key_directory);
+                continue;
+            }
+
+            const auto key = Key(unhexUInt<UInt128>(key_directory.filename().string().data()));
+            auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::CREATE_EMPTY, /* is_initial_load */true);
+
+            for (fs::directory_iterator offset_it{key_directory}; offset_it != fs::directory_iterator(); ++offset_it)
             {
                 auto offset_with_suffix = offset_it->path().filename().string();
                 auto delim_pos = offset_with_suffix.find('_');
@@ -1054,7 +827,9 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock
                 {
                     parsed = tryParse<UInt64>(offset, offset_with_suffix.substr(0, delim_pos));
                     if (offset_with_suffix.substr(delim_pos+1) == "persistent")
+                    {
                         segment_kind = FileSegmentKind::Persistent;
+                    }
                     if (offset_with_suffix.substr(delim_pos+1) == "temporary")
                     {
                         fs::remove(offset_it->path());
@@ -1075,21 +850,27 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock
                     continue;
                 }
 
-                if (tryReserve(key, offset, size, cache_lock))
+                if ((main_priority->getSizeLimit() == 0 || main_priority->getSize(lock) + size <= main_priority->getSizeLimit())
+                    && (main_priority->getElementsLimit() == 0 || main_priority->getElementsCount(lock) + 1 <= main_priority->getElementsLimit()))
                 {
-                    auto * cell = addCell(
-                        key, offset, size, FileSegment::State::DOWNLOADED,
-                        CreateFileSegmentSettings(segment_kind), cache_lock);
+                    auto file_segment_metadata_it = addFileSegment(
+                        *locked_key, offset, size, FileSegment::State::DOWNLOADED, CreateFileSegmentSettings(segment_kind), &lock);
 
-                    if (cell)
-                        queue_entries.emplace_back(cell->queue_iterator, cell->file_segment);
+                    const auto & file_segment_metadata = file_segment_metadata_it->second;
+                    chassert(file_segment_metadata->file_segment->assertCorrectness());
+                    total_size += size;
+
+                    queue_entries.emplace_back(
+                        file_segment_metadata->getQueueIterator(),
+                        file_segment_metadata->file_segment);
                 }
                 else
                 {
                     LOG_WARNING(
                         log,
-                        "Cache capacity changed (max size: {}, available: {}), cached file `{}` does not fit in cache anymore (size: {})",
-                        max_size, getAvailableCacheSizeUnlocked(cache_lock), key_it->path().string(), size);
+                        "Cache capacity changed (max size: {}, used: {}), "
+                        "cached file `{}` does not fit in cache anymore (size: {})",
+                        main_priority->getSizeLimit(), main_priority->getSize(lock), key_directory.string(), size);
 
                     fs::remove(offset_it->path());
                 }
@@ -1097,259 +878,142 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock
         }
     }
 
-    /// Shuffle cells to have random order in LRUQueue as at startup all cells have the same priority.
+    chassert(total_size == main_priority->getSize(lock));
+    chassert(total_size <= main_priority->getSizeLimit());
+
+    /// Shuffle file_segment_metadatas to have random order in LRUQueue
+    /// as at startup all file_segment_metadatas have the same priority.
     pcg64 generator(randomSeed());
     std::shuffle(queue_entries.begin(), queue_entries.end(), generator);
-    for (const auto & [it, file_segment] : queue_entries)
+    for (auto & [it, file_segment] : queue_entries)
     {
-        /// Cell cache size changed and, for example, 1st file segment fits into cache
+        /// Cache size changed and, for example, 1st file segment fits into cache
         /// and 2nd file segment will fit only if first was evicted, then first will be removed and
-        /// cell is nullptr here.
+        /// file_segment_metadata is nullptr here.
         if (file_segment.expired())
             continue;
 
-        it->use(cache_lock);
+        it->use(lock);
     }
-#ifndef NDEBUG
-    assertCacheCorrectness(cache_lock);
-#endif
 }
 
-void FileCache::reduceSizeToDownloaded(
-    const Key & key, size_t offset,
-    std::lock_guard<std::mutex> & cache_lock, std::unique_lock<std::mutex> & segment_lock)
+FileCache::~FileCache()
 {
-    /**
-     * In case file was partially downloaded and it's download cannot be continued
-     * because of no space left in cache, we need to be able to cut cell's size to downloaded_size.
-     */
-
-    auto * cell = getCell(key, offset, cache_lock);
-
-    if (!cell)
-    {
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "No cell found for key: {}, offset: {}",
-            key.toString(), offset);
-    }
-
-    const auto & file_segment = cell->file_segment;
-
-    size_t downloaded_size = file_segment->downloaded_size;
-    size_t full_size = file_segment->range().size();
-
-    if (downloaded_size == full_size)
-    {
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Nothing to reduce, file segment fully downloaded: {}",
-            file_segment->getInfoForLogUnlocked(segment_lock));
-    }
-
-    CreateFileSegmentSettings create_settings(file_segment->getKind());
-
-    cell->file_segment = std::make_shared<FileSegment>(
-        offset, downloaded_size, key, this, FileSegment::State::DOWNLOADED, create_settings);
-
-    chassert(cell->queue_iterator);
-    chassert(cell->queue_iterator->size() >= downloaded_size);
-    const int64_t diff = cell->queue_iterator->size() - downloaded_size;
-    if (diff > 0)
-        cell->queue_iterator->updateSize(-diff, cache_lock);
-
-    chassert(file_segment->reserved_size == downloaded_size);
-    chassert(file_segment->reserved_size == cell->queue_iterator->size());
+    deactivateBackgroundOperations();
 }
 
-bool FileCache::isLastFileSegmentHolder(
-    const Key & key, size_t offset,
-    std::lock_guard<std::mutex> & cache_lock, std::unique_lock<std::mutex> & /* segment_lock */)
+void FileCache::deactivateBackgroundOperations()
 {
-    auto * cell = getCell(key, offset, cache_lock);
-
-    if (!cell)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "No cell found for key: {}, offset: {}", key.toString(), offset);
-
-    /// The caller of this method is last file segment holder if use count is 2 (the second pointer is cache itself)
-    return cell->file_segment.use_count() == 2;
+    if (cleanup_task)
+        cleanup_task->deactivate();
 }
 
-FileSegments FileCache::getSnapshot() const
+void FileCache::cleanup()
 {
-    std::lock_guard cache_lock(mutex);
-
-    FileSegments file_segments;
-
-    for (const auto & [key, cells_by_offset] : files)
-    {
-        for (const auto & [offset, cell] : cells_by_offset)
-            file_segments.push_back(FileSegment::getSnapshot(cell.file_segment, cache_lock));
-    }
-    return file_segments;
+    metadata.doCleanup();
 }
 
-std::vector<String> FileCache::tryGetCachePaths(const Key & key)
+void FileCache::cleanupThreadFunc()
 {
-    std::lock_guard cache_lock(mutex);
-
-    std::vector<String> cache_paths;
-
-    const auto & cells_by_offset = files[key];
+#ifndef NDEBUG
+    assertCacheCorrectness();
+#endif
 
-    for (const auto & [offset, cell] : cells_by_offset)
+    try
     {
-        if (cell.file_segment->state() == FileSegment::State::DOWNLOADED)
-            cache_paths.push_back(getPathInLocalCache(key, offset, cell.file_segment->getKind()));
+        cleanup();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
     }
 
-    return cache_paths;
-}
-
-size_t FileCache::getUsedCacheSize() const
-{
-    std::lock_guard cache_lock(mutex);
-    return getUsedCacheSizeUnlocked(cache_lock);
-}
-
-size_t FileCache::getUsedCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const
-{
-    return main_priority->getCacheSize(cache_lock);
+    cleanup_task->scheduleAfter(delayed_cleanup_interval_ms);
 }
 
-size_t FileCache::getAvailableCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const
+FileSegmentsHolderPtr FileCache::getSnapshot()
 {
-    return max_size - getUsedCacheSizeUnlocked(cache_lock);
-}
+    assertInitialized();
+#ifndef NDEBUG
+    assertCacheCorrectness();
+#endif
 
-size_t FileCache::getFileSegmentsNum() const
-{
-    std::lock_guard cache_lock(mutex);
-    return getFileSegmentsNumUnlocked(cache_lock);
+    FileSegments file_segments;
+    metadata.iterate([&](const LockedKey & locked_key)
+    {
+        for (const auto & [_, file_segment_metadata] : locked_key)
+            file_segments.push_back(FileSegment::getSnapshot(file_segment_metadata->file_segment));
+    });
+    return std::make_unique<FileSegmentsHolder>(std::move(file_segments), /* complete_on_dtor */false);
 }
 
-size_t FileCache::getFileSegmentsNumUnlocked(std::lock_guard<std::mutex> & cache_lock) const
+FileSegmentsHolderPtr FileCache::getSnapshot(const Key & key)
 {
-    return main_priority->getElementsNum(cache_lock);
+    FileSegments file_segments;
+    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::THROW);
+    for (const auto & [_, file_segment_metadata] : *locked_key->getKeyMetadata())
+        file_segments.push_back(FileSegment::getSnapshot(file_segment_metadata->file_segment));
+    return std::make_unique<FileSegmentsHolder>(std::move(file_segments));
 }
 
-FileCache::FileSegmentCell::FileSegmentCell(
-    FileSegmentPtr file_segment_,
-    FileCache * cache,
-    std::lock_guard<std::mutex> & cache_lock)
-    : file_segment(file_segment_)
+FileSegmentsHolderPtr FileCache::dumpQueue()
 {
-    /**
-     * Cell can be created with either DOWNLOADED or EMPTY file segment's state.
-     * File segment acquires DOWNLOADING state and creates LRUQueue iterator on first
-     * successful getOrSetDownaloder call.
-     */
+    assertInitialized();
 
-    switch (file_segment->download_state)
+    FileSegments file_segments;
+    main_priority->iterate([&](LockedKey &, FileSegmentMetadataPtr segment_metadata)
     {
-        case FileSegment::State::DOWNLOADED:
-        {
-            queue_iterator = cache->main_priority->add(
-                file_segment->key(), file_segment->offset(), file_segment->range().size(), cache_lock);
-            break;
-        }
-        case FileSegment::State::SKIP_CACHE:
-        case FileSegment::State::EMPTY:
-        case FileSegment::State::DOWNLOADING:
-        {
-            break;
-        }
-        default:
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Can create cell with either EMPTY, DOWNLOADED, DOWNLOADING state, got: {}",
-                FileSegment::stateToString(file_segment->download_state));
-    }
-}
+        file_segments.push_back(FileSegment::getSnapshot(segment_metadata->file_segment));
+        return PriorityIterationResult::CONTINUE;
+    }, cache_guard.lock());
 
-String FileCache::dumpStructure(const Key & key)
-{
-    std::lock_guard cache_lock(mutex);
-    return dumpStructureUnlocked(key, cache_lock);
+    return std::make_unique<FileSegmentsHolder>(std::move(file_segments));
 }
 
-String FileCache::dumpStructureUnlocked(const Key & key, std::lock_guard<std::mutex> &)
+std::vector<String> FileCache::tryGetCachePaths(const Key & key)
 {
-    WriteBufferFromOwnString result;
-    const auto & cells_by_offset = files[key];
+    assertInitialized();
 
-    for (const auto & [offset, cell] : cells_by_offset)
-        result << cell.file_segment->getInfoForLog() << "\n";
+    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::RETURN_NULL);
+    if (!locked_key)
+        return {};
 
-    return result.str();
-}
+    std::vector<String> cache_paths;
 
-void FileCache::assertCacheCellsCorrectness(
-    const FileSegmentsByOffset & cells_by_offset, [[maybe_unused]] std::lock_guard<std::mutex> & cache_lock)
-{
-    for (const auto & [_, cell] : cells_by_offset)
+    for (const auto & [offset, file_segment_metadata] : *locked_key->getKeyMetadata())
     {
-        const auto & file_segment = cell.file_segment;
-        file_segment->assertCorrectness();
-
-        if (file_segment->reserved_size != 0)
-        {
-            assert(cell.queue_iterator);
-            assert(main_priority->contains(file_segment->key(), file_segment->offset(), cache_lock));
-        }
+        if (file_segment_metadata->file_segment->state() == FileSegment::State::DOWNLOADED)
+            cache_paths.push_back(metadata.getPathInLocalCache(key, offset, file_segment_metadata->file_segment->getKind()));
     }
+    return cache_paths;
 }
 
-void FileCache::assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock)
+size_t FileCache::getUsedCacheSize() const
 {
-    assertCacheCellsCorrectness(files[key], cache_lock);
-    assertPriorityCorrectness(cache_lock);
+    return main_priority->getSize(cache_guard.lock());
 }
 
-void FileCache::assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock)
+size_t FileCache::getFileSegmentsNum() const
 {
-    for (const auto & [key, cells_by_offset] : files)
-        assertCacheCellsCorrectness(files[key], cache_lock);
-    assertPriorityCorrectness(cache_lock);
+    return main_priority->getElementsCount(cache_guard.lock());
 }
 
-void FileCache::assertPriorityCorrectness(std::lock_guard<std::mutex> & cache_lock)
+void FileCache::assertCacheCorrectness()
 {
-    [[maybe_unused]] size_t total_size = 0;
-    for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next())
+    auto lock = cache_guard.lock();
+    main_priority->iterate([&](LockedKey &, FileSegmentMetadataPtr segment_metadata)
     {
-        const auto & key = it->key();
-        auto offset = it->offset();
-        auto size = it->size();
-
-        auto * cell = getCell(key, offset, cache_lock);
-        if (!cell)
-        {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())");
-        }
-
-        if (cell->size() != size)
-        {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Expected {} == {} size ({})",
-                cell->size(), size, cell->file_segment->getInfoForLog());
-        }
-
-        total_size += size;
-    }
-
-    assert(total_size == main_priority->getCacheSize(cache_lock));
-    assert(main_priority->getCacheSize(cache_lock) <= max_size);
-    assert(main_priority->getElementsNum(cache_lock) <= max_element_size);
+        const auto & file_segment = *segment_metadata->file_segment;
+        UNUSED(file_segment);
+        chassert(file_segment.assertCorrectness());
+        return PriorityIterationResult::CONTINUE;
+    }, lock);
 }
 
 FileCache::QueryContextHolder::QueryContextHolder(
     const String & query_id_,
     FileCache * cache_,
-    FileCache::QueryContextPtr context_)
+    FileCacheQueryLimit::QueryContextPtr context_)
     : query_id(query_id_)
     , cache(cache_)
     , context(context_)
@@ -1361,115 +1025,21 @@ FileCache::QueryContextHolder::~QueryContextHolder()
     /// If only the query_map and the current holder hold the context_query,
     /// the query has been completed and the query_context is released.
     if (context && context.use_count() == 2)
-        cache->removeQueryContext(query_id);
-}
-
-FileCache::QueryContextPtr FileCache::getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock)
-{
-    if (!isQueryInitialized())
-        return nullptr;
-
-    return getQueryContext(std::string(CurrentThread::getQueryId()), cache_lock);
-}
-
-FileCache::QueryContextPtr FileCache::getQueryContext(const String & query_id, std::lock_guard<std::mutex> & /* cache_lock */)
-{
-    auto query_iter = query_map.find(query_id);
-    return (query_iter == query_map.end()) ? nullptr : query_iter->second;
-}
-
-void FileCache::removeQueryContext(const String & query_id)
-{
-    std::lock_guard cache_lock(mutex);
-    auto query_iter = query_map.find(query_id);
-
-    if (query_iter == query_map.end())
     {
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Attempt to release query context that does not exist (query_id: {})",
-            query_id);
+        auto lock = cache->cache_guard.lock();
+        cache->query_limit->removeQueryContext(query_id, lock);
     }
-
-    query_map.erase(query_iter);
 }
 
-FileCache::QueryContextPtr FileCache::getOrSetQueryContext(
-    const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> & cache_lock)
+FileCache::QueryContextHolderPtr FileCache::getQueryContextHolder(
+    const String & query_id, const ReadSettings & settings)
 {
-    if (query_id.empty())
-        return nullptr;
-
-    auto context = getQueryContext(query_id, cache_lock);
-    if (context)
-        return context;
-
-    auto query_context = std::make_shared<QueryContext>(settings.filesystem_cache_max_download_size, settings.skip_download_if_exceeds_query_cache);
-    auto query_iter = query_map.emplace(query_id, query_context).first;
-    return query_iter->second;
-}
-
-FileCache::QueryContextHolder FileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings)
-{
-    std::lock_guard cache_lock(mutex);
-
-    if (!enable_filesystem_query_cache_limit || settings.filesystem_cache_max_download_size == 0)
+    if (!query_limit || settings.filesystem_cache_max_download_size == 0)
         return {};
 
-    /// if enable_filesystem_query_cache_limit is true, and filesystem_cache_max_download_size large than zero,
-    /// we create context query for current query.
-    auto context = getOrSetQueryContext(query_id, settings, cache_lock);
-    return QueryContextHolder(query_id, this, context);
-}
-
-void FileCache::QueryContext::remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
-{
-    if (cache_size < size)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size");
-
-    if (!skip_download_if_exceeds_query_cache)
-    {
-        auto record = records.find({key, offset});
-        if (record != records.end())
-        {
-            record->second->removeAndGetNext(cache_lock);
-            records.erase({key, offset});
-        }
-    }
-    cache_size -= size;
-}
-
-void FileCache::QueryContext::reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
-{
-    if (cache_size + size > max_cache_size)
-    {
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Reserved cache size exceeds the remaining cache size (key: {}, offset: {})",
-            key.toString(), offset);
-    }
-
-    if (!skip_download_if_exceeds_query_cache)
-    {
-        auto record = records.find({key, offset});
-        if (record == records.end())
-        {
-            auto queue_iter = priority->add(key, offset, 0, cache_lock);
-            record = records.insert({{key, offset}, queue_iter}).first;
-        }
-        record->second->updateSize(size, cache_lock);
-    }
-    cache_size += size;
-}
-
-void FileCache::QueryContext::use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock)
-{
-    if (skip_download_if_exceeds_query_cache)
-        return;
-
-    auto record = records.find({key, offset});
-    if (record != records.end())
-        record->second->use(cache_lock);
+    auto lock = cache_guard.lock();
+    auto context = query_limit->getOrSetQueryContext(query_id, settings, lock);
+    return std::make_unique<QueryContextHolder>(query_id, this, std::move(context));
 }
 
 }
diff --git a/src/Interpreters/Cache/FileCache.h b/src/Interpreters/Cache/FileCache.h
index 83435b67562e..2ceb6825a54c 100644
--- a/src/Interpreters/Cache/FileCache.h
+++ b/src/Interpreters/Cache/FileCache.h
@@ -9,43 +9,52 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <boost/functional/hash.hpp>
-#include <boost/noncopyable.hpp>
 
-#include <Core/Types.h>
-#include <Common/ThreadPool.h>
 #include <IO/ReadSettings.h>
-#include <Interpreters/Cache/IFileCachePriority.h>
-#include <Interpreters/Cache/FileCacheKey.h>
+
+#include <Core/BackgroundSchedulePool.h>
+#include <Interpreters/Cache/LRUFileCachePriority.h>
 #include <Interpreters/Cache/FileCache_fwd.h>
 #include <Interpreters/Cache/FileSegment.h>
+#include <Interpreters/Cache/Metadata.h>
+#include <Interpreters/Cache/QueryLimit.h>
+#include <Interpreters/Cache/FileCache_fwd_internal.h>
+#include <filesystem>
 
 
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
 /// Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments.
 /// Different caching algorithms are implemented using IFileCachePriority.
 class FileCache : private boost::noncopyable
 {
-
-friend class FileSegment;
-friend class IFileCachePriority;
-friend struct FileSegmentsHolder;
-friend class FileSegmentRangeWriter;
-
-struct QueryContext;
-using QueryContextPtr = std::shared_ptr<QueryContext>;
-
 public:
     using Key = DB::FileCacheKey;
+    using QueryLimit = DB::FileCacheQueryLimit;
+    using Priority = IFileCachePriority;
+    using PriorityEntry = IFileCachePriority::Entry;
+    using PriorityIterator = IFileCachePriority::Iterator;
+    using PriorityIterationResult = IFileCachePriority::IterationResult;
 
     explicit FileCache(const FileCacheSettings & settings);
 
-    ~FileCache() = default;
+    ~FileCache();
 
     void initialize();
 
-    const String & getBasePath() const { return cache_base_path; }
+    const String & getBasePath() const;
+
+    static Key createKeyForPath(const String & path);
+
+    String getPathInLocalCache(const Key & key, size_t offset, FileSegmentKind segment_kind) const;
+
+    String getPathInLocalCache(const Key & key) const;
 
     /**
      * Given an `offset` and `size` representing [offset, offset + size) bytes interval,
@@ -58,8 +67,7 @@ using QueryContextPtr = std::shared_ptr<QueryContext>;
      * As long as pointers to returned file segments are held
      * it is guaranteed that these file segments are not removed from cache.
      */
-    FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings);
-    FileSegmentsHolder set(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings);
+    FileSegmentsHolderPtr getOrSet(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings);
 
     /**
      * Segments in returned list are ordered in ascending order and represent a full contiguous
@@ -70,53 +78,40 @@ using QueryContextPtr = std::shared_ptr<QueryContext>;
      * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change
      * it's state (and become DOWNLOADED).
      */
-    FileSegmentsHolder get(const Key & key, size_t offset, size_t size);
+    FileSegmentsHolderPtr get(const Key & key, size_t offset, size_t size);
+
+    FileSegmentsHolderPtr set(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings);
 
     /// Remove files by `key`. Removes files which might be used at the moment.
-    void removeIfExists(const Key & key);
+    void removeKeyIfExists(const Key & key);
 
     /// Remove files by `key`. Will not remove files which are used at the moment.
-    void removeIfReleasable();
-
-    static Key hash(const String & path);
-
-    String getPathInLocalCache(const Key & key, size_t offset, FileSegmentKind segment_kind) const;
-
-    String getPathInLocalCache(const Key & key) const;
+    void removeAllReleasable();
 
     std::vector<String> tryGetCachePaths(const Key & key);
 
-    size_t capacity() const { return max_size; }
-
     size_t getUsedCacheSize() const;
 
     size_t getFileSegmentsNum() const;
 
-    static bool isReadOnly();
+    size_t getMaxFileSegmentSize() const { return max_file_segment_size; }
 
-    /**
-     * Create a file segment of exactly requested size with EMPTY state.
-     * Throw exception if requested size exceeds max allowed file segment size.
-     * This method is for protected usage: file segment range writer uses it
-     * to dynamically allocate file segments.
-     */
-    FileSegmentPtr createFileSegmentForDownload(
-         const Key & key,
-         size_t offset,
-         size_t size,
-         const CreateFileSegmentSettings & create_settings,
-         std::lock_guard<std::mutex> & cache_lock);
+    bool tryReserve(FileSegment & file_segment, size_t size);
+
+    FileSegmentsHolderPtr getSnapshot();
+
+    FileSegmentsHolderPtr getSnapshot(const Key & key);
 
-    FileSegments getSnapshot() const;
+    FileSegmentsHolderPtr dumpQueue();
 
-    /// For debug.
-    String dumpStructure(const Key & key);
+    void cleanup();
 
-    /// Save a query context information, and adopt different cache policies
-    /// for different queries through the context cache layer.
+    void deactivateBackgroundOperations();
+
+    /// For per query cache limit.
     struct QueryContextHolder : private boost::noncopyable
     {
-        QueryContextHolder(const String & query_id_, FileCache * cache_, QueryContextPtr context_);
+        QueryContextHolder(const String & query_id_, FileCache * cache_, QueryLimit::QueryContextPtr context_);
 
         QueryContextHolder() = default;
 
@@ -124,198 +119,95 @@ using QueryContextPtr = std::shared_ptr<QueryContext>;
 
         String query_id;
         FileCache * cache = nullptr;
-        QueryContextPtr context;
+        QueryLimit::QueryContextPtr context;
     };
+    using QueryContextHolderPtr = std::unique_ptr<QueryContextHolder>;
+    QueryContextHolderPtr getQueryContextHolder(const String & query_id, const ReadSettings & settings);
 
-    QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings);
+    CacheGuard::Lock lockCache() { return cache_guard.lock(); }
 
 private:
-    String cache_base_path;
+    using KeyAndOffset = FileCacheKeyAndOffset;
 
-    const size_t max_size;
-    const size_t max_element_size;
     const size_t max_file_segment_size;
-
     const bool allow_persistent_files;
-    const size_t enable_cache_hits_threshold;
-    const bool enable_filesystem_query_cache_limit;
-
-    const bool enable_bypass_cache_with_threashold;
-    const size_t bypass_cache_threashold;
+    const size_t bypass_cache_threshold = 0;
+    const size_t delayed_cleanup_interval_ms;
 
-    mutable std::mutex mutex;
     Poco::Logger * log;
 
-    bool is_initialized = false;
-    std::exception_ptr initialization_exception;
-
-    void assertInitialized(std::lock_guard<std::mutex> & cache_lock) const;
-
-    bool tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
-
-    void remove(
-        Key key,
-        size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::unique_lock<std::mutex> & segment_lock);
-
-    void remove(
-        FileSegmentPtr file_segment,
-        std::lock_guard<std::mutex> & cache_lock);
-
-    bool isLastFileSegmentHolder(
-        const Key & key,
-        size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::unique_lock<std::mutex> & segment_lock);
-
-    void reduceSizeToDownloaded(
-        const Key & key,
-        size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::unique_lock<std::mutex> & segment_lock);
-
-    struct FileSegmentCell : private boost::noncopyable
-    {
-        FileSegmentPtr file_segment;
+    std::exception_ptr init_exception;
+    std::atomic<bool> is_initialized = false;
+    mutable std::mutex init_mutex;
 
-        /// Iterator is put here on first reservation attempt, if successful.
-        IFileCachePriority::WriteIterator queue_iterator;
+    CacheMetadata metadata;
 
-        /// Pointer to file segment is always hold by the cache itself.
-        /// Apart from pointer in cache, it can be hold by cache users, when they call
-        /// getorSet(), but cache users always hold it via FileSegmentsHolder.
-        bool releasable() const { return file_segment.unique(); }
+    FileCachePriorityPtr main_priority;
+    mutable CacheGuard cache_guard;
 
-        size_t size() const { return file_segment->reserved_size; }
-
-        FileSegmentCell(FileSegmentPtr file_segment_, FileCache * cache, std::lock_guard<std::mutex> & cache_lock);
-
-        FileSegmentCell(FileSegmentCell && other) noexcept
-            : file_segment(std::move(other.file_segment)), queue_iterator(std::move(other.queue_iterator)) {}
-    };
-
-    using AccessKeyAndOffset = std::pair<Key, size_t>;
-    struct KeyAndOffsetHash
+    struct HitsCountStash
     {
-        std::size_t operator()(const AccessKeyAndOffset & key) const
+        HitsCountStash(size_t hits_threashold_, size_t queue_size_)
+            : hits_threshold(hits_threashold_), queue(std::make_unique<LRUFileCachePriority>(0, queue_size_))
         {
-            return std::hash<UInt128>()(key.first.key) ^ std::hash<UInt64>()(key.second);
+            if (!queue_size_)
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Queue size for hits queue must be non-zero");
         }
-    };
-
-    using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
-    using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;
-    using FileCacheRecords = std::unordered_map<AccessKeyAndOffset, IFileCachePriority::WriteIterator, KeyAndOffsetHash>;
-
-    CachedFiles files;
-    std::unique_ptr<IFileCachePriority> main_priority;
-
-    FileCacheRecords stash_records;
-    std::unique_ptr<IFileCachePriority> stash_priority;
-    size_t max_stash_element_size;
 
-    void loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock);
+        const size_t hits_threshold;
+        FileCachePriorityPtr queue;
+        using Records = std::unordered_map<KeyAndOffset, PriorityIterator, FileCacheKeyAndOffsetHash>;
+        Records records;
+    };
 
-    FileSegments getImpl(const Key & key, const FileSegment::Range & range, std::lock_guard<std::mutex> & cache_lock);
+    /**
+     * A HitsCountStash allows to cache certain data only after it reached
+     * a certain hit rate, e.g. if hit rate it 5, then data is cached on 6th cache hit.
+     */
+    mutable std::unique_ptr<HitsCountStash> stash;
+    /**
+     * A QueryLimit allows to control cache write limit per query.
+     * E.g. if a query needs n bytes from cache, but it has only k bytes, where 0 <= k <= n
+     * then allowed loaded cache size is std::min(n - k, max_query_cache_size).
+     */
+    FileCacheQueryLimitPtr query_limit;
+    /**
+     * A background cleanup task.
+     * Clears removed cache entries from metadata.
+     */
+    BackgroundSchedulePool::TaskHolder cleanup_task;
 
-    FileSegmentCell * getCell(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock);
+    void assertInitialized() const;
 
-    /// Returns non-owned pointer to the cell stored in the `files` map.
-    /// Doesn't reserve any space.
-    FileSegmentCell * addCell(
-        const Key & key,
-        size_t offset,
-        size_t size,
-        FileSegment::State state,
-        const CreateFileSegmentSettings & create_settings,
-        std::lock_guard<std::mutex> & cache_lock);
+    void assertCacheCorrectness();
 
-    static void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard<std::mutex> & cache_lock);
+    void loadMetadata();
 
-    bool tryReserveForMainList(
-        const Key & key,
-        size_t offset,
-        size_t size,
-        QueryContextPtr query_context,
-        std::lock_guard<std::mutex> & cache_lock);
+    FileSegments getImpl(const LockedKey & locked_key, const FileSegment::Range & range) const;
 
-    FileSegments splitRangeIntoCells(
-        const Key & key,
+    FileSegments splitRangeIntoFileSegments(
+        LockedKey & locked_key,
         size_t offset,
         size_t size,
         FileSegment::State state,
-        const CreateFileSegmentSettings & create_settings,
-        std::lock_guard<std::mutex> & cache_lock);
-
-    String dumpStructureUnlocked(const Key & key_, std::lock_guard<std::mutex> & cache_lock);
+        const CreateFileSegmentSettings & create_settings);
 
     void fillHolesWithEmptyFileSegments(
+        LockedKey & locked_key,
         FileSegments & file_segments,
-        const Key & key,
         const FileSegment::Range & range,
         bool fill_with_detached_file_segments,
-        const CreateFileSegmentSettings & settings,
-        std::lock_guard<std::mutex> & cache_lock);
-
-    size_t getUsedCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
-
-    size_t getAvailableCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
-
-    size_t getFileSegmentsNumUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
-
-    void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard<std::mutex> & cache_lock);
-
-    void removeKeyDirectoryIfExists(const Key & key, std::lock_guard<std::mutex> & cache_lock) const;
-
-    /// Used to track and control the cache access of each query.
-    /// Through it, we can realize the processing of different queries by the cache layer.
-    struct QueryContext
-    {
-        FileCacheRecords records;
-        FileCachePriorityPtr priority;
+        const CreateFileSegmentSettings & settings);
 
-        size_t cache_size = 0;
-        size_t max_cache_size;
-
-        bool skip_download_if_exceeds_query_cache;
-
-        QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_)
-            : max_cache_size(max_cache_size_)
-            , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {}
-
-        size_t getMaxCacheSize() const { return max_cache_size; }
-
-        size_t getCacheSize() const { return cache_size; }
-
-        FileCachePriorityPtr getPriority() const { return priority; }
-
-        bool isSkipDownloadIfExceed() const { return skip_download_if_exceeds_query_cache; }
-
-        void remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
-
-        void reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
-
-        void use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock);
-    };
-
-    using QueryContextMap = std::unordered_map<String, QueryContextPtr>;
-    QueryContextMap query_map;
-
-    QueryContextPtr getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock);
-
-    QueryContextPtr getQueryContext(const String & query_id, std::lock_guard<std::mutex> & cache_lock);
-
-    void removeQueryContext(const String & query_id);
-
-    QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> &);
-
-public:
-    void assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock);
-
-    void assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock);
+    KeyMetadata::iterator addFileSegment(
+        LockedKey & locked_key,
+        size_t offset,
+        size_t size,
+        FileSegment::State state,
+        const CreateFileSegmentSettings & create_settings,
+        const CacheGuard::Lock *);
 
-    void assertPriorityCorrectness(std::lock_guard<std::mutex> & cache_lock);
+    void cleanupThreadFunc();
 };
 
 }
diff --git a/src/Interpreters/Cache/FileCacheKey.cpp b/src/Interpreters/Cache/FileCacheKey.cpp
new file mode 100644
index 000000000000..f97cdc058aa1
--- /dev/null
+++ b/src/Interpreters/Cache/FileCacheKey.cpp
@@ -0,0 +1,31 @@
+#include "FileCacheKey.h"
+
+#include <base/hex.h>
+#include <Common/SipHash.h>
+#include <Core/UUID.h>
+
+
+namespace DB
+{
+
+FileCacheKey::FileCacheKey(const std::string & path)
+    : key(sipHash128(path.data(), path.size()))
+{
+}
+
+FileCacheKey::FileCacheKey(const UInt128 & key_)
+    : key(key_)
+{
+}
+
+std::string FileCacheKey::toString() const
+{
+    return getHexUIntLowercase(key);
+}
+
+FileCacheKey FileCacheKey::random()
+{
+    return FileCacheKey(UUIDHelpers::generateV4().toUnderType());
+}
+
+}
diff --git a/src/Interpreters/Cache/FileCacheKey.h b/src/Interpreters/Cache/FileCacheKey.h
index 67e1466e2d41..bab8359732cd 100644
--- a/src/Interpreters/Cache/FileCacheKey.h
+++ b/src/Interpreters/Cache/FileCacheKey.h
@@ -1,26 +1,37 @@
 #pragma once
 #include <Core/Types.h>
-#include <base/hex.h>
-#include <Core/UUID.h>
+#include <fmt/format.h>
 
 namespace DB
 {
 
 struct FileCacheKey
 {
-    UInt128 key;
+    using KeyHash = UInt128;
+    KeyHash key;
 
-    String toString() const { return getHexUIntLowercase(key); }
+    std::string toString() const;
 
     FileCacheKey() = default;
 
-    explicit FileCacheKey(const UInt128 & key_) : key(key_) { }
+    explicit FileCacheKey(const std::string & path);
 
-    static FileCacheKey random() { return FileCacheKey(UUIDHelpers::generateV4().toUnderType()); }
+    explicit FileCacheKey(const UInt128 & key_);
+
+    static FileCacheKey random();
 
     bool operator==(const FileCacheKey & other) const { return key == other.key; }
 };
 
+using FileCacheKeyAndOffset = std::pair<FileCacheKey, size_t>;
+struct FileCacheKeyAndOffsetHash
+{
+    std::size_t operator()(const FileCacheKeyAndOffset & key) const
+    {
+        return std::hash<UInt128>()(key.first.key) ^ std::hash<UInt64>()(key.second);
+    }
+};
+
 }
 
 namespace std
@@ -32,3 +43,13 @@ struct hash<DB::FileCacheKey>
 };
 
 }
+
+template <>
+struct fmt::formatter<DB::FileCacheKey> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const DB::FileCacheKey & key, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(key.toString(), ctx);
+    }
+};
diff --git a/src/Interpreters/Cache/FileCacheSettings.cpp b/src/Interpreters/Cache/FileCacheSettings.cpp
index 9d5282047aae..1737defd3160 100644
--- a/src/Interpreters/Cache/FileCacheSettings.cpp
+++ b/src/Interpreters/Cache/FileCacheSettings.cpp
@@ -30,24 +30,26 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
     if (path.empty())
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk Cache requires non-empty `path` field (cache base path) in config");
 
-    max_elements = config.getUInt64(config_prefix + ".max_elements", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS);
+    max_elements = config.getUInt64(config_prefix + ".max_elements", FILECACHE_DEFAULT_MAX_ELEMENTS);
     if (config.has(config_prefix + ".max_file_segment_size"))
         max_file_segment_size = parseWithSizeSuffix<uint64_t>(config.getString(config_prefix + ".max_file_segment_size"));
     else
-        max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
+        max_file_segment_size = FILECACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
 
     cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
     enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
-    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
+    cache_hits_threshold = config.getUInt64(config_prefix + ".cache_hits_threshold", FILECACHE_DEFAULT_HITS_THRESHOLD);
 
     enable_bypass_cache_with_threashold = config.getUInt64(config_prefix + ".enable_bypass_cache_with_threashold", false);
 
     if (config.has(config_prefix + ".bypass_cache_threashold"))
         bypass_cache_threashold = parseWithSizeSuffix<uint64_t>(config.getString(config_prefix + ".bypass_cache_threashold"));
     else
-         bypass_cache_threashold = REMOTE_FS_OBJECTS_CACHE_BYPASS_THRESHOLD;
+         bypass_cache_threashold = FILECACHE_BYPASS_THRESHOLD;
 
     do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", false);
+
+    delayed_cleanup_interval_ms = config.getUInt64(config_prefix + ".delayed_cleanup_interval_ms", FILECACHE_DELAYED_CLEANUP_INTERVAL_MS);
 }
 
 }
diff --git a/src/Interpreters/Cache/FileCacheSettings.h b/src/Interpreters/Cache/FileCacheSettings.h
index 689c3ef70fbf..e316cc6d6fee 100644
--- a/src/Interpreters/Cache/FileCacheSettings.h
+++ b/src/Interpreters/Cache/FileCacheSettings.h
@@ -13,18 +13,19 @@ struct FileCacheSettings
     std::string base_path;
 
     size_t max_size = 0;
-    size_t max_elements = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS;
-    size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
+    size_t max_elements = FILECACHE_DEFAULT_MAX_ELEMENTS;
+    size_t max_file_segment_size = FILECACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
 
     bool cache_on_write_operations = false;
 
-    size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD;
+    size_t cache_hits_threshold = FILECACHE_DEFAULT_HITS_THRESHOLD;
     bool enable_filesystem_query_cache_limit = false;
 
     bool do_not_evict_index_and_mark_files = true;
 
     bool enable_bypass_cache_with_threashold = false;
-    size_t bypass_cache_threashold = REMOTE_FS_OBJECTS_CACHE_BYPASS_THRESHOLD;
+    size_t bypass_cache_threashold = FILECACHE_BYPASS_THRESHOLD;
+    size_t delayed_cleanup_interval_ms = FILECACHE_DELAYED_CLEANUP_INTERVAL_MS;
 
     void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
 };
diff --git a/src/Interpreters/Cache/FileCache_fwd.h b/src/Interpreters/Cache/FileCache_fwd.h
index 72dc1144fb9b..afd8d86074ef 100644
--- a/src/Interpreters/Cache/FileCache_fwd.h
+++ b/src/Interpreters/Cache/FileCache_fwd.h
@@ -4,10 +4,11 @@
 namespace DB
 {
 
-static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 100 * 1024 * 1024;
-static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024;
-static constexpr int REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD = 0;
-static constexpr size_t REMOTE_FS_OBJECTS_CACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024;;
+static constexpr int FILECACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 100 * 1024 * 1024;
+static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024;
+static constexpr int FILECACHE_DEFAULT_HITS_THRESHOLD = 0;
+static constexpr size_t FILECACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024;
+static constexpr size_t FILECACHE_DELAYED_CLEANUP_INTERVAL_MS = 1000 * 60; /// 1 min
 
 class FileCache;
 using FileCachePtr = std::shared_ptr<FileCache>;
diff --git a/src/Interpreters/Cache/FileCache_fwd_internal.h b/src/Interpreters/Cache/FileCache_fwd_internal.h
new file mode 100644
index 000000000000..5ded018a674e
--- /dev/null
+++ b/src/Interpreters/Cache/FileCache_fwd_internal.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <list>
+
+namespace DB
+{
+
+class FileCache;
+using FileCachePtr = std::shared_ptr<FileCache>;
+
+class IFileCachePriority;
+using FileCachePriorityPtr = std::unique_ptr<IFileCachePriority>;
+
+class FileSegment;
+using FileSegmentPtr = std::shared_ptr<FileSegment>;
+using FileSegments = std::list<FileSegmentPtr>;
+
+struct FileSegmentMetadata;
+using FileSegmentMetadataPtr = std::shared_ptr<FileSegmentMetadata>;
+
+struct LockedKey;
+using LockedKeyPtr = std::shared_ptr<LockedKey>;
+
+struct KeyMetadata;
+using KeyMetadataPtr = std::shared_ptr<KeyMetadata>;
+
+}
diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp
index 6ae25c681d4c..8ff6c782fe58 100644
--- a/src/Interpreters/Cache/FileSegment.cpp
+++ b/src/Interpreters/Cache/FileSegment.cpp
@@ -13,11 +13,6 @@
 
 namespace fs = std::filesystem;
 
-namespace CurrentMetrics
-{
-extern const Metric CacheDetachedFileSegments;
-}
-
 namespace DB
 {
 
@@ -32,23 +27,27 @@ String toString(FileSegmentKind kind)
 }
 
 FileSegment::FileSegment(
+        const Key & key_,
         size_t offset_,
         size_t size_,
-        const Key & key_,
-        FileCache * cache_,
         State download_state_,
-        const CreateFileSegmentSettings & settings)
-    : segment_range(offset_, offset_ + size_ - 1)
+        const CreateFileSegmentSettings & settings,
+        FileCache * cache_,
+        std::weak_ptr<KeyMetadata> key_metadata_,
+        Priority::Iterator queue_iterator_)
+    : file_key(key_)
+    , segment_range(offset_, offset_ + size_ - 1)
+    , segment_kind(settings.kind)
+    , is_unbound(settings.unbounded)
     , download_state(download_state_)
-    , file_key(key_)
+    , key_metadata(key_metadata_)
+    , queue_iterator(queue_iterator_)
     , cache(cache_)
-#ifndef NDEBUG
-    , log(&Poco::Logger::get(fmt::format("FileSegment({}) : {}", getHexUIntLowercase(key_), range().toString())))
+#ifdef ABORT_ON_LOGICAL_ERROR
+    , log(&Poco::Logger::get(fmt::format("FileSegment({}) : {}", key_.toString(), range().toString())))
 #else
     , log(&Poco::Logger::get("FileSegment"))
 #endif
-    , segment_kind(settings.kind)
-    , is_unbound(settings.unbounded)
 {
     /// On creation, file segment state can be EMPTY, DOWNLOADED, DOWNLOADING.
     switch (download_state)
@@ -57,18 +56,20 @@ FileSegment::FileSegment(
         /// someone will _potentially_ want to download it (after calling getOrSetDownloader()).
         case (State::EMPTY):
         {
+            chassert(key_metadata.lock());
             break;
         }
         /// DOWNLOADED is used either on initial cache metadata load into memory on server startup
-        /// or on reduceSizeToDownloaded() -- when file segment object is updated.
+        /// or on shrinkFileSegmentToDownloadedSize() -- when file segment object is updated.
         case (State::DOWNLOADED):
         {
             reserved_size = downloaded_size = size_;
-            is_downloaded = true;
             chassert(fs::file_size(getPathInLocalCache()) == size_);
+            chassert(queue_iterator);
+            chassert(key_metadata.lock());
             break;
         }
-        case (State::SKIP_CACHE):
+        case (State::DETACHED):
         {
             break;
         }
@@ -76,82 +77,87 @@ FileSegment::FileSegment(
         {
             throw Exception(
                 ErrorCodes::LOGICAL_ERROR,
-                "Can only create cell with either EMPTY, DOWNLOADED or SKIP_CACHE state");
+                "Can only create file segment with either EMPTY, DOWNLOADED or DETACHED state");
         }
     }
 }
 
-String FileSegment::getPathInLocalCache() const
+FileSegment::State FileSegment::state() const
 {
-    chassert(cache);
-    return cache->getPathInLocalCache(key(), offset(), segment_kind);
+    auto lock = segment_guard.lock();
+    return download_state;
 }
 
-FileSegment::State FileSegment::state() const
+String FileSegment::getPathInLocalCache() const
 {
-    std::unique_lock segment_lock(mutex);
-    return download_state;
+    return getKeyMetadata()->getFileSegmentPath(*this);
 }
 
-void FileSegment::setDownloadState(State state)
+void FileSegment::setDownloadState(State state, const FileSegmentGuard::Lock & lock)
 {
+    if (isCompleted(false) && state != State::DETACHED)
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Updating state to {} of file segment is not allowed, because it is already completed ({})",
+            stateToString(state), getInfoForLogUnlocked(lock));
+    }
+
     LOG_TEST(log, "Updated state from {} to {}", stateToString(download_state), stateToString(state));
     download_state = state;
 }
 
-size_t FileSegment::getFirstNonDownloadedOffset() const
+size_t FileSegment::getReservedSize() const
 {
-    std::unique_lock segment_lock(mutex);
-    return getFirstNonDownloadedOffsetUnlocked(segment_lock);
+    auto lock = segment_guard.lock();
+    return reserved_size;
 }
 
-size_t FileSegment::getFirstNonDownloadedOffsetUnlocked(std::unique_lock<std::mutex> & segment_lock) const
+FileSegment::Priority::Iterator FileSegment::getQueueIterator() const
 {
-    return range().left + getDownloadedSizeUnlocked(segment_lock);
+    auto lock = segment_guard.lock();
+    return queue_iterator;
 }
 
-size_t FileSegment::getCurrentWriteOffset() const
+void FileSegment::setQueueIterator(Priority::Iterator iterator)
 {
-    std::unique_lock segment_lock(mutex);
-    return getCurrentWriteOffsetUnlocked(segment_lock);
+    auto lock = segment_guard.lock();
+    if (queue_iterator)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Queue iterator cannot be set twice");
+    queue_iterator = iterator;
 }
 
-size_t FileSegment::getCurrentWriteOffsetUnlocked(std::unique_lock<std::mutex> & segment_lock) const
+size_t FileSegment::getFirstNonDownloadedOffset(bool sync) const
 {
-    return getFirstNonDownloadedOffsetUnlocked(segment_lock);
+    return range().left + getDownloadedSize(sync);
 }
 
-size_t FileSegment::getDownloadedSize() const
+size_t FileSegment::getCurrentWriteOffset(bool sync) const
 {
-    std::unique_lock segment_lock(mutex);
-    return getDownloadedSizeUnlocked(segment_lock);
+    return getFirstNonDownloadedOffset(sync);
 }
 
-size_t FileSegment::getDownloadedSizeUnlocked(std::unique_lock<std::mutex> & /* segment_lock */) const
+size_t FileSegment::getDownloadedSize(bool sync) const
 {
-    if (download_state == State::DOWNLOADED)
+    if (sync)
+    {
+        std::lock_guard lock(download_mutex);
         return downloaded_size;
-
-    std::unique_lock download_lock(download_mutex);
+    }
     return downloaded_size;
 }
 
 void FileSegment::setDownloadedSize(size_t delta)
 {
-    std::unique_lock download_lock(download_mutex);
-    setDownloadedSizeUnlocked(download_lock, delta);
-}
-
-void FileSegment::setDownloadedSizeUnlocked(std::unique_lock<std::mutex> & /* download_lock */, size_t delta)
-{
+    auto lock = segment_guard.lock();
     downloaded_size += delta;
     assert(downloaded_size == std::filesystem::file_size(getPathInLocalCache()));
 }
 
 bool FileSegment::isDownloaded() const
 {
-    std::lock_guard segment_lock(mutex);
-    return is_downloaded;
+    auto lock = segment_guard.lock();
+    return download_state == State::DOWNLOADED;
 }
 
 String FileSegment::getCallerId()
@@ -166,70 +172,74 @@ String FileSegment::getCallerId()
 
 String FileSegment::getDownloader() const
 {
-    std::unique_lock segment_lock(mutex);
-    return getDownloaderUnlocked(segment_lock);
+    auto lock = segment_guard.lock();
+    return getDownloaderUnlocked(lock);
 }
 
-String FileSegment::getDownloaderUnlocked(std::unique_lock<std::mutex> & /* segment_lock */) const
+String FileSegment::getDownloaderUnlocked(const FileSegmentGuard::Lock &) const
 {
     return downloader_id;
 }
 
 String FileSegment::getOrSetDownloader()
 {
-    std::unique_lock segment_lock(mutex);
+    auto lock = segment_guard.lock();
 
-    assertNotDetachedUnlocked(segment_lock);
+    assertNotDetachedUnlocked(lock);
 
-    auto current_downloader = getDownloaderUnlocked(segment_lock);
+    auto current_downloader = getDownloaderUnlocked(lock);
 
     if (current_downloader.empty())
     {
+        const auto caller_id = getCallerId();
         bool allow_new_downloader = download_state == State::EMPTY || download_state == State::PARTIALLY_DOWNLOADED;
         if (!allow_new_downloader)
             return "notAllowed:" + stateToString(download_state);
 
-        current_downloader = downloader_id = getCallerId();
-        setDownloadState(State::DOWNLOADING);
+        current_downloader = downloader_id = caller_id;
+        setDownloadState(State::DOWNLOADING, lock);
+        chassert(key_metadata.lock());
     }
 
     return current_downloader;
 }
 
-void FileSegment::resetDownloadingStateUnlocked([[maybe_unused]] std::unique_lock<std::mutex> & segment_lock)
+void FileSegment::resetDownloadingStateUnlocked(const FileSegmentGuard::Lock & lock)
 {
-    assert(isDownloaderUnlocked(segment_lock));
+    assert(isDownloaderUnlocked(lock));
     assert(download_state == State::DOWNLOADING);
 
-    size_t current_downloaded_size = getDownloadedSizeUnlocked(segment_lock);
+    size_t current_downloaded_size = getDownloadedSize(true);
     /// range().size() can equal 0 in case of write-though cache.
     if (!is_unbound && current_downloaded_size != 0 && current_downloaded_size == range().size())
-        setDownloadedUnlocked(segment_lock);
+        setDownloadedUnlocked(lock);
     else
-        setDownloadState(State::PARTIALLY_DOWNLOADED);
+        setDownloadState(State::PARTIALLY_DOWNLOADED, lock);
 }
 
 void FileSegment::resetDownloader()
 {
-    std::unique_lock segment_lock(mutex);
+    auto lock = segment_guard.lock();
+
+    SCOPE_EXIT({ cv.notify_all(); });
 
-    assertNotDetachedUnlocked(segment_lock);
-    assertIsDownloaderUnlocked("resetDownloader", segment_lock);
+    assertNotDetachedUnlocked(lock);
+    assertIsDownloaderUnlocked("resetDownloader", lock);
 
-    resetDownloadingStateUnlocked(segment_lock);
-    resetDownloaderUnlocked(segment_lock);
+    resetDownloadingStateUnlocked(lock);
+    resetDownloaderUnlocked(lock);
 }
 
-void FileSegment::resetDownloaderUnlocked(std::unique_lock<std::mutex> & /* segment_lock */)
+void FileSegment::resetDownloaderUnlocked(const FileSegmentGuard::Lock &)
 {
     LOG_TEST(log, "Resetting downloader from {}", downloader_id);
     downloader_id.clear();
 }
 
-void FileSegment::assertIsDownloaderUnlocked(const std::string & operation, std::unique_lock<std::mutex> & segment_lock) const
+void FileSegment::assertIsDownloaderUnlocked(const std::string & operation, const FileSegmentGuard::Lock & lock) const
 {
     auto caller = getCallerId();
-    auto current_downloader = getDownloaderUnlocked(segment_lock);
+    auto current_downloader = getDownloaderUnlocked(lock);
     LOG_TEST(log, "Downloader id: {}, caller id: {}, operation: {}", current_downloader, caller, operation);
 
     if (caller != current_downloader)
@@ -244,41 +254,53 @@ void FileSegment::assertIsDownloaderUnlocked(const std::string & operation, std:
 
 bool FileSegment::isDownloader() const
 {
-    std::unique_lock segment_lock(mutex);
-    return isDownloaderUnlocked(segment_lock);
+    auto lock = segment_guard.lock();
+    return isDownloaderUnlocked(lock);
 }
 
-bool FileSegment::isDownloaderUnlocked(std::unique_lock<std::mutex> & segment_lock) const
+bool FileSegment::isDownloaderUnlocked(const FileSegmentGuard::Lock & lock) const
 {
-    return getCallerId() == getDownloaderUnlocked(segment_lock);
+    return getCallerId() == getDownloaderUnlocked(lock);
 }
 
 FileSegment::RemoteFileReaderPtr FileSegment::getRemoteFileReader()
 {
-    std::unique_lock segment_lock(mutex);
-    assertIsDownloaderUnlocked("getRemoteFileReader", segment_lock);
+    auto lock = segment_guard.lock();
+    assertIsDownloaderUnlocked("getRemoteFileReader", lock);
     return remote_file_reader;
 }
 
-FileSegment::RemoteFileReaderPtr FileSegment::extractRemoteFileReader()
+void FileSegment::resetRemoteFileReader()
 {
-    std::lock_guard cache_lock(cache->mutex);
-    std::unique_lock segment_lock(mutex);
+    auto lock = segment_guard.lock();
+    assertIsDownloaderUnlocked("resetRemoteFileReader", lock);
+    remote_file_reader.reset();
+}
 
-    if (!is_detached)
+FileSegment::RemoteFileReaderPtr FileSegment::extractRemoteFileReader()
+{
+    auto locked_key = lockKeyMetadata(false);
+    if (!locked_key)
     {
-        bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
-        if (!downloader_id.empty() || !is_last_holder)
-            return nullptr;
+        assert(isDetached());
+        return std::move(remote_file_reader);
     }
 
+    auto segment_lock = segment_guard.lock();
+
+    assert(download_state != State::DETACHED);
+
+    bool is_last_holder = locked_key->isLastOwnerOfFileSegment(offset());
+    if (!downloader_id.empty() || !is_last_holder)
+        return nullptr;
+
     return std::move(remote_file_reader);
 }
 
 void FileSegment::setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_)
 {
-    std::unique_lock segment_lock(mutex);
-    assertIsDownloaderUnlocked("setRemoteFileReader", segment_lock);
+    auto lock = segment_guard.lock();
+    assertIsDownloaderUnlocked("setRemoteFileReader", lock);
 
     if (remote_file_reader)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Remote file reader already exists");
@@ -286,54 +308,30 @@ void FileSegment::setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_)
     remote_file_reader = remote_file_reader_;
 }
 
-void FileSegment::resetRemoteFileReader()
-{
-    std::unique_lock segment_lock(mutex);
-    assertIsDownloaderUnlocked("resetRemoteFileReader", segment_lock);
-
-    remote_file_reader.reset();
-}
-
-std::unique_ptr<WriteBufferFromFile> FileSegment::detachWriter()
-{
-    std::unique_lock segment_lock(mutex);
-
-    if (!cache_writer)
-    {
-        if (detached_writer)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Writer is already detached");
-
-        auto download_path = getPathInLocalCache();
-        cache_writer = std::make_unique<WriteBufferFromFile>(download_path);
-    }
-    detached_writer = true;
-    return std::move(cache_writer);
-}
-
 void FileSegment::write(const char * from, size_t size, size_t offset)
 {
     if (!size)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing zero size is not allowed");
 
     {
-        std::unique_lock segment_lock(mutex);
+        auto lock = segment_guard.lock();
 
-        assertIsDownloaderUnlocked("write", segment_lock);
-        assertNotDetachedUnlocked(segment_lock);
+        assertIsDownloaderUnlocked("write", lock);
+        assertNotDetachedUnlocked(lock);
 
         if (download_state != State::DOWNLOADING)
             throw Exception(
                 ErrorCodes::LOGICAL_ERROR,
                 "Expected DOWNLOADING state, got {}", stateToString(download_state));
 
-        size_t first_non_downloaded_offset = getFirstNonDownloadedOffsetUnlocked(segment_lock);
+        size_t first_non_downloaded_offset = getFirstNonDownloadedOffset(false);
         if (offset != first_non_downloaded_offset)
             throw Exception(
                 ErrorCodes::LOGICAL_ERROR,
                 "Attempt to write {} bytes to offset: {}, but current write offset is {}",
                 size, offset, first_non_downloaded_offset);
 
-        size_t current_downloaded_size = getDownloadedSizeUnlocked(segment_lock);
+        size_t current_downloaded_size = getDownloadedSize(false);
         chassert(reserved_size >= current_downloaded_size);
         size_t free_reserved_size = reserved_size - current_downloaded_size;
 
@@ -353,25 +351,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset)
                     "Cache writer was finalized (downloaded size: {}, state: {})",
                     current_downloaded_size, stateToString(download_state));
 
-            if (detached_writer)
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache writer was detached");
-
-            auto download_path = getPathInLocalCache();
-
-            try
-            {
-                cache_writer = std::make_unique<WriteBufferFromFile>(download_path);
-            }
-            catch (Exception & e)
-            {
-                wrapWithCacheInfo(e, "while opening file in local cache", segment_lock);
-
-                setDownloadFailedUnlocked(segment_lock);
-
-                cv.notify_all();
-
-                throw;
-            }
+            cache_writer = std::make_unique<WriteBufferFromFile>(getPathInLocalCache());
         }
     }
 
@@ -379,7 +359,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset)
     {
         cache_writer->write(from, size);
 
-        std::unique_lock download_lock(download_mutex);
+        std::lock_guard lock(download_mutex);
 
         cache_writer->next();
 
@@ -389,9 +369,8 @@ void FileSegment::write(const char * from, size_t size, size_t offset)
     }
     catch (ErrnoException & e)
     {
-        std::unique_lock segment_lock(mutex);
-
-        wrapWithCacheInfo(e, "while writing into cache", segment_lock);
+        auto lock = segment_guard.lock();
+        e.addMessage(fmt::format("{}, current cache state: {}", e.what(), getInfoForLogUnlocked(lock)));
 
         int code = e.getErrno();
         if (code == /* No space left on device */28 || code == /* Quota exceeded */122)
@@ -403,37 +382,26 @@ void FileSegment::write(const char * from, size_t size, size_t offset)
                 downloaded_size = file_size;
         }
 
-        setDownloadFailedUnlocked(segment_lock);
-
-        cv.notify_all();
+        setDownloadFailedUnlocked(lock);
         throw;
 
     }
     catch (Exception & e)
     {
-        std::unique_lock segment_lock(mutex);
-
-        wrapWithCacheInfo(e, "while writing into cache", segment_lock);
-
-        setDownloadFailedUnlocked(segment_lock);
-
-        cv.notify_all();
+        auto lock = segment_guard.lock();
+        e.addMessage(fmt::format("{}, current cache state: {}", e.what(), getInfoForLogUnlocked(lock)));
+        setDownloadFailedUnlocked(lock);
         throw;
     }
 
-    chassert(getFirstNonDownloadedOffset() == offset + size);
+    chassert(getFirstNonDownloadedOffset(false) == offset + size);
 }
 
-FileSegment::State FileSegment::wait()
+FileSegment::State FileSegment::wait(size_t offset)
 {
-    std::unique_lock segment_lock(mutex);
+    auto lock = segment_guard.lock();
 
-    if (is_detached)
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Cache file segment is in detached state, operation not allowed");
-
-    if (downloader_id.empty())
+    if (downloader_id.empty() || offset < getCurrentWriteOffset(true))
         return download_state;
 
     if (download_state == State::EMPTY)
@@ -443,15 +411,46 @@ FileSegment::State FileSegment::wait()
     {
         LOG_TEST(log, "{} waiting on: {}, current downloader: {}", getCallerId(), range().toString(), downloader_id);
 
-        chassert(!getDownloaderUnlocked(segment_lock).empty());
-        chassert(!isDownloaderUnlocked(segment_lock));
+        chassert(!getDownloaderUnlocked(lock).empty());
+        chassert(!isDownloaderUnlocked(lock));
 
-        cv.wait_for(segment_lock, std::chrono::seconds(60));
+        [[maybe_unused]] const auto ok = cv.wait_for(lock, std::chrono::seconds(60), [&, this]()
+        {
+            return download_state != State::DOWNLOADING || offset < getCurrentWriteOffset(true);
+        });
+        /// chassert(ok);
     }
 
     return download_state;
 }
 
+KeyMetadataPtr FileSegment::getKeyMetadata() const
+{
+    auto metadata = tryGetKeyMetadata();
+    if (metadata)
+        return metadata;
+    throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot lock key, key metadata is not set ({})", stateToString(download_state));
+}
+
+KeyMetadataPtr FileSegment::tryGetKeyMetadata() const
+{
+    auto metadata = key_metadata.lock();
+    if (metadata)
+        return metadata;
+    return nullptr;
+}
+
+LockedKeyPtr FileSegment::lockKeyMetadata(bool assert_exists) const
+{
+    if (assert_exists)
+        return getKeyMetadata()->lock();
+
+    auto metadata = tryGetKeyMetadata();
+    if (!metadata)
+        return nullptr;
+    return metadata->tryLock();
+}
+
 bool FileSegment::reserve(size_t size_to_reserve)
 {
     if (!size_to_reserve)
@@ -461,13 +460,12 @@ bool FileSegment::reserve(size_t size_to_reserve)
 
     bool is_file_segment_size_exceeded;
     {
-        std::unique_lock segment_lock(mutex);
-
+        auto lock = segment_guard.lock();
 
-        assertNotDetachedUnlocked(segment_lock);
-        assertIsDownloaderUnlocked("reserve", segment_lock);
+        assertNotDetachedUnlocked(lock);
+        assertIsDownloaderUnlocked("reserve", lock);
 
-        expected_downloaded_size = getDownloadedSizeUnlocked(segment_lock);
+        expected_downloaded_size = getDownloadedSize(false);
 
         is_file_segment_size_exceeded = expected_downloaded_size + size_to_reserve > range().size();
         if (is_file_segment_size_exceeded && !is_unbound)
@@ -490,32 +488,31 @@ bool FileSegment::reserve(size_t size_to_reserve)
     size_t already_reserved_size = reserved_size - expected_downloaded_size;
 
     bool reserved = already_reserved_size >= size_to_reserve;
-    if (!reserved)
-    {
-        std::lock_guard cache_lock(cache->mutex);
-        std::lock_guard segment_lock(mutex);
+    if (reserved)
+        return reserved;
 
-        size_to_reserve = size_to_reserve - already_reserved_size;
+    size_to_reserve = size_to_reserve - already_reserved_size;
 
-        if (is_unbound && is_file_segment_size_exceeded)
-        {
-            segment_range.right = range().left + expected_downloaded_size + size_to_reserve;
-        }
+    /// This (resizable file segments) is allowed only for single threaded use of file segment.
+    /// Currently it is used only for temporary files through cache.
+    if (is_unbound && is_file_segment_size_exceeded)
+        segment_range.right = range().left + expected_downloaded_size + size_to_reserve;
 
-        reserved = cache->tryReserve(key(), offset(), size_to_reserve, cache_lock);
+    reserved = cache->tryReserve(*this, size_to_reserve);
 
-        if (reserved)
-            reserved_size += size_to_reserve;
-    }
+    if (!reserved)
+        setDownloadFailedUnlocked(segment_guard.lock());
 
     return reserved;
 }
 
-void FileSegment::setDownloadedUnlocked([[maybe_unused]] std::unique_lock<std::mutex> & segment_lock)
+void FileSegment::setDownloadedUnlocked(const FileSegmentGuard::Lock &)
 {
-    if (is_downloaded)
+    if (download_state == State::DOWNLOADED)
         return;
 
+    download_state = State::DOWNLOADED;
+
     if (cache_writer)
     {
         cache_writer->finalize();
@@ -523,19 +520,17 @@ void FileSegment::setDownloadedUnlocked([[maybe_unused]] std::unique_lock<std::m
         remote_file_reader.reset();
     }
 
-    download_state = State::DOWNLOADED;
-    is_downloaded = true;
-
-    assert(getDownloadedSizeUnlocked(segment_lock) > 0);
-    assert(fs::file_size(getPathInLocalCache()) > 0);
+    chassert(getDownloadedSize(false) > 0);
+    chassert(fs::file_size(getPathInLocalCache()) > 0);
 }
 
-void FileSegment::setDownloadFailedUnlocked(std::unique_lock<std::mutex> & segment_lock)
+void FileSegment::setDownloadFailedUnlocked(const FileSegmentGuard::Lock & lock)
 {
-    LOG_INFO(log, "Setting download as failed: {}", getInfoForLogUnlocked(segment_lock));
+    LOG_INFO(log, "Setting download as failed: {}", getInfoForLogUnlocked(lock));
+
+    SCOPE_EXIT({ cv.notify_all(); });
 
-    setDownloadState(State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
-    resetDownloaderUnlocked(segment_lock);
+    setDownloadState(State::PARTIALLY_DOWNLOADED_NO_CONTINUATION, lock);
 
     if (cache_writer)
     {
@@ -548,80 +543,60 @@ void FileSegment::setDownloadFailedUnlocked(std::unique_lock<std::mutex> & segme
 
 void FileSegment::completePartAndResetDownloader()
 {
-    std::unique_lock segment_lock(mutex);
-    completePartAndResetDownloaderUnlocked(segment_lock);
-}
-
-void FileSegment::completePartAndResetDownloaderUnlocked(std::unique_lock<std::mutex> & segment_lock)
-{
-    assertNotDetachedUnlocked(segment_lock);
-    assertIsDownloaderUnlocked("completePartAndResetDownloader", segment_lock);
+    auto lock = segment_guard.lock();
 
-    resetDownloadingStateUnlocked(segment_lock);
-    resetDownloaderUnlocked(segment_lock);
+    SCOPE_EXIT({ cv.notify_all(); });
 
-    LOG_TEST(log, "Complete batch. ({})", getInfoForLogUnlocked(segment_lock));
-    cv.notify_all();
-}
+    assertNotDetachedUnlocked(lock);
+    assertIsDownloaderUnlocked("completePartAndResetDownloader", lock);
 
-void FileSegment::completeWithState(State state)
-{
-    std::lock_guard cache_lock(cache->mutex);
-    std::unique_lock segment_lock(mutex);
+    chassert(download_state == State::DOWNLOADING
+             || download_state == State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
 
-    assertNotDetachedUnlocked(segment_lock);
-    assertIsDownloaderUnlocked("complete", segment_lock);
+    if (download_state == State::DOWNLOADING)
+        resetDownloadingStateUnlocked(lock);
 
-    if (state != State::DOWNLOADED
-        && state != State::PARTIALLY_DOWNLOADED
-        && state != State::PARTIALLY_DOWNLOADED_NO_CONTINUATION)
-    {
-        cv.notify_all();
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Cannot complete file segment with state: {}", stateToString(state));
-    }
+    resetDownloaderUnlocked(lock);
 
-    setDownloadState(state);
-    completeBasedOnCurrentState(cache_lock, segment_lock);
+    LOG_TEST(log, "Complete batch. ({})", getInfoForLogUnlocked(lock));
 }
 
-void FileSegment::completeWithoutState()
+void FileSegment::complete()
 {
-    std::lock_guard cache_lock(cache->mutex);
-    completeWithoutStateUnlocked(cache_lock);
-}
+    if (isCompleted())
+        return;
 
-void FileSegment::completeWithoutStateUnlocked(std::lock_guard<std::mutex> & cache_lock)
-{
-    std::unique_lock segment_lock(mutex);
-    completeBasedOnCurrentState(cache_lock, segment_lock);
-}
+    auto locked_key = lockKeyMetadata(false);
+    if (!locked_key)
+    {
+        /// If we failed to lock a key, it must be in detached state.
+        if (isDetached())
+            return;
 
-void FileSegment::completeBasedOnCurrentState(std::lock_guard<std::mutex> & cache_lock, std::unique_lock<std::mutex> & segment_lock)
-{
-    if (is_detached)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot complete file segment: {}", getInfoForLog());
+    }
+
+    auto segment_lock = segment_guard.lock();
+
+    if (isCompleted(false))
         return;
 
-    bool is_downloader = isDownloaderUnlocked(segment_lock);
-    bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
-    size_t current_downloaded_size = getDownloadedSizeUnlocked(segment_lock);
+    const bool is_downloader = isDownloaderUnlocked(segment_lock);
+    const bool is_last_holder = locked_key->isLastOwnerOfFileSegment(offset());
+    const size_t current_downloaded_size = getDownloadedSize(true);
 
     SCOPE_EXIT({
         if (is_downloader)
-        {
             cv.notify_all();
-        }
     });
 
     LOG_TEST(
-        log,
-        "Complete based on current state (is_last_holder: {}, {})",
+        log, "Complete based on current state (is_last_holder: {}, {})",
         is_last_holder, getInfoForLogUnlocked(segment_lock));
 
     if (is_downloader)
     {
-        if (download_state == State::DOWNLOADING) /// != in case of completeWithState
+        if (download_state == State::DOWNLOADING)
             resetDownloadingStateUnlocked(segment_lock);
         resetDownloaderUnlocked(segment_lock);
     }
@@ -639,25 +614,18 @@ void FileSegment::completeBasedOnCurrentState(std::lock_guard<std::mutex> & cach
     if (segment_kind == FileSegmentKind::Temporary && is_last_holder)
     {
         LOG_TEST(log, "Removing temporary file segment: {}", getInfoForLogUnlocked(segment_lock));
-        detach(cache_lock, segment_lock);
-        setDownloadState(State::SKIP_CACHE);
-        cache->remove(key(), offset(), cache_lock, segment_lock);
+        detach(segment_lock, *locked_key);
+        setDownloadState(State::DETACHED, segment_lock);
+        locked_key->removeFileSegment(offset(), segment_lock);
         return;
     }
 
     switch (download_state)
     {
-        case State::SKIP_CACHE:
-        {
-            if (is_last_holder)
-                cache->remove(key(), offset(), cache_lock, segment_lock);
-            break;
-        }
         case State::DOWNLOADED:
         {
-            chassert(getDownloadedSizeUnlocked(segment_lock) == range().size());
-            chassert(getDownloadedSizeUnlocked(segment_lock) == fs::file_size(getPathInLocalCache()));
-            chassert(is_downloaded);
+            chassert(current_downloaded_size == range().size());
+            chassert(current_downloaded_size == fs::file_size(getPathInLocalCache()));
             chassert(!cache_writer);
             break;
         }
@@ -670,74 +638,71 @@ void FileSegment::completeBasedOnCurrentState(std::lock_guard<std::mutex> & cach
         case State::PARTIALLY_DOWNLOADED:
         case State::PARTIALLY_DOWNLOADED_NO_CONTINUATION:
         {
+            chassert(current_downloaded_size != range().size());
+
             if (is_last_holder)
             {
                 if (current_downloaded_size == 0)
                 {
-                    LOG_TEST(log, "Remove cell {} (nothing downloaded)", range().toString());
-
-                    setDownloadState(State::SKIP_CACHE);
-                    cache->remove(key(), offset(), cache_lock, segment_lock);
+                    LOG_TEST(log, "Remove file segment {} (nothing downloaded)", range().toString());
+                    locked_key->removeFileSegment(offset(), segment_lock);
                 }
                 else
                 {
-                    LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), current_downloaded_size);
+                    LOG_TEST(log, "Resize file segment {} to downloaded: {}", range().toString(), current_downloaded_size);
 
                     /**
-                    * Only last holder of current file segment can resize the cell,
+                    * Only last holder of current file segment can resize the file segment,
                     * because there is an invariant that file segments returned to users
                     * in FileSegmentsHolder represent a contiguous range, so we can resize
                     * it only when nobody needs it.
                     */
-                    setDownloadState(State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
 
                     /// Resize this file segment by creating a copy file segment with DOWNLOADED state,
                     /// but current file segment should remain PARRTIALLY_DOWNLOADED_NO_CONTINUATION and with detached state,
                     /// because otherwise an invariant that getOrSet() returns a contiguous range of file segments will be broken
                     /// (this will be crucial for other file segment holder, not for current one).
-                    cache->reduceSizeToDownloaded(key(), offset(), cache_lock, segment_lock);
+                    locked_key->shrinkFileSegmentToDownloadedSize(offset(), segment_lock);
+
+                    /// We mark current file segment with state DETACHED, even though the data is still in cache
+                    /// (but a separate file segment) because is_last_holder is satisfied, so it does not matter.
                 }
 
-                detachAssumeStateFinalized(segment_lock);
+                setDetachedState(segment_lock);
             }
             break;
         }
+        default:
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected state while completing file segment");
     }
 
-    is_completed = true;
     LOG_TEST(log, "Completed file segment: {}", getInfoForLogUnlocked(segment_lock));
 }
 
 String FileSegment::getInfoForLog() const
 {
-    std::unique_lock segment_lock(mutex);
-    return getInfoForLogUnlocked(segment_lock);
+    auto lock = segment_guard.lock();
+    return getInfoForLogUnlocked(lock);
 }
 
-String FileSegment::getInfoForLogUnlocked(std::unique_lock<std::mutex> & segment_lock) const
+String FileSegment::getInfoForLogUnlocked(const FileSegmentGuard::Lock &) const
 {
     WriteBufferFromOwnString info;
     info << "File segment: " << range().toString() << ", ";
     info << "key: " << key().toString() << ", ";
-    info << "state: " << download_state << ", ";
-    info << "downloaded size: " << getDownloadedSizeUnlocked(segment_lock) << ", ";
-    info << "reserved size: " << reserved_size << ", ";
+    info << "state: " << download_state.load() << ", ";
+    info << "downloaded size: " << getDownloadedSize(false) << ", ";
+    info << "reserved size: " << reserved_size.load() << ", ";
     info << "downloader id: " << (downloader_id.empty() ? "None" : downloader_id) << ", ";
-    info << "current write offset: " << getCurrentWriteOffsetUnlocked(segment_lock) << ", ";
-    info << "first non-downloaded offset: " << getFirstNonDownloadedOffsetUnlocked(segment_lock) << ", ";
+    info << "current write offset: " << getCurrentWriteOffset(false) << ", ";
+    info << "first non-downloaded offset: " << getFirstNonDownloadedOffset(false) << ", ";
     info << "caller id: " << getCallerId() << ", ";
-    info << "detached: " << is_detached << ", ";
     info << "kind: " << toString(segment_kind) << ", ";
     info << "unbound: " << is_unbound;
 
     return info.str();
 }
 
-void FileSegment::wrapWithCacheInfo(Exception & e, const String & message, std::unique_lock<std::mutex> & segment_lock) const
-{
-    e.addMessage(fmt::format("{}, current cache state: {}", message, getInfoForLogUnlocked(segment_lock)));
-}
-
 String FileSegment::stateToString(FileSegment::State state)
 {
     switch (state)
@@ -752,189 +717,167 @@ String FileSegment::stateToString(FileSegment::State state)
             return "PARTIALLY DOWNLOADED";
         case FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION:
             return "PARTIALLY DOWNLOADED NO CONTINUATION";
-        case FileSegment::State::SKIP_CACHE:
-            return "SKIP_CACHE";
+        case FileSegment::State::DETACHED:
+            return "DETACHED";
     }
     UNREACHABLE();
 }
 
-void FileSegment::assertCorrectness() const
+bool FileSegment::assertCorrectness() const
 {
-    std::unique_lock segment_lock(mutex);
-    assertCorrectnessUnlocked(segment_lock);
+    return assertCorrectnessUnlocked(segment_guard.lock());
 }
 
-void FileSegment::assertCorrectnessUnlocked(std::unique_lock<std::mutex> & segment_lock) const
+bool FileSegment::assertCorrectnessUnlocked(const FileSegmentGuard::Lock &) const
 {
-    auto current_downloader = getDownloaderUnlocked(segment_lock);
-    chassert(current_downloader.empty() == (download_state != FileSegment::State::DOWNLOADING));
-    chassert(!current_downloader.empty() == (download_state == FileSegment::State::DOWNLOADING));
-    chassert(download_state != FileSegment::State::DOWNLOADED || fs::file_size(getPathInLocalCache()) > 0);
-}
+    auto check_iterator = [this](const Priority::Iterator & it)
+    {
+        UNUSED(this);
+        if (!it)
+            return;
 
-void FileSegment::throwIfDetachedUnlocked(std::unique_lock<std::mutex> & segment_lock) const
-{
-    throw Exception(
-        ErrorCodes::LOGICAL_ERROR,
-        "Cache file segment is in detached state, operation not allowed. "
-        "It can happen when cache was concurrently dropped with SYSTEM DROP FILESYSTEM CACHE FORCE. "
-        "Please, retry. File segment info: {}", getInfoForLogUnlocked(segment_lock));
-}
+        const auto & entry = it->getEntry();
+        UNUSED(entry);
+        chassert(entry.size == reserved_size);
+        chassert(entry.key == key());
+        chassert(entry.offset == offset());
+    };
 
-void FileSegment::assertNotDetached() const
-{
-    std::unique_lock segment_lock(mutex);
-    assertNotDetachedUnlocked(segment_lock);
+    if (download_state == State::DOWNLOADED)
+    {
+        chassert(downloader_id.empty());
+        chassert(downloaded_size == reserved_size);
+        chassert(std::filesystem::file_size(getPathInLocalCache()) > 0);
+        chassert(queue_iterator);
+        check_iterator(queue_iterator);
+    }
+    else
+    {
+        if (download_state == State::DOWNLOADED)
+        {
+            chassert(!downloader_id.empty());
+        }
+        else if (download_state == State::PARTIALLY_DOWNLOADED
+                 || download_state == State::EMPTY)
+        {
+            chassert(downloader_id.empty());
+        }
+
+        chassert(reserved_size >= downloaded_size);
+        chassert((reserved_size == 0) || queue_iterator);
+        check_iterator(queue_iterator);
+    }
+
+    return true;
 }
 
-void FileSegment::assertNotDetachedUnlocked(std::unique_lock<std::mutex> & segment_lock) const
+void FileSegment::assertNotDetached() const
 {
-    if (is_detached)
-        throwIfDetachedUnlocked(segment_lock);
+    auto lock = segment_guard.lock();
+    assertNotDetachedUnlocked(lock);
 }
 
-void FileSegment::assertDetachedStatus(std::unique_lock<std::mutex> & segment_lock) const
+void FileSegment::assertNotDetachedUnlocked(const FileSegmentGuard::Lock & lock) const
 {
-    /// Detached file segment is allowed to have only a certain subset of states.
-    /// It should be either EMPTY or one of the finalized states.
-
-    if (download_state != State::EMPTY && !hasFinalizedStateUnlocked(segment_lock))
+    if (download_state == State::DETACHED)
     {
         throw Exception(
             ErrorCodes::LOGICAL_ERROR,
-            "Detached file segment has incorrect state: {}",
-            getInfoForLogUnlocked(segment_lock));
+            "Cache file segment is in detached state, operation not allowed. "
+            "It can happen when cache was concurrently dropped with SYSTEM DROP FILESYSTEM CACHE FORCE. "
+            "Please, retry. File segment info: {}", getInfoForLogUnlocked(lock));
     }
 }
 
-FileSegmentPtr FileSegment::getSnapshot(const FileSegmentPtr & file_segment, std::lock_guard<std::mutex> & /* cache_lock */)
+FileSegmentPtr FileSegment::getSnapshot(const FileSegmentPtr & file_segment)
 {
-    std::unique_lock segment_lock(file_segment->mutex);
+    auto lock = file_segment->segment_guard.lock();
 
     auto snapshot = std::make_shared<FileSegment>(
+        file_segment->key(),
         file_segment->offset(),
         file_segment->range().size(),
-        file_segment->key(),
-        nullptr,
-        State::EMPTY,
-        CreateFileSegmentSettings{});
+        State::DETACHED,
+        CreateFileSegmentSettings(file_segment->getKind(), file_segment->is_unbound));
 
     snapshot->hits_count = file_segment->getHitsCount();
+    snapshot->downloaded_size = file_segment->getDownloadedSize(false);
+    snapshot->download_state = file_segment->download_state.load();
     snapshot->ref_count = file_segment.use_count();
-    snapshot->downloaded_size = file_segment->getDownloadedSizeUnlocked(segment_lock);
-    snapshot->download_state = file_segment->download_state;
-    snapshot->segment_kind = file_segment->getKind();
-    snapshot->is_unbound = file_segment->is_unbound;
 
     return snapshot;
 }
 
-bool FileSegment::hasFinalizedStateUnlocked(std::unique_lock<std::mutex> & /* segment_lock */) const
-{
-    return download_state == State::DOWNLOADED
-        || download_state == State::PARTIALLY_DOWNLOADED_NO_CONTINUATION
-        || download_state == State::SKIP_CACHE;
-}
-
 bool FileSegment::isDetached() const
 {
-    std::unique_lock segment_lock(mutex);
-    return is_detached;
+    auto lock = segment_guard.lock();
+    return download_state == State::DETACHED;
 }
 
-bool FileSegment::isCompleted() const
+bool FileSegment::isCompleted(bool sync) const
 {
-    std::unique_lock segment_lock(mutex);
-    return is_completed;
-}
+    auto is_completed_state = [this]() -> bool
+    {
+        return download_state == State::DOWNLOADED || download_state == State::DETACHED;
+    };
 
-void FileSegment::detach(std::lock_guard<std::mutex> & /* cache_lock */, std::unique_lock<std::mutex> & segment_lock)
-{
-    if (is_detached)
-        return;
+    if (sync)
+    {
+        if (is_completed_state())
+            return true;
 
-    if (download_state == State::DOWNLOADING)
-        resetDownloadingStateUnlocked(segment_lock);
-    else
-        setDownloadState(State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
+        auto lock = segment_guard.lock();
+        return is_completed_state();
+    }
 
-    resetDownloaderUnlocked(segment_lock);
-    detachAssumeStateFinalized(segment_lock);
+    return is_completed_state();
 }
 
-void FileSegment::detachAssumeStateFinalized(std::unique_lock<std::mutex> & segment_lock)
+void FileSegment::setDetachedState(const FileSegmentGuard::Lock & lock)
 {
-    is_detached = true;
-    CurrentMetrics::add(CurrentMetrics::CacheDetachedFileSegments);
-    LOG_TEST(log, "Detached file segment: {}", getInfoForLogUnlocked(segment_lock));
+    setDownloadState(State::DETACHED, lock);
+    key_metadata.reset();
+    cache = nullptr;
 }
 
-FileSegment::~FileSegment()
+void FileSegment::detach(const FileSegmentGuard::Lock & lock, const LockedKey &)
 {
-    std::unique_lock segment_lock(mutex);
-    if (is_detached)
-        CurrentMetrics::sub(CurrentMetrics::CacheDetachedFileSegments);
+    if (download_state == State::DETACHED)
+        return;
+
+    resetDownloaderUnlocked(lock);
+    setDetachedState(lock);
 }
 
-void FileSegmentsHolder::reset()
+void FileSegment::use()
 {
-    /// In CacheableReadBufferFromRemoteFS file segment's downloader removes file segments from
-    /// FileSegmentsHolder right after calling file_segment->complete(), so on destruction here
-    /// remain only uncompleted file segments.
-
-    SCOPE_EXIT({
-        file_segments.clear();
-    });
-
-    FileCache * cache = nullptr;
-
-    for (auto file_segment_it = file_segments.begin(); file_segment_it != file_segments.end();)
+    if (!cache)
     {
-        auto current_file_segment_it = file_segment_it;
-        auto & file_segment = *current_file_segment_it;
-
-        if (!cache)
-            cache = file_segment->cache;
-
-        assert(cache == file_segment->cache); /// all segments should belong to the same cache
-
-        try
-        {
-            bool is_detached = false;
-
-            {
-                std::unique_lock segment_lock(file_segment->mutex);
-                is_detached = file_segment->isDetached(segment_lock);
-                if (is_detached)
-                    file_segment->assertDetachedStatus(segment_lock);
-            }
-
-            if (is_detached)
-            {
-                /// This file segment is not owned by cache, so it will be destructed
-                /// at this point, therefore no completion required.
-                file_segment_it = file_segments.erase(current_file_segment_it);
-                continue;
-            }
-
-            /// File segment pointer must be reset right after calling complete() and
-            /// under the same mutex, because complete() checks for segment pointers.
-            std::lock_guard cache_lock(cache->mutex);
-
-            file_segment->completeWithoutStateUnlocked(cache_lock);
+        chassert(isCompleted(true));
+        return;
+    }
 
-            file_segment_it = file_segments.erase(current_file_segment_it);
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
+    auto it = getQueueIterator();
+    if (it)
+    {
+        auto cache_lock = cache->lockCache();
+        it->use(cache_lock);
     }
 }
 
+FileSegments::iterator FileSegmentsHolder::completeAndPopFrontImpl()
+{
+    front().complete();
+    return file_segments.erase(file_segments.begin());
+}
+
 FileSegmentsHolder::~FileSegmentsHolder()
 {
-    reset();
+    if (!complete_on_dtor)
+        return;
+
+    for (auto file_segment_it = file_segments.begin(); file_segment_it != file_segments.end();)
+        file_segment_it = completeAndPopFrontImpl();
 }
 
 String FileSegmentsHolder::toString()
@@ -945,7 +888,7 @@ String FileSegmentsHolder::toString()
         if (!ranges.empty())
             ranges += ", ";
         ranges += file_segment->range().toString();
-        if (file_segment->is_unbound)
+        if (file_segment->isUnbound())
             ranges += "(unbound)";
     }
     return ranges;
diff --git a/src/Interpreters/Cache/FileSegment.h b/src/Interpreters/Cache/FileSegment.h
index 44bc5662c40f..60883631177e 100644
--- a/src/Interpreters/Cache/FileSegment.h
+++ b/src/Interpreters/Cache/FileSegment.h
@@ -2,13 +2,16 @@
 
 #include <boost/noncopyable.hpp>
 #include <Interpreters/Cache/FileCacheKey.h>
+#include <Interpreters/Cache/Guards.h>
 
 #include <IO/WriteBufferFromFile.h>
 #include <IO/ReadBufferFromFileBase.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
+#include <IO/OpenedFileCache.h>
 #include <base/getThreadId.h>
-#include <list>
+#include <Interpreters/Cache/IFileCachePriority.h>
+#include <Interpreters/Cache/FileCache_fwd_internal.h>
 #include <queue>
 
 
@@ -22,14 +25,8 @@ extern const Metric CacheFileSegments;
 namespace DB
 {
 
-class FileCache;
 class ReadBufferFromFileBase;
 
-class FileSegment;
-using FileSegmentPtr = std::shared_ptr<FileSegment>;
-using FileSegments = std::list<FileSegmentPtr>;
-
-
 /*
  * FileSegmentKind is used to specify the eviction policy for file segments.
  */
@@ -61,17 +58,13 @@ struct CreateFileSegmentSettings
     CreateFileSegmentSettings() = default;
 
     explicit CreateFileSegmentSettings(FileSegmentKind kind_, bool unbounded_ = false)
-        : kind(kind_), unbounded(unbounded_)
-    {}
+        : kind(kind_), unbounded(unbounded_) {}
 };
 
 class FileSegment : private boost::noncopyable, public std::enable_shared_from_this<FileSegment>
 {
-
-friend class FileCache;
-friend struct FileSegmentsHolder;
-friend class FileSegmentRangeWriter;
-friend class StorageSystemFilesystemCache;
+friend struct LockedKey;
+friend class FileCache; /// Because of reserved_size in tryReserve().
 
 public:
     using Key = FileCacheKey;
@@ -79,6 +72,7 @@ friend class StorageSystemFilesystemCache;
     using LocalCacheWriterPtr = std::unique_ptr<WriteBufferFromFile>;
     using Downloader = std::string;
     using DownloaderId = std::string;
+    using Priority = IFileCachePriority;
 
     enum class State
     {
@@ -111,18 +105,20 @@ friend class StorageSystemFilesystemCache;
          * If file segment cannot possibly be downloaded (first space reservation attempt failed), mark
          * this file segment as out of cache scope.
          */
-        SKIP_CACHE,
+        DETACHED,
     };
 
     FileSegment(
+        const Key & key_,
         size_t offset_,
         size_t size_,
-        const Key & key_,
-        FileCache * cache_,
         State download_state_,
-        const CreateFileSegmentSettings & create_settings);
+        const CreateFileSegmentSettings & create_settings = {},
+        FileCache * cache_ = nullptr,
+        std::weak_ptr<KeyMetadata> key_metadata_ = std::weak_ptr<KeyMetadata>(),
+        Priority::Iterator queue_iterator_ = Priority::Iterator{});
 
-    ~FileSegment();
+    ~FileSegment() = default;
 
     State state() const;
 
@@ -158,11 +154,10 @@ friend class StorageSystemFilesystemCache;
     size_t offset() const { return range().left; }
 
     FileSegmentKind getKind() const { return segment_kind; }
+
     bool isPersistent() const { return segment_kind == FileSegmentKind::Persistent; }
-    bool isUnbound() const { return is_unbound; }
 
-    using UniqueId = std::pair<FileCacheKey, size_t>;
-    UniqueId getUniqueId() const { return std::pair(key(), offset()); }
+    bool isUnbound() const { return is_unbound; }
 
     String getPathInLocalCache() const;
 
@@ -177,7 +172,7 @@ friend class StorageSystemFilesystemCache;
     DownloaderId getDownloader() const;
 
     /// Wait for the change of state from DOWNLOADING to any other.
-    State wait();
+    State wait(size_t offset);
 
     bool isDownloaded() const;
 
@@ -187,11 +182,13 @@ friend class StorageSystemFilesystemCache;
 
     void incrementHitsCount() { ++hits_count; }
 
-    size_t getCurrentWriteOffset() const;
+    size_t getCurrentWriteOffset(bool sync) const;
+
+    size_t getFirstNonDownloadedOffset(bool sync) const;
 
-    size_t getFirstNonDownloadedOffset() const;
+    size_t getDownloadedSize(bool sync) const;
 
-    size_t getDownloadedSize() const;
+    size_t getReservedSize() const;
 
     /// Now detached status can be used in the following cases:
     /// 1. there is only 1 remaining file segment holder
@@ -207,15 +204,43 @@ friend class StorageSystemFilesystemCache;
     /// 2. Detached file segment can still be hold by some cache users, but it's state became
     /// immutable at the point it was detached, any non-const / stateful method will throw an
     /// exception.
-    void detach(std::lock_guard<std::mutex> & cache_lock, std::unique_lock<std::mutex> & segment_lock);
+    void detach(const FileSegmentGuard::Lock &, const LockedKey &);
 
-    static FileSegmentPtr getSnapshot(const FileSegmentPtr & file_segment, std::lock_guard<std::mutex> & cache_lock);
+    static FileSegmentPtr getSnapshot(const FileSegmentPtr & file_segment);
 
     bool isDetached() const;
 
-    bool isCompleted() const;
+    /// File segment has a completed state, if this state is final and
+    /// is not going to be changed. Completed states: DOWNALODED, DETACHED.
+    bool isCompleted(bool sync = false) const;
+
+    void use();
+
+    /**
+     * ========== Methods used by `cache` ========================
+     */
+
+    FileSegmentGuard::Lock lock() const { return segment_guard.lock(); }
+
+    Priority::Iterator getQueueIterator() const;
+
+    void setQueueIterator(Priority::Iterator iterator);
+
+    KeyMetadataPtr tryGetKeyMetadata() const;
+
+    KeyMetadataPtr getKeyMetadata() const;
 
-    void assertCorrectness() const;
+    bool assertCorrectness() const;
+
+    /**
+     * ========== Methods that must do cv.notify() ==================
+     */
+
+    void complete();
+
+    void completePartAndResetDownloader();
+
+    void resetDownloader();
 
     /**
      * ========== Methods for _only_ file segment's `downloader` ==================
@@ -233,16 +258,6 @@ friend class StorageSystemFilesystemCache;
     /// Write data into reserved space.
     void write(const char * from, size_t size, size_t offset);
 
-    /// Complete file segment with a certain state.
-    void completeWithState(State state);
-
-    void completeWithoutState();
-
-    /// Complete file segment's part which was last written.
-    void completePartAndResetDownloader();
-
-    void resetDownloader();
-
     // Invariant: if state() != DOWNLOADING and remote file reader is present, the reader's
     // available() == 0, and getFileOffsetOfBufferEnd() == our getCurrentWriteOffset().
     //
@@ -252,125 +267,112 @@ friend class StorageSystemFilesystemCache;
 
     RemoteFileReaderPtr extractRemoteFileReader();
 
-    void setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_);
-
     void resetRemoteFileReader();
 
-    void setDownloadedSize(size_t delta);
+    void setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_);
 
-    LocalCacheWriterPtr detachWriter();
+    void setDownloadedSize(size_t delta);
 
 private:
-    size_t getFirstNonDownloadedOffsetUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-    size_t getCurrentWriteOffsetUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-    size_t getDownloadedSizeUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-
-    String getInfoForLogUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-
-    String getDownloaderUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-    void resetDownloaderUnlocked(std::unique_lock<std::mutex> & segment_lock);
-    void resetDownloadingStateUnlocked(std::unique_lock<std::mutex> & segment_lock);
-
-    void setDownloadState(State state);
+    String getDownloaderUnlocked(const FileSegmentGuard::Lock &) const;
+    bool isDownloaderUnlocked(const FileSegmentGuard::Lock & segment_lock) const;
+    void resetDownloaderUnlocked(const FileSegmentGuard::Lock &);
 
-    void setDownloadedUnlocked(std::unique_lock<std::mutex> & segment_lock);
-    void setDownloadFailedUnlocked(std::unique_lock<std::mutex> & segment_lock);
-    void setDownloadedSizeUnlocked(std::unique_lock<std::mutex> & /* download_lock */, size_t delta);
+    void setDownloadState(State state, const FileSegmentGuard::Lock &);
+    void resetDownloadingStateUnlocked(const FileSegmentGuard::Lock &);
+    void setDetachedState(const FileSegmentGuard::Lock &);
 
-    bool hasFinalizedStateUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
+    String getInfoForLogUnlocked(const FileSegmentGuard::Lock &) const;
 
-    bool isDownloaderUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
+    void setDownloadedUnlocked(const FileSegmentGuard::Lock &);
+    void setDownloadFailedUnlocked(const FileSegmentGuard::Lock &);
 
-    bool isDetached(std::unique_lock<std::mutex> & /* segment_lock */) const { return is_detached; }
-    void detachAssumeStateFinalized(std::unique_lock<std::mutex> & segment_lock);
-    [[noreturn]] void throwIfDetachedUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-
-    void assertDetachedStatus(std::unique_lock<std::mutex> & segment_lock) const;
     void assertNotDetached() const;
-    void assertNotDetachedUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-    void assertIsDownloaderUnlocked(const std::string & operation, std::unique_lock<std::mutex> & segment_lock) const;
-    void assertCorrectnessUnlocked(std::unique_lock<std::mutex> & segment_lock) const;
-
-    /// completeWithoutStateUnlocked() is called from destructor of FileSegmentsHolder.
-    /// Function might check if the caller of the method
-    /// is the last alive holder of the segment. Therefore, completion and destruction
-    /// of the file segment pointer must be done under the same cache mutex.
-    void completeWithoutStateUnlocked(std::lock_guard<std::mutex> & cache_lock);
-    void completeBasedOnCurrentState(std::lock_guard<std::mutex> & cache_lock, std::unique_lock<std::mutex> & segment_lock);
+    void assertNotDetachedUnlocked(const FileSegmentGuard::Lock &) const;
+    void assertIsDownloaderUnlocked(const std::string & operation, const FileSegmentGuard::Lock &) const;
+    bool assertCorrectnessUnlocked(const FileSegmentGuard::Lock &) const;
 
-    void completePartAndResetDownloaderUnlocked(std::unique_lock<std::mutex> & segment_lock);
-
-    void wrapWithCacheInfo(Exception & e, const String & message, std::unique_lock<std::mutex> & segment_lock) const;
+    LockedKeyPtr lockKeyMetadata(bool assert_exists = true) const;
 
+    Key file_key;
     Range segment_range;
+    const FileSegmentKind segment_kind;
+    /// Size of the segment is not known until it is downloaded and
+    /// can be bigger than max_file_segment_size.
+    const bool is_unbound = false;
 
-    State download_state;
-
-    /// The one who prepares the download
-    DownloaderId downloader_id;
+    std::atomic<State> download_state;
+    DownloaderId downloader_id; /// The one who prepares the download
 
     RemoteFileReaderPtr remote_file_reader;
     LocalCacheWriterPtr cache_writer;
-    bool detached_writer = false;
 
     /// downloaded_size should always be less or equal to reserved_size
-    size_t downloaded_size = 0;
-    size_t reserved_size = 0;
-
-    /// global locking order rule:
-    /// 1. cache lock
-    /// 2. segment lock
-
-    mutable std::mutex mutex;
-    std::condition_variable cv;
-
-    /// Protects downloaded_size access with actual write into fs.
-    /// downloaded_size is not protected by download_mutex in methods which
-    /// can never be run in parallel to FileSegment::write() method
-    /// as downloaded_size is updated only in FileSegment::write() method.
-    /// Such methods are identified by isDownloader() check at their start,
-    /// e.g. they are executed strictly by the same thread, sequentially.
+    std::atomic<size_t> downloaded_size = 0;
+    std::atomic<size_t> reserved_size = 0;
     mutable std::mutex download_mutex;
 
-    Key file_key;
+    mutable FileSegmentGuard segment_guard;
+    std::weak_ptr<KeyMetadata> key_metadata;
+    mutable Priority::Iterator queue_iterator; /// Iterator is put here on first reservation attempt, if successful.
     FileCache * cache;
+    std::condition_variable cv;
 
     Poco::Logger * log;
 
-    /// "detached" file segment means that it is not owned by cache ("detached" from cache).
-    /// In general case, all file segments are owned by cache.
-    bool is_detached = false;
-    bool is_completed = false;
-
-    bool is_downloaded = false;
-
     std::atomic<size_t> hits_count = 0; /// cache hits.
     std::atomic<size_t> ref_count = 0; /// Used for getting snapshot state
 
-    FileSegmentKind segment_kind;
-
-    /// Size of the segment is not known until it is downloaded and can be bigger than max_file_segment_size.
-    bool is_unbound = false;
-
     CurrentMetrics::Increment metric_increment{CurrentMetrics::CacheFileSegments};
 };
 
+
 struct FileSegmentsHolder : private boost::noncopyable
 {
     FileSegmentsHolder() = default;
 
-    explicit FileSegmentsHolder(FileSegments && file_segments_) : file_segments(std::move(file_segments_)) {}
+    explicit FileSegmentsHolder(FileSegments && file_segments_, bool complete_on_dtor_ = true)
+        : file_segments(std::move(file_segments_)), complete_on_dtor(complete_on_dtor_) {}
 
-    FileSegmentsHolder(FileSegmentsHolder && other) noexcept : file_segments(std::move(other.file_segments)) {}
+    ~FileSegmentsHolder();
 
-    void reset();
     bool empty() const { return file_segments.empty(); }
 
-    ~FileSegmentsHolder();
+    size_t size() const { return file_segments.size(); }
 
     String toString();
 
+    void popFront() { completeAndPopFrontImpl(); }
+
+    FileSegment & front() { return *file_segments.front(); }
+
+    FileSegment & back() { return *file_segments.back(); }
+
+    FileSegment & add(FileSegmentPtr && file_segment)
+    {
+        file_segments.push_back(file_segment);
+        return *file_segments.back();
+    }
+
+    FileSegments::iterator begin() { return file_segments.begin(); }
+    FileSegments::iterator end() { return file_segments.end(); }
+
+    FileSegments::const_iterator begin() const { return file_segments.begin(); }
+    FileSegments::const_iterator end() const { return file_segments.end(); }
+
+    void moveTo(FileSegmentsHolder & holder)
+    {
+        holder.file_segments.insert(holder.file_segments.end(), file_segments.begin(), file_segments.end());
+        file_segments.clear();
+    }
+
+private:
     FileSegments file_segments{};
+    const bool complete_on_dtor = true;
+
+    FileSegments::iterator completeAndPopFrontImpl();
 };
 
+using FileSegmentsHolderPtr = std::unique_ptr<FileSegmentsHolder>;
+
 }
diff --git a/src/Interpreters/Cache/Guards.h b/src/Interpreters/Cache/Guards.h
new file mode 100644
index 000000000000..0e06495bd825
--- /dev/null
+++ b/src/Interpreters/Cache/Guards.h
@@ -0,0 +1,117 @@
+#pragma once
+#include <mutex>
+#include <Interpreters/Cache/FileCache_fwd.h>
+#include <boost/noncopyable.hpp>
+#include <map>
+
+namespace DB
+{
+/**
+ * FileCache::get/getOrSet/set
+ * 1. CacheMetadataGuard::Lock (take key lock and release metadata lock)
+ * 2. KeyGuard::Lock (hold till the end of the method)
+ *
+ * FileCache::tryReserve
+ * 1. CacheGuard::Lock
+ * 2. KeyGuard::Lock (taken without metadata lock)
+ * 3. any number of KeyGuard::Lock's for files which are going to be evicted (taken via metadata lock)
+ *
+ * FileCache::removeIfExists
+ * 1. CacheGuard::Lock
+ * 2. KeyGuard::Lock (taken via metadata lock)
+ * 3. FileSegmentGuard::Lock
+ *
+ * FileCache::removeAllReleasable
+ * 1. CacheGuard::Lock
+ * 2. any number of KeyGuard::Lock's locks (takken via metadata lock), but at a moment of time only one key lock can be hold
+ * 3. FileSegmentGuard::Lock
+ *
+ * FileCache::getSnapshot (for all cache)
+ * 1. metadata lock
+ * 2. any number of KeyGuard::Lock's locks (takken via metadata lock), but at a moment of time only one key lock can be hold
+ * 3. FileSegmentGuard::Lock
+ *
+ * FileCache::getSnapshot(key)
+ * 1. KeyGuard::Lock (taken via metadata lock)
+ * 2. FileSegmentGuard::Lock
+ *
+ * FileSegment::complete
+ * 1. CacheGuard::Lock
+ * 2. KeyGuard::Lock (taken without metadata lock)
+ * 3. FileSegmentGuard::Lock
+ *
+ * Rules:
+ * 1. Priority of locking: CacheGuard::Lock > CacheMetadataGuard::Lock > KeyGuard::Lock > FileSegmentGuard::Lock
+ * 2. If we take more than one key lock at a moment of time, we need to take CacheGuard::Lock (example: tryReserve())
+ *
+ *
+ *                                 _CacheGuard_
+ *                                 1. FileCache::tryReserve
+ *                                 2. FileCache::removeIfExists(key)
+ *                                 3. FileCache::removeAllReleasable
+ *                                 4. FileSegment::complete
+ *
+ *             _KeyGuard_                                      _CacheMetadataGuard_
+ *             1. all from CacheGuard                          1. getOrSet/get/set
+ *             2. getOrSet/get/Set
+ *
+ * *This table does not include locks taken for introspection and system tables.
+ */
+
+/**
+ * Cache priority queue guard.
+ */
+struct CacheGuard : private boost::noncopyable
+{
+    struct Lock : public std::unique_lock<std::mutex>
+    {
+        explicit Lock(std::mutex & mutex_) : std::unique_lock<std::mutex>(mutex_) {}
+    };
+
+    Lock lock() { return Lock(mutex); }
+    std::mutex mutex;
+};
+
+/**
+ * Guard for cache metadata.
+ */
+struct CacheMetadataGuard : private boost::noncopyable
+{
+    struct Lock : public std::unique_lock<std::mutex>
+    {
+        explicit Lock(std::mutex & mutex_) : std::unique_lock<std::mutex>(mutex_) {}
+    };
+
+    Lock lock() { return Lock(mutex); }
+    std::mutex mutex;
+};
+
+/**
+ * Key guard. A separate guard for each cache key.
+ */
+struct KeyGuard : private boost::noncopyable
+{
+    struct Lock : public std::unique_lock<std::mutex>
+    {
+        explicit Lock(std::mutex & mutex_) : std::unique_lock<std::mutex>(mutex_) {}
+    };
+
+    Lock lock() { return Lock(mutex); }
+    std::mutex mutex;
+};
+
+/**
+ * Guard for a file segment.
+ */
+struct FileSegmentGuard : private boost::noncopyable
+{
+    struct Lock : public std::unique_lock<std::mutex>
+    {
+        explicit Lock(std::mutex & mutex_) : std::unique_lock<std::mutex>(mutex_) {}
+    };
+
+    Lock lock() { return Lock(mutex); }
+    std::mutex mutex;
+};
+
+}
diff --git a/src/Interpreters/Cache/IFileCachePriority.h b/src/Interpreters/Cache/IFileCachePriority.h
index 94ec34132c31..4d5f67cc10cb 100644
--- a/src/Interpreters/Cache/IFileCachePriority.h
+++ b/src/Interpreters/Cache/IFileCachePriority.h
@@ -5,33 +5,35 @@
 #include <Core/Types.h>
 #include <Common/Exception.h>
 #include <Interpreters/Cache/FileCacheKey.h>
+#include <Interpreters/Cache/Guards.h>
+#include <Interpreters/Cache/FileCache_fwd_internal.h>
 
 namespace DB
 {
 
-class IFileCachePriority;
-using FileCachePriorityPtr = std::shared_ptr<IFileCachePriority>;
-
 /// IFileCachePriority is used to maintain the priority of cached data.
-class IFileCachePriority
+class IFileCachePriority : private boost::noncopyable
 {
 public:
-    class IIterator;
     using Key = FileCacheKey;
-    using ReadIterator = std::unique_ptr<const IIterator>;
-    using WriteIterator = std::shared_ptr<IIterator>;
+    using KeyAndOffset = FileCacheKeyAndOffset;
 
-    struct FileCacheRecord
+    struct Entry
     {
-        Key key;
-        size_t offset;
-        size_t size;
-        size_t hits = 0;
+        Entry(const Key & key_, size_t offset_, size_t size_, KeyMetadataPtr key_metadata_)
+            : key(key_), offset(offset_), size(size_), key_metadata(key_metadata_) {}
+
+        Entry(const Entry & other)
+            : key(other.key), offset(other.offset), size(other.size.load()), hits(other.hits), key_metadata(other.key_metadata) {}
 
-        FileCacheRecord(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) { }
+        const Key key;
+        const size_t offset;
+        std::atomic<size_t> size;
+        size_t hits = 0;
+        const KeyMetadataPtr key_metadata;
     };
 
-    /// It provides an iterator to traverse the cache priority. Under normal circumstances,
+    /// Provides an iterator to traverse the cache priority. Under normal circumstances,
     /// the iterator can only return the records that have been directly swapped out.
     /// For example, in the LRU algorithm, it can traverse all records, but in the LRU-K, it
     /// can only traverse the records in the low priority queue.
@@ -40,56 +42,54 @@ class IFileCachePriority
     public:
         virtual ~IIterator() = default;
 
-        virtual const Key & key() const = 0;
-
-        virtual size_t offset() const = 0;
+        virtual size_t use(const CacheGuard::Lock &) = 0;
 
-        virtual size_t size() const = 0;
+        virtual std::shared_ptr<IIterator> remove(const CacheGuard::Lock &) = 0;
 
-        virtual size_t hits() const = 0;
+        virtual const Entry & getEntry() const = 0;
 
-        /// Point the iterator to the next higher priority cache record.
-        virtual void next() const = 0;
+        virtual Entry & getEntry() = 0;
 
-        virtual bool valid() const = 0;
+        virtual void annul() = 0;
 
-        /// Mark a cache record as recently used, it will update the priority
-        /// of the cache record according to different cache algorithms.
-        virtual void use(std::lock_guard<std::mutex> &) = 0;
+        virtual void updateSize(int64_t size) = 0;
+    };
 
-        /// Deletes an existing cached record. And to avoid pointer suspension
-        /// the iterator should automatically point to the next record.
-        virtual void removeAndGetNext(std::lock_guard<std::mutex> &) = 0;
+    using Iterator = std::shared_ptr<IIterator>;
+    using ConstIterator = std::shared_ptr<const IIterator>;
 
-        virtual void updateSize(int64_t, std::lock_guard<std::mutex> &) = 0;
+    enum class IterationResult
+    {
+        BREAK,
+        CONTINUE,
+        REMOVE_AND_CONTINUE,
     };
+    using IterateFunc = std::function<IterationResult(LockedKey &, FileSegmentMetadataPtr)>;
+
+    IFileCachePriority(size_t max_size_, size_t max_elements_) : max_size(max_size_), max_elements(max_elements_) {}
 
-public:
     virtual ~IFileCachePriority() = default;
 
-    /// Add a cache record that did not exist before, and throw a
-    /// logical exception if the cache block already exists.
-    virtual WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock) = 0;
+    size_t getElementsLimit() const { return max_elements; }
 
-    /// This method is used for assertions in debug mode. So we do not care about complexity here.
-    /// Query whether a cache record exists. If it exists, return true. If not, return false.
-    virtual bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) = 0;
+    size_t getSizeLimit() const { return max_size; }
 
-    virtual void removeAll(std::lock_guard<std::mutex> & cache_lock) = 0;
+    virtual size_t getSize(const CacheGuard::Lock &) const = 0;
 
-    /// Returns an iterator pointing to the lowest priority cached record.
-    /// We can traverse all cached records through the iterator's next().
-    virtual ReadIterator getLowestPriorityReadIterator(std::lock_guard<std::mutex> & cache_lock) = 0;
+    virtual size_t getElementsCount(const CacheGuard::Lock &) const = 0;
 
-    /// The same as getLowestPriorityReadIterator(), but it is writeable.
-    virtual WriteIterator getLowestPriorityWriteIterator(std::lock_guard<std::mutex> & cache_lock) = 0;
+    virtual Iterator add(
+        KeyMetadataPtr key_metadata, size_t offset, size_t size, const CacheGuard::Lock &) = 0;
 
-    virtual size_t getElementsNum(std::lock_guard<std::mutex> & cache_lock) const = 0;
+    virtual void pop(const CacheGuard::Lock &) = 0;
 
-    size_t getCacheSize(std::lock_guard<std::mutex> &) const { return cache_size; }
+    virtual void removeAll(const CacheGuard::Lock &) = 0;
 
-protected:
-    size_t max_cache_size = 0;
-    size_t cache_size = 0;
+    virtual void iterate(IterateFunc && func, const CacheGuard::Lock &) = 0;
+
+private:
+    const size_t max_size = 0;
+    const size_t max_elements = 0;
 };
+
 };
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.cpp b/src/Interpreters/Cache/LRUFileCachePriority.cpp
index b54eebffcc4e..f1798cd626c1 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/LRUFileCachePriority.cpp
@@ -1,5 +1,7 @@
 #include <Interpreters/Cache/LRUFileCachePriority.h>
+#include <Interpreters/Cache/FileCache.h>
 #include <Common/CurrentMetrics.h>
+#include <Common/randomSeed.h>
 #include <Common/logger_useful.h>
 
 namespace CurrentMetrics
@@ -16,8 +18,13 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
-IFileCachePriority::WriteIterator LRUFileCachePriority::add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> &)
+IFileCachePriority::Iterator LRUFileCachePriority::add(
+    KeyMetadataPtr key_metadata,
+    size_t offset,
+    size_t size,
+    const CacheGuard::Lock &)
 {
+    const auto & key = key_metadata->key;
 #ifndef NDEBUG
     for (const auto & entry : queue)
     {
@@ -25,78 +32,125 @@ IFileCachePriority::WriteIterator LRUFileCachePriority::add(const Key & key, siz
             throw Exception(
                 ErrorCodes::LOGICAL_ERROR,
                 "Attempt to add duplicate queue entry to queue. (Key: {}, offset: {}, size: {})",
-                entry.key.toString(), entry.offset, entry.size);
+                entry.key, entry.offset, entry.size);
     }
 #endif
 
-    auto iter = queue.insert(queue.end(), FileCacheRecord(key, offset, size));
-    cache_size += size;
+    const auto & size_limit = getSizeLimit();
+    if (size_limit && current_size + size > size_limit)
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Not enough space to add {}:{} with size {}: current size: {}/{}",
+            key, offset, size, current_size, getSizeLimit());
+    }
+
+    current_size += size;
+
+    auto iter = queue.insert(queue.end(), Entry(key, offset, size, key_metadata));
 
     CurrentMetrics::add(CurrentMetrics::FilesystemCacheSize, size);
     CurrentMetrics::add(CurrentMetrics::FilesystemCacheElements);
 
-    LOG_TEST(log, "Added entry into LRU queue, key: {}, offset: {}", key.toString(), offset);
+    LOG_TEST(log, "Added entry into LRU queue, key: {}, offset: {}", key, offset);
 
     return std::make_shared<LRUFileCacheIterator>(this, iter);
 }
 
-bool LRUFileCachePriority::contains(const Key & key, size_t offset, std::lock_guard<std::mutex> &)
-{
-    for (const auto & record : queue)
-    {
-        if (key == record.key && offset == record.offset)
-            return true;
-    }
-    return false;
-}
-
-void LRUFileCachePriority::removeAll(std::lock_guard<std::mutex> &)
+void LRUFileCachePriority::removeAll(const CacheGuard::Lock &)
 {
-    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheSize, cache_size);
+    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheSize, current_size);
     CurrentMetrics::sub(CurrentMetrics::FilesystemCacheElements, queue.size());
 
     LOG_TEST(log, "Removed all entries from LRU queue");
 
     queue.clear();
-    cache_size = 0;
+    current_size = 0;
 }
 
-LRUFileCachePriority::LRUFileCacheIterator::LRUFileCacheIterator(
-    LRUFileCachePriority * cache_priority_, LRUFileCachePriority::LRUQueueIterator queue_iter_)
-    : cache_priority(cache_priority_), queue_iter(queue_iter_)
+void LRUFileCachePriority::pop(const CacheGuard::Lock &)
 {
+    remove(queue.begin());
 }
 
-IFileCachePriority::ReadIterator LRUFileCachePriority::getLowestPriorityReadIterator(std::lock_guard<std::mutex> &)
+LRUFileCachePriority::LRUQueueIterator LRUFileCachePriority::remove(LRUQueueIterator it)
 {
-    return std::make_unique<const LRUFileCacheIterator>(this, queue.begin());
+    current_size -= it->size;
+
+    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheSize, it->size);
+    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheElements);
+
+    LOG_TEST(log, "Removed entry from LRU queue, key: {}, offset: {}", it->key, it->offset);
+    return queue.erase(it);
 }
 
-IFileCachePriority::WriteIterator LRUFileCachePriority::getLowestPriorityWriteIterator(std::lock_guard<std::mutex> &)
+LRUFileCachePriority::LRUFileCacheIterator::LRUFileCacheIterator(
+    LRUFileCachePriority * cache_priority_, LRUFileCachePriority::LRUQueueIterator queue_iter_)
+    : cache_priority(cache_priority_), queue_iter(queue_iter_)
 {
-    return std::make_shared<LRUFileCacheIterator>(this, queue.begin());
 }
 
-size_t LRUFileCachePriority::getElementsNum(std::lock_guard<std::mutex> &) const
+void LRUFileCachePriority::iterate(IterateFunc && func, const CacheGuard::Lock &)
 {
-    return queue.size();
+    for (auto it = queue.begin(); it != queue.end();)
+    {
+        auto locked_key = it->key_metadata->tryLock();
+        if (!locked_key || it->size == 0)
+        {
+            it = remove(it);
+            continue;
+        }
+
+        auto metadata = locked_key->tryGetByOffset(it->offset);
+        if (!metadata)
+        {
+            it = remove(it);
+            continue;
+        }
+
+        if (metadata->size() != it->size)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Mismatch of file segment size in file segment metadata and priority queue: {} != {} ({})",
+                it->size, metadata->size(), metadata->file_segment->getInfoForLog());
+        }
+
+        auto result = func(*locked_key, metadata);
+        switch (result)
+        {
+            case IterationResult::BREAK:
+            {
+                return;
+            }
+            case IterationResult::CONTINUE:
+            {
+                ++it;
+                break;
+            }
+            case IterationResult::REMOVE_AND_CONTINUE:
+            {
+                it = remove(it);
+                break;
+            }
+        }
+    }
 }
 
-void LRUFileCachePriority::LRUFileCacheIterator::removeAndGetNext(std::lock_guard<std::mutex> &)
+LRUFileCachePriority::Iterator LRUFileCachePriority::LRUFileCacheIterator::remove(const CacheGuard::Lock &)
 {
-    cache_priority->cache_size -= queue_iter->size;
-
-    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheSize, queue_iter->size);
-    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheElements);
-
-    LOG_TEST(cache_priority->log, "Removed entry from LRU queue, key: {}, offset: {}", queue_iter->key.toString(), queue_iter->offset);
+    return std::make_shared<LRUFileCacheIterator>(cache_priority, cache_priority->remove(queue_iter));
+}
 
-    queue_iter = cache_priority->queue.erase(queue_iter);
+void LRUFileCachePriority::LRUFileCacheIterator::annul()
+{
+    cache_priority->current_size -= queue_iter->size;
+    queue_iter->size = 0;
 }
 
-void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size, std::lock_guard<std::mutex> &)
+void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size)
 {
-    cache_priority->cache_size += size;
+    cache_priority->current_size += size;
 
     if (size > 0)
         CurrentMetrics::add(CurrentMetrics::FilesystemCacheSize, size);
@@ -105,14 +159,14 @@ void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size, std::l
 
     queue_iter->size += size;
 
-    chassert(queue_iter->size > 0);
-    chassert(cache_priority->cache_size >= 0);
+    chassert(cache_priority->current_size >= 0);
+    chassert(queue_iter->size >= 0);
 }
 
-void LRUFileCachePriority::LRUFileCacheIterator::use(std::lock_guard<std::mutex> &)
+size_t LRUFileCachePriority::LRUFileCacheIterator::use(const CacheGuard::Lock &)
 {
-    queue_iter->hits++;
     cache_priority->queue.splice(cache_priority->queue.end(), cache_priority->queue, queue_iter);
+    return ++queue_iter->hits;
 }
 
 };
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.h b/src/Interpreters/Cache/LRUFileCachePriority.h
index 77d28f18f61e..0cb81109fcda 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.h
+++ b/src/Interpreters/Cache/LRUFileCachePriority.h
@@ -2,6 +2,8 @@
 
 #include <list>
 #include <Interpreters/Cache/IFileCachePriority.h>
+#include <Interpreters/Cache/FileCacheKey.h>
+#include <Common/logger_useful.h>
 
 namespace DB
 {
@@ -12,51 +14,51 @@ class LRUFileCachePriority : public IFileCachePriority
 {
 private:
     class LRUFileCacheIterator;
-    using LRUQueue = std::list<FileCacheRecord>;
+    using LRUQueue = std::list<Entry>;
     using LRUQueueIterator = typename LRUQueue::iterator;
 
 public:
-    LRUFileCachePriority() = default;
+    LRUFileCachePriority(size_t max_size_, size_t max_elements_) : IFileCachePriority(max_size_, max_elements_) {}
 
-    WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> &) override;
+    size_t getSize(const CacheGuard::Lock &) const override { return current_size; }
 
-    bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> &) override;
+    size_t getElementsCount(const CacheGuard::Lock &) const override { return queue.size(); }
 
-    void removeAll(std::lock_guard<std::mutex> &) override;
+    Iterator add(KeyMetadataPtr key_metadata, size_t offset, size_t size, const CacheGuard::Lock &) override;
 
-    ReadIterator getLowestPriorityReadIterator(std::lock_guard<std::mutex> &) override;
+    void pop(const CacheGuard::Lock &) override;
 
-    WriteIterator getLowestPriorityWriteIterator(std::lock_guard<std::mutex> &) override;
+    void removeAll(const CacheGuard::Lock &) override;
 
-    size_t getElementsNum(std::lock_guard<std::mutex> &) const override;
+    void iterate(IterateFunc && func, const CacheGuard::Lock &) override;
 
 private:
     LRUQueue queue;
     Poco::Logger * log = &Poco::Logger::get("LRUFileCachePriority");
+
+    std::atomic<size_t> current_size = 0;
+
+    LRUQueueIterator remove(LRUQueueIterator it);
 };
 
 class LRUFileCachePriority::LRUFileCacheIterator : public IFileCachePriority::IIterator
 {
 public:
-    LRUFileCacheIterator(LRUFileCachePriority * cache_priority_, LRUFileCachePriority::LRUQueueIterator queue_iter_);
-
-    void next() const override { queue_iter++; }
-
-    bool valid() const override { return queue_iter != cache_priority->queue.end(); }
-
-    const Key & key() const override { return queue_iter->key; }
+    LRUFileCacheIterator(
+        LRUFileCachePriority * cache_priority_,
+        LRUFileCachePriority::LRUQueueIterator queue_iter_);
 
-    size_t offset() const override { return queue_iter->offset; }
+    const Entry & getEntry() const override { return *queue_iter; }
 
-    size_t size() const override { return queue_iter->size; }
+    Entry & getEntry() override { return *queue_iter; }
 
-    size_t hits() const override { return queue_iter->hits; }
+    size_t use(const CacheGuard::Lock &) override;
 
-    void removeAndGetNext(std::lock_guard<std::mutex> &) override;
+    Iterator remove(const CacheGuard::Lock &) override;
 
-    void updateSize(int64_t size, std::lock_guard<std::mutex> &) override;
+    void annul() override;
 
-    void use(std::lock_guard<std::mutex> &) override;
+    void updateSize(int64_t size) override;
 
 private:
     LRUFileCachePriority * cache_priority;
diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp
new file mode 100644
index 000000000000..d97417dd2906
--- /dev/null
+++ b/src/Interpreters/Cache/Metadata.cpp
@@ -0,0 +1,468 @@
+#include <Interpreters/Cache/Metadata.h>
+#include <Interpreters/Cache/FileCache.h>
+#include <Interpreters/Cache/FileSegment.h>
+#include <Common/logger_useful.h>
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+FileSegmentMetadata::FileSegmentMetadata(FileSegmentPtr && file_segment_)
+    : file_segment(std::move(file_segment_))
+{
+    switch (file_segment->state())
+    {
+        case FileSegment::State::DOWNLOADED:
+        {
+            chassert(file_segment->getQueueIterator());
+            break;
+        }
+        case FileSegment::State::EMPTY:
+        case FileSegment::State::DOWNLOADING:
+        {
+            break;
+        }
+        default:
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Can create file segment with either EMPTY, DOWNLOADED, DOWNLOADING state, got: {}",
+                FileSegment::stateToString(file_segment->state()));
+    }
+}
+
+size_t FileSegmentMetadata::size() const
+{
+    return file_segment->getReservedSize();
+}
+
+KeyMetadata::KeyMetadata(
+    const Key & key_,
+    const std::string & key_path_,
+    CleanupQueue & cleanup_queue_,
+    bool created_base_directory_)
+    : key(key_)
+    , key_path(key_path_)
+    , cleanup_queue(cleanup_queue_)
+    , created_base_directory(created_base_directory_)
+{
+    if (created_base_directory)
+        chassert(fs::exists(key_path));
+}
+
+LockedKeyPtr KeyMetadata::lock()
+{
+    auto locked = tryLock();
+    if (locked)
+        return locked;
+
+    throw Exception(
+        ErrorCodes::LOGICAL_ERROR,
+        "Cannot lock key {} (state: {})", key, magic_enum::enum_name(key_state));
+}
+
+LockedKeyPtr KeyMetadata::tryLock()
+{
+    auto locked = std::make_unique<LockedKey>(shared_from_this());
+    if (key_state == KeyMetadata::KeyState::ACTIVE)
+        return locked;
+
+    return nullptr;
+}
+
+bool KeyMetadata::createBaseDirectory()
+{
+    if (!created_base_directory.exchange(true))
+    {
+        try
+        {
+            fs::create_directories(key_path);
+        }
+        catch (...)
+        {
+            /// Avoid errors like
+            /// std::__1::__fs::filesystem::filesystem_error: filesystem error: in create_directories: No space left on device
+            /// and mark file segment with SKIP_CACHE state
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+            created_base_directory = false;
+            return false;
+        }
+    }
+    return true;
+}
+
+std::string KeyMetadata::getFileSegmentPath(const FileSegment & file_segment)
+{
+    return fs::path(key_path)
+        / CacheMetadata::getFileNameForFileSegment(file_segment.offset(), file_segment.getKind());
+}
+
+
+class CleanupQueue
+{
+    friend struct CacheMetadata;
+public:
+    void add(const FileCacheKey & key);
+    void remove(const FileCacheKey & key);
+    size_t getSize() const;
+
+private:
+    bool tryPop(FileCacheKey & key);
+
+    std::unordered_set<FileCacheKey> keys;
+    mutable std::mutex mutex;
+};
+
+
+CacheMetadata::CacheMetadata(const std::string & path_)
+    : path(path_)
+    , cleanup_queue(std::make_unique<CleanupQueue>())
+    , log(&Poco::Logger::get("CacheMetadata"))
+{
+}
+
+String CacheMetadata::getFileNameForFileSegment(size_t offset, FileSegmentKind segment_kind)
+{
+    String file_suffix;
+    switch (segment_kind)
+    {
+        case FileSegmentKind::Persistent:
+            file_suffix = "_persistent";
+            break;
+        case FileSegmentKind::Temporary:
+            file_suffix = "_temporary";
+            break;
+        case FileSegmentKind::Regular:
+            file_suffix = "";
+            break;
+    }
+    return std::to_string(offset) + file_suffix;
+}
+
+String CacheMetadata::getPathInLocalCache(const Key & key, size_t offset, FileSegmentKind segment_kind) const
+{
+    String file_suffix;
+
+    const auto key_str = key.toString();
+    return fs::path(path) / key_str.substr(0, 3) / key_str / getFileNameForFileSegment(offset, segment_kind);
+}
+
+String CacheMetadata::getPathInLocalCache(const Key & key) const
+{
+    const auto key_str = key.toString();
+    return fs::path(path) / key_str.substr(0, 3) / key_str;
+}
+
+LockedKeyPtr CacheMetadata::lockKeyMetadata(
+    const FileCacheKey & key,
+    KeyNotFoundPolicy key_not_found_policy,
+    bool is_initial_load)
+{
+    KeyMetadataPtr key_metadata;
+    {
+        auto lock = guard.lock();
+
+        auto it = find(key);
+        if (it == end())
+        {
+            if (key_not_found_policy == KeyNotFoundPolicy::THROW)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "No such key `{}` in cache", key);
+            else if (key_not_found_policy == KeyNotFoundPolicy::RETURN_NULL)
+                return nullptr;
+
+            it = emplace(
+                key, std::make_shared<KeyMetadata>(
+                    key, getPathInLocalCache(key), *cleanup_queue, is_initial_load)).first;
+        }
+
+        key_metadata = it->second;
+    }
+
+    {
+        auto locked_metadata = std::make_unique<LockedKey>(key_metadata);
+        const auto key_state = locked_metadata->getKeyState();
+
+        if (key_state == KeyMetadata::KeyState::ACTIVE)
+            return locked_metadata;
+
+        if (key_not_found_policy == KeyNotFoundPolicy::THROW)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "No such key `{}` in cache", key);
+
+        if (key_not_found_policy == KeyNotFoundPolicy::RETURN_NULL)
+            return nullptr;
+
+        if (key_state == KeyMetadata::KeyState::REMOVING)
+        {
+            locked_metadata->removeFromCleanupQueue();
+            return locked_metadata;
+        }
+
+        chassert(key_state == KeyMetadata::KeyState::REMOVED);
+        chassert(key_not_found_policy == KeyNotFoundPolicy::CREATE_EMPTY);
+    }
+
+    /// Not we are at a case:
+    /// key_state == KeyMetadata::KeyState::REMOVED
+    /// and KeyNotFoundPolicy == CREATE_EMPTY
+    /// Retry.
+    return lockKeyMetadata(key, key_not_found_policy);
+}
+
+void CacheMetadata::iterate(IterateCacheMetadataFunc && func)
+{
+    auto lock = guard.lock();
+    for (const auto & [key, key_metadata] : *this)
+    {
+        auto locked_key = std::make_unique<LockedKey>(key_metadata);
+        const auto key_state = locked_key->getKeyState();
+
+        if (key_state == KeyMetadata::KeyState::ACTIVE)
+        {
+            func(*locked_key);
+            continue;
+        }
+
+        if (key_state == KeyMetadata::KeyState::REMOVING)
+            continue;
+
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR, "Cannot lock key {}: key does not exist", key_metadata->key);
+    }
+}
+
+void CacheMetadata::doCleanup()
+{
+    auto lock = guard.lock();
+
+    /// Let's mention this case.
+    /// This metadata cleanup is delayed so what is we marked key as deleted and
+    /// put it to deletion queue, but then the same key was added to cache before
+    /// we actually performed this delayed removal?
+    /// In this case it will work fine because on each attempt to add any key to cache
+    /// we perform this delayed removal.
+
+    FileCacheKey cleanup_key;
+    while (cleanup_queue->tryPop(cleanup_key))
+    {
+        auto it = find(cleanup_key);
+        if (it == end())
+            continue;
+
+        auto locked_metadata = std::make_unique<LockedKey>(it->second);
+        const auto key_state = locked_metadata->getKeyState();
+
+        if (key_state == KeyMetadata::KeyState::ACTIVE)
+        {
+            /// Key was added back to cache after we submitted it to removal queue.
+            continue;
+        }
+
+        locked_metadata->markAsRemoved();
+        erase(it);
+
+        try
+        {
+            const fs::path key_directory = getPathInLocalCache(cleanup_key);
+            if (fs::exists(key_directory))
+                fs::remove_all(key_directory);
+
+            const fs::path key_prefix_directory = key_directory.parent_path();
+            if (fs::exists(key_prefix_directory) && fs::is_empty(key_prefix_directory))
+                fs::remove_all(key_prefix_directory);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+            chassert(false);
+        }
+    }
+}
+
+LockedKey::LockedKey(std::shared_ptr<KeyMetadata> key_metadata_)
+    : key_metadata(key_metadata_)
+    , lock(key_metadata->guard.lock())
+    , log(&Poco::Logger::get("LockedKey"))
+{
+}
+
+LockedKey::~LockedKey()
+{
+    if (!key_metadata->empty())
+        return;
+
+    key_metadata->key_state = KeyMetadata::KeyState::REMOVING;
+    key_metadata->cleanup_queue.add(getKey());
+}
+
+void LockedKey::removeFromCleanupQueue()
+{
+    if (key_metadata->key_state != KeyMetadata::KeyState::REMOVING)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot remove non-removing");
+
+    /// Just mark key_state as "not to be removed", the cleanup thread will check it and skip the key.
+    key_metadata->key_state = KeyMetadata::KeyState::ACTIVE;
+}
+
+void LockedKey::markAsRemoved()
+{
+    key_metadata->key_state = KeyMetadata::KeyState::REMOVED;
+}
+
+bool LockedKey::isLastOwnerOfFileSegment(size_t offset) const
+{
+    const auto file_segment_metadata = getByOffset(offset);
+    return file_segment_metadata->file_segment.use_count() == 2;
+}
+
+void LockedKey::removeAllReleasable()
+{
+    for (auto it = key_metadata->begin(); it != key_metadata->end();)
+    {
+        if (!it->second->releasable())
+        {
+            ++it;
+            continue;
+        }
+
+        auto file_segment = it->second->file_segment;
+        it = removeFileSegment(file_segment->offset(), file_segment->lock());
+    }
+}
+
+KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset, const FileSegmentGuard::Lock & segment_lock)
+{
+    LOG_DEBUG(log, "Remove from cache. Key: {}, offset: {}", getKey(), offset);
+
+    auto it = key_metadata->find(offset);
+    if (it == key_metadata->end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no offset {}", offset);
+
+    auto file_segment = it->second->file_segment;
+    if (file_segment->queue_iterator)
+        file_segment->queue_iterator->annul();
+
+    const auto path = key_metadata->getFileSegmentPath(*file_segment);
+    if (fs::exists(path))
+        fs::remove(path);
+
+    file_segment->detach(segment_lock, *this);
+    return key_metadata->erase(it);
+}
+
+void LockedKey::shrinkFileSegmentToDownloadedSize(
+    size_t offset,
+    const FileSegmentGuard::Lock & segment_lock)
+{
+    /**
+     * In case file was partially downloaded and it's download cannot be continued
+     * because of no space left in cache, we need to be able to cut file segment's size to downloaded_size.
+     */
+
+    auto metadata = getByOffset(offset);
+    const auto & file_segment = metadata->file_segment;
+    chassert(file_segment->assertCorrectnessUnlocked(segment_lock));
+
+    const size_t downloaded_size = file_segment->getDownloadedSize(false);
+    if (downloaded_size == file_segment->range().size())
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Nothing to reduce, file segment fully downloaded: {}",
+            file_segment->getInfoForLogUnlocked(segment_lock));
+    }
+
+    int64_t diff = file_segment->reserved_size - downloaded_size;
+
+    metadata->file_segment = std::make_shared<FileSegment>(
+        getKey(), offset, downloaded_size, FileSegment::State::DOWNLOADED,
+        CreateFileSegmentSettings(file_segment->getKind()),
+        file_segment->cache, key_metadata, file_segment->queue_iterator);
+
+    if (diff)
+        metadata->getQueueIterator()->updateSize(-diff);
+
+    chassert(file_segment->assertCorrectnessUnlocked(segment_lock));
+}
+
+std::shared_ptr<const FileSegmentMetadata> LockedKey::getByOffset(size_t offset) const
+{
+    auto it = key_metadata->find(offset);
+    if (it == key_metadata->end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is not offset {}", offset);
+    return it->second;
+}
+
+std::shared_ptr<FileSegmentMetadata> LockedKey::getByOffset(size_t offset)
+{
+    auto it = key_metadata->find(offset);
+    if (it == key_metadata->end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is not offset {}", offset);
+    return it->second;
+}
+
+std::shared_ptr<const FileSegmentMetadata> LockedKey::tryGetByOffset(size_t offset) const
+{
+    auto it = key_metadata->find(offset);
+    if (it == key_metadata->end())
+        return nullptr;
+    return it->second;
+}
+
+std::shared_ptr<FileSegmentMetadata> LockedKey::tryGetByOffset(size_t offset)
+{
+    auto it = key_metadata->find(offset);
+    if (it == key_metadata->end())
+        return nullptr;
+    return it->second;
+}
+
+std::string LockedKey::toString() const
+{
+    std::string result;
+    for (auto it = key_metadata->begin(); it != key_metadata->end(); ++it)
+    {
+        if (it != key_metadata->begin())
+            result += ", ";
+        result += std::to_string(it->first);
+    }
+    return result;
+}
+
+void CleanupQueue::add(const FileCacheKey & key)
+{
+    std::lock_guard lock(mutex);
+    keys.insert(key);
+}
+
+void CleanupQueue::remove(const FileCacheKey & key)
+{
+    std::lock_guard lock(mutex);
+    bool erased = keys.erase(key);
+    if (!erased)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "No such key {} in removal queue", key);
+}
+
+bool CleanupQueue::tryPop(FileCacheKey & key)
+{
+    std::lock_guard lock(mutex);
+    if (keys.empty())
+        return false;
+    auto it = keys.begin();
+    key = *it;
+    keys.erase(it);
+    return true;
+}
+
+size_t CleanupQueue::getSize() const
+{
+    std::lock_guard lock(mutex);
+    return keys.size();
+}
+
+}
diff --git a/src/Interpreters/Cache/Metadata.h b/src/Interpreters/Cache/Metadata.h
new file mode 100644
index 000000000000..586c7e5c2a89
--- /dev/null
+++ b/src/Interpreters/Cache/Metadata.h
@@ -0,0 +1,176 @@
+#pragma once
+#include <boost/noncopyable.hpp>
+#include <Interpreters/Cache/Guards.h>
+#include <Interpreters/Cache/IFileCachePriority.h>
+#include <Interpreters/Cache/FileCacheKey.h>
+#include <Interpreters/Cache/FileSegment.h>
+#include <Interpreters/Cache/FileCache_fwd_internal.h>
+
+namespace DB
+{
+class CleanupQueue;
+using CleanupQueuePtr = std::shared_ptr<CleanupQueue>;
+
+
+struct FileSegmentMetadata : private boost::noncopyable
+{
+    using Priority = IFileCachePriority;
+
+    explicit FileSegmentMetadata(FileSegmentPtr && file_segment_);
+
+    bool releasable() const { return file_segment.unique(); }
+
+    size_t size() const;
+
+    bool valid() const { return !removal_candidate.load(); }
+
+    Priority::Iterator getQueueIterator() { return file_segment->getQueueIterator(); }
+
+    FileSegmentPtr file_segment;
+    std::atomic<bool> removal_candidate{false};
+};
+
+using FileSegmentMetadataPtr = std::shared_ptr<FileSegmentMetadata>;
+
+
+struct KeyMetadata : public std::map<size_t, FileSegmentMetadataPtr>,
+                     private boost::noncopyable,
+                     public std::enable_shared_from_this<KeyMetadata>
+{
+    friend struct LockedKey;
+    using Key = FileCacheKey;
+
+    KeyMetadata(
+        const Key & key_,
+        const std::string & key_path_,
+        CleanupQueue & cleanup_queue_,
+        bool created_base_directory_ = false);
+
+    enum class KeyState
+    {
+        ACTIVE,
+        REMOVING,
+        REMOVED,
+    };
+
+    const Key key;
+    const std::string key_path;
+
+    LockedKeyPtr lock();
+
+    /// Return nullptr if key has non-ACTIVE state.
+    LockedKeyPtr tryLock();
+
+    bool createBaseDirectory();
+
+    std::string getFileSegmentPath(const FileSegment & file_segment);
+
+private:
+    KeyState key_state = KeyState::ACTIVE;
+    KeyGuard guard;
+    CleanupQueue & cleanup_queue;
+    std::atomic<bool> created_base_directory = false;
+};
+
+using KeyMetadataPtr = std::shared_ptr<KeyMetadata>;
+
+
+struct CacheMetadata : public std::unordered_map<FileCacheKey, KeyMetadataPtr>, private boost::noncopyable
+{
+public:
+    using Key = FileCacheKey;
+    using IterateCacheMetadataFunc = std::function<void(const LockedKey &)>;
+
+    explicit CacheMetadata(const std::string & path_);
+
+    const String & getBaseDirectory() const { return path; }
+
+    String getPathInLocalCache(
+        const Key & key,
+        size_t offset,
+        FileSegmentKind segment_kind) const;
+
+    String getPathInLocalCache(const Key & key) const;
+    static String getFileNameForFileSegment(size_t offset, FileSegmentKind segment_kind);
+
+    void iterate(IterateCacheMetadataFunc && func);
+
+    enum class KeyNotFoundPolicy
+    {
+        THROW,
+        CREATE_EMPTY,
+        RETURN_NULL,
+    };
+
+    LockedKeyPtr lockKeyMetadata(
+        const Key & key,
+        KeyNotFoundPolicy key_not_found_policy,
+        bool is_initial_load = false);
+
+    void doCleanup();
+
+private:
+    const std::string path; /// Cache base path
+    CacheMetadataGuard guard;
+    const CleanupQueuePtr cleanup_queue;
+    Poco::Logger * log;
+};
+
+
+/**
+ * `LockedKey` is an object which makes sure that as long as it exists the following is true:
+ * 1. the key cannot be removed from cache
+ *    (Why: this LockedKey locks key metadata mutex in ctor, unlocks it in dtor, and so
+ *    when key is going to be deleted, key mutex is also locked.
+ *    Why it cannot be the other way round? E.g. that ctor of LockedKey locks the key
+ *    right after it was deleted? This case it taken into consideration in createLockedKey())
+ * 2. the key cannot be modified, e.g. new offsets cannot be added to key; already existing
+ *    offsets cannot be deleted from the key
+ * And also provides some methods which allow the owner of this LockedKey object to do such
+ * modification of the key (adding/deleting offsets) and deleting the key from cache.
+ */
+struct LockedKey : private boost::noncopyable
+{
+    using Key = FileCacheKey;
+
+    explicit LockedKey(std::shared_ptr<KeyMetadata> key_metadata_);
+
+    ~LockedKey();
+
+    const Key & getKey() const { return key_metadata->key; }
+
+    auto begin() const { return key_metadata->begin(); }
+    auto end() const { return key_metadata->end(); }
+
+    std::shared_ptr<const FileSegmentMetadata> getByOffset(size_t offset) const;
+    std::shared_ptr<FileSegmentMetadata> getByOffset(size_t offset);
+
+    std::shared_ptr<const FileSegmentMetadata> tryGetByOffset(size_t offset) const;
+    std::shared_ptr<FileSegmentMetadata> tryGetByOffset(size_t offset);
+
+    KeyMetadata::KeyState getKeyState() const { return key_metadata->key_state; }
+
+    std::shared_ptr<const KeyMetadata> getKeyMetadata() const { return key_metadata; }
+    std::shared_ptr<KeyMetadata> getKeyMetadata() { return key_metadata; }
+
+    void removeAllReleasable();
+
+    KeyMetadata::iterator removeFileSegment(size_t offset, const FileSegmentGuard::Lock &);
+
+    void shrinkFileSegmentToDownloadedSize(size_t offset, const FileSegmentGuard::Lock &);
+
+    bool isLastOwnerOfFileSegment(size_t offset) const;
+
+    void removeFromCleanupQueue();
+
+    void markAsRemoved();
+
+    std::string toString() const;
+
+private:
+    const std::shared_ptr<KeyMetadata> key_metadata;
+    KeyGuard::Lock lock; /// `lock` must be destructed before `key_metadata`.
+    Poco::Logger * log;
+};
+
+}
diff --git a/src/Interpreters/Cache/QueryLimit.cpp b/src/Interpreters/Cache/QueryLimit.cpp
new file mode 100644
index 000000000000..fc7556f21f5b
--- /dev/null
+++ b/src/Interpreters/Cache/QueryLimit.cpp
@@ -0,0 +1,112 @@
+#include <Interpreters/Cache/QueryLimit.h>
+#include <Interpreters/Cache/Metadata.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+static bool isQueryInitialized()
+{
+    return CurrentThread::isInitialized()
+        && CurrentThread::get().getQueryContext()
+        && !CurrentThread::getQueryId().empty();
+}
+
+FileCacheQueryLimit::QueryContextPtr FileCacheQueryLimit::tryGetQueryContext(const CacheGuard::Lock &)
+{
+    if (!isQueryInitialized())
+        return nullptr;
+
+    auto query_iter = query_map.find(std::string(CurrentThread::getQueryId()));
+    return (query_iter == query_map.end()) ? nullptr : query_iter->second;
+}
+
+void FileCacheQueryLimit::removeQueryContext(const std::string & query_id, const CacheGuard::Lock &)
+{
+    auto query_iter = query_map.find(query_id);
+    if (query_iter == query_map.end())
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Attempt to release query context that does not exist (query_id: {})",
+            query_id);
+    }
+    query_map.erase(query_iter);
+}
+
+FileCacheQueryLimit::QueryContextPtr FileCacheQueryLimit::getOrSetQueryContext(
+    const std::string & query_id,
+    const ReadSettings & settings,
+    const CacheGuard::Lock &)
+{
+    if (query_id.empty())
+        return nullptr;
+
+    auto [it, inserted] = query_map.emplace(query_id, nullptr);
+    if (inserted)
+    {
+        it->second = std::make_shared<QueryContext>(
+            settings.filesystem_cache_max_download_size,
+            !settings.skip_download_if_exceeds_query_cache);
+    }
+
+    return it->second;
+}
+
+FileCacheQueryLimit::QueryContext::QueryContext(
+    size_t query_cache_size,
+    bool recache_on_query_limit_exceeded_)
+    : priority(LRUFileCachePriority(query_cache_size, 0))
+    , recache_on_query_limit_exceeded(recache_on_query_limit_exceeded_)
+{
+}
+
+void FileCacheQueryLimit::QueryContext::add(
+    const FileSegment & file_segment,
+    const CacheGuard::Lock & lock)
+{
+    const auto key = file_segment.key();
+    const auto offset = file_segment.offset();
+
+    auto it = getPriority().add(
+        file_segment.getKeyMetadata(), offset, file_segment.range().size(), lock);
+
+    auto [_, inserted] = records.emplace(FileCacheKeyAndOffset{key, offset}, it);
+    if (!inserted)
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Cannot add offset {} to query context under key {}, it already exists",
+            offset, key);
+    }
+}
+
+void FileCacheQueryLimit::QueryContext::remove(
+    const Key & key,
+    size_t offset,
+    const CacheGuard::Lock & lock)
+{
+    auto record = records.find({key, offset});
+    if (record == records.end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no {}:{} in query context", key, offset);
+
+    record->second->remove(lock);
+    records.erase({key, offset});
+}
+
+IFileCachePriority::Iterator FileCacheQueryLimit::QueryContext::tryGet(
+    const Key & key,
+    size_t offset,
+    const CacheGuard::Lock &)
+{
+    auto it = records.find({key, offset});
+    if (it == records.end())
+        return nullptr;
+    return it->second;
+
+}
+
+}
diff --git a/src/Interpreters/Cache/QueryLimit.h b/src/Interpreters/Cache/QueryLimit.h
new file mode 100644
index 000000000000..5c08584bacff
--- /dev/null
+++ b/src/Interpreters/Cache/QueryLimit.h
@@ -0,0 +1,67 @@
+#pragma once
+#include <Interpreters/Cache/Guards.h>
+#include <Interpreters/Cache/LRUFileCachePriority.h>
+
+namespace DB
+{
+struct ReadSettings;
+class FileSegment;
+
+class FileCacheQueryLimit
+{
+public:
+    class QueryContext;
+    using QueryContextPtr = std::shared_ptr<QueryContext>;
+
+    QueryContextPtr tryGetQueryContext(const CacheGuard::Lock & lock);
+
+    QueryContextPtr getOrSetQueryContext(
+        const std::string & query_id,
+        const ReadSettings & settings,
+        const CacheGuard::Lock &);
+
+    void removeQueryContext(const std::string & query_id, const CacheGuard::Lock &);
+
+    class QueryContext
+    {
+    public:
+        using Key = FileCacheKey;
+        using Priority = IFileCachePriority;
+        using PriorityIterator = IFileCachePriority::Iterator;
+
+        QueryContext(size_t query_cache_size, bool recache_on_query_limit_exceeded_);
+
+        Priority & getPriority() { return priority; }
+        const Priority & getPriority() const { return priority; }
+
+        bool recacheOnFileCacheQueryLimitExceeded() const { return recache_on_query_limit_exceeded; }
+
+        IFileCachePriority::Iterator tryGet(
+            const Key & key,
+            size_t offset,
+            const CacheGuard::Lock &);
+
+        void add(
+            const FileSegment & file_segment,
+            const CacheGuard::Lock &);
+
+        void remove(
+            const Key & key,
+            size_t offset,
+            const CacheGuard::Lock &);
+
+    private:
+        using Records = std::unordered_map<FileCacheKeyAndOffset, IFileCachePriority::Iterator, FileCacheKeyAndOffsetHash>;
+        Records records;
+        LRUFileCachePriority priority;
+        const bool recache_on_query_limit_exceeded;
+    };
+
+private:
+    using QueryContextMap = std::unordered_map<String, QueryContextPtr>;
+    QueryContextMap query_map;
+};
+
+using FileCacheQueryLimitPtr = std::unique_ptr<FileCacheQueryLimit>;
+
+}
diff --git a/src/Interpreters/Cache/WriteBufferToFileSegment.cpp b/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
index a1f0992afdfe..1eac87a804d1 100644
--- a/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
+++ b/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
@@ -17,17 +17,17 @@ namespace ErrorCodes
 }
 
 WriteBufferToFileSegment::WriteBufferToFileSegment(FileSegment * file_segment_)
-    : WriteBufferFromFileDecorator(file_segment_->detachWriter())
+    : WriteBufferFromFileDecorator(std::make_unique<WriteBufferFromFile>(file_segment_->getPathInLocalCache()))
     , file_segment(file_segment_)
 {
 }
 
-WriteBufferToFileSegment::WriteBufferToFileSegment(FileSegmentsHolder && segment_holder_)
+WriteBufferToFileSegment::WriteBufferToFileSegment(FileSegmentsHolderPtr segment_holder_)
     : WriteBufferFromFileDecorator(
-        segment_holder_.file_segments.size() == 1
-        ? segment_holder_.file_segments.front()->detachWriter()
+        segment_holder_->size() == 1
+        ? std::make_unique<WriteBufferFromFile>(segment_holder_->front().getPathInLocalCache())
         : throw Exception(ErrorCodes::LOGICAL_ERROR, "WriteBufferToFileSegment can be created only from single segment"))
-    , file_segment(segment_holder_.file_segments.front().get())
+    , file_segment(&segment_holder_->front())
     , segment_holder(std::move(segment_holder_))
 {
 }
diff --git a/src/Interpreters/Cache/WriteBufferToFileSegment.h b/src/Interpreters/Cache/WriteBufferToFileSegment.h
index 1305bcf83872..4d1e82996a3f 100644
--- a/src/Interpreters/Cache/WriteBufferToFileSegment.h
+++ b/src/Interpreters/Cache/WriteBufferToFileSegment.h
@@ -13,7 +13,7 @@ class WriteBufferToFileSegment : public WriteBufferFromFileDecorator, public IRe
 {
 public:
     explicit WriteBufferToFileSegment(FileSegment * file_segment_);
-    explicit WriteBufferToFileSegment(FileSegmentsHolder && segment_holder);
+    explicit WriteBufferToFileSegment(FileSegmentsHolderPtr segment_holder);
 
     void nextImpl() override;
 
@@ -28,7 +28,7 @@ class WriteBufferToFileSegment : public WriteBufferFromFileDecorator, public IRe
     FileSegment * file_segment;
 
     /// Empty if file_segment is not owned by this WriteBufferToFileSegment
-    FileSegmentsHolder segment_holder;
+    FileSegmentsHolderPtr segment_holder;
 };
 
 
diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp
index 445c227dd294..b696b5390136 100644
--- a/src/Interpreters/Cluster.cpp
+++ b/src/Interpreters/Cluster.cpp
@@ -127,24 +127,17 @@ Cluster::Address::Address(
 
 
 Cluster::Address::Address(
-    const String & host_port_,
-    const String & user_,
-    const String & password_,
-    UInt16 clickhouse_port,
-    bool treat_local_port_as_remote,
-    bool secure_,
-    Int64 priority_,
+    const DatabaseReplicaInfo & info,
+    const ClusterConnectionParameters & params,
     UInt32 shard_index_,
-    UInt32 replica_index_,
-    String cluster_name_,
-    String cluster_secret_)
-    : user(user_), password(password_)
+    UInt32 replica_index_)
+    : user(params.username), password(params.password)
 {
     bool can_be_local = true;
     std::pair<std::string, UInt16> parsed_host_port;
-    if (!treat_local_port_as_remote)
+    if (!params.treat_local_port_as_remote)
     {
-        parsed_host_port = parseAddress(host_port_, clickhouse_port);
+        parsed_host_port = parseAddress(info.hostname, params.clickhouse_port);
     }
     else
     {
@@ -154,23 +147,25 @@ Cluster::Address::Address(
         /// If it doesn't include a port then use the default one and it could be local (if the address is)
         try
         {
-            parsed_host_port = parseAddress(host_port_, 0);
+            parsed_host_port = parseAddress(info.hostname, 0);
             can_be_local = false;
         }
         catch (...)
         {
-            parsed_host_port = parseAddress(host_port_, clickhouse_port);
+            parsed_host_port = parseAddress(info.hostname, params.clickhouse_port);
         }
     }
     host_name = parsed_host_port.first;
+    database_shard_name = info.shard_name;
+    database_replica_name = info.replica_name;
     port = parsed_host_port.second;
-    secure = secure_ ? Protocol::Secure::Enable : Protocol::Secure::Disable;
-    priority = priority_;
-    is_local = can_be_local && isLocal(clickhouse_port);
+    secure = params.secure ? Protocol::Secure::Enable : Protocol::Secure::Disable;
+    priority = params.priority;
+    is_local = can_be_local && isLocal(params.clickhouse_port);
     shard_index = shard_index_;
     replica_index = replica_index_;
-    cluster = cluster_name_;
-    cluster_secret = cluster_secret_;
+    cluster = params.cluster_name;
+    cluster_secret = params.cluster_secret;
 }
 
 
@@ -492,44 +487,8 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config,
                     throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Unknown element in config: {}", replica_key);
             }
 
-            Addresses shard_local_addresses;
-            Addresses shard_all_addresses;
-
-            ConnectionPoolPtrs all_replicas_pools;
-            all_replicas_pools.reserve(replica_addresses.size());
-
-            for (const auto & replica : replica_addresses)
-            {
-                auto replica_pool = ConnectionPoolFactory::instance().get(
-                    static_cast<unsigned>(settings.distributed_connections_pool_size),
-                    replica.host_name, replica.port,
-                    replica.default_database, replica.user, replica.password, replica.quota_key,
-                    replica.cluster, replica.cluster_secret,
-                    "server", replica.compression,
-                    replica.secure, replica.priority);
-
-                all_replicas_pools.emplace_back(replica_pool);
-                if (replica.is_local)
-                    shard_local_addresses.push_back(replica);
-                shard_all_addresses.push_back(replica);
-            }
-            ConnectionPoolWithFailoverPtr shard_pool = std::make_shared<ConnectionPoolWithFailover>(
-                        all_replicas_pools, settings.load_balancing,
-                        settings.distributed_replica_error_half_life.totalSeconds(), settings.distributed_replica_error_cap);
-
-            if (weight)
-                slot_to_shard.insert(std::end(slot_to_shard), weight, shards_info.size());
-
-            shards_info.push_back({
-                std::move(insert_paths),
-                current_shard_num,
-                weight,
-                std::move(shard_local_addresses),
-                std::move(shard_all_addresses),
-                std::move(shard_pool),
-                std::move(all_replicas_pools),
-                internal_replication
-            });
+            addShard(settings, std::move(replica_addresses), false, current_shard_num,
+                     std::move(insert_paths), /* treat_local_as_remote */ weight, internal_replication);
         }
         else
             throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Unknown element in config: {}", key);
@@ -547,79 +506,102 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config,
 Cluster::Cluster(
     const Settings & settings,
     const std::vector<std::vector<String>> & names,
-    const String & username,
-    const String & password,
-    UInt16 clickhouse_port,
-    bool treat_local_as_remote,
-    bool treat_local_port_as_remote,
-    bool secure,
-    Int64 priority,
-    String cluster_name,
-    String cluster_secret)
+    const ClusterConnectionParameters & params)
 {
     UInt32 current_shard_num = 1;
 
-    secret = cluster_secret;
+    secret = params.cluster_secret;
 
     for (const auto & shard : names)
     {
         Addresses current;
         for (const auto & replica : shard)
             current.emplace_back(
-                replica,
-                username,
-                password,
-                clickhouse_port,
-                treat_local_port_as_remote,
-                secure,
-                priority,
+                DatabaseReplicaInfo{replica, "", ""},
+                params,
                 current_shard_num,
-                current.size() + 1,
-                cluster_name,
-                cluster_secret);
+                current.size() + 1);
 
         addresses_with_failover.emplace_back(current);
 
-        Addresses shard_local_addresses;
-        Addresses all_addresses;
-        ConnectionPoolPtrs all_replicas;
-        all_replicas.reserve(current.size());
+        addShard(settings, std::move(current), params.treat_local_as_remote, current_shard_num);
+        ++current_shard_num;
+    }
 
-        for (const auto & replica : current)
-        {
-            auto replica_pool = ConnectionPoolFactory::instance().get(
-                static_cast<unsigned>(settings.distributed_connections_pool_size),
-                replica.host_name, replica.port,
-                replica.default_database, replica.user, replica.password, replica.quota_key,
-                replica.cluster, replica.cluster_secret,
-                "server", replica.compression, replica.secure, replica.priority);
-            all_replicas.emplace_back(replica_pool);
-            if (replica.is_local && !treat_local_as_remote)
-                shard_local_addresses.push_back(replica);
-            all_addresses.push_back(replica);
-        }
+    initMisc();
+}
+
+Cluster::Cluster(
+    const Settings & settings,
+    const std::vector<std::vector<DatabaseReplicaInfo>> & infos,
+    const ClusterConnectionParameters & params)
+{
+    UInt32 current_shard_num = 1;
+
+    secret = params.cluster_secret;
+
+    for (const auto & shard : infos)
+    {
+        Addresses current;
+        for (const auto & replica : shard)
+            current.emplace_back(
+                replica,
+                params,
+                current_shard_num,
+                current.size() + 1);
+
+        addresses_with_failover.emplace_back(current);
 
-        ConnectionPoolWithFailoverPtr shard_pool = std::make_shared<ConnectionPoolWithFailover>(
-                all_replicas, settings.load_balancing,
-                settings.distributed_replica_error_half_life.totalSeconds(), settings.distributed_replica_error_cap);
-
-        slot_to_shard.insert(std::end(slot_to_shard), default_weight, shards_info.size());
-        shards_info.push_back({
-            {}, // insert_path_for_internal_replication
-            current_shard_num,
-            default_weight,
-            std::move(shard_local_addresses),
-            std::move(all_addresses),
-            std::move(shard_pool),
-            std::move(all_replicas),
-            false // has_internal_replication
-        });
+        addShard(settings, std::move(current), params.treat_local_as_remote, current_shard_num);
         ++current_shard_num;
     }
 
     initMisc();
 }
 
+void Cluster::addShard(const Settings & settings, Addresses && addresses, bool treat_local_as_remote, UInt32 current_shard_num,
+                       ShardInfoInsertPathForInternalReplication && insert_paths, UInt32 weight, bool internal_replication)
+{
+    Addresses shard_local_addresses;
+    Addresses shard_all_addresses;
+
+    ConnectionPoolPtrs all_replicas_pools;
+    all_replicas_pools.reserve(addresses.size());
+
+    for (const auto & replica : addresses)
+    {
+        auto replica_pool = ConnectionPoolFactory::instance().get(
+            static_cast<unsigned>(settings.distributed_connections_pool_size),
+            replica.host_name, replica.port,
+            replica.default_database, replica.user, replica.password, replica.quota_key,
+            replica.cluster, replica.cluster_secret,
+            "server", replica.compression,
+            replica.secure, replica.priority);
+
+        all_replicas_pools.emplace_back(replica_pool);
+        if (replica.is_local && !treat_local_as_remote)
+            shard_local_addresses.push_back(replica);
+        shard_all_addresses.push_back(replica);
+    }
+    ConnectionPoolWithFailoverPtr shard_pool = std::make_shared<ConnectionPoolWithFailover>(
+        all_replicas_pools, settings.load_balancing,
+        settings.distributed_replica_error_half_life.totalSeconds(), settings.distributed_replica_error_cap);
+
+    if (weight)
+        slot_to_shard.insert(std::end(slot_to_shard), weight, shards_info.size());
+
+    shards_info.push_back({
+        std::move(insert_paths),
+        current_shard_num,
+        weight,
+        std::move(shard_local_addresses),
+        std::move(shard_all_addresses),
+        std::move(shard_pool),
+        std::move(all_replicas_pools),
+        internal_replication
+    });
+}
+
 
 Poco::Timespan Cluster::saturate(Poco::Timespan v, Poco::Timespan limit)
 {
diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h
index d74f75c941e6..4798384f29c7 100644
--- a/src/Interpreters/Cluster.h
+++ b/src/Interpreters/Cluster.h
@@ -29,6 +29,26 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
+struct DatabaseReplicaInfo
+{
+    String hostname;
+    String shard_name;
+    String replica_name;
+};
+
+struct ClusterConnectionParameters
+{
+    const String & username;
+    const String & password;
+    UInt16 clickhouse_port;
+    bool treat_local_as_remote;
+    bool treat_local_port_as_remote;
+    bool secure = false;
+    Int64 priority = 1;
+    String cluster_name;
+    String cluster_secret;
+};
+
 /// Cluster contains connection pools to each node
 /// With the local nodes, the connection is not established, but the request is executed directly.
 /// Therefore we store only the number of local nodes
@@ -51,15 +71,13 @@ class Cluster
     Cluster(
         const Settings & settings,
         const std::vector<std::vector<String>> & names,
-        const String & username,
-        const String & password,
-        UInt16 clickhouse_port,
-        bool treat_local_as_remote,
-        bool treat_local_port_as_remote,
-        bool secure = false,
-        Int64 priority = 1,
-        String cluster_name = "",
-        String cluster_secret = "");
+        const ClusterConnectionParameters & params);
+
+
+    Cluster(
+        const Settings & settings,
+        const std::vector<std::vector<DatabaseReplicaInfo>> & infos,
+        const ClusterConnectionParameters & params);
 
     Cluster(const Cluster &)= delete;
     Cluster & operator=(const Cluster &) = delete;
@@ -90,6 +108,8 @@ class Cluster
         */
 
         String host_name;
+        String database_shard_name;
+        String database_replica_name;
         UInt16 port{0};
         String user;
         String password;
@@ -125,16 +145,15 @@ class Cluster
 
         Address(
             const String & host_port_,
-            const String & user_,
-            const String & password_,
-            UInt16 clickhouse_port,
-            bool treat_local_port_as_remote,
-            bool secure_ = false,
-            Int64 priority_ = 1,
-            UInt32 shard_index_ = 0,
-            UInt32 replica_index_ = 0,
-            String cluster_name = "",
-            String cluster_secret_ = "");
+            const ClusterConnectionParameters & params,
+            UInt32 shard_index_,
+            UInt32 replica_index_);
+
+        Address(
+            const DatabaseReplicaInfo & info,
+            const ClusterConnectionParameters & params,
+            UInt32 shard_index_,
+            UInt32 replica_index_);
 
         /// Returns 'escaped_host_name:port'
         String toString() const;
@@ -276,6 +295,9 @@ class Cluster
     struct ReplicasAsShardsTag {};
     Cluster(ReplicasAsShardsTag, const Cluster & from, const Settings & settings, size_t max_replicas_from_shard);
 
+    void addShard(const Settings & settings, Addresses && addresses, bool treat_local_as_remote, UInt32 current_shard_num,
+                  ShardInfoInsertPathForInternalReplication && insert_paths = {}, UInt32 weight = 1, bool internal_replication = false);
+
     /// Inter-server secret
     String secret;
 
diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp
index 2e7ccb816a27..610403a5262f 100644
--- a/src/Interpreters/ClusterDiscovery.cpp
+++ b/src/Interpreters/ClusterDiscovery.cpp
@@ -237,15 +237,20 @@ ClusterPtr ClusterDiscovery::makeCluster(const ClusterInfo & cluster_info)
     }
 
     bool secure = cluster_info.current_node.secure;
-    auto cluster = std::make_shared<Cluster>(
-        context->getSettingsRef(),
-        shards,
+    ClusterConnectionParameters params{
         /* username= */ context->getUserName(),
         /* password= */ "",
         /* clickhouse_port= */ secure ? context->getTCPPortSecure().value_or(DBMS_DEFAULT_SECURE_PORT) : context->getTCPPort(),
         /* treat_local_as_remote= */ false,
         /* treat_local_port_as_remote= */ false, /// should be set only for clickhouse-local, but cluster discovery is not used there
-        /* secure= */ secure);
+        /* secure= */ secure,
+        /* priority= */ 1,
+        /* cluster_name= */ "",
+        /* password= */ ""};
+    auto cluster = std::make_shared<Cluster>(
+        context->getSettingsRef(),
+        shards,
+        params);
     return cluster;
 }
 
diff --git a/src/Interpreters/ConcurrentHashJoin.cpp b/src/Interpreters/ConcurrentHashJoin.cpp
index 03c173a73d99..b1a351e95bac 100644
--- a/src/Interpreters/ConcurrentHashJoin.cpp
+++ b/src/Interpreters/ConcurrentHashJoin.cpp
@@ -1,227 +1,116 @@
+#include "ConcurrentHashJoin.h"
 #include <memory>
 #include <mutex>
-#include <Columns/ColumnSparse.h>
-#include <Columns/FilterDescription.h>
-#include <Columns/IColumn.h>
-#include <Core/ColumnsWithTypeAndName.h>
-#include <Core/NamesAndTypes.h>
-#include <Interpreters/ConcurrentHashJoin.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/ExpressionActions.h>
-#include <Interpreters/PreparedSets.h>
-#include <Interpreters/TableJoin.h>
-#include <Interpreters/createBlockSelector.h>
-#include <Parsers/DumpASTNode.h>
-#include <Parsers/ExpressionListParsers.h>
-#include <Parsers/IAST_fwd.h>
-#include <Parsers/parseQuery.h>
 #include <Common/Exception.h>
-#include <Common/WeakHash.h>
-#include <Common/typeid_cast.h>
+#include <Interpreters/HashJoin.h>
+#include <Interpreters/IJoin.h>
+#include <Interpreters/TableJoin.h>
+
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
-    extern const int LOGICAL_ERROR;
     extern const int SET_SIZE_LIMIT_EXCEEDED;
 }
 
-static UInt32 toPowerOfTwo(UInt32 x)
+ConcurrentHashJoin::ConcurrentHashJoin(std::shared_ptr<TableJoin> table_join_, const Block & right_sample_block_)
+    : table_join(table_join_)
+    , right_sample_block(right_sample_block_)
 {
-    if (x <= 1)
-        return 1;
-    return static_cast<UInt32>(1) << (32 - std::countl_zero(x - 1));
+    inner_join = std::make_unique<HashJoin>(table_join, right_sample_block);
+    shared_context = std::make_shared<SharedContext>();
+    shared_context->original_size_limit = table_join->sizeLimits();
+    shared_context->size_limit_per_clone = table_join->sizeLimits();
+    shared_context->clone_count = 1;
 }
 
-ConcurrentHashJoin::ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<TableJoin> table_join_, size_t slots_, const Block & right_sample_block, bool any_take_last_row_)
-    : context(context_)
-    , table_join(table_join_)
-    , slots(toPowerOfTwo(std::min<UInt32>(static_cast<UInt32>(slots_), 256)))
+ConcurrentHashJoin::ConcurrentHashJoin(std::shared_ptr<TableJoin> table_join_, const Block & right_sample_block_, SharedContextPtr shared_context_)
+    : table_join(table_join_)
+    , right_sample_block(right_sample_block_)
 {
-    for (size_t i = 0; i < slots; ++i)
-    {
-        auto inner_hash_join = std::make_shared<InternalHashJoin>();
-        inner_hash_join->data = std::make_unique<HashJoin>(table_join_, right_sample_block, any_take_last_row_);
-        hash_joins.emplace_back(std::move(inner_hash_join));
-    }
+    inner_join = std::make_unique<HashJoin>(table_join, right_sample_block);
+    shared_context = shared_context_;
 }
 
-bool ConcurrentHashJoin::addJoinedBlock(const Block & right_block, bool check_limits)
+bool ConcurrentHashJoin::addJoinedBlock(const Block & block, bool check_limits)
 {
-    Blocks dispatched_blocks = dispatchBlock(table_join->getOnlyClause().key_names_right, right_block);
-
-    size_t blocks_left = 0;
-    for (const auto & block : dispatched_blocks)
-    {
-        if (block)
-        {
-            ++blocks_left;
-        }
-    }
-
-    while (blocks_left > 0)
+    if (!inner_join->addJoinedBlock(block, false))
+        return false;
+    if (check_limits)
     {
-        /// insert blocks into corresponding HashJoin instances
-        for (size_t i = 0; i < dispatched_blocks.size(); ++i)
-        {
-            auto & hash_join = hash_joins[i];
-            auto & dispatched_block = dispatched_blocks[i];
-
-            if (dispatched_block)
-            {
-                /// if current hash_join is already processed by another thread, skip it and try later
-                std::unique_lock<std::mutex> lock(hash_join->mutex, std::try_to_lock);
-                if (!lock.owns_lock())
-                    continue;
-
-                bool limit_exceeded = !hash_join->data->addJoinedBlock(dispatched_block, check_limits);
-
-                dispatched_block = {};
-                blocks_left--;
-
-                if (limit_exceeded)
-                    return false;
-            }
-        }
+        auto total_rows = getTotalRowCount();
+        auto total_bytes = getTotalByteCount();
+        return shared_context->size_limit_per_clone.check(total_rows, total_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
     }
-
-    if (check_limits)
-        return table_join->sizeLimits().check(getTotalRowCount(), getTotalByteCount(), "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
     return true;
 }
 
-void ConcurrentHashJoin::joinBlock(Block & block, std::shared_ptr<ExtraBlock> & /*not_processed*/)
+void ConcurrentHashJoin::checkTypesOfKeys(const Block & block) const
 {
-    Blocks dispatched_blocks = dispatchBlock(table_join->getOnlyClause().key_names_left, block);
-    block = {};
-    for (size_t i = 0; i < dispatched_blocks.size(); ++i)
-    {
-        std::shared_ptr<ExtraBlock> none_extra_block;
-        auto & hash_join = hash_joins[i];
-        auto & dispatched_block = dispatched_blocks[i];
-        hash_join->data->joinBlock(dispatched_block, none_extra_block);
-        if (none_extra_block && !none_extra_block->empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "not_processed should be empty");
-    }
-
-    block = concatenateBlocks(dispatched_blocks);
+    inner_join->checkTypesOfKeys(block);
 }
 
-void ConcurrentHashJoin::checkTypesOfKeys(const Block & block) const
+void ConcurrentHashJoin::joinBlock(Block & block, std::shared_ptr<ExtraBlock> & not_processed)
 {
-    hash_joins[0]->data->checkTypesOfKeys(block);
+    inner_join->joinBlock(block, not_processed);
 }
 
 void ConcurrentHashJoin::setTotals(const Block & block)
 {
-    if (block)
-    {
-        std::lock_guard lock(totals_mutex);
-        totals = block;
-    }
+    inner_join->setTotals(block);
 }
-
 const Block & ConcurrentHashJoin::getTotals() const
 {
-    return totals;
+    const auto & res = inner_join->getTotals();
+    return res;
 }
 
 size_t ConcurrentHashJoin::getTotalRowCount() const
 {
-    size_t res = 0;
-    for (const auto & hash_join : hash_joins)
-    {
-        std::lock_guard lock(hash_join->mutex);
-        res += hash_join->data->getTotalRowCount();
-    }
-    return res;
+    return inner_join->getTotalRowCount();
 }
 
 size_t ConcurrentHashJoin::getTotalByteCount() const
 {
-    size_t res = 0;
-    for (const auto & hash_join : hash_joins)
-    {
-        std::lock_guard lock(hash_join->mutex);
-        res += hash_join->data->getTotalByteCount();
-    }
-    return res;
+    return inner_join->getTotalByteCount();
 }
 
 bool ConcurrentHashJoin::alwaysReturnsEmptySet() const
 {
-    for (const auto & hash_join : hash_joins)
-    {
-        std::lock_guard lock(hash_join->mutex);
-        if (!hash_join->data->alwaysReturnsEmptySet())
-            return false;
-    }
-    return true;
+    return inner_join->alwaysReturnsEmptySet();
 }
 
-IBlocksStreamPtr ConcurrentHashJoin::getNonJoinedBlocks(
-        const Block & /*left_sample_block*/, const Block & /*result_sample_block*/, UInt64 /*max_block_size*/) const
-{
-    if (!JoinCommon::hasNonJoinedBlocks(*table_join))
-        return {};
 
-    throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid join type. join kind: {}, strictness: {}",
-                    table_join->kind(), table_join->strictness());
-}
-
-static ALWAYS_INLINE IColumn::Selector hashToSelector(const WeakHash32 & hash, size_t num_shards)
+IBlocksStreamPtr
+ConcurrentHashJoin::getNonJoinedBlocks(const Block & left_sample_block, const Block & result_sample_block, UInt64 max_block_size) const
 {
-    assert(num_shards > 0 && (num_shards & (num_shards - 1)) == 0);
-    const auto & data = hash.getData();
-    size_t num_rows = data.size();
-
-    IColumn::Selector selector(num_rows);
-    for (size_t i = 0; i < num_rows; ++i)
-        /// Apply intHash64 to mix bits in data.
-        /// HashTable internally uses WeakHash32, and we need to get different lower bits not to cause collisions.
-        selector[i] = intHash64(data[i]) & (num_shards - 1);
-    return selector;
+    return inner_join->getNonJoinedBlocks(left_sample_block, result_sample_block, max_block_size);
 }
 
-IColumn::Selector ConcurrentHashJoin::selectDispatchBlock(const Strings & key_columns_names, const Block & from_block)
+JoinPtr ConcurrentHashJoin::clone()
 {
-    size_t num_rows = from_block.rows();
-    size_t num_shards = hash_joins.size();
-
-    WeakHash32 hash(num_rows);
-    for (const auto & key_name : key_columns_names)
-    {
-        const auto & key_col = from_block.getByName(key_name).column->convertToFullColumnIfConst();
-        const auto & key_col_no_lc = recursiveRemoveLowCardinality(recursiveRemoveSparse(key_col));
-        key_col_no_lc->updateWeakHash32(hash);
-    }
-    return hashToSelector(hash, num_shards);
+    auto res = std::make_shared<ConcurrentHashJoin>(table_join, right_sample_block, shared_context);
+    shared_context->clone_count += 1;
+    shared_context->size_limit_per_clone.max_bytes = shared_context->original_size_limit.max_bytes / shared_context->clone_count;
+    shared_context->size_limit_per_clone.max_rows = shared_context->original_size_limit.max_rows / shared_context->clone_count;
+    return res;
 }
 
-Blocks ConcurrentHashJoin::dispatchBlock(const Strings & key_columns_names, const Block & from_block)
+bool ConcurrentHashJoin::isSupported(const std::shared_ptr<TableJoin> & table_join)
 {
-    /// TODO: use JoinCommon::scatterBlockByHash
-    size_t num_shards = hash_joins.size();
-    size_t num_cols = from_block.columns();
-
-    IColumn::Selector selector = selectDispatchBlock(key_columns_names, from_block);
-
-    Blocks result(num_shards);
-    for (size_t i = 0; i < num_shards; ++i)
-        result[i] = from_block.cloneEmpty();
-
-    for (size_t i = 0; i < num_cols; ++i)
+    if (table_join->strictness() == JoinStrictness::Asof)
+        return false;
+    if (!isInnerOrLeft(table_join->kind()) && !isRight(table_join->kind()))
     {
-        auto dispatched_columns = from_block.getByPosition(i).column->scatter(num_shards, selector);
-        assert(result.size() == dispatched_columns.size());
-        for (size_t block_index = 0; block_index < num_shards; ++block_index)
-        {
-            result[block_index].getByPosition(i).column = std::move(dispatched_columns[block_index]);
-        }
+        return false;
     }
-    return result;
+    if (table_join->isSpecialStorage() || !table_join->oneDisjunct())
+        return false;
+    return true;
 }
 
 }
diff --git a/src/Interpreters/ConcurrentHashJoin.h b/src/Interpreters/ConcurrentHashJoin.h
index 5e53f9845aa0..31549f6ace8c 100644
--- a/src/Interpreters/ConcurrentHashJoin.h
+++ b/src/Interpreters/ConcurrentHashJoin.h
@@ -1,40 +1,27 @@
 #pragma once
-
-#include <condition_variable>
 #include <memory>
-#include <optional>
-#include <Functions/FunctionsLogical.h>
-#include <Interpreters/Context.h>
-#include <Interpreters/ExpressionActions.h>
-#include <Interpreters/HashJoin.h>
 #include <Interpreters/IJoin.h>
-#include <base/defines.h>
-#include <base/types.h>
-#include <Common/Stopwatch.h>
-
+#include <Interpreters/Context_fwd.h>
+#include <Interpreters/HashJoin.h>
+#include "Context.h"
 namespace DB
 {
 
-/**
- * Can run addJoinedBlock() parallelly to speedup the join process. On test, it almose linear speedup by
- * the degree of parallelism.
- *
- * The default HashJoin is not thread safe for inserting right table's rows and run it in a single thread. When
- * the right table is large, the join process is too slow.
- *
- * We create multiple HashJoin instances here. In addJoinedBlock(), one input block is split into multiple blocks
- * corresponding to the HashJoin instances by hashing every row on the join keys. And make a guarantee that every HashJoin
- * instance is written by only one thread.
- *
- * When come to the left table matching, the blocks from left table are alse split into different HashJoin instances.
- *
- */
-class ConcurrentHashJoin : public IJoin
+// If could not running shuffle  mode, this join behaviors as HashJoin.
+// Not support parallel mode.
+// Could support right join now, still no asof strictness.
+class ConcurrentHashJoin : public IJoin, public boost::noncopyable
 {
-
 public:
-    explicit ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<TableJoin> table_join_, size_t slots_, const Block & right_sample_block, bool any_take_last_row_ = false);
-    ~ConcurrentHashJoin() override = default;
+    struct SharedContext
+    {
+        size_t clone_count;
+        SizeLimits original_size_limit;
+        SizeLimits size_limit_per_clone;
+    };
+    using SharedContextPtr = std::shared_ptr<SharedContext>;
+    explicit ConcurrentHashJoin(std::shared_ptr<TableJoin> table_join_, const Block & right_sample_block_);
+    explicit ConcurrentHashJoin(std::shared_ptr<TableJoin> table_join_, const Block & right_sample_block_, SharedContextPtr shared_context_);
 
     const TableJoin & getTableJoin() const override { return *table_join; }
     bool addJoinedBlock(const Block & block, bool check_limits) override;
@@ -45,28 +32,18 @@ class ConcurrentHashJoin : public IJoin
     size_t getTotalRowCount() const override;
     size_t getTotalByteCount() const override;
     bool alwaysReturnsEmptySet() const override;
-    bool supportParallelJoin() const override { return true; }
+    bool supportShuffle() const override { return true; }
+    JoinPtr clone() override;
+    bool supportTotals() const override { return true; }
     IBlocksStreamPtr
     getNonJoinedBlocks(const Block & left_sample_block, const Block & result_sample_block, UInt64 max_block_size) const override;
 
+    static bool isSupported(const std::shared_ptr<TableJoin> & table_join);
 private:
-    struct InternalHashJoin
-    {
-        std::mutex mutex;
-        std::unique_ptr<HashJoin> data;
-    };
 
-    ContextPtr context;
     std::shared_ptr<TableJoin> table_join;
-    size_t slots;
-    std::vector<std::shared_ptr<InternalHashJoin>> hash_joins;
-
-    std::mutex totals_mutex;
-    Block totals;
-
-    IColumn::Selector selectDispatchBlock(const Strings & key_columns_names, const Block & from_block);
-    Blocks dispatchBlock(const Strings & key_columns_names, const Block & from_block);
-
+    Block right_sample_block;
+    std::unique_ptr<HashJoin> inner_join;
+    SharedContextPtr shared_context;
 };
-
 }
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 66725ba5aa3e..944ff6dfa247 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -19,7 +19,6 @@
 #include <Coordination/KeeperDispatcher.h>
 #include <Compression/ICompressionCodec.h>
 #include <Core/BackgroundSchedulePool.h>
-#include <Core/ServerSettings.h>
 #include <Formats/FormatFactory.h>
 #include <Databases/IDatabase.h>
 #include <Storages/IStorage.h>
@@ -43,6 +42,9 @@
 #include <Interpreters/ExternalLoaderXMLConfigRepository.h>
 #include <Interpreters/TemporaryDataOnDisk.h>
 #include <Interpreters/Cache/QueryCache.h>
+#include <Interpreters/Cache/FileCacheFactory.h>
+#include <Interpreters/Cache/FileCache.h>
+#include <Core/ServerSettings.h>
 #include <Interpreters/PreparedSets.h>
 #include <Core/Settings.h>
 #include <Core/SettingsQuirks.h>
@@ -107,15 +109,12 @@
 #include <Interpreters/Lemmatizers.h>
 #include <Interpreters/ClusterDiscovery.h>
 #include <Interpreters/TransactionLog.h>
-#include <Interpreters/Cache/FileCacheFactory.h>
 #include <filesystem>
 #include <re2/re2.h>
 #include <Storages/StorageView.h>
 #include <Parsers/ASTFunction.h>
 #include <base/find_symbols.h>
 
-#include <Interpreters/Cache/FileCache.h>
-
 #if USE_ROCKSDB
 #include <rocksdb/table.h>
 #endif
@@ -536,6 +535,12 @@ struct ContextSharedPart : boost::noncopyable
         /// take it as well, which will cause deadlock.
         delete_ddl_worker.reset();
 
+        /// Background operations in cache use background schedule pool.
+        /// Deactivate them before destructing it.
+        const auto & caches = FileCacheFactory::instance().getAll();
+        for (const auto & [_, cache] : caches)
+            cache->cache->deactivateBackgroundOperations();
+
         {
             auto lock = std::lock_guard(mutex);
 
@@ -4283,8 +4288,10 @@ ReadSettings Context::getReadSettings() const
             "Invalid value '{}' for max_read_buffer_size", settings.max_read_buffer_size);
     }
 
-    res.local_fs_buffer_size = settings.max_read_buffer_size;
-    res.remote_fs_buffer_size = settings.max_read_buffer_size;
+    res.local_fs_buffer_size
+        = settings.max_read_buffer_size_local_fs ? settings.max_read_buffer_size_local_fs : settings.max_read_buffer_size;
+    res.remote_fs_buffer_size
+        = settings.max_read_buffer_size_remote_fs ? settings.max_read_buffer_size_remote_fs : settings.max_read_buffer_size;
     res.prefetch_buffer_size = settings.prefetch_buffer_size;
     res.direct_io_threshold = settings.min_bytes_to_use_direct_io;
     res.mmap_threshold = settings.min_bytes_to_use_mmap_io;
diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index cc54e7620f61..5c82f5651af3 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -79,6 +79,9 @@
 #include <Parsers/formatAST.h>
 #include <Parsers/QueryParameterVisitor.h>
 
+#include <Poco/Logger.h>
+#include <Core/Joins.h>
+
 namespace DB
 {
 
@@ -595,7 +598,6 @@ void ExpressionAnalyzer::getRootActions(const ASTPtr & ast, bool no_makeset_for_
         no_makeset_for_subqueries,
         false /* no_makeset */,
         only_consts,
-        !isRemoteStorage() /* create_source_for_in */,
         getAggregationKeysInfo(),
         false /* build_expression_with_window_functions */,
         is_create_parameterized_view);
@@ -616,7 +618,6 @@ void ExpressionAnalyzer::getRootActionsNoMakeSet(const ASTPtr & ast, ActionsDAGP
         true /* no_makeset_for_subqueries, no_makeset implies no_makeset_for_subqueries */,
         true /* no_makeset */,
         only_consts,
-        !isRemoteStorage() /* create_source_for_in */,
         getAggregationKeysInfo(),
         false /* build_expression_with_window_functions */,
         is_create_parameterized_view);
@@ -639,7 +640,6 @@ void ExpressionAnalyzer::getRootActionsForHaving(
         no_makeset_for_subqueries,
         false /* no_makeset */,
         only_consts,
-        true /* create_source_for_in */,
         getAggregationKeysInfo(),
         false /* build_expression_with_window_functions */,
         is_create_parameterized_view);
@@ -661,7 +661,6 @@ void ExpressionAnalyzer::getRootActionsForWindowFunctions(const ASTPtr & ast, bo
         no_makeset_for_subqueries,
         false /* no_makeset */,
         false /*only_consts */,
-        !isRemoteStorage() /* create_source_for_in */,
         getAggregationKeysInfo(),
         true);
     ActionsVisitor(visitor_data, log.stream()).visit(ast);
@@ -1066,13 +1065,6 @@ static std::shared_ptr<IJoin> chooseJoinAlgorithm(
 {
     const auto & settings = context->getSettings();
 
-    Block left_sample_block(left_sample_columns);
-    for (auto & column : left_sample_block)
-    {
-        if (!column.column)
-            column.column = column.type->createColumn();
-    }
-
     Block right_sample_block = joined_plan->getCurrentDataStream().header;
 
     std::vector<String> tried_algorithms;
@@ -1103,8 +1095,10 @@ static std::shared_ptr<IJoin> chooseJoinAlgorithm(
         analyzed_join->isEnabledAlgorithm(JoinAlgorithm::PARALLEL_HASH))
     {
         tried_algorithms.push_back(toString(JoinAlgorithm::HASH));
-        if (analyzed_join->allowParallelHashJoin())
-            return std::make_shared<ConcurrentHashJoin>(context, analyzed_join, settings.max_threads, right_sample_block);
+        if (ConcurrentHashJoin::isSupported(analyzed_join))
+        {
+            return std::make_shared<ConcurrentHashJoin>(analyzed_join, right_sample_block);
+        }
         return std::make_shared<HashJoin>(analyzed_join, right_sample_block);
     }
 
@@ -1118,7 +1112,10 @@ static std::shared_ptr<IJoin> chooseJoinAlgorithm(
     if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH))
     {
         tried_algorithms.push_back(toString(JoinAlgorithm::GRACE_HASH));
-        if (GraceHashJoin::isSupported(analyzed_join))
+
+        // Grace hash join requires that columns exist in left_sample_block.
+        Block left_sample_block(left_sample_columns);
+        if (sanitizeBlock(left_sample_block, false) && GraceHashJoin::isSupported(analyzed_join))
             return std::make_shared<GraceHashJoin>(context, analyzed_join, left_sample_block, right_sample_block, context->getTempDataOnDisk());
     }
 
diff --git a/src/Interpreters/IJoin.h b/src/Interpreters/IJoin.h
index 83067b0eab75..7ea8ec9387a6 100644
--- a/src/Interpreters/IJoin.h
+++ b/src/Interpreters/IJoin.h
@@ -63,7 +63,7 @@ class IJoin
     virtual void checkTypesOfKeys(const Block & block) const = 0;
 
     /// Join the block with data from left hand of JOIN to the right hand data (that was previously built by calls to addJoinedBlock).
-    /// Could be called from different threads in parallel.
+    /// Could be called from different threads in parallel.p
     virtual void joinBlock(Block & block, std::shared_ptr<ExtraBlock> & not_processed) = 0;
 
     /** Set/Get totals for right table
@@ -86,6 +86,13 @@ class IJoin
 
     // That can run FillingRightJoinSideTransform parallelly
     virtual bool supportParallelJoin() const { return false; }
+    // Shuffle data into partitions, eache processor will handle only one partition.
+    // In ConcurrentHashJoin, the lock cmpetition is fierce which protects the inner HashJoins.
+    // Make a inner shuffle will remove the lock.
+    virtual bool supportShuffle() const { return false; }
+    // If supportShuffle = true, this method should be implemented.
+    virtual JoinPtr clone() { return nullptr; }
+
     virtual bool supportTotals() const { return true; }
 
     /// Peek next stream of delayed joined blocks.
diff --git a/src/Interpreters/InterpreterDescribeCacheQuery.cpp b/src/Interpreters/InterpreterDescribeCacheQuery.cpp
index b8f6a9b308d3..ca875ee57b2d 100644
--- a/src/Interpreters/InterpreterDescribeCacheQuery.cpp
+++ b/src/Interpreters/InterpreterDescribeCacheQuery.cpp
@@ -20,7 +20,7 @@ static Block getSampleBlock()
         ColumnWithTypeAndName{std::make_shared<DataTypeUInt64>(), "max_elements"},
         ColumnWithTypeAndName{std::make_shared<DataTypeUInt64>(), "max_file_segment_size"},
         ColumnWithTypeAndName{std::make_shared<DataTypeNumber<UInt8>>(), "cache_on_write_operations"},
-        ColumnWithTypeAndName{std::make_shared<DataTypeNumber<UInt8>>(), "enable_cache_hits_threshold"},
+        ColumnWithTypeAndName{std::make_shared<DataTypeNumber<UInt8>>(), "cache_hits_threshold"},
         ColumnWithTypeAndName{std::make_shared<DataTypeUInt64>(), "current_size"},
         ColumnWithTypeAndName{std::make_shared<DataTypeUInt64>(), "current_elements"},
         ColumnWithTypeAndName{std::make_shared<DataTypeString>(), "path"},
@@ -45,7 +45,7 @@ BlockIO InterpreterDescribeCacheQuery::execute()
     res_columns[1]->insert(settings.max_elements);
     res_columns[2]->insert(settings.max_file_segment_size);
     res_columns[3]->insert(settings.cache_on_write_operations);
-    res_columns[4]->insert(settings.enable_cache_hits_threshold);
+    res_columns[4]->insert(settings.cache_hits_threshold);
     res_columns[5]->insert(cache->getUsedCacheSize());
     res_columns[6]->insert(cache->getFileSegmentsNum());
     res_columns[7]->insert(cache->getBasePath());
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index 36cb57c36781..415cf7028dac 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -364,12 +364,12 @@ BlockIO InterpreterSystemQuery::execute()
             {
                 auto caches = FileCacheFactory::instance().getAll();
                 for (const auto & [_, cache_data] : caches)
-                    cache_data->cache->removeIfReleasable();
+                    cache_data->cache->removeAllReleasable();
             }
             else
             {
                 auto cache = FileCacheFactory::instance().getByName(query.filesystem_cache_name).cache;
-                cache->removeIfReleasable();
+                cache->removeAllReleasable();
             }
             break;
         }
@@ -830,7 +830,9 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query)
     {
         if (!query_.replica_zk_path.empty() && fs::path(replicated->getZooKeeperPath()) != fs::path(query_.replica_zk_path))
             return;
-        if (replicated->getFullReplicaName() != query_.replica)
+        String full_replica_name = query_.shard.empty() ? query_.replica
+                                                        : DatabaseReplicated::getFullReplicaName(query_.shard, query_.replica);
+        if (replicated->getFullReplicaName() != full_replica_name)
             return;
 
         throw Exception(ErrorCodes::TABLE_WAS_NOT_DROPPED, "There is a local database {}, which has the same path in ZooKeeper "
@@ -845,7 +847,7 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query)
         if (auto * replicated = dynamic_cast<DatabaseReplicated *>(database.get()))
         {
             check_not_local_replica(replicated, query);
-            DatabaseReplicated::dropReplica(replicated, replicated->getZooKeeperPath(), query.replica);
+            DatabaseReplicated::dropReplica(replicated, replicated->getZooKeeperPath(), query.shard, query.replica);
         }
         else
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Database {} is not Replicated, cannot drop replica", query.getDatabase());
@@ -870,7 +872,7 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query)
             }
 
             check_not_local_replica(replicated, query);
-            DatabaseReplicated::dropReplica(replicated, replicated->getZooKeeperPath(), query.replica);
+            DatabaseReplicated::dropReplica(replicated, replicated->getZooKeeperPath(), query.shard, query.replica);
             LOG_TRACE(log, "Dropped replica {} of Replicated database {}", query.replica, backQuoteIfNeed(database->getDatabaseName()));
         }
     }
@@ -883,7 +885,7 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query)
             if (auto * replicated = dynamic_cast<DatabaseReplicated *>(elem.second.get()))
                 check_not_local_replica(replicated, query);
 
-        DatabaseReplicated::dropReplica(nullptr, query.replica_zk_path, query.replica);
+        DatabaseReplicated::dropReplica(nullptr, query.replica_zk_path, query.shard, query.replica);
         LOG_INFO(log, "Dropped replica {} of Replicated database with path {}", query.replica, query.replica_zk_path);
     }
     else
diff --git a/src/Interpreters/MergeTreeTransaction.cpp b/src/Interpreters/MergeTreeTransaction.cpp
index bfdda354c9bd..1358e3ed3c24 100644
--- a/src/Interpreters/MergeTreeTransaction.cpp
+++ b/src/Interpreters/MergeTreeTransaction.cpp
@@ -184,7 +184,7 @@ scope_guard MergeTreeTransaction::beforeCommit()
 
     /// We should wait for mutations to finish before committing transaction, because some mutation may fail and cause rollback.
     for (const auto & table_and_mutation : mutations_to_wait)
-        table_and_mutation.first->waitForMutation(table_and_mutation.second);
+        table_and_mutation.first->waitForMutation(table_and_mutation.second, /* wait_for_another_mutation */ false);
 
     assert([&]()
     {
diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index 443867806d61..d7c06a60464d 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <Common/SettingsChanges.h>
-#include <Access/Common/AuthenticationData.h>
+#include <Access/AuthenticationData.h>
 #include <Interpreters/ClientInfo.h>
 #include <Interpreters/Context_fwd.h>
 
diff --git a/src/Interpreters/SessionLog.cpp b/src/Interpreters/SessionLog.cpp
index 79aac63b40c9..c930013e52b1 100644
--- a/src/Interpreters/SessionLog.cpp
+++ b/src/Interpreters/SessionLog.cpp
@@ -87,9 +87,10 @@ NamesAndTypesList SessionLogElement::getNamesAndTypes()
             AUTH_TYPE_NAME_AND_VALUE(AuthType::LDAP),
             AUTH_TYPE_NAME_AND_VALUE(AuthType::KERBEROS),
             AUTH_TYPE_NAME_AND_VALUE(AuthType::SSL_CERTIFICATE),
+            AUTH_TYPE_NAME_AND_VALUE(AuthType::BCRYPT_PASSWORD),
         });
 #undef AUTH_TYPE_NAME_AND_VALUE
-    static_assert(static_cast<int>(AuthenticationType::MAX) == 7);
+    static_assert(static_cast<int>(AuthenticationType::MAX) == 8);
 
     auto interface_type_column = std::make_shared<DataTypeEnum8>(
         DataTypeEnum8::Values
diff --git a/src/Interpreters/SessionLog.h b/src/Interpreters/SessionLog.h
index 990c7ffea01f..1282ac09c4dd 100644
--- a/src/Interpreters/SessionLog.h
+++ b/src/Interpreters/SessionLog.h
@@ -2,7 +2,7 @@
 
 #include <Interpreters/SystemLog.h>
 #include <Interpreters/ClientInfo.h>
-#include <Access/Common/AuthenticationData.h>
+#include <Access/Common/AuthenticationType.h>
 #include <Core/NamesAndTypes.h>
 #include <Core/NamesAndAliases.h>
 #include <Columns/IColumn.h>
diff --git a/src/Interpreters/SortedBlocksWriter.cpp b/src/Interpreters/SortedBlocksWriter.cpp
index d8c42cba9c15..e09a66a38e6c 100644
--- a/src/Interpreters/SortedBlocksWriter.cpp
+++ b/src/Interpreters/SortedBlocksWriter.cpp
@@ -165,6 +165,7 @@ SortedBlocksWriter::TmpFilePtr SortedBlocksWriter::flush(const BlocksList & bloc
             pipeline.getNumStreams(),
             sort_description,
             rows_in_block,
+            /*max_block_size_bytes=*/0,
             SortingQueueStrategy::Default);
 
         pipeline.addTransform(std::move(transform));
@@ -220,6 +221,7 @@ SortedBlocksWriter::PremergedFiles SortedBlocksWriter::premerge()
                             pipeline.getNumStreams(),
                             sort_description,
                             rows_in_block,
+                            /*max_block_size_bytes=*/0,
                             SortingQueueStrategy::Default);
 
                         pipeline.addTransform(std::move(transform));
@@ -254,6 +256,7 @@ SortedBlocksWriter::SortedFiles SortedBlocksWriter::finishMerge(std::function<vo
             pipeline.getNumStreams(),
             sort_description,
             rows_in_block,
+            /*max_block_size_bytes=*/0,
             SortingQueueStrategy::Default);
 
         pipeline.addTransform(std::move(transform));
@@ -331,6 +334,7 @@ Block SortedBlocksBuffer::mergeBlocks(Blocks && blocks) const
                 builder.getNumStreams(),
                 sort_description,
                 num_rows,
+                /*max_block_size_bytes=*/0,
                 SortingQueueStrategy::Default);
 
             builder.addTransform(std::move(transform));
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index 7ea7a2652635..2d882083f3d8 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -147,6 +147,7 @@ void TableJoin::addDisjunct()
 void TableJoin::addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast)
 {
     addKey(left_table_ast->getColumnName(), right_table_ast->getAliasOrColumnName(), left_table_ast, right_table_ast);
+    right_key_aliases[right_table_ast->getColumnName()] = right_table_ast->getAliasOrColumnName();
 }
 
 /// @return how many times right key appears in ON section.
@@ -662,6 +663,14 @@ String TableJoin::renamedRightColumnName(const String & name) const
     return name;
 }
 
+String TableJoin::renamedRightColumnNameWithAlias(const String & name) const
+{
+    auto renamed = renamedRightColumnName(name);
+    if (const auto it = right_key_aliases.find(renamed); it != right_key_aliases.end())
+        return it->second;
+    return renamed;
+}
+
 void TableJoin::setRename(const String & from, const String & to)
 {
     renames[from] = to;
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 99b683b77134..0e0c905e30c6 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -156,6 +156,13 @@ class TableJoin
     /// Original name -> name. Only renamed columns.
     std::unordered_map<String, String> renames;
 
+    /// Map column name to actual key name that can be an alias.
+    /// Example: SELECT r.id as rid from t JOIN r ON t.id = rid
+    /// Map: r.id -> rid
+    /// Required only for StorageJoin to map join keys back to original column names.
+    /// (workaround for ExpressionAnalyzer)
+    std::unordered_map<String, String> right_key_aliases;
+
     VolumePtr tmp_volume;
 
     std::shared_ptr<StorageJoin> right_storage_join;
@@ -333,6 +340,7 @@ class TableJoin
     Block getRequiredRightKeys(const Block & right_table_keys, std::vector<String> & keys_sources) const;
 
     String renamedRightColumnName(const String & name) const;
+    String renamedRightColumnNameWithAlias(const String & name) const;
     void setRename(const String & from, const String & to);
 
     void resetKeys();
diff --git a/src/Interpreters/TemporaryDataOnDisk.cpp b/src/Interpreters/TemporaryDataOnDisk.cpp
index 11877eccc5c8..69fef21dbabb 100644
--- a/src/Interpreters/TemporaryDataOnDisk.cpp
+++ b/src/Interpreters/TemporaryDataOnDisk.cpp
@@ -92,12 +92,15 @@ TemporaryFileStream & TemporaryDataOnDisk::createStream(const Block & header, si
     throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryDataOnDiskScope has no cache and no volume");
 }
 
-FileSegmentsHolder TemporaryDataOnDisk::createCacheFile(size_t max_file_size)
+FileSegmentsHolderPtr TemporaryDataOnDisk::createCacheFile(size_t max_file_size)
 {
     if (!file_cache)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryDataOnDiskScope has no cache");
 
-    return file_cache->set(FileSegment::Key::random(), 0, std::max(10_MiB, max_file_size), CreateFileSegmentSettings(FileSegmentKind::Temporary, /* unbounded */ true));
+    const auto key = FileSegment::Key::random();
+    auto holder = file_cache->set(key, 0, std::max(10_MiB, max_file_size), CreateFileSegmentSettings(FileSegmentKind::Temporary, /* unbounded */ true));
+    fs::create_directories(file_cache->getPathInLocalCache(key));
+    return holder;
 }
 
 TemporaryFileOnDiskHolder TemporaryDataOnDisk::createRegularFile(size_t max_file_size)
@@ -237,15 +240,14 @@ TemporaryFileStream::TemporaryFileStream(TemporaryFileOnDiskHolder file_, const
     LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Writing to temporary file {}", file->getPath());
 }
 
-TemporaryFileStream::TemporaryFileStream(FileSegmentsHolder && segments_, const Block & header_, TemporaryDataOnDisk * parent_)
+TemporaryFileStream::TemporaryFileStream(FileSegmentsHolderPtr segments_, const Block & header_, TemporaryDataOnDisk * parent_)
     : parent(parent_)
     , header(header_)
     , segment_holder(std::move(segments_))
 {
-    if (segment_holder.file_segments.size() != 1)
+    if (segment_holder->size() != 1)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryFileStream can be created only from single segment");
-    auto & segment = segment_holder.file_segments.front();
-    auto out_buf = std::make_unique<WriteBufferToFileSegment>(segment.get());
+    auto out_buf = std::make_unique<WriteBufferToFileSegment>(&segment_holder->front());
 
     LOG_TEST(&Poco::Logger::get("TemporaryFileStream"), "Writing to temporary file {}", out_buf->getFileName());
     out_writer = std::make_unique<OutputWriter>(std::move(out_buf), header);
@@ -336,7 +338,7 @@ void TemporaryFileStream::updateAllocAndCheck()
 
 bool TemporaryFileStream::isEof() const
 {
-    return file == nullptr && segment_holder.empty();
+    return file == nullptr && !segment_holder;
 }
 
 void TemporaryFileStream::release()
@@ -356,7 +358,7 @@ void TemporaryFileStream::release()
         parent->deltaAllocAndCheck(-stat.compressed_size, -stat.uncompressed_size);
     }
 
-    if (!segment_holder.empty())
+    if (segment_holder)
         segment_holder.reset();
 }
 
@@ -364,8 +366,8 @@ String TemporaryFileStream::getPath() const
 {
     if (file)
         return file->getPath();
-    if (!segment_holder.file_segments.empty())
-        return segment_holder.file_segments.front()->getPathInLocalCache();
+    if (segment_holder && !segment_holder->empty())
+        return segment_holder->front().getPathInLocalCache();
 
     throw Exception(ErrorCodes::LOGICAL_ERROR, "TemporaryFileStream has no file");
 }
diff --git a/src/Interpreters/TemporaryDataOnDisk.h b/src/Interpreters/TemporaryDataOnDisk.h
index d06f05645597..14eefbf984d4 100644
--- a/src/Interpreters/TemporaryDataOnDisk.h
+++ b/src/Interpreters/TemporaryDataOnDisk.h
@@ -103,7 +103,7 @@ class TemporaryDataOnDisk : private TemporaryDataOnDiskScope
     const StatAtomic & getStat() const { return stat; }
 
 private:
-    FileSegmentsHolder createCacheFile(size_t max_file_size);
+    FileSegmentsHolderPtr createCacheFile(size_t max_file_size);
     TemporaryFileOnDiskHolder createRegularFile(size_t max_file_size);
 
     mutable std::mutex mutex;
@@ -130,7 +130,7 @@ class TemporaryFileStream : boost::noncopyable
     };
 
     TemporaryFileStream(TemporaryFileOnDiskHolder file_, const Block & header_, TemporaryDataOnDisk * parent_);
-    TemporaryFileStream(FileSegmentsHolder && segments_, const Block & header_, TemporaryDataOnDisk * parent_);
+    TemporaryFileStream(FileSegmentsHolderPtr segments_, const Block & header_, TemporaryDataOnDisk * parent_);
 
     size_t write(const Block & block);
     void flush();
@@ -161,7 +161,7 @@ class TemporaryFileStream : boost::noncopyable
 
     /// Data can be stored in file directly or in the cache
     TemporaryFileOnDiskHolder file;
-    FileSegmentsHolder segment_holder;
+    FileSegmentsHolderPtr segment_holder;
 
     Stat stat;
 
diff --git a/src/Interpreters/ZooKeeperLog.cpp b/src/Interpreters/ZooKeeperLog.cpp
index faa6d1f9f02b..48f4d510af75 100644
--- a/src/Interpreters/ZooKeeperLog.cpp
+++ b/src/Interpreters/ZooKeeperLog.cpp
@@ -87,6 +87,7 @@ NamesAndTypesList ZooKeeperLogElement::getNamesAndTypes()
                 {"Auth",                static_cast<Int16>(Coordination::OpNum::Auth)},
                 {"SessionID",           static_cast<Int16>(Coordination::OpNum::SessionID)},
                 {"FilteredList",        static_cast<Int16>(Coordination::OpNum::FilteredList)},
+                {"CheckNotExists",      static_cast<Int16>(Coordination::OpNum::CheckNotExists)},
             });
 
     auto error_enum = getCoordinationErrorCodesEnumType();
diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp
index 00a5d0ed1d89..7852f4cefa82 100644
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@@ -57,6 +57,7 @@
 #include <Interpreters/SelectQueryOptions.h>
 #include <Interpreters/TransactionLog.h>
 #include <Interpreters/executeQuery.h>
+#include <Interpreters/DatabaseCatalog.h>
 #include <Common/ProfileEvents.h>
 
 #include <IO/CompressionMethod.h>
@@ -526,6 +527,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
         context->initializeExternalTablesIfSet();
 
         auto * insert_query = ast->as<ASTInsertQuery>();
+        bool async_insert_enabled = settings.async_insert;
 
         /// Resolve database before trying to use async insert feature - to properly hash the query.
         if (insert_query)
@@ -534,6 +536,10 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                 insert_query->table_id = context->resolveStorageID(insert_query->table_id);
             else if (auto table = insert_query->getTable(); !table.empty())
                 insert_query->table_id = context->resolveStorageID(StorageID{insert_query->getDatabase(), table});
+
+            if (insert_query->table_id)
+                if (auto table = DatabaseCatalog::instance().tryGetTable(insert_query->table_id, context))
+                    async_insert_enabled |= table->areAsynchronousInsertsEnabled();
         }
 
         if (insert_query && insert_query->select)
@@ -568,7 +574,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
         auto * queue = context->getAsynchronousInsertQueue();
         auto * logger = &Poco::Logger::get("executeQuery");
 
-        if (insert_query && settings.async_insert)
+        if (insert_query && async_insert_enabled)
         {
             String reason;
 
diff --git a/src/Interpreters/tests/gtest_lru_file_cache.cpp b/src/Interpreters/tests/gtest_lru_file_cache.cpp
index 0754c394f66d..e26d412b35e6 100644
--- a/src/Interpreters/tests/gtest_lru_file_cache.cpp
+++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp
@@ -16,6 +16,9 @@
 #include <filesystem>
 #include <thread>
 #include <DataTypes/DataTypesNumber.h>
+#include <Poco/Util/XMLConfiguration.h>
+#include <Poco/DOM/DOMParser.h>
+#include <base/sleep.h>
 
 #include <Poco/ConsoleChannel.h>
 #include <Disks/IO/CachedOnDiskWriteBufferFromFile.h>
@@ -26,21 +29,6 @@ using namespace DB;
 
 static constexpr auto TEST_LOG_LEVEL = "debug";
 
-void assertRange(
-    [[maybe_unused]] size_t assert_n, DB::FileSegmentPtr file_segment,
-    const DB::FileSegment::Range & expected_range, DB::FileSegment::State expected_state)
-{
-    auto range = file_segment->range();
-
-    std::cerr << fmt::format("\nAssert #{} : {} == {} (state: {} == {})\n", assert_n,
-                             range.toString(), expected_range.toString(),
-                             toString(file_segment->state()), toString(expected_state));
-
-    ASSERT_EQ(range.left, expected_range.left);
-    ASSERT_EQ(range.right, expected_range.right);
-    ASSERT_EQ(file_segment->state(), expected_state);
-}
-
 void printRanges(const auto & segments)
 {
     std::cerr << "\nHaving file segments: ";
@@ -48,21 +36,16 @@ void printRanges(const auto & segments)
         std::cerr << '\n' << segment->range().toString() << " (state: " + DB::FileSegment::stateToString(segment->state()) + ")" << "\n";
 }
 
-std::vector<DB::FileSegmentPtr> fromHolder(const DB::FileSegmentsHolder & holder)
-{
-    return std::vector<DB::FileSegmentPtr>(holder.file_segments.begin(), holder.file_segments.end());
-}
-
 String getFileSegmentPath(const String & base_path, const DB::FileCache::Key & key, size_t offset)
 {
     auto key_str = key.toString();
     return fs::path(base_path) / key_str.substr(0, 3) / key_str / DB::toString(offset);
 }
 
-void download(const std::string & cache_base_path, DB::FileSegmentPtr file_segment)
+void download(const std::string & cache_base_path, DB::FileSegment & file_segment)
 {
-    const auto & key = file_segment->key();
-    size_t size = file_segment->range().size();
+    const auto & key = file_segment.key();
+    size_t size = file_segment.range().size();
 
     auto key_str = key.toString();
     auto subdir = fs::path(cache_base_path) / key_str.substr(0, 3) / key_str;
@@ -70,29 +53,94 @@ void download(const std::string & cache_base_path, DB::FileSegmentPtr file_segme
         fs::create_directories(subdir);
 
     std::string data(size, '0');
-    file_segment->write(data.data(), size, file_segment->getCurrentWriteOffset());
+    file_segment.write(data.data(), size, file_segment.getCurrentWriteOffset(false));
 }
 
-void prepareAndDownload(const std::string & cache_base_path, DB::FileSegmentPtr file_segment)
+using Range = FileSegment::Range;
+using Ranges = std::vector<Range>;
+using State = FileSegment::State;
+using States = std::vector<State>;
+using Holder = FileSegmentsHolder;
+using HolderPtr = FileSegmentsHolderPtr;
+
+fs::path caches_dir = fs::current_path() / "lru_cache_test";
+std::string cache_base_path = caches_dir / "cache1" / "";
+
+
+void assertEqual(const HolderPtr & holder, const Ranges & expected_ranges, const States & expected_states = {})
 {
-    ASSERT_TRUE(file_segment->reserve(file_segment->range().size()));
+    std::cerr << "Holder: " << holder->toString() << "\n";
+    ASSERT_EQ(holder->size(), expected_ranges.size());
+
+    if (!expected_states.empty())
+        ASSERT_EQ(holder->size(), expected_states.size());
+
+    auto get_expected_state = [&](size_t i)
+    {
+        if (expected_states.empty())
+            return State::DOWNLOADED;
+        else
+            return expected_states[i];
+    };
+
+    size_t i = 0;
+    for (const auto & file_segment : *holder)
+    {
+        ASSERT_EQ(file_segment->range(), expected_ranges[i]);
+        ASSERT_EQ(file_segment->state(), get_expected_state(i));
+        ++i;
+    }
+}
+
+FileSegment & get(const HolderPtr & holder, int i)
+{
+    auto it = std::next(holder->begin(), i);
+    if (it == holder->end())
+        std::terminate();
+    return **it;
+}
+
+void download(FileSegment & file_segment)
+{
+    std::cerr << "Downloading range " << file_segment.range().toString() << "\n";
+
+    ASSERT_EQ(file_segment.getOrSetDownloader(), FileSegment::getCallerId());
+    ASSERT_EQ(file_segment.state(), State::DOWNLOADING);
+    ASSERT_EQ(file_segment.getDownloadedSize(false), 0);
+
+    ASSERT_TRUE(file_segment.reserve(file_segment.range().size()));
     download(cache_base_path, file_segment);
+    ASSERT_EQ(file_segment.state(), State::DOWNLOADING);
+
+    file_segment.complete();
+    ASSERT_EQ(file_segment.state(), State::DOWNLOADED);
+}
+
+void assertDownloadFails(FileSegment & file_segment)
+{
+    ASSERT_EQ(file_segment.getOrSetDownloader(), FileSegment::getCallerId());
+    ASSERT_EQ(file_segment.getDownloadedSize(false), 0);
+    ASSERT_FALSE(file_segment.reserve(file_segment.range().size()));
+    file_segment.complete();
 }
 
-void complete(const std::string & cache_base_path, const DB::FileSegmentsHolder & holder)
+void download(const HolderPtr & holder)
 {
-    for (const auto & file_segment : holder.file_segments)
+    for (auto & it : *holder)
     {
-        ASSERT_TRUE(file_segment->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        prepareAndDownload(cache_base_path, file_segment);
-        file_segment->completeWithoutState();
+        download(*it);
     }
 }
 
+void increasePriority(const HolderPtr & holder)
+{
+    for (auto & it : *holder)
+        it->use();
+}
+
 class FileCacheTest : public ::testing::Test
 {
 public:
-
     static void setupLogs(const std::string & level)
     {
         Poco::AutoPtr<Poco::ConsoleChannel> channel(new Poco::ConsoleChannel(std::cerr));
@@ -118,8 +166,6 @@ class FileCacheTest : public ::testing::Test
             fs::remove_all(cache_base_path);
     }
 
-    fs::path caches_dir = fs::current_path() / "lru_cache_test";
-    std::string cache_base_path = caches_dir / "cache1" / "";
 };
 
 TEST_F(FileCacheTest, get)
@@ -128,6 +174,14 @@ TEST_F(FileCacheTest, get)
 
     /// To work with cache need query_id and query context.
     std::string query_id = "query_id";
+
+    Poco::XML::DOMParser dom_parser;
+    std::string xml(R"CONFIG(<clickhouse>
+</clickhouse>)CONFIG");
+    Poco::AutoPtr<Poco::XML::Document> document = dom_parser.parseString(xml);
+    Poco::AutoPtr<Poco::Util::XMLConfiguration> config = new Poco::Util::XMLConfiguration(document);
+    getMutableContext().context->setConfig(config);
+
     auto query_context = DB::Context::createCopy(getContext().context);
     query_context->makeQueryContext();
     query_context->setCurrentQueryId(query_id);
@@ -140,255 +194,249 @@ TEST_F(FileCacheTest, get)
     settings.max_elements = 5;
 
     {
+        std::cerr << "Step 1\n";
         auto cache = DB::FileCache(settings);
         cache.initialize();
-        auto key = cache.hash("key1");
+        auto key = cache.createKeyForPath("key1");
 
         {
             auto holder = cache.getOrSet(key, 0, 10, {});  /// Add range [0, 9]
-            auto segments = fromHolder(holder);
-            /// Range was not present in cache. It should be added in cache as one while file segment.
-            ASSERT_EQ(segments.size(), 1);
-
-            assertRange(1, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::EMPTY);
-
-            /// Exception because space not reserved.
-            /// EXPECT_THROW(download(segments[0]), DB::Exception);
-            /// Exception because space can be reserved only by downloader
-            /// EXPECT_THROW(segments[0]->reserve(segments[0]->range().size()), DB::Exception);
-
-            ASSERT_TRUE(segments[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(segments[0]->reserve(segments[0]->range().size()));
-            assertRange(2, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADING);
-
-            download(cache_base_path, segments[0]);
-            segments[0]->completeWithoutState();
-            assertRange(3, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
+            assertEqual(holder, { Range(0, 9) }, { State::EMPTY });
+            download(holder->front());
+            assertEqual(holder, { Range(0, 9) }, { State::DOWNLOADED });
+            increasePriority(holder);
         }
 
         /// Current cache:    [__________]
         ///                   ^          ^
         ///                   0          9
+        assertEqual(cache.getSnapshot(key), { Range(0, 9) });
+        assertEqual(cache.dumpQueue(), { Range(0, 9) });
         ASSERT_EQ(cache.getFileSegmentsNum(), 1);
         ASSERT_EQ(cache.getUsedCacheSize(), 10);
 
+        std::cerr << "Step 2\n";
+
         {
             /// Want range [5, 14], but [0, 9] already in cache, so only [10, 14] will be put in cache.
             auto holder = cache.getOrSet(key, 5, 10, {});
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 2);
-
-            assertRange(4, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-            assertRange(5, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::EMPTY);
-
-            ASSERT_TRUE(segments[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            prepareAndDownload(cache_base_path, segments[1]);
-            segments[1]->completeWithoutState();
-            assertRange(6, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
+            assertEqual(holder, { Range(0, 9), Range(10, 14) }, { State::DOWNLOADED, State::EMPTY });
+            download(get(holder, 1));
+            assertEqual(holder, { Range(0, 9), Range(10, 14) }, { State::DOWNLOADED, State::DOWNLOADED });
+            increasePriority(holder);
         }
 
         /// Current cache:    [__________][_____]
         ///                   ^          ^^     ^
         ///                   0          910    14
+        assertEqual(cache.getSnapshot(key), { Range(0, 9), Range(10, 14) });
+        assertEqual(cache.dumpQueue(), { Range(0, 9), Range(10, 14) });
         ASSERT_EQ(cache.getFileSegmentsNum(), 2);
         ASSERT_EQ(cache.getUsedCacheSize(), 15);
 
+        std::cerr << "Step 3\n";
+
+        /// Get [9, 9]
+        {
+            auto holder = cache.getOrSet(key, 9, 1, {});
+            assertEqual(holder, { Range(0, 9) }, { State::DOWNLOADED });
+            increasePriority(holder);
+        }
+
+        assertEqual(cache.dumpQueue(), { Range(10, 14), Range(0, 9) });
+        /// Get [9, 10]
+        assertEqual(cache.getOrSet(key, 9, 2, {}),
+                    { Range(0, 9),       Range(10, 14) },
+                    { State::DOWNLOADED, State::DOWNLOADED });
+
+        /// Get [10, 10]
         {
-            auto holder = cache.getOrSet(key, 9, 1, {});  /// Get [9, 9]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 1);
-            assertRange(7, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
+            auto holder = cache.getOrSet(key, 10, 1, {});
+            assertEqual(holder, { Range(10, 14) }, { State::DOWNLOADED });
+            increasePriority(holder);
         }
 
+        assertEqual(cache.getSnapshot(key), { Range(0, 9), Range(10, 14) });
+        assertEqual(cache.dumpQueue(), { Range(0, 9), Range(10, 14) });
+        ASSERT_EQ(cache.getFileSegmentsNum(), 2);
+        ASSERT_EQ(cache.getUsedCacheSize(), 15);
+
+        std::cerr << "Step 4\n";
+
         {
-            auto holder = cache.getOrSet(key, 9, 2, {});  /// Get [9, 10]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 2);
-            assertRange(8, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-            assertRange(9, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
+            auto holder = cache.getOrSet(key, 17, 4, {});
+            download(holder); /// Get [17, 20]
+            increasePriority(holder);
         }
 
         {
-            auto holder = cache.getOrSet(key, 10, 1, {});  /// Get [10, 10]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 1);
-            assertRange(10, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
+            auto holder = cache.getOrSet(key, 24, 3, {});
+            download(holder); /// Get [24, 26]
+            increasePriority(holder);
         }
 
-        complete(cache_base_path, cache.getOrSet(key, 17, 4, {})); /// Get [17, 20]
-        complete(cache_base_path, cache.getOrSet(key, 24, 3, {})); /// Get [24, 26]
-        /// completeWithState(cache.getOrSet(key, 27, 1, false)); /// Get [27, 27]
+        {
+            auto holder = cache.getOrSet(key, 27, 1, {});
+            download(holder); /// Get [27, 27]
+            increasePriority(holder);
+        }
 
         /// Current cache:    [__________][_____]   [____]    [___][]
         ///                   ^          ^^     ^   ^    ^    ^   ^^^
         ///                   0          910    14  17   20   24  2627
         ///
-        ASSERT_EQ(cache.getFileSegmentsNum(), 4);
-        ASSERT_EQ(cache.getUsedCacheSize(), 22);
+        assertEqual(cache.getSnapshot(key), { Range(0, 9), Range(10, 14), Range(17, 20), Range(24, 26), Range(27, 27) });
+        assertEqual(cache.dumpQueue(), { Range(0, 9), Range(10, 14), Range(17, 20), Range(24, 26), Range(27, 27) });
+        ASSERT_EQ(cache.getFileSegmentsNum(), 5);
+        ASSERT_EQ(cache.getUsedCacheSize(), 23);
 
+        std::cerr << "Step 5\n";
         {
             auto holder = cache.getOrSet(key, 0, 26, {}); /// Get [0, 25]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 6);
-
-            assertRange(11, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-            assertRange(12, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
-
-            /// Missing [15, 16] should be added in cache.
-            assertRange(13, segments[2], DB::FileSegment::Range(15, 16), DB::FileSegment::State::EMPTY);
-
-            ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            prepareAndDownload(cache_base_path, segments[2]);
-
-            segments[2]->completeWithoutState();
-
-            assertRange(14, segments[3], DB::FileSegment::Range(17, 20), DB::FileSegment::State::DOWNLOADED);
-
-            /// New [21, 23], but will not be added in cache because of elements limit (5)
-            assertRange(15, segments[4], DB::FileSegment::Range(21, 23), DB::FileSegment::State::EMPTY);
-            ASSERT_TRUE(segments[4]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_FALSE(segments[4]->reserve(1));
-
-            assertRange(16, segments[5], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-
-            /// Current cache:    [__________][_____][   ][____]    [___]
-            ///                   ^                            ^    ^
-            ///                   0                            20   24
-            ///
+            assertEqual(holder,
+                        { Range(0, 9),       Range(10, 14),     Range(15, 16),  Range(17, 20),     Range(21, 23), Range(24, 26) },
+                        { State::DOWNLOADED, State::DOWNLOADED, State::EMPTY,   State::DOWNLOADED, State::EMPTY,  State::DOWNLOADED });
+            download(get(holder, 2)); /// [27, 27] was evicted.
+            assertEqual(holder,
+                        { Range(0, 9),       Range(10, 14),     Range(15, 16),     Range(17, 20),     Range(21, 23), Range(24, 26) },
+                        { State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::EMPTY,  State::DOWNLOADED });
+            assertDownloadFails(get(holder, 4));
+            assertEqual(holder,
+                        { Range(0, 9),       Range(10, 14),     Range(15, 16),     Range(17, 20),     Range(21, 23),     Range(24, 26) },
+                        { State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::DETACHED, State::DOWNLOADED });
 
             /// Range [27, 27] must be evicted in previous getOrSet [0, 25].
             /// Let's not invalidate pointers to returned segments from range [0, 25] and
             /// as max elements size is reached, next attempt to put something in cache should fail.
             /// This will also check that [27, 27] was indeed evicted.
-
-            auto holder1 = cache.getOrSet(key, 27, 1, {});
-            auto segments_1 = fromHolder(holder1); /// Get [27, 27]
-            ASSERT_EQ(segments_1.size(), 1);
-            assertRange(17, segments_1[0], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY);
+            auto holder2 = cache.getOrSet(key, 27, 1, {});
+            assertEqual(holder2, { Range(27, 27) }, { State::EMPTY });
+            assertDownloadFails(holder2->front());
+            assertEqual(holder2, { Range(27, 27) }, { State::DETACHED });
+
+            auto holder3 = cache.getOrSet(key, 28, 3, {});
+            assertEqual(holder3, { Range(28, 30) }, { State::EMPTY });
+            assertDownloadFails(holder3->front());
+            assertEqual(holder3, { Range(28, 30) }, { State::DETACHED });
+
+            increasePriority(holder);
+            increasePriority(holder2);
+            increasePriority(holder3);
         }
 
-        {
-            auto holder = cache.getOrSet(key, 12, 10, {}); /// Get [12, 21]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 4);
-
-            assertRange(18, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
-            assertRange(19, segments[1], DB::FileSegment::Range(15, 16), DB::FileSegment::State::DOWNLOADED);
-            assertRange(20, segments[2], DB::FileSegment::Range(17, 20), DB::FileSegment::State::DOWNLOADED);
-
-            assertRange(21, segments[3], DB::FileSegment::Range(21, 21), DB::FileSegment::State::EMPTY);
+        /// Current cache:    [__________][_____][   ][____]    [___]
+        ///                   ^                            ^    ^
+        ///                   0                            20   24
+        ///
+        assertEqual(cache.getSnapshot(key), { Range(0, 9), Range(10, 14), Range(15, 16), Range(17, 20), Range(24, 26) });
+        assertEqual(cache.dumpQueue(), { Range(0, 9), Range(10, 14), Range(15, 16), Range(17, 20), Range(24, 26) });
+        ASSERT_EQ(cache.getFileSegmentsNum(), 5);
+        ASSERT_EQ(cache.getUsedCacheSize(), 24);
 
-            ASSERT_TRUE(segments[3]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            prepareAndDownload(cache_base_path, segments[3]);
+        std::cerr << "Step 6\n";
 
-            segments[3]->completeWithoutState();
-            ASSERT_TRUE(segments[3]->state() == DB::FileSegment::State::DOWNLOADED);
+        {
+            auto holder = cache.getOrSet(key, 12, 10, {}); /// Get [12, 21]
+            assertEqual(holder,
+                        { Range(10, 14),     Range(15, 16),     Range(17, 20),     Range(21, 21) },
+                        { State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::EMPTY });
+            download(get(holder, 3));
+            assertEqual(holder,
+                        { Range(10, 14),     Range(15, 16),     Range(17, 20),     Range(21, 21) },
+                        { State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED });
+            increasePriority(holder);
         }
 
         /// Current cache:    [_____][__][____][_]   [___]
         ///                   ^          ^       ^   ^   ^
         ///                   10         17      21  24  26
-
+        assertEqual(cache.getSnapshot(key), { Range(10, 14), Range(15, 16), Range(17, 20), Range(21, 21), Range(24, 26) });
+        assertEqual(cache.dumpQueue(), { Range(24, 26), Range(10, 14), Range(15, 16), Range(17, 20), Range(21, 21) });
         ASSERT_EQ(cache.getFileSegmentsNum(), 5);
+        ASSERT_EQ(cache.getUsedCacheSize(), 15);
 
+        std::cerr << "Step 7\n";
         {
-            auto holder = cache.getOrSet(key, 23, 5, {}); /// Get [23, 28]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 3);
-
-            assertRange(22, segments[0], DB::FileSegment::Range(23, 23), DB::FileSegment::State::EMPTY);
-            assertRange(23, segments[1], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-            assertRange(24, segments[2], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY);
-
-            ASSERT_TRUE(segments[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            prepareAndDownload(cache_base_path, segments[0]);
-            prepareAndDownload(cache_base_path, segments[2]);
-            segments[0]->completeWithoutState();
-            segments[2]->completeWithoutState();
+            auto holder = cache.getOrSet(key, 23, 5, {}); /// Get [23, 27]
+            assertEqual(holder,
+                        { Range(23, 23), Range(24, 26),     Range(27, 27) },
+                        { State::EMPTY,  State::DOWNLOADED, State::EMPTY });
+            download(get(holder, 0));
+            download(get(holder, 2));
+            increasePriority(holder);
         }
 
         /// Current cache:    [____][_]  [][___][__]
         ///                   ^       ^  ^^^   ^^  ^
-        ///                   17      21 2324  26  28
-
-        {
-            auto holder5 = cache.getOrSet(key, 2, 3, {}); /// Get [2, 4]
-            auto s5 = fromHolder(holder5);
-            ASSERT_EQ(s5.size(), 1);
-            assertRange(25, s5[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::EMPTY);
-
-            auto holder1 = cache.getOrSet(key, 30, 2, {}); /// Get [30, 31]
-            auto s1 = fromHolder(holder1);
-            ASSERT_EQ(s1.size(), 1);
-            assertRange(26, s1[0], DB::FileSegment::Range(30, 31), DB::FileSegment::State::EMPTY);
-
-            ASSERT_TRUE(s5[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(s1[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            prepareAndDownload(cache_base_path, s5[0]);
-            prepareAndDownload(cache_base_path, s1[0]);
-            s5[0]->completeWithoutState();
-            s1[0]->completeWithoutState();
-
-            /// Current cache:    [___]       [_][___][_]   [__]
-            ///                   ^   ^       ^  ^   ^  ^   ^  ^
-            ///                   2   4       23 24  26 27  30 31
-
-            auto holder2 = cache.getOrSet(key, 23, 1, {}); /// Get [23, 23]
-            auto s2 = fromHolder(holder2);
-            ASSERT_EQ(s2.size(), 1);
-
-            auto holder3 = cache.getOrSet(key, 24, 3, {}); /// Get [24, 26]
-            auto s3 = fromHolder(holder3);
-            ASSERT_EQ(s3.size(), 1);
-
-            auto holder4 = cache.getOrSet(key, 27, 1, {}); /// Get [27, 27]
-            auto s4 = fromHolder(holder4);
-            ASSERT_EQ(s4.size(), 1);
-
-            /// All cache is now unreleasable because pointers are still hold
-            auto holder6 = cache.getOrSet(key, 0, 40, {});
-            auto f = fromHolder(holder6);
-            ASSERT_EQ(f.size(), 9);
-
-            assertRange(27, f[0], DB::FileSegment::Range(0, 1), DB::FileSegment::State::EMPTY);
-            assertRange(28, f[2], DB::FileSegment::Range(5, 22), DB::FileSegment::State::EMPTY);
-            assertRange(29, f[6], DB::FileSegment::Range(28, 29), DB::FileSegment::State::EMPTY);
-            assertRange(30, f[8], DB::FileSegment::Range(32, 39), DB::FileSegment::State::EMPTY);
-
-            ASSERT_TRUE(f[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(f[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(f[6]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(f[8]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-
-            ASSERT_FALSE(f[0]->reserve(1));
-            ASSERT_FALSE(f[2]->reserve(1));
-            ASSERT_FALSE(f[6]->reserve(1));
-            ASSERT_FALSE(f[8]->reserve(1));
-        }
+        ///                   17      21 2324  26  27
+        assertEqual(cache.getSnapshot(key), { Range(17, 20), Range(21, 21), Range(23, 23), Range(24, 26), Range(27, 27) });
+        assertEqual(cache.dumpQueue(), { Range(17, 20), Range(21, 21), Range(23, 23), Range(24, 26), Range(27, 27) });
+        ASSERT_EQ(cache.getFileSegmentsNum(), 5);
+        ASSERT_EQ(cache.getUsedCacheSize(), 10);
 
+        std::cerr << "Step 8\n";
         {
             auto holder = cache.getOrSet(key, 2, 3, {}); /// Get [2, 4]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 1);
-            assertRange(31, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
+            assertEqual(holder, { Range(2, 4) }, { State::EMPTY });
+
+            auto holder2 = cache.getOrSet(key, 30, 2, {}); /// Get [30, 31]
+            assertEqual(holder2, { Range(30, 31) }, { State::EMPTY });
+
+            download(get(holder, 0));
+            download(get(holder2, 0));
+
+            auto holder3 = cache.getOrSet(key, 23, 1, {}); /// Get [23, 23]
+            assertEqual(holder3, { Range(23, 23) }, { State::DOWNLOADED });
+
+            auto holder4 = cache.getOrSet(key, 24, 3, {}); /// Get [24, 26]
+            assertEqual(holder4, { Range(24, 26) }, { State::DOWNLOADED });
+
+            auto holder5 = cache.getOrSet(key, 27, 1, {}); /// Get [27, 27]
+            assertEqual(holder5, { Range(27, 27) }, { State::DOWNLOADED });
+
+            auto holder6 = cache.getOrSet(key, 0, 40, {});
+            assertEqual(holder6,
+                        { Range(0, 1), Range(2, 4),        Range(5, 22), Range(23, 23),     Range(24, 26),     Range(27, 27),    Range(28, 29), Range(30, 31),     Range(32, 39) },
+                        { State::EMPTY, State::DOWNLOADED, State::EMPTY, State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::EMPTY, State::DOWNLOADED, State::EMPTY });
+
+            assertDownloadFails(get(holder6, 0));
+            assertDownloadFails(get(holder6, 2));
+            assertDownloadFails(get(holder6, 6));
+            assertDownloadFails(get(holder6, 8));
+
+            increasePriority(holder);
+            increasePriority(holder2);
+            increasePriority(holder3);
+            increasePriority(holder4);
+            increasePriority(holder5);
+            increasePriority(holder6);
         }
 
         /// Current cache:    [___]       [_][___][_]   [__]
         ///                   ^   ^       ^  ^   ^  ^   ^  ^
         ///                   2   4       23 24  26 27  30 31
+        assertEqual(cache.getSnapshot(key), { Range(2, 4), Range(23, 23), Range(24, 26), Range(27, 27), Range(30, 31) });
+        assertEqual(cache.dumpQueue(), { Range(2, 4), Range(23, 23), Range(24, 26), Range(27, 27), Range(30, 31) });
 
+        std::cerr << "Step 9\n";
+
+        /// Get [2, 4]
         {
-            auto holder = cache.getOrSet(key, 25, 5, {}); /// Get [25, 29]
-            auto segments = fromHolder(holder);
-            ASSERT_EQ(segments.size(), 3);
+            auto holder = cache.getOrSet(key, 2, 3, {});
+            assertEqual(holder, { Range(2, 4) }, { State::DOWNLOADED });
+            increasePriority(holder);
+        }
 
-            assertRange(32, segments[0], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-            assertRange(33, segments[1], DB::FileSegment::Range(27, 27), DB::FileSegment::State::DOWNLOADED);
 
-            assertRange(34, segments[2], DB::FileSegment::Range(28, 29), DB::FileSegment::State::EMPTY);
-            ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADING);
+        {
+            auto holder = cache.getOrSet(key, 25, 5, {}); /// Get [25, 29]
+            assertEqual(holder,
+                        { Range(24, 26),     Range(27, 27),     Range(28, 29) },
+                        { State::DOWNLOADED, State::DOWNLOADED, State::EMPTY });
+
+            auto & file_segment = get(holder, 2);
+            ASSERT_TRUE(file_segment.getOrSetDownloader() == FileSegment::getCallerId());
+            ASSERT_TRUE(file_segment.state() == State::DOWNLOADING);
 
             bool lets_start_download = false;
             std::mutex mutex;
@@ -403,16 +451,13 @@ TEST_F(FileCacheTest, get)
                 chassert(&DB::CurrentThread::get() == &thread_status_1);
                 DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
 
-                auto holder_2 = cache.getOrSet(key, 25, 5, {}); /// Get [25, 29] once again.
-                auto segments_2 = fromHolder(holder_2);
-                ASSERT_EQ(segments.size(), 3);
-
-                assertRange(35, segments_2[0], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-                assertRange(36, segments_2[1], DB::FileSegment::Range(27, 27), DB::FileSegment::State::DOWNLOADED);
-                assertRange(37, segments_2[2], DB::FileSegment::Range(28, 29), DB::FileSegment::State::DOWNLOADING);
+                auto holder2 = cache.getOrSet(key, 25, 5, {}); /// Get [25, 29] once again.
+                assertEqual(holder2,
+                            { Range(24, 26),     Range(27, 27),     Range(28, 29) },
+                            { State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADING });
 
-                ASSERT_TRUE(segments[2]->getOrSetDownloader() != DB::FileSegment::getCallerId());
-                ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADING);
+                auto & file_segment2 = get(holder2, 2);
+                ASSERT_TRUE(file_segment2.getOrSetDownloader() != FileSegment::getCallerId());
 
                 {
                     std::lock_guard lock(mutex);
@@ -420,8 +465,8 @@ TEST_F(FileCacheTest, get)
                 }
                 cv.notify_one();
 
-                segments_2[2]->wait();
-                ASSERT_TRUE(segments_2[2]->state() == DB::FileSegment::State::DOWNLOADED);
+                file_segment2.wait(file_segment2.range().left);
+                ASSERT_TRUE(file_segment2.state() == State::DOWNLOADED);
             });
 
             {
@@ -429,35 +474,34 @@ TEST_F(FileCacheTest, get)
                 cv.wait(lock, [&]{ return lets_start_download; });
             }
 
-            prepareAndDownload(cache_base_path, segments[2]);
-            segments[2]->completeWithoutState();
-            ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADED);
+            download(file_segment);
+            ASSERT_TRUE(file_segment.state() == State::DOWNLOADED);
 
             other_1.join();
+
+            increasePriority(holder);
         }
 
         /// Current cache:    [___]       [___][_][__][__]
         ///                   ^   ^       ^   ^  ^^  ^^  ^
         ///                   2   4       24  26 27  2930 31
+        assertEqual(cache.getSnapshot(key), { Range(2, 4), Range(24, 26), Range(27, 27), Range(28, 29), Range(30, 31) });
+        assertEqual(cache.dumpQueue(), { Range(30, 31), Range(2, 4), Range(24, 26), Range(27, 27), Range(28, 29) });
 
+        std::cerr << "Step 10\n";
         {
             /// Now let's check the similar case but getting ERROR state after segment->wait(), when
             /// state is changed not manually via segment->completeWithState(state) but from destructor of holder
             /// and notify_all() is also called from destructor of holder.
 
-            std::optional<DB::FileSegmentsHolder> holder;
-            holder.emplace(cache.getOrSet(key, 3, 23, {})); /// Get [3, 25]
-
-            auto segments = fromHolder(*holder);
-            ASSERT_EQ(segments.size(), 3);
-
-            assertRange(38, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
-
-            assertRange(39, segments[1], DB::FileSegment::Range(5, 23), DB::FileSegment::State::EMPTY);
-            ASSERT_TRUE(segments[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(segments[1]->state() == DB::FileSegment::State::DOWNLOADING);
+            auto holder = cache.getOrSet(key, 3, 23, {}); /// Get [3, 25]
+            assertEqual(holder,
+                        { Range(2, 4),       Range(5, 23), Range(24, 26) },
+                        { State::DOWNLOADED, State::EMPTY, State::DOWNLOADED });
 
-            assertRange(40, segments[2], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
+            auto & file_segment = get(holder, 1);
+            ASSERT_TRUE(file_segment.getOrSetDownloader() == FileSegment::getCallerId());
+            ASSERT_TRUE(file_segment.state() == State::DOWNLOADING);
 
             bool lets_start_download = false;
             std::mutex mutex;
@@ -472,16 +516,13 @@ TEST_F(FileCacheTest, get)
                 chassert(&DB::CurrentThread::get() == &thread_status_1);
                 DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
 
-                auto holder_2 = cache.getOrSet(key, 3, 23, {}); /// Get [3, 25] once again
-                auto segments_2 = fromHolder(*holder);
-                ASSERT_EQ(segments_2.size(), 3);
+                auto holder2 = cache.getOrSet(key, 3, 23, {}); /// Get [3, 25] once again
+                assertEqual(holder,
+                            { Range(2, 4),       Range(5, 23),       Range(24, 26) },
+                            { State::DOWNLOADED, State::DOWNLOADING, State::DOWNLOADED });
 
-                assertRange(41, segments_2[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
-                assertRange(42, segments_2[1], DB::FileSegment::Range(5, 23), DB::FileSegment::State::DOWNLOADING);
-                assertRange(43, segments_2[2], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-
-                ASSERT_TRUE(segments_2[1]->getDownloader() != DB::FileSegment::getCallerId());
-                ASSERT_TRUE(segments_2[1]->state() == DB::FileSegment::State::DOWNLOADING);
+                auto & file_segment2 = get(holder, 1);
+                ASSERT_TRUE(file_segment2.getDownloader() != FileSegment::getCallerId());
 
                 {
                     std::lock_guard lock(mutex);
@@ -489,13 +530,10 @@ TEST_F(FileCacheTest, get)
                 }
                 cv.notify_one();
 
-                segments_2[1]->wait();
-                printRanges(segments_2);
-                ASSERT_TRUE(segments_2[1]->state() == DB::FileSegment::State::PARTIALLY_DOWNLOADED);
-
-                ASSERT_TRUE(segments_2[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-                prepareAndDownload(cache_base_path, segments_2[1]);
-                segments_2[1]->completeWithoutState();
+                file_segment2.wait(file_segment2.range().left);
+                ASSERT_TRUE(file_segment2.state() == DB::FileSegment::State::PARTIALLY_DOWNLOADED);
+                ASSERT_TRUE(file_segment2.getOrSetDownloader() == DB::FileSegment::getCallerId());
+                download(file_segment2);
             });
 
             {
@@ -505,8 +543,7 @@ TEST_F(FileCacheTest, get)
 
             holder.reset();
             other_1.join();
-            printRanges(segments);
-            ASSERT_TRUE(segments[1]->state() == DB::FileSegment::State::DOWNLOADED);
+            ASSERT_TRUE(file_segment.state() == DB::FileSegment::State::DOWNLOADED);
         }
     }
 
@@ -514,55 +551,103 @@ TEST_F(FileCacheTest, get)
     ///                   ^   ^^         ^   ^^  ^  ^
     ///                   2   45       24  2627 28 29
 
+    std::cerr << "Step 11\n";
     {
         /// Test LRUCache::restore().
 
         auto cache2 = DB::FileCache(settings);
         cache2.initialize();
-        auto key = cache2.hash("key1");
-
-        auto holder1 = cache2.getOrSet(key, 2, 28, {}); /// Get [2, 29]
-
-        auto segments1 = fromHolder(holder1);
-        ASSERT_EQ(segments1.size(), 5);
+        auto key = cache2.createKeyForPath("key1");
 
-        assertRange(44, segments1[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
-        assertRange(45, segments1[1], DB::FileSegment::Range(5, 23), DB::FileSegment::State::DOWNLOADED);
-        assertRange(45, segments1[2], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-        assertRange(46, segments1[3], DB::FileSegment::Range(27, 27), DB::FileSegment::State::DOWNLOADED);
-        assertRange(47, segments1[4], DB::FileSegment::Range(28, 29), DB::FileSegment::State::DOWNLOADED);
+        /// Get [2, 29]
+        assertEqual(cache2.getOrSet(key, 2, 28, {}),
+                    { Range(2, 4),       Range(5, 23),      Range(24, 26),     Range(27, 27),     Range(28, 29) },
+                    { State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED, State::DOWNLOADED });
     }
 
+    std::cerr << "Step 12\n";
     {
         /// Test max file segment size
 
         auto settings2 = settings;
         settings2.max_file_segment_size = 10;
         settings2.base_path = caches_dir / "cache2";
+        fs::create_directories(settings2.base_path);
         auto cache2 = DB::FileCache(settings2);
         cache2.initialize();
-        auto key = cache2.hash("key1");
+        auto key = cache2.createKeyForPath("key1");
+
+        /// Get [0, 24]
+        assertEqual(cache2.getOrSet(key, 0, 25, {}),
+                    { Range(0, 9),  Range(10, 19), Range(20, 24) },
+                    { State::EMPTY, State::EMPTY,  State::EMPTY });
+    }
+
+    std::cerr << "Step 13\n";
+    {
+        /// Test delated cleanup
+
+        auto cache = FileCache(settings);
+        cache.initialize();
+        cache.cleanup();
+        const auto key = cache.createKeyForPath("key10");
+        const auto key_path = cache.getPathInLocalCache(key);
+
+        cache.removeAllReleasable();
+        ASSERT_EQ(cache.getUsedCacheSize(), 0);
+        ASSERT_TRUE(!fs::exists(key_path));
+        ASSERT_TRUE(!fs::exists(fs::path(key_path).parent_path()));
 
-        auto holder1 = cache2.getOrSet(key, 0, 25, {}); /// Get [0, 24]
-        auto segments1 = fromHolder(holder1);
+        download(cache.getOrSet(key, 0, 10, {}));
+        ASSERT_EQ(cache.getUsedCacheSize(), 10);
+        ASSERT_TRUE(fs::exists(cache.getPathInLocalCache(key, 0, FileSegmentKind::Regular)));
+
+        cache.removeAllReleasable();
+        ASSERT_EQ(cache.getUsedCacheSize(), 0);
+        ASSERT_TRUE(fs::exists(key_path));
+        ASSERT_TRUE(!fs::exists(cache.getPathInLocalCache(key, 0, FileSegmentKind::Regular)));
 
-        ASSERT_EQ(segments1.size(), 3);
-        assertRange(48, segments1[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::EMPTY);
-        assertRange(49, segments1[1], DB::FileSegment::Range(10, 19), DB::FileSegment::State::EMPTY);
-        assertRange(50, segments1[2], DB::FileSegment::Range(20, 24), DB::FileSegment::State::EMPTY);
+        cache.cleanup();
+        ASSERT_TRUE(!fs::exists(key_path));
+        ASSERT_TRUE(!fs::exists(fs::path(key_path).parent_path()));
     }
 
+    std::cerr << "Step 14\n";
+    {
+        /// Test background thread delated cleanup
+
+        auto settings2{settings};
+        settings2.delayed_cleanup_interval_ms = 0;
+        auto cache = DB::FileCache(settings2);
+        cache.initialize();
+        const auto key = cache.createKeyForPath("key10");
+        const auto key_path = cache.getPathInLocalCache(key);
+
+        cache.removeAllReleasable();
+        ASSERT_EQ(cache.getUsedCacheSize(), 0);
+        ASSERT_TRUE(!fs::exists(key_path));
+        ASSERT_TRUE(!fs::exists(fs::path(key_path).parent_path()));
+
+        download(cache.getOrSet(key, 0, 10, {}));
+        ASSERT_EQ(cache.getUsedCacheSize(), 10);
+        ASSERT_TRUE(fs::exists(key_path));
+
+        cache.removeAllReleasable();
+        ASSERT_EQ(cache.getUsedCacheSize(), 0);
+        sleepForSeconds(2);
+        ASSERT_TRUE(!fs::exists(key_path));
+    }
 }
 
 TEST_F(FileCacheTest, writeBuffer)
 {
-    DB::FileCacheSettings settings;
+    FileCacheSettings settings;
     settings.max_size = 100;
     settings.max_elements = 5;
     settings.max_file_segment_size = 5;
     settings.base_path = cache_base_path;
 
-    DB::FileCache cache(settings);
+    FileCache cache(settings);
     cache.initialize();
 
     auto write_to_cache = [&cache](const String & key, const Strings & data, bool flush)
@@ -571,10 +656,13 @@ TEST_F(FileCacheTest, writeBuffer)
         segment_settings.kind = FileSegmentKind::Temporary;
         segment_settings.unbounded = true;
 
-        auto holder = cache.set(cache.hash(key), 0, 3, segment_settings);
-        EXPECT_EQ(holder.file_segments.size(), 1);
-        auto & segment = holder.file_segments.front();
-        WriteBufferToFileSegment out(segment.get());
+        auto cache_key = cache.createKeyForPath(key);
+        auto holder = cache.set(cache_key, 0, 3, segment_settings);
+        /// The same is done in TemporaryDataOnDisk::createStreamToCacheFile.
+        std::filesystem::create_directories(cache.getPathInLocalCache(cache_key));
+        EXPECT_EQ(holder->size(), 1);
+        auto & segment = holder->front();
+        WriteBufferToFileSegment out(&segment);
         std::list<std::thread> threads;
         std::mutex mu;
         for (const auto & s : data)
@@ -600,18 +688,18 @@ TEST_F(FileCacheTest, writeBuffer)
     std::vector<fs::path> file_segment_paths;
     {
         auto holder = write_to_cache("key1", {"abc", "defg"}, false);
-        file_segment_paths.emplace_back(holder.file_segments.front()->getPathInLocalCache());
+        file_segment_paths.emplace_back(holder->front().getPathInLocalCache());
 
         ASSERT_EQ(fs::file_size(file_segment_paths.back()), 7);
-        ASSERT_TRUE(holder.file_segments.front()->range() == FileSegment::Range(0, 7));
+        ASSERT_TRUE(holder->front().range() == FileSegment::Range(0, 7));
         ASSERT_EQ(cache.getUsedCacheSize(), 7);
 
         {
             auto holder2 = write_to_cache("key2", {"1", "22", "333", "4444", "55555"}, true);
-            file_segment_paths.emplace_back(holder2.file_segments.front()->getPathInLocalCache());
+            file_segment_paths.emplace_back(holder2->front().getPathInLocalCache());
 
             ASSERT_EQ(fs::file_size(file_segment_paths.back()), 15);
-            ASSERT_TRUE(holder2.file_segments.front()->range() == FileSegment::Range(0, 15));
+            ASSERT_TRUE(holder2->front().range() == FileSegment::Range(0, 15));
             ASSERT_EQ(cache.getUsedCacheSize(), 22);
         }
         ASSERT_FALSE(fs::exists(file_segment_paths.back()));
@@ -668,17 +756,16 @@ TEST_F(FileCacheTest, temporaryData)
 
     auto tmp_data_scope = std::make_shared<TemporaryDataOnDiskScope>(nullptr, &file_cache, 0);
 
-    auto some_data_holder = file_cache.getOrSet(file_cache.hash("some_data"), 0, 5_KiB, CreateFileSegmentSettings{});
+    auto some_data_holder = file_cache.getOrSet(file_cache.createKeyForPath("some_data"), 0, 5_KiB, CreateFileSegmentSettings{});
 
     {
-        auto segments = fromHolder(some_data_holder);
-        ASSERT_EQ(segments.size(), 5);
-        for (auto & segment : segments)
+        ASSERT_EQ(some_data_holder->size(), 5);
+        for (auto & segment : *some_data_holder)
         {
             ASSERT_TRUE(segment->getOrSetDownloader() == DB::FileSegment::getCallerId());
             ASSERT_TRUE(segment->reserve(segment->range().size()));
-            download(cache_base_path, segment);
-            segment->completeWithoutState();
+            download(*segment);
+            segment->complete();
         }
     }
 
diff --git a/src/Parsers/ASTSubquery.h b/src/Parsers/ASTSubquery.h
index 7d0fabf3ed4a..e4de766621a0 100644
--- a/src/Parsers/ASTSubquery.h
+++ b/src/Parsers/ASTSubquery.h
@@ -21,15 +21,9 @@ class ASTSubquery : public ASTWithAlias
 
     ASTPtr clone() const override
     {
-        const auto res = std::make_shared<ASTSubquery>(*this);
-        ASTPtr ptr{res};
-
-        res->children.clear();
-
-        for (const auto & child : children)
-            res->children.emplace_back(child->clone());
-
-        return ptr;
+        auto clone = std::make_shared<ASTSubquery>(*this);
+        clone->cloneChildren();
+        return clone;
     }
 
     void updateTreeHashImpl(SipHash & hash_state) const override;
diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp
index 9520b3211e1c..a91449ff0357 100644
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@@ -104,6 +104,12 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &,
     auto print_drop_replica = [&]
     {
         settings.ostr << " " << quoteString(replica);
+        if (!shard.empty())
+        {
+            settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM SHARD "
+                          << (settings.hilite ? hilite_none : "") << quoteString(shard);
+        }
+
         if (table)
         {
             settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM TABLE"
diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h
index dfe2389edb7d..d9fcb2e8f05e 100644
--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@@ -96,6 +96,7 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster
     String target_model;
     String target_function;
     String replica;
+    String shard;
     String replica_zk_path;
     bool is_drop_whole_replica{};
     String storage_policy;
diff --git a/src/Parsers/Access/ASTAuthenticationData.cpp b/src/Parsers/Access/ASTAuthenticationData.cpp
new file mode 100644
index 000000000000..d730a08889d0
--- /dev/null
+++ b/src/Parsers/Access/ASTAuthenticationData.cpp
@@ -0,0 +1,199 @@
+#include <Parsers/Access/ASTAuthenticationData.h>
+
+#include <Access/AccessControl.h>
+#include <Common/Exception.h>
+#include <Parsers/ASTLiteral.h>
+#include <IO/Operators.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+std::optional<String> ASTAuthenticationData::getPassword() const
+{
+    if (contains_password)
+    {
+        if (const auto * password = children[0]->as<const ASTLiteral>())
+        {
+            return password->value.safeGet<String>();
+        }
+    }
+
+    return {};
+}
+std::optional<String> ASTAuthenticationData::getSalt() const
+{
+    if (type && *type == AuthenticationType::SHA256_PASSWORD && children.size() == 2)
+    {
+        if (const auto * salt = children[1]->as<const ASTLiteral>())
+        {
+            return salt->value.safeGet<String>();
+        }
+    }
+
+    return {};
+}
+
+void ASTAuthenticationData::formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const
+{
+    if (type && *type == AuthenticationType::NO_PASSWORD)
+    {
+        settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " NOT IDENTIFIED"
+                      << (settings.hilite ? IAST::hilite_none : "");
+        return;
+    }
+
+    String auth_type_name;
+    String prefix; /// "BY" or "SERVER" or "REALM"
+    bool password = false; /// either a password or hash
+    bool salt = false;
+    bool parameter = false;
+    bool parameters = false;
+
+    if (type)
+    {
+        auth_type_name = AuthenticationTypeInfo::get(*type).name;
+
+        switch (*type)
+        {
+            case AuthenticationType::PLAINTEXT_PASSWORD:
+            {
+                prefix = "BY";
+                password = true;
+                break;
+            }
+            case AuthenticationType::SHA256_PASSWORD:
+            {
+                if (contains_hash)
+                    auth_type_name = "sha256_hash";
+
+                prefix = "BY";
+                password = true;
+                if (children.size() == 2)
+                    salt = true;
+                break;
+            }
+            case AuthenticationType::DOUBLE_SHA1_PASSWORD:
+            {
+                if (contains_hash)
+                    auth_type_name = "double_sha1_hash";
+
+                prefix = "BY";
+                password = true;
+                break;
+            }
+            case AuthenticationType::LDAP:
+            {
+                prefix = "SERVER";
+                parameter = true;
+                break;
+            }
+            case AuthenticationType::KERBEROS:
+            {
+                if (!children.empty())
+                {
+                    prefix = "REALM";
+                    parameter = true;
+                }
+                break;
+            }
+            case AuthenticationType::SSL_CERTIFICATE:
+            {
+                prefix = "CN";
+                parameters = true;
+                break;
+            }
+            case AuthenticationType::BCRYPT_PASSWORD:
+            {
+                if (contains_hash)
+                    auth_type_name = "bcrypt_hash";
+
+                prefix = "BY";
+                password = true;
+                break;
+            }
+            case AuthenticationType::NO_PASSWORD: [[fallthrough]];
+            case AuthenticationType::MAX:
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "AST: Unexpected authentication type {}", toString(*type));
+        }
+    }
+    else
+    {
+        /// Default password type
+        prefix = "BY";
+        password = true;
+    }
+
+    if (password && !settings.show_secrets)
+    {
+        prefix = "";
+        password = false;
+        salt = false;
+        if (type)
+            auth_type_name = AuthenticationTypeInfo::get(*type).name;
+    }
+
+    settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " IDENTIFIED" << (settings.hilite ? IAST::hilite_none : "");
+
+    if (!auth_type_name.empty())
+    {
+        settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " WITH " << auth_type_name
+                        << (settings.hilite ? IAST::hilite_none : "");
+    }
+
+    if (!prefix.empty())
+    {
+        settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " " << prefix << (settings.hilite ? IAST::hilite_none : "");
+    }
+
+    if (password)
+    {
+        settings.ostr << " ";
+        children[0]->format(settings);
+    }
+
+    if (salt)
+    {
+        settings.ostr << " SALT ";
+        children[1]->format(settings);
+    }
+
+    if (parameter)
+    {
+        settings.ostr << " ";
+        children[0]->format(settings);
+    }
+    else if (parameters)
+    {
+        settings.ostr << " ";
+        bool need_comma = false;
+        for (const auto & child : children)
+        {
+            if (std::exchange(need_comma, true))
+                settings.ostr << ", ";
+            child->format(settings);
+        }
+    }
+}
+
+bool ASTAuthenticationData::hasSecretParts() const
+{
+    /// Default password type is used hence secret part
+    if (!type)
+        return true;
+
+    auto auth_type = *type;
+    if ((auth_type == AuthenticationType::PLAINTEXT_PASSWORD)
+        || (auth_type == AuthenticationType::SHA256_PASSWORD)
+        || (auth_type == AuthenticationType::DOUBLE_SHA1_PASSWORD))
+        return true;
+
+    return childrenHaveSecretParts();
+}
+
+}
diff --git a/src/Parsers/Access/ASTAuthenticationData.h b/src/Parsers/Access/ASTAuthenticationData.h
new file mode 100644
index 000000000000..de166bdf234a
--- /dev/null
+++ b/src/Parsers/Access/ASTAuthenticationData.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <Parsers/IAST.h>
+#include <Access/Common/AuthenticationType.h>
+#include <optional>
+
+
+namespace DB
+{
+
+/** Represents authentication data in CREATE/ALTER USER query:
+  *  ... IDENTIFIED WITH sha256_password BY 'password'
+  *
+  * Can store password, hash and salt, LDAP server name, Kerberos Realm, or common names.
+  * They are stored in children vector as ASTLiteral or ASTQueryParameter.
+  * ASTAuthenticationData without a type represents authentication data with
+  *  the default password type that will be later inferred from the server parameters.
+  */
+
+class ASTAuthenticationData : public IAST
+{
+public:
+    String getID(char) const override { return "AuthenticationData"; }
+
+    ASTPtr clone() const override
+    {
+        auto clone = std::make_shared<ASTAuthenticationData>(*this);
+        clone->cloneChildren();
+        return clone;
+    }
+
+    bool hasSecretParts() const override;
+
+    std::optional<String> getPassword() const;
+    std::optional<String> getSalt() const;
+
+    /// If type is empty we use the default password type.
+    /// AuthenticationType::NO_PASSWORD is specified explicitly.
+    std::optional<AuthenticationType> type;
+
+    bool contains_password = false;
+    bool contains_hash = false;
+
+protected:
+    void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
+};
+
+}
diff --git a/src/Parsers/Access/ASTCreateUserQuery.cpp b/src/Parsers/Access/ASTCreateUserQuery.cpp
index b0d4aef38b83..0611545adf02 100644
--- a/src/Parsers/Access/ASTCreateUserQuery.cpp
+++ b/src/Parsers/Access/ASTCreateUserQuery.cpp
@@ -2,17 +2,13 @@
 #include <Parsers/Access/ASTRolesOrUsersSet.h>
 #include <Parsers/Access/ASTSettingsProfileElement.h>
 #include <Parsers/Access/ASTUserNameWithHost.h>
+#include <Parsers/Access/ASTAuthenticationData.h>
 #include <Common/quoteString.h>
 #include <IO/Operators.h>
 
 
 namespace DB
 {
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
 
 namespace
 {
@@ -23,122 +19,9 @@ namespace
     }
 
 
-    void formatAuthenticationData(const AuthenticationData & auth_data, const IAST::FormatSettings & settings)
+    void formatAuthenticationData(const ASTAuthenticationData & auth_data, const IAST::FormatSettings & settings)
     {
-        auto auth_type = auth_data.getType();
-        if (auth_type == AuthenticationType::NO_PASSWORD)
-        {
-            settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " NOT IDENTIFIED"
-                          << (settings.hilite ? IAST::hilite_none : "");
-            return;
-        }
-
-        String auth_type_name = AuthenticationTypeInfo::get(auth_type).name;
-        String prefix; /// "BY" or "SERVER" or "REALM"
-        std::optional<String> password; /// either a password or hash
-        std::optional<String> salt;
-        std::optional<String> parameter;
-        const boost::container::flat_set<String> * parameters = nullptr;
-
-        switch (auth_type)
-        {
-            case AuthenticationType::PLAINTEXT_PASSWORD:
-            {
-                prefix = "BY";
-                password = auth_data.getPassword();
-                break;
-            }
-            case AuthenticationType::SHA256_PASSWORD:
-            {
-                auth_type_name = "sha256_hash";
-                prefix = "BY";
-                password = auth_data.getPasswordHashHex();
-                if (!auth_data.getSalt().empty())
-                    salt = auth_data.getSalt();
-                break;
-            }
-            case AuthenticationType::DOUBLE_SHA1_PASSWORD:
-            {
-                auth_type_name = "double_sha1_hash";
-                prefix = "BY";
-                password = auth_data.getPasswordHashHex();
-                break;
-            }
-            case AuthenticationType::LDAP:
-            {
-                prefix = "SERVER";
-                parameter = auth_data.getLDAPServerName();
-                break;
-            }
-            case AuthenticationType::KERBEROS:
-            {
-                const auto & realm = auth_data.getKerberosRealm();
-                if (!realm.empty())
-                {
-                    prefix = "REALM";
-                    parameter = realm;
-                }
-                break;
-            }
-
-            case AuthenticationType::SSL_CERTIFICATE:
-            {
-                prefix = "CN";
-                parameters = &auth_data.getSSLCertificateCommonNames();
-                break;
-            }
-
-            case AuthenticationType::NO_PASSWORD: [[fallthrough]];
-            case AuthenticationType::MAX:
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "AST: Unexpected authentication type {}", toString(auth_type));
-        }
-
-        if (password && !settings.show_secrets)
-        {
-            prefix = "";
-            password.reset();
-            salt.reset();
-            auth_type_name = AuthenticationTypeInfo::get(auth_type).name;
-        }
-
-        settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " IDENTIFIED" << (settings.hilite ? IAST::hilite_none : "");
-
-        if (!auth_type_name.empty())
-        {
-            settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " WITH " << auth_type_name
-                          << (settings.hilite ? IAST::hilite_none : "");
-        }
-
-        if (!prefix.empty())
-        {
-            settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " " << prefix << (settings.hilite ? IAST::hilite_none : "");
-        }
-
-        if (password)
-        {
-            settings.ostr << " " << quoteString(*password);
-        }
-
-        if (salt)
-        {
-            settings.ostr << " SALT " << quoteString(*salt);
-        }
-
-        if (parameter)
-        {
-            settings.ostr << " " << quoteString(*parameter);
-        }
-        else if (parameters)
-        {
-            settings.ostr << " ";
-            bool need_comma = false;
-            for (const auto & param : *parameters)
-            {
-                if (std::exchange(need_comma, true))
-                    settings.ostr << ", ";
-                settings.ostr << quoteString(param);
-            }
-        }
+        auth_data.format(settings);
     }
 
 
@@ -276,6 +159,7 @@ String ASTCreateUserQuery::getID(char) const
 ASTPtr ASTCreateUserQuery::clone() const
 {
     auto res = std::make_shared<ASTCreateUserQuery>(*this);
+    res->children.clear();
 
     if (names)
         res->names = std::static_pointer_cast<ASTUserNamesWithHost>(names->clone());
@@ -292,6 +176,12 @@ ASTPtr ASTCreateUserQuery::clone() const
     if (settings)
         res->settings = std::static_pointer_cast<ASTSettingsProfileElements>(settings->clone());
 
+    if (auth_data)
+    {
+        res->auth_data = std::static_pointer_cast<ASTAuthenticationData>(auth_data->clone());
+        res->children.push_back(res->auth_data);
+    }
+
     return res;
 }
 
@@ -346,17 +236,4 @@ void ASTCreateUserQuery::formatImpl(const FormatSettings & format, FormatState &
         formatGrantees(*grantees, format);
 }
 
-bool ASTCreateUserQuery::hasSecretParts() const
-{
-    if (auth_data)
-    {
-        auto auth_type = auth_data->getType();
-        if ((auth_type == AuthenticationType::PLAINTEXT_PASSWORD)
-            || (auth_type == AuthenticationType::SHA256_PASSWORD)
-            || (auth_type == AuthenticationType::DOUBLE_SHA1_PASSWORD))
-            return true;
-    }
-    return childrenHaveSecretParts();
-}
-
 }
diff --git a/src/Parsers/Access/ASTCreateUserQuery.h b/src/Parsers/Access/ASTCreateUserQuery.h
index a3571dd6c61e..62ddbfd00406 100644
--- a/src/Parsers/Access/ASTCreateUserQuery.h
+++ b/src/Parsers/Access/ASTCreateUserQuery.h
@@ -3,7 +3,7 @@
 #include <Parsers/IAST.h>
 #include <Parsers/ASTQueryWithOnCluster.h>
 #include <Parsers/ASTDatabaseOrNone.h>
-#include <Access/Common/AuthenticationData.h>
+#include <Access/Common/AuthenticationType.h>
 #include <Access/Common/AllowedClientHosts.h>
 
 
@@ -13,6 +13,8 @@ class ASTUserNamesWithHost;
 class ASTRolesOrUsersSet;
 class ASTDatabaseOrNone;
 class ASTSettingsProfileElements;
+class ASTAuthenticationData;
+
 
 /** CREATE USER [IF NOT EXISTS | OR REPLACE] name
   *     [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password|plaintext_password|sha256_password|sha256_hash|double_sha1_password|double_sha1_hash}] BY {'password'|'hash'}}|{WITH ldap SERVER 'server_name'}|{WITH kerberos [REALM 'realm']}]
@@ -44,9 +46,7 @@ class ASTCreateUserQuery : public IAST, public ASTQueryWithOnCluster
     std::shared_ptr<ASTUserNamesWithHost> names;
     std::optional<String> new_name;
 
-    std::optional<AuthenticationData> auth_data;
-
-    mutable std::optional<String> temporary_password_for_checks;
+    std::shared_ptr<ASTAuthenticationData> auth_data;
 
     std::optional<AllowedClientHosts> hosts;
     std::optional<AllowedClientHosts> add_hosts;
@@ -61,7 +61,6 @@ class ASTCreateUserQuery : public IAST, public ASTQueryWithOnCluster
     String getID(char) const override;
     ASTPtr clone() const override;
     void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override;
-    bool hasSecretParts() const override;
     ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster<ASTCreateUserQuery>(clone()); }
 
     QueryKind getQueryKind() const override { return QueryKind::Create; }
diff --git a/src/Parsers/Access/ParserCreateUserQuery.cpp b/src/Parsers/Access/ParserCreateUserQuery.cpp
index c1d0691d305d..0344fb99c043 100644
--- a/src/Parsers/Access/ParserCreateUserQuery.cpp
+++ b/src/Parsers/Access/ParserCreateUserQuery.cpp
@@ -3,6 +3,7 @@
 #include <Parsers/Access/ASTRolesOrUsersSet.h>
 #include <Parsers/Access/ASTSettingsProfileElement.h>
 #include <Parsers/Access/ASTUserNameWithHost.h>
+#include <Parsers/Access/ASTAuthenticationData.h>
 #include <Parsers/Access/ParserRolesOrUsersSet.h>
 #include <Parsers/Access/ParserSettingsProfileElement.h>
 #include <Parsers/Access/ParserUserNameWithHost.h>
@@ -17,21 +18,11 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <base/insertAtEnd.h>
 #include "config.h"
-#include <base/hex.h>
-#if USE_SSL
-#     include <openssl/crypto.h>
-#     include <openssl/rand.h>
-#     include <openssl/err.h>
-#endif
+
 
 namespace DB
 {
 
-namespace ErrorCodes
-{
-    extern const int OPENSSL_ERROR;
-}
-
 namespace
 {
     bool parseRenameTo(IParserBase::Pos & pos, Expected & expected, std::optional<String> & new_name)
@@ -50,14 +41,29 @@ namespace
         });
     }
 
+    class ParserStringAndSubstitution : public IParserBase
+    {
+    private:
+        const char * getName() const override { return "ParserStringAndSubstitution"; }
+        bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override
+        {
+            return ParserStringLiteral{}.parse(pos, node, expected) || ParserSubstitution{}.parse(pos, node, expected);
+        }
+
+    public:
+        explicit ParserStringAndSubstitution() = default;
+    };
 
-    bool parseAuthenticationData(IParserBase::Pos & pos, Expected & expected, AuthenticationData & auth_data, std::optional<String> & temporary_password_for_checks)
+
+    bool parseAuthenticationData(IParserBase::Pos & pos, Expected & expected, std::shared_ptr<ASTAuthenticationData> & auth_data)
     {
         return IParserBase::wrapParseImpl(pos, [&]
         {
             if (ParserKeyword{"NOT IDENTIFIED"}.ignore(pos, expected))
             {
-                auth_data = AuthenticationData{AuthenticationType::NO_PASSWORD};
+                auth_data = std::make_shared<ASTAuthenticationData>();
+                auth_data->type = AuthenticationType::NO_PASSWORD;
+
                 return true;
             }
 
@@ -65,6 +71,7 @@ namespace
                 return false;
 
             std::optional<AuthenticationType> type;
+
             bool expect_password = false;
             bool expect_hash = false;
             bool expect_ldap_server_name = false;
@@ -104,52 +111,48 @@ namespace
                         type = AuthenticationType::DOUBLE_SHA1_PASSWORD;
                         expect_hash = true;
                     }
+                    else if (ParserKeyword{"BCRYPT_HASH"}.ignore(pos, expected))
+                    {
+                        type = AuthenticationType::BCRYPT_PASSWORD;
+                        expect_hash = true;
+                    }
                     else
                         return false;
                 }
             }
 
+            /// If authentication type is not specified, then the default password type is used
             if (!type)
-            {
-                type = AuthenticationType::SHA256_PASSWORD;
                 expect_password = true;
-            }
 
-            String value;
-            String parsed_salt;
-            boost::container::flat_set<String> common_names;
+            ASTPtr value;
+            ASTPtr parsed_salt;
+            ASTPtr common_names;
             if (expect_password || expect_hash)
             {
-                ASTPtr ast;
-                if (!ParserKeyword{"BY"}.ignore(pos, expected) || !ParserStringLiteral{}.parse(pos, ast, expected))
+                if (!ParserKeyword{"BY"}.ignore(pos, expected) || !ParserStringAndSubstitution{}.parse(pos, value, expected))
                     return false;
-                value = ast->as<const ASTLiteral &>().value.safeGet<String>();
 
                 if (expect_hash && type == AuthenticationType::SHA256_PASSWORD)
                 {
-                    if (ParserKeyword{"SALT"}.ignore(pos, expected) && ParserStringLiteral{}.parse(pos, ast, expected))
+                    if (ParserKeyword{"SALT"}.ignore(pos, expected))
                     {
-                        parsed_salt = ast->as<const ASTLiteral &>().value.safeGet<String>();
+                        if (!ParserStringAndSubstitution{}.parse(pos, parsed_salt, expected))
+                            return false;
                     }
                 }
             }
             else if (expect_ldap_server_name)
             {
-                ASTPtr ast;
-                if (!ParserKeyword{"SERVER"}.ignore(pos, expected) || !ParserStringLiteral{}.parse(pos, ast, expected))
+                if (!ParserKeyword{"SERVER"}.ignore(pos, expected) || !ParserStringAndSubstitution{}.parse(pos, value, expected))
                     return false;
-
-                value = ast->as<const ASTLiteral &>().value.safeGet<String>();
             }
             else if (expect_kerberos_realm)
             {
                 if (ParserKeyword{"REALM"}.ignore(pos, expected))
                 {
-                    ASTPtr ast;
-                    if (!ParserStringLiteral{}.parse(pos, ast, expected))
+                    if (!ParserStringAndSubstitution{}.parse(pos, value, expected))
                         return false;
-
-                    value = ast->as<const ASTLiteral &>().value.safeGet<String>();
                 }
             }
             else if (expect_common_names)
@@ -157,63 +160,24 @@ namespace
                 if (!ParserKeyword{"CN"}.ignore(pos, expected))
                     return false;
 
-                ASTPtr ast;
-                if (!ParserList{std::make_unique<ParserStringLiteral>(), std::make_unique<ParserToken>(TokenType::Comma), false}.parse(pos, ast, expected))
+                if (!ParserList{std::make_unique<ParserStringAndSubstitution>(), std::make_unique<ParserToken>(TokenType::Comma), false}.parse(pos, common_names, expected))
                     return false;
-
-                for (const auto & ast_child : ast->children)
-                    common_names.insert(ast_child->as<const ASTLiteral &>().value.safeGet<String>());
             }
 
-            /// Save password separately for future complexity rules check
-            if (expect_password)
-                temporary_password_for_checks = value;
+            auth_data = std::make_shared<ASTAuthenticationData>();
 
-            auth_data = AuthenticationData{*type};
-            if (auth_data.getType() == AuthenticationType::SHA256_PASSWORD)
-            {
-                if (!parsed_salt.empty())
-                {
-                    auth_data.setSalt(parsed_salt);
-                }
-                else if (expect_password)
-                {
-#if USE_SSL
-                    ///generate and add salt here
-                    ///random generator FIPS complaint
-                    uint8_t key[32];
-                    if (RAND_bytes(key, sizeof(key)) != 1)
-                    {
-                        char buf[512] = {0};
-                        ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
-                        throw Exception(ErrorCodes::OPENSSL_ERROR, "Cannot generate salt for password. OpenSSL {}", buf);
-                    }
+            auth_data->type = type;
+            auth_data->contains_password = expect_password;
+            auth_data->contains_hash = expect_hash;
 
-                    String salt;
-                    salt.resize(sizeof(key) * 2);
-                    char * buf_pos = salt.data();
-                    for (uint8_t k : key)
-                    {
-                        writeHexByteUppercase(k, buf_pos);
-                        buf_pos += 2;
-                    }
-                    value.append(salt);
-                    auth_data.setSalt(salt);
-#else
-                    ///if USE_SSL is not defined, Exception thrown later
-#endif
-                }
-            }
-            if (expect_password)
-                auth_data.setPassword(value);
-            else if (expect_hash)
-                auth_data.setPasswordHashHex(value);
-            else if (expect_ldap_server_name)
-                auth_data.setLDAPServerName(value);
-            else if (expect_kerberos_realm)
-                auth_data.setKerberosRealm(value);
-            else if (expect_common_names)
-                auth_data.setSSLCertificateCommonNames(std::move(common_names));
+            if (value)
+                auth_data->children.push_back(std::move(value));
+
+            if (parsed_salt)
+                auth_data->children.push_back(std::move(parsed_salt));
+
+            if (common_names)
+                auth_data->children = std::move(common_names->children);
 
             return true;
         });
@@ -441,11 +405,10 @@ bool ParserCreateUserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
     auto names_ref = names->names;
 
     std::optional<String> new_name;
-    std::optional<AuthenticationData> auth_data;
-    std::optional<String> temporary_password_for_checks;
     std::optional<AllowedClientHosts> hosts;
     std::optional<AllowedClientHosts> add_hosts;
     std::optional<AllowedClientHosts> remove_hosts;
+    std::shared_ptr<ASTAuthenticationData> auth_data;
     std::shared_ptr<ASTRolesOrUsersSet> default_roles;
     std::shared_ptr<ASTSettingsProfileElements> settings;
     std::shared_ptr<ASTRolesOrUsersSet> grantees;
@@ -456,12 +419,10 @@ bool ParserCreateUserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
     {
         if (!auth_data)
         {
-            AuthenticationData new_auth_data;
-            std::optional<String> new_temporary_password_for_checks;
-            if (parseAuthenticationData(pos, expected, new_auth_data, new_temporary_password_for_checks))
+            std::shared_ptr<ASTAuthenticationData> new_auth_data;
+            if (parseAuthenticationData(pos, expected, new_auth_data))
             {
                 auth_data = std::move(new_auth_data);
-                temporary_password_for_checks = std::move(new_temporary_password_for_checks);
                 continue;
             }
         }
@@ -546,7 +507,6 @@ bool ParserCreateUserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
     query->names = std::move(names);
     query->new_name = std::move(new_name);
     query->auth_data = std::move(auth_data);
-    query->temporary_password_for_checks = std::move(temporary_password_for_checks);
     query->hosts = std::move(hosts);
     query->add_hosts = std::move(add_hosts);
     query->remove_hosts = std::move(remove_hosts);
@@ -555,6 +515,9 @@ bool ParserCreateUserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
     query->grantees = std::move(grantees);
     query->default_database = std::move(default_database);
 
+    if (query->auth_data)
+        query->children.push_back(query->auth_data);
+
     return true;
 }
 }
diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index a6354cd0e81f..28cef51e571e 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -1429,10 +1429,12 @@ bool ParserAlias::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
     if (!allow_alias_without_as_keyword && !has_as_word)
         return false;
 
+    bool is_quoted = pos->type == TokenType::QuotedIdentifier;
+
     if (!id_p.parse(pos, node, expected))
         return false;
 
-    if (!has_as_word)
+    if (!has_as_word && !is_quoted)
     {
         /** In this case, the alias can not match the keyword -
           *  so that in the query "SELECT x FROM t", the word FROM was not considered an alias,
diff --git a/src/Parsers/ParserSystemQuery.cpp b/src/Parsers/ParserSystemQuery.cpp
index 7d41ac0e5d33..e4f2595f83c1 100644
--- a/src/Parsers/ParserSystemQuery.cpp
+++ b/src/Parsers/ParserSystemQuery.cpp
@@ -159,6 +159,14 @@ enum class SystemQueryTargetType
     if (!ParserStringLiteral{}.parse(pos, ast, expected))
         return false;
     res->replica = ast->as<ASTLiteral &>().value.safeGet<String>();
+
+    if (ParserKeyword{"FROM SHARD"}.ignore(pos, expected))
+    {
+        if (!ParserStringLiteral{}.parse(pos, ast, expected))
+            return false;
+        res->shard = ast->as<ASTLiteral &>().value.safeGet<String>();
+    }
+
     if (ParserKeyword{"FROM"}.ignore(pos, expected))
     {
         // way 1. parse replica database
diff --git a/src/Parsers/tests/gtest_Parser.cpp b/src/Parsers/tests/gtest_Parser.cpp
index b452bd276429..19872c4189ad 100644
--- a/src/Parsers/tests/gtest_Parser.cpp
+++ b/src/Parsers/tests/gtest_Parser.cpp
@@ -5,6 +5,7 @@
 #include <Parsers/ASTIdentifier.h>
 #include <Parsers/Access/ASTCreateUserQuery.h>
 #include <Parsers/Access/ParserCreateUserQuery.h>
+#include <Parsers/Access/ASTAuthenticationData.h>
 #include <Parsers/ParserAlterQuery.h>
 #include <Parsers/ParserCreateQuery.h>
 #include <Parsers/ParserOptimizeQuery.h>
@@ -69,7 +70,7 @@ TEST_P(ParserTest, parseQuery)
             {
                 if (input_text.starts_with("ATTACH"))
                 {
-                    auto salt = (dynamic_cast<const ASTCreateUserQuery *>(ast.get())->auth_data)->getSalt();
+                    auto salt = (dynamic_cast<const ASTCreateUserQuery *>(ast.get())->auth_data)->getSalt().value_or("");
                     EXPECT_TRUE(std::regex_match(salt, std::regex(expected_ast)));
                 }
                 else
@@ -260,7 +261,7 @@ INSTANTIATE_TEST_SUITE_P(ParserCreateUserQuery, ParserTest,
         ::testing::ValuesIn(std::initializer_list<ParserTestCase>{
         {
             "CREATE USER user1 IDENTIFIED WITH sha256_password BY 'qwe123'",
-            "CREATE USER user1 IDENTIFIED WITH sha256_hash BY '[A-Za-z0-9]{64}' SALT '[A-Za-z0-9]{64}'"
+            "CREATE USER user1 IDENTIFIED WITH sha256_password BY 'qwe123'"
         },
         {
             "CREATE USER user1 IDENTIFIED WITH sha256_hash BY '7A37B85C8918EAC19A9089C0FA5A2AB4DCE3F90528DCDEEC108B23DDF3607B99' SALT 'salt'",
@@ -268,7 +269,7 @@ INSTANTIATE_TEST_SUITE_P(ParserCreateUserQuery, ParserTest,
         },
         {
             "ALTER USER user1 IDENTIFIED WITH sha256_password BY 'qwe123'",
-            "ALTER USER user1 IDENTIFIED WITH sha256_hash BY '[A-Za-z0-9]{64}' SALT '[A-Za-z0-9]{64}'"
+            "ALTER USER user1 IDENTIFIED WITH sha256_password BY 'qwe123'"
         },
         {
             "ALTER USER user1 IDENTIFIED WITH sha256_hash BY '7A37B85C8918EAC19A9089C0FA5A2AB4DCE3F90528DCDEEC108B23DDF3607B99' SALT 'salt'",
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 7da10a8523b1..a02db0f2791d 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -681,10 +681,9 @@ std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> & table_jo
         table_join->isEnabledAlgorithm(JoinAlgorithm::PREFER_PARTIAL_MERGE) ||
         table_join->isEnabledAlgorithm(JoinAlgorithm::PARALLEL_HASH))
     {
-        if (table_join->allowParallelHashJoin())
+        if (ConcurrentHashJoin::isSupported(table_join))
         {
-            auto query_context = planner_context->getQueryContext();
-            return std::make_shared<ConcurrentHashJoin>(query_context, table_join, query_context->getSettings().max_threads, right_table_expression_header);
+            return std::make_shared<ConcurrentHashJoin>(table_join, right_table_expression_header);
         }
 
         return std::make_shared<HashJoin>(table_join, right_table_expression_header);
diff --git a/src/Processors/Executors/ExecutingGraph.cpp b/src/Processors/Executors/ExecutingGraph.cpp
index 574b1ccbcd28..27f6a454b244 100644
--- a/src/Processors/Executors/ExecutingGraph.cpp
+++ b/src/Processors/Executors/ExecutingGraph.cpp
@@ -219,7 +219,7 @@ bool ExecutingGraph::updateNode(uint64_t pid, Queue & queue, Queue & async_queue
     std::stack<uint64_t> updated_processors;
     updated_processors.push(pid);
 
-    UpgradableMutex::ReadGuard read_lock(nodes_mutex);
+    std::shared_lock read_lock(nodes_mutex);
 
     while (!updated_processors.empty() || !updated_edges.empty())
     {
@@ -382,11 +382,14 @@ bool ExecutingGraph::updateNode(uint64_t pid, Queue & queue, Queue & async_queue
 
             if (need_expand_pipeline)
             {
+                // We do not need to upgrade lock atomically, so we can safely release shared_lock and acquire unique_lock
+                read_lock.unlock();
                 {
-                    UpgradableMutex::WriteGuard lock(read_lock);
+                    std::unique_lock lock(nodes_mutex);
                     if (!expandPipeline(updated_processors, pid))
                         return false;
                 }
+                read_lock.lock();
 
                 /// Add itself back to be prepared again.
                 updated_processors.push(pid);
diff --git a/src/Processors/Executors/ExecutingGraph.h b/src/Processors/Executors/ExecutingGraph.h
index 69acc7fb31b6..1a7659d3e08e 100644
--- a/src/Processors/Executors/ExecutingGraph.h
+++ b/src/Processors/Executors/ExecutingGraph.h
@@ -2,7 +2,7 @@
 
 #include <Processors/Port.h>
 #include <Processors/IProcessor.h>
-#include <Processors/Executors/UpgradableLock.h>
+#include <Common/SharedMutex.h>
 #include <mutex>
 #include <queue>
 #include <stack>
@@ -156,7 +156,7 @@ class ExecutingGraph
     std::vector<bool> source_processors;
     std::mutex processors_mutex;
 
-    UpgradableMutex nodes_mutex;
+    SharedMutex nodes_mutex;
 
     const bool profile_processors;
     bool cancelled = false;
diff --git a/src/Processors/Executors/UpgradableLock.h b/src/Processors/Executors/UpgradableLock.h
deleted file mode 100644
index b5a318854241..000000000000
--- a/src/Processors/Executors/UpgradableLock.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#pragma once
-#include <atomic>
-#include <cassert>
-#include <list>
-#include <mutex>
-#include <condition_variable>
-
-namespace DB
-{
-
-/// RWLock which allows to upgrade read lock to write lock.
-/// Read locks should be fast if there is no write lock.
-///
-/// Newly created write lock waits for all active read locks.
-/// Newly created read lock waits for all write locks. Starvation is possible.
-///
-/// Mutex must live longer than locks.
-/// Read lock must live longer than corresponding  write lock.
-///
-/// For every write lock, a new internal state is created inside mutex.
-/// This state is not deallocated until the destruction of mutex itself.
-///
-/// Usage example:
-///
-/// UpgradableMutex mutex;
-/// {
-///     UpgradableMutex::ReadLock read_lock(mutex);
-///     ...
-///     {
-///         UpgradableMutex::WriteLock write_lock(read_lock);
-///         ...
-///     }
-///     ...
-/// }
-class UpgradableMutex
-{
-private:
-    /// Implementation idea
-    ///
-    /// ----------- (read scope)
-    /// ++num_readers
-    /// ** wait for active writer (in loop, starvation is possible here) **
-    ///
-    /// =========== (write scope)
-    /// ** create new State **
-    /// ** wait for active writer (in loop, starvation is possible here) **
-    /// ** wait for all active readers **
-    ///
-    /// ** notify all waiting readers for the current state.
-    /// =========== (end write scope)
-    ///
-    /// --num_readers
-    /// ** notify current active writer **
-    /// ----------- (end read scope)
-    struct State
-    {
-        size_t num_waiting = 0;
-        bool is_done = false;
-
-        std::mutex mutex;
-        std::condition_variable read_condvar;
-        std::condition_variable write_condvar;
-
-        void wait() noexcept
-        {
-            std::unique_lock lock(mutex);
-            ++num_waiting;
-            write_condvar.notify_one();
-            while (!is_done)
-                read_condvar.wait(lock);
-        }
-
-        void lock(std::atomic_size_t & num_readers_) noexcept
-        {
-            /// Note : num_locked is an atomic
-            /// which can change it's value without locked mutex.
-            /// We support an invariant that after changing num_locked value,
-            /// UpgradableMutex::write_state is checked, and in case of active
-            /// write lock, we always notify it's write condvar.
-            std::unique_lock lock(mutex);
-            ++num_waiting;
-            while (num_waiting < num_readers_.load())
-                write_condvar.wait(lock);
-        }
-
-        void unlock() noexcept
-        {
-            {
-                std::unique_lock lock(mutex);
-                is_done = true;
-            }
-            read_condvar.notify_all();
-        }
-    };
-
-    std::atomic_size_t num_readers = 0;
-
-    std::list<State> states;
-    std::mutex states_mutex;
-    std::atomic<State *> write_state{nullptr};
-
-    void lock() noexcept
-    {
-        ++num_readers;
-        while (auto * state = write_state.load())
-            state->wait();
-    }
-
-    void unlock() noexcept
-    {
-        --num_readers;
-        while (auto * state = write_state.load())
-            state->write_condvar.notify_one();
-    }
-
-    State * allocState()
-    {
-        std::lock_guard guard(states_mutex);
-        return &states.emplace_back();
-    }
-
-    void upgrade(State & state) noexcept
-    {
-        State * expected = nullptr;
-
-        /// Only change nullptr -> state is possible.
-        while (!write_state.compare_exchange_strong(expected, &state))
-        {
-            expected->wait();
-            expected = nullptr;
-        }
-
-        state.lock(num_readers);
-    }
-
-    void degrade(State & state) noexcept
-    {
-        State * my = write_state.exchange(nullptr);
-        if (&state != my)
-            std::terminate();
-        state.unlock();
-    }
-
-public:
-    class ReadGuard
-    {
-    public:
-        explicit ReadGuard(UpgradableMutex & lock_) : lock(lock_) { lock.lock(); }
-        ~ReadGuard() { lock.unlock(); }
-
-        UpgradableMutex & lock;
-    };
-
-    class WriteGuard
-    {
-    public:
-        explicit WriteGuard(ReadGuard & read_guard_) : read_guard(read_guard_)
-        {
-            state = read_guard.lock.allocState();
-            read_guard.lock.upgrade(*state);
-        }
-
-        ~WriteGuard()
-        {
-            if (state)
-                read_guard.lock.degrade(*state);
-        }
-
-    private:
-        ReadGuard & read_guard;
-        State * state = nullptr;
-    };
-};
-
-}
diff --git a/src/Processors/Merges/AggregatingSortedTransform.h b/src/Processors/Merges/AggregatingSortedTransform.h
index b7a88cca9527..c6d7e844c659 100644
--- a/src/Processors/Merges/AggregatingSortedTransform.h
+++ b/src/Processors/Merges/AggregatingSortedTransform.h
@@ -13,14 +13,18 @@ class AggregatingSortedTransform final : public IMergingTransform<AggregatingSor
 {
 public:
     AggregatingSortedTransform(
-        const Block & header, size_t num_inputs,
-        SortDescription description_, size_t max_block_size)
+        const Block & header,
+        size_t num_inputs,
+        SortDescription description_,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes)
         : IMergingTransform(
             num_inputs, header, header, /*have_all_inputs_=*/ true, /*limit_hint_=*/ 0, /*always_read_till_end_=*/ false,
             header,
             num_inputs,
             std::move(description_),
-            max_block_size)
+            max_block_size_rows,
+            max_block_size_bytes)
     {
     }
 
diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp
index 74cccdb08dde..3525a5cab778 100644
--- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp
@@ -159,8 +159,11 @@ AggregatingSortedAlgorithm::SimpleAggregateDescription::~SimpleAggregateDescript
 
 
 AggregatingSortedAlgorithm::AggregatingMergedData::AggregatingMergedData(
-    MutableColumns columns_, UInt64 max_block_size_, ColumnsDefinition & def_)
-    : MergedData(std::move(columns_), false, max_block_size_), def(def_)
+    MutableColumns columns_,
+    UInt64 max_block_size_rows_,
+    UInt64 max_block_size_bytes_,
+    ColumnsDefinition & def_)
+    : MergedData(std::move(columns_), false, max_block_size_rows_, max_block_size_bytes_), def(def_)
 {
     initAggregateDescription();
 
@@ -257,10 +260,14 @@ void AggregatingSortedAlgorithm::AggregatingMergedData::initAggregateDescription
 
 
 AggregatingSortedAlgorithm::AggregatingSortedAlgorithm(
-    const Block & header_, size_t num_inputs, SortDescription description_, size_t max_block_size)
+    const Block & header_,
+    size_t num_inputs,
+    SortDescription description_,
+    size_t max_block_size_rows_,
+    size_t max_block_size_bytes_)
     : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, description_)
     , columns_definition(defineColumns(header_, description_))
-    , merged_data(getMergedColumns(header_, columns_definition), max_block_size, columns_definition)
+    , merged_data(getMergedColumns(header_, columns_definition), max_block_size_rows_, max_block_size_bytes_, columns_definition)
 {
 }
 
diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h
index d670242ed818..456b94c69ce7 100644
--- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h
@@ -18,8 +18,11 @@ class AggregatingSortedAlgorithm final : public IMergingAlgorithmWithDelayedChun
 {
 public:
     AggregatingSortedAlgorithm(
-        const Block & header, size_t num_inputs,
-        SortDescription description_, size_t max_block_size);
+        const Block & header,
+        size_t num_inputs,
+        SortDescription description_,
+        size_t max_block_size_rows_,
+        size_t max_block_size_bytes_);
 
     void initialize(Inputs inputs) override;
     void consume(Input & input, size_t source_num) override;
@@ -96,7 +99,11 @@ class AggregatingSortedAlgorithm final : public IMergingAlgorithmWithDelayedChun
         using MergedData::insertRow;
 
     public:
-        AggregatingMergedData(MutableColumns columns_, UInt64 max_block_size_, ColumnsDefinition & def_);
+        AggregatingMergedData(
+            MutableColumns columns_,
+            UInt64 max_block_size_rows_,
+            UInt64 max_block_size_bytes_,
+            ColumnsDefinition & def_);
 
         /// Group is a group of rows with the same sorting key. It represents single row in result.
         /// Algorithm is: start group, add several rows, finish group.
diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp
index 0c23dd51f3c8..1b47cb824d10 100644
--- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp
@@ -26,12 +26,13 @@ CollapsingSortedAlgorithm::CollapsingSortedAlgorithm(
     SortDescription description_,
     const String & sign_column,
     bool only_positive_sign_,
-    size_t max_block_size,
+    size_t max_block_size_rows_,
+    size_t max_block_size_bytes_,
     Poco::Logger * log_,
     WriteBuffer * out_row_sources_buf_,
     bool use_average_block_sizes)
     : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs)
-    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size)
+    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)
     , sign_column_number(header_.getPositionByName(sign_column))
     , only_positive_sign(only_positive_sign_)
     , log(log_)
diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h
index f457af05bd56..c90ddbbab5d0 100644
--- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h
@@ -32,7 +32,8 @@ class CollapsingSortedAlgorithm final : public IMergingAlgorithmWithSharedChunks
         SortDescription description_,
         const String & sign_column,
         bool only_positive_sign_, /// For select final. Skip rows with sum(sign) < 0.
-        size_t max_block_size,
+        size_t max_block_size_rows_,
+        size_t max_block_size_bytes_,
         Poco::Logger * log_,
         WriteBuffer * out_row_sources_buf_ = nullptr,
         bool use_average_block_sizes = false);
@@ -74,4 +75,3 @@ class CollapsingSortedAlgorithm final : public IMergingAlgorithmWithSharedChunks
 };
 
 }
-
diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
index aef1e9c70da1..5f9725c804b5 100644
--- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
@@ -30,9 +30,9 @@ FinishAggregatingInOrderAlgorithm::FinishAggregatingInOrderAlgorithm(
     size_t num_inputs_,
     AggregatingTransformParamsPtr params_,
     const SortDescription & description_,
-    size_t max_block_size_,
-    size_t max_block_bytes_)
-    : header(header_), num_inputs(num_inputs_), params(params_), max_block_size(max_block_size_), max_block_bytes(max_block_bytes_)
+    size_t max_block_size_rows_,
+    size_t max_block_size_bytes_)
+    : header(header_), num_inputs(num_inputs_), params(params_), max_block_size_rows(max_block_size_rows_), max_block_size_bytes(max_block_size_bytes_)
 {
     for (const auto & column_description : description_)
         description.emplace_back(column_description, header_.getPositionByName(column_description.column_name));
@@ -118,7 +118,7 @@ IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge()
     inputs_to_update.pop_back();
 
     /// Do not merge blocks, if there are too few rows or bytes.
-    if (accumulated_rows >= max_block_size || accumulated_bytes >= max_block_bytes)
+    if (accumulated_rows >= max_block_size_rows || accumulated_bytes >= max_block_size_bytes)
         status.chunk = prepareToMerge();
 
     return status;
diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
index b1a74a09459d..13522b708340 100644
--- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
@@ -42,8 +42,8 @@ class FinishAggregatingInOrderAlgorithm final : public IMergingAlgorithm
         size_t num_inputs_,
         AggregatingTransformParamsPtr params_,
         const SortDescription & description_,
-        size_t max_block_size_,
-        size_t max_block_bytes_);
+        size_t max_block_size_rows_,
+        size_t max_block_size_bytes_);
 
     void initialize(Inputs inputs) override;
     void consume(Input & input, size_t source_num) override;
@@ -79,8 +79,8 @@ class FinishAggregatingInOrderAlgorithm final : public IMergingAlgorithm
     size_t num_inputs;
     AggregatingTransformParamsPtr params;
     SortDescriptionWithPositions description;
-    size_t max_block_size;
-    size_t max_block_bytes;
+    size_t max_block_size_rows;
+    size_t max_block_size_bytes;
 
     Inputs current_inputs;
 
diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp
index 123748f9b436..814625d7aeef 100644
--- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp
@@ -42,11 +42,12 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm(
     const Block & header_,
     size_t num_inputs,
     SortDescription description_,
-    size_t max_block_size,
+    size_t max_block_size_rows_,
+    size_t max_block_size_bytes_,
     Graphite::Params params_,
     time_t time_of_merge_)
     : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), nullptr, max_row_refs)
-    , merged_data(header_.cloneEmptyColumns(), false, max_block_size)
+    , merged_data(header_.cloneEmptyColumns(), false, max_block_size_rows_, max_block_size_bytes_)
     , params(std::move(params_))
     , time_of_merge(time_of_merge_)
 {
diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h
index d6d2f66fb82b..f920d623b1fc 100644
--- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h
@@ -22,9 +22,13 @@ class GraphiteRollupSortedAlgorithm final : public IMergingAlgorithmWithSharedCh
 {
 public:
     GraphiteRollupSortedAlgorithm(
-        const Block & header, size_t num_inputs,
-        SortDescription description_, size_t max_block_size,
-        Graphite::Params params_, time_t time_of_merge_);
+        const Block & header,
+        size_t num_inputs,
+        SortDescription description_,
+        size_t max_block_size_rows_,
+        size_t max_block_size_bytes_,
+        Graphite::Params params_,
+        time_t time_of_merge_);
 
     Status merge() override;
 
diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h
index f4ef0b77c538..f92d20d22e10 100644
--- a/src/Processors/Merges/Algorithms/MergedData.h
+++ b/src/Processors/Merges/Algorithms/MergedData.h
@@ -19,8 +19,8 @@ namespace ErrorCodes
 class MergedData
 {
 public:
-    explicit MergedData(MutableColumns columns_, bool use_average_block_size_, UInt64 max_block_size_)
-        : columns(std::move(columns_)), max_block_size(max_block_size_), use_average_block_size(use_average_block_size_)
+    explicit MergedData(MutableColumns columns_, bool use_average_block_size_, UInt64 max_block_size_, UInt64 max_block_size_bytes_)
+        : columns(std::move(columns_)), max_block_size(max_block_size_), max_block_size_bytes(max_block_size_bytes_), use_average_block_size(use_average_block_size_)
     {
     }
 
@@ -117,6 +117,16 @@ class MergedData
         if (merged_rows >= max_block_size)
             return true;
 
+        /// Never return more than max_block_size_bytes
+        if (max_block_size_bytes)
+        {
+            size_t merged_bytes = 0;
+            for (const auto & column : columns)
+                merged_bytes += column->allocatedBytes();
+            if (merged_bytes >= max_block_size_bytes)
+                return true;
+        }
+
         if (!use_average_block_size)
             return false;
 
@@ -143,8 +153,9 @@ class MergedData
     UInt64 total_chunks = 0;
     UInt64 total_allocated_bytes = 0;
 
-    const UInt64 max_block_size;
-    const bool use_average_block_size;
+    const UInt64 max_block_size = 0;
+    const UInt64 max_block_size_bytes = 0;
+    const bool use_average_block_size = false;
 
     bool need_flush = false;
 };
diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp
index 77db1e06d062..1debfcec8e0a 100644
--- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp
@@ -11,13 +11,14 @@ MergingSortedAlgorithm::MergingSortedAlgorithm(
     Block header_,
     size_t num_inputs,
     const SortDescription & description_,
-    size_t max_block_size,
+    size_t max_block_size_,
+    size_t max_block_size_bytes_,
     SortingQueueStrategy sorting_queue_strategy_,
     UInt64 limit_,
     WriteBuffer * out_row_sources_buf_,
     bool use_average_block_sizes)
     : header(std::move(header_))
-    , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size)
+    , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size_, max_block_size_bytes_)
     , description(description_)
     , limit(limit_)
     , out_row_sources_buf(out_row_sources_buf_)
diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
index 2537c48b1282..1357e58f0f1f 100644
--- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
@@ -17,7 +17,8 @@ class MergingSortedAlgorithm final : public IMergingAlgorithm
         Block header_,
         size_t num_inputs,
         const SortDescription & description_,
-        size_t max_block_size,
+        size_t max_block_size_,
+        size_t max_block_size_bytes_,
         SortingQueueStrategy sorting_queue_strategy_,
         UInt64 limit_ = 0,
         WriteBuffer * out_row_sources_buf_ = nullptr,
diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
index e8d1f8365916..db770de858cf 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@@ -17,12 +17,13 @@ ReplacingSortedAlgorithm::ReplacingSortedAlgorithm(
     SortDescription description_,
     const String & is_deleted_column,
     const String & version_column,
-    size_t max_block_size,
+    size_t max_block_size_rows,
+    size_t max_block_size_bytes,
     WriteBuffer * out_row_sources_buf_,
     bool use_average_block_sizes,
     bool cleanup_)
     : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs)
-    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size), cleanup(cleanup_)
+    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows, max_block_size_bytes), cleanup(cleanup_)
 {
     if (!is_deleted_column.empty())
         is_deleted_column_number = header_.getPositionByName(is_deleted_column);
diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
index 6b9fb8f98c5c..4d8de55b0328 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
@@ -23,7 +23,8 @@ class ReplacingSortedAlgorithm final : public IMergingAlgorithmWithSharedChunks
         SortDescription description_,
         const String & is_deleted_column,
         const String & version_column,
-        size_t max_block_size,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes,
         WriteBuffer * out_row_sources_buf_ = nullptr,
         bool use_average_block_sizes = false,
         bool cleanup = false);
diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp
index 5b829d6299e2..7dac5715f956 100644
--- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp
@@ -497,8 +497,8 @@ static void setRow(Row & row, const ColumnRawPtrs & raw_columns, size_t row_num,
 
 
 SummingSortedAlgorithm::SummingMergedData::SummingMergedData(
-    MutableColumns columns_, UInt64 max_block_size_, ColumnsDefinition & def_)
-    : MergedData(std::move(columns_), false, max_block_size_)
+    MutableColumns columns_, UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_)
+    : MergedData(std::move(columns_), false, max_block_size_rows_, max_block_size_bytes_)
     , def(def_)
 {
     current_row.resize(def.column_names.size());
@@ -686,10 +686,11 @@ SummingSortedAlgorithm::SummingSortedAlgorithm(
     SortDescription description_,
     const Names & column_names_to_sum,
     const Names & partition_key_columns,
-    size_t max_block_size)
+    size_t max_block_size_rows,
+    size_t max_block_size_bytes)
     : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, std::move(description_))
     , columns_definition(defineColumns(header_, description, column_names_to_sum, partition_key_columns))
-    , merged_data(getMergedDataColumns(header_, columns_definition), max_block_size, columns_definition)
+    , merged_data(getMergedDataColumns(header_, columns_definition), max_block_size_rows, max_block_size_bytes, columns_definition)
 {
 }
 
diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
index c77bf7c0ba58..8943e2357297 100644
--- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
@@ -22,7 +22,8 @@ class SummingSortedAlgorithm final : public IMergingAlgorithmWithDelayedChunk
         const Names & column_names_to_sum,
         /// List of partition key columns. They have to be excluded.
         const Names & partition_key_columns,
-        size_t max_block_size);
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes);
 
     void initialize(Inputs inputs) override;
     void consume(Input & input, size_t source_num) override;
@@ -63,7 +64,7 @@ class SummingSortedAlgorithm final : public IMergingAlgorithmWithDelayedChunk
         using MergedData::insertRow;
 
     public:
-        SummingMergedData(MutableColumns columns_, UInt64 max_block_size_, ColumnsDefinition & def_);
+        SummingMergedData(MutableColumns columns_, UInt64 max_block_size_rows, UInt64 max_block_size_bytes_, ColumnsDefinition & def_);
 
         void startGroup(ColumnRawPtrs & raw_columns, size_t row);
         void finishGroup();
diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp
index cbafa53d0a38..e7a431dc1d00 100644
--- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp
@@ -12,13 +12,14 @@ VersionedCollapsingAlgorithm::VersionedCollapsingAlgorithm(
     size_t num_inputs,
     SortDescription description_,
     const String & sign_column_,
-    size_t max_block_size,
+    size_t max_block_size_rows_,
+    size_t max_block_size_bytes_,
     WriteBuffer * out_row_sources_buf_,
     bool use_average_block_sizes)
     : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE)
-    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size)
+    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)
     /// -1 for +1 in FixedSizeDequeWithGaps's internal buffer. 3 is a reasonable minimum size to collapse anything.
-    , max_rows_in_queue(std::min(std::max<size_t>(3, max_block_size), MAX_ROWS_IN_MULTIVERSION_QUEUE) - 1)
+    , max_rows_in_queue(std::min(std::max<size_t>(3, max_block_size_rows_), MAX_ROWS_IN_MULTIVERSION_QUEUE) - 1)
     , current_keys(max_rows_in_queue)
 {
     sign_column_number = header_.getPositionByName(sign_column_);
diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h
index 2226762d541e..578100f080d8 100644
--- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h
@@ -20,7 +20,8 @@ class VersionedCollapsingAlgorithm final : public IMergingAlgorithmWithSharedChu
     VersionedCollapsingAlgorithm(
         const Block & header, size_t num_inputs,
         SortDescription description_, const String & sign_column_,
-        size_t max_block_size,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes,
         WriteBuffer * out_row_sources_buf_ = nullptr,
         bool use_average_block_sizes = false);
 
diff --git a/src/Processors/Merges/CollapsingSortedTransform.h b/src/Processors/Merges/CollapsingSortedTransform.h
index abe3eefb401a..b0cb6bc6d624 100644
--- a/src/Processors/Merges/CollapsingSortedTransform.h
+++ b/src/Processors/Merges/CollapsingSortedTransform.h
@@ -16,7 +16,8 @@ class CollapsingSortedTransform final : public IMergingTransform<CollapsingSorte
         SortDescription description_,
         const String & sign_column,
         bool only_positive_sign,
-        size_t max_block_size,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes,
         WriteBuffer * out_row_sources_buf_ = nullptr,
         bool use_average_block_sizes = false)
         : IMergingTransform(
@@ -26,7 +27,8 @@ class CollapsingSortedTransform final : public IMergingTransform<CollapsingSorte
             std::move(description_),
             sign_column,
             only_positive_sign,
-            max_block_size,
+            max_block_size_rows,
+            max_block_size_bytes,
             &Poco::Logger::get("CollapsingSortedTransform"),
             out_row_sources_buf_,
             use_average_block_sizes)
diff --git a/src/Processors/Merges/FinishAggregatingInOrderTransform.h b/src/Processors/Merges/FinishAggregatingInOrderTransform.h
index b82a103fee0d..0960b9d41279 100644
--- a/src/Processors/Merges/FinishAggregatingInOrderTransform.h
+++ b/src/Processors/Merges/FinishAggregatingInOrderTransform.h
@@ -17,16 +17,16 @@ class FinishAggregatingInOrderTransform final : public IMergingTransform<FinishA
         size_t num_inputs,
         AggregatingTransformParamsPtr params,
         SortDescription description,
-        size_t max_block_size,
-        size_t max_block_bytes)
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes)
         : IMergingTransform(
             num_inputs, header, {}, /*have_all_inputs_=*/ true, /*limit_hint_=*/ 0, /*always_read_till_end_=*/ false,
             header,
             num_inputs,
             params,
             std::move(description),
-            max_block_size,
-            max_block_bytes)
+            max_block_size_rows,
+            max_block_size_bytes)
     {
     }
 
diff --git a/src/Processors/Merges/GraphiteRollupSortedTransform.h b/src/Processors/Merges/GraphiteRollupSortedTransform.h
index f3c391c77ce4..b69feff1fb60 100644
--- a/src/Processors/Merges/GraphiteRollupSortedTransform.h
+++ b/src/Processors/Merges/GraphiteRollupSortedTransform.h
@@ -11,15 +11,20 @@ class GraphiteRollupSortedTransform final : public IMergingTransform<GraphiteRol
 {
 public:
     GraphiteRollupSortedTransform(
-        const Block & header, size_t num_inputs,
-        SortDescription description_, size_t max_block_size,
-        Graphite::Params params_, time_t time_of_merge_)
+        const Block & header,
+        size_t num_inputs,
+        SortDescription description_,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes,
+        Graphite::Params params_,
+        time_t time_of_merge_)
         : IMergingTransform(
             num_inputs, header, header, /*have_all_inputs_=*/ true, /*limit_hint_=*/ 0, /*always_read_till_end_=*/ false,
             header,
             num_inputs,
             std::move(description_),
-            max_block_size,
+            max_block_size_rows,
+            max_block_size_bytes,
             std::move(params_),
             time_of_merge_)
     {
diff --git a/src/Processors/Merges/MergingSortedTransform.cpp b/src/Processors/Merges/MergingSortedTransform.cpp
index 2e5eda9b54bd..572a5204df78 100644
--- a/src/Processors/Merges/MergingSortedTransform.cpp
+++ b/src/Processors/Merges/MergingSortedTransform.cpp
@@ -11,7 +11,8 @@ MergingSortedTransform::MergingSortedTransform(
     const Block & header,
     size_t num_inputs,
     const SortDescription & description_,
-    size_t max_block_size,
+    size_t max_block_size_rows,
+    size_t max_block_size_bytes,
     SortingQueueStrategy sorting_queue_strategy,
     UInt64 limit_,
     bool always_read_till_end_,
@@ -29,7 +30,8 @@ MergingSortedTransform::MergingSortedTransform(
         header,
         num_inputs,
         description_,
-        max_block_size,
+        max_block_size_rows,
+        max_block_size_bytes,
         sorting_queue_strategy,
         limit_,
         out_row_sources_buf_,
diff --git a/src/Processors/Merges/MergingSortedTransform.h b/src/Processors/Merges/MergingSortedTransform.h
index 3042550d5d54..2b53939f309a 100644
--- a/src/Processors/Merges/MergingSortedTransform.h
+++ b/src/Processors/Merges/MergingSortedTransform.h
@@ -15,7 +15,8 @@ class MergingSortedTransform final : public IMergingTransform<MergingSortedAlgor
         const Block & header,
         size_t num_inputs,
         const SortDescription & description,
-        size_t max_block_size,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes,
         SortingQueueStrategy sorting_queue_strategy,
         UInt64 limit_ = 0,
         bool always_read_till_end_ = false,
diff --git a/src/Processors/Merges/ReplacingSortedTransform.h b/src/Processors/Merges/ReplacingSortedTransform.h
index 8289f102cb70..9cd2f29a862b 100644
--- a/src/Processors/Merges/ReplacingSortedTransform.h
+++ b/src/Processors/Merges/ReplacingSortedTransform.h
@@ -15,7 +15,8 @@ class ReplacingSortedTransform final : public IMergingTransform<ReplacingSortedA
         const Block & header, size_t num_inputs,
         SortDescription description_,
         const String & is_deleted_column, const String & version_column,
-        size_t max_block_size,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes,
         WriteBuffer * out_row_sources_buf_ = nullptr,
         bool use_average_block_sizes = false,
         bool cleanup = false)
@@ -26,7 +27,8 @@ class ReplacingSortedTransform final : public IMergingTransform<ReplacingSortedA
             std::move(description_),
             is_deleted_column,
             version_column,
-            max_block_size,
+            max_block_size_rows,
+            max_block_size_bytes,
             out_row_sources_buf_,
             use_average_block_sizes,
             cleanup)
diff --git a/src/Processors/Merges/SummingSortedTransform.h b/src/Processors/Merges/SummingSortedTransform.h
index 204224ecf06b..70ddebfea955 100644
--- a/src/Processors/Merges/SummingSortedTransform.h
+++ b/src/Processors/Merges/SummingSortedTransform.h
@@ -17,7 +17,9 @@ class SummingSortedTransform final : public IMergingTransform<SummingSortedAlgor
         /// List of columns to be summed. If empty, all numeric columns that are not in the description are taken.
         const Names & column_names_to_sum,
         const Names & partition_key_columns,
-        size_t max_block_size)
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes
+        )
         : IMergingTransform(
             num_inputs, header, header, /*have_all_inputs_=*/ true, /*limit_hint_=*/ 0, /*always_read_till_end_=*/ false,
             header,
@@ -25,7 +27,8 @@ class SummingSortedTransform final : public IMergingTransform<SummingSortedAlgor
             std::move(description_),
             column_names_to_sum,
             partition_key_columns,
-            max_block_size)
+            max_block_size_rows,
+            max_block_size_bytes)
     {
     }
 
diff --git a/src/Processors/Merges/VersionedCollapsingTransform.h b/src/Processors/Merges/VersionedCollapsingTransform.h
index e7eb164f5157..18244469bd7f 100644
--- a/src/Processors/Merges/VersionedCollapsingTransform.h
+++ b/src/Processors/Merges/VersionedCollapsingTransform.h
@@ -15,7 +15,8 @@ class VersionedCollapsingTransform final : public IMergingTransform<VersionedCol
     VersionedCollapsingTransform(
         const Block & header, size_t num_inputs,
         SortDescription description_, const String & sign_column_,
-        size_t max_block_size,
+        size_t max_block_size_rows,
+        size_t max_block_size_bytes,
         WriteBuffer * out_row_sources_buf_ = nullptr,
         bool use_average_block_sizes = false)
         : IMergingTransform(
@@ -24,7 +25,8 @@ class VersionedCollapsingTransform final : public IMergingTransform<VersionedCol
             num_inputs,
             std::move(description_),
             sign_column_,
-            max_block_size,
+            max_block_size_rows,
+            max_block_size_bytes,
             out_row_sources_buf_,
             use_average_block_sizes)
     {
diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp
index 9eec3e904942..b5d773d889fb 100644
--- a/src/Processors/QueryPlan/CreatingSetsStep.cpp
+++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp
@@ -1,3 +1,4 @@
+#include <exception>
 #include <Processors/QueryPlan/CreatingSetsStep.h>
 #include <Processors/QueryPlan/QueryPlan.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
@@ -133,7 +134,11 @@ void addCreatingSetsStep(QueryPlan & query_plan, PreparedSets::SubqueriesForSets
     for (auto & [description, subquery_for_set] : subqueries_for_sets)
     {
         if (!subquery_for_set.hasSource())
+        {
+            subquery_for_set.promise_to_fill_set.set_exception(std::make_exception_ptr(
+                Exception(ErrorCodes::LOGICAL_ERROR, "Subquery for set {} has no source", subquery_for_set.key)));
             continue;
+        }
 
         auto plan = subquery_for_set.detachSource();
 
diff --git a/src/Processors/QueryPlan/InnerShuffleStep.cpp b/src/Processors/QueryPlan/InnerShuffleStep.cpp
new file mode 100644
index 000000000000..0c7d43af1981
--- /dev/null
+++ b/src/Processors/QueryPlan/InnerShuffleStep.cpp
@@ -0,0 +1,156 @@
+#include "InnerShuffleStep.h"
+#include <memory>
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+#include <Processors/QueryPlan/ITransformingStep.h>
+#include <Processors/Transforms/InnerShuffleTransform.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <base/types.h>
+#include <Processors/Port.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+static ITransformingStep::Traits getTraits(const DataStream& /*input_stream_*/)
+{
+    return ITransformingStep::Traits
+    {
+        .data_stream_traits =
+        {
+            .returns_single_stream = false,
+            .preserves_number_of_streams = true,
+            .preserves_sorting = true,
+        },
+        .transform_traits =
+        {
+            .preserves_number_of_rows = false,
+        }
+    };
+}
+
+InnerShuffleStep::InnerShuffleStep(const DataStream & input_stream_, const std::vector<String> & hash_columns_)
+    : ITransformingStep(input_stream_, input_stream_.header, getTraits(input_stream_))
+    , hash_columns(hash_columns_)
+{
+}
+/**
+ * 1. InnerShuffleScatterTransform scatter each input block into num_streams blocks. The join keys are
+ *   used as the hash keys. num_streams should be a number of power of 2.
+ * 2. To avoid createting too many edges between scatter and gather processors. we make a small set of
+ *   InnerShuffleDispatchTransform. It collect split chunks from InnerShuffleScatterTransform and
+ *   dispatch them into different output ports.
+ * 3. InnerShuffleGatherTransform gather data from InnerShuffleDispatchTransforms and merge them into
+ *   one outport.
+ */
+void InnerShuffleStep::transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & /*settings*/)
+{
+    const auto & header = pipeline.getHeader();
+    std::vector<size_t> keys;
+    keys.reserve(hash_columns.size());
+    for (const auto & key_name : hash_columns)
+    {
+        keys.push_back(header.getPositionByName(key_name));
+    }
+    OutputPortRawPtrs current_outports;
+
+    size_t num_streams = pipeline.getNumStreams();
+    if (num_streams != alignStreamsNum(static_cast<UInt32>(num_streams)))
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR, "The num_streams {} is not a power of 2", num_streams);
+    }
+
+    // should not be a high overhead operation, small number is OK.
+    size_t max_dispatcher_num = 8;
+    size_t dispatchers_num = num_streams/2;
+    if (dispatchers_num > max_dispatcher_num)
+    {
+        dispatchers_num = max_dispatcher_num;
+    }
+    if (!dispatchers_num)
+    {
+        dispatchers_num = 1;
+    }
+
+    auto add_scatter_transform = [&](OutputPortRawPtrs outports)
+    {
+        Processors scatters;
+        Processors dispatchers;
+        std::vector<InputPort *> dispatcher_input_ports;
+
+        if (outports.size() != num_streams)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR, "The output ports size is expected to be {}, but got {}", num_streams, outports.size());
+        }
+        for (size_t i = 0; i < dispatchers_num; ++i)
+        {
+            dispatchers.push_back(std::make_shared<InnerShuffleDispatchTransform>(num_streams/dispatchers_num, num_streams, header));
+            for (auto & port : dispatchers.back()->getInputs())
+            {
+                dispatcher_input_ports.push_back(&port);
+            }
+        }
+
+        for (auto & outport : outports)
+        {
+            auto scatter = std::make_shared<InnerShuffleScatterTransform>(num_streams, header, keys);
+            connect(*outport, scatter->getInputs().front());
+            scatters.push_back(scatter);
+        }
+        for (size_t i = 0; i < num_streams; ++i)
+        {
+            connect(scatters[i]->getOutputs().front(), *dispatcher_input_ports[i]);
+        }
+        for (auto & dispatcher : dispatchers)
+        {
+            scatters.push_back(dispatcher);
+        }
+        return scatters;
+    };
+    pipeline.transform(add_scatter_transform);
+
+    auto add_gather_transform = [&](OutputPortRawPtrs outports)
+    {
+        Processors gathers;
+        for (size_t i = 0; i < num_streams; ++i)
+        {
+            OutputPortRawPtrs gather_upstream_outports;
+            auto gather = std::make_shared<InnerShuffleGatherTransform>(header, dispatchers_num);
+            gathers.push_back(gather);
+            auto & gather_inputs = gather->getInputs();
+            for (size_t j = 0; j < dispatchers_num; ++j)
+            {
+                gather_upstream_outports.push_back(outports[j * num_streams + i]);
+            }
+            auto oiter = gather_upstream_outports.begin();
+            auto iiter = gather_inputs.begin();
+            for (; oiter != gather_upstream_outports.end();)
+            {
+                connect(**oiter, *iiter);
+                oiter++;
+                iiter++;
+            }
+        }
+        return gathers;
+    };
+    pipeline.transform(add_gather_transform);
+
+}
+
+void InnerShuffleStep::updateOutputStream()
+{
+    output_stream = createOutputStream(
+        input_streams.front(),
+        input_streams.front().header,
+        getDataStreamTraits());
+}
+UInt32 InnerShuffleStep::alignStreamsNum(UInt32 n)
+{
+    if (n <= 1)
+        return 1;
+    return static_cast<UInt32>(1) << (32 - std::countl_zero(n - 1));
+}
+}
diff --git a/src/Processors/QueryPlan/InnerShuffleStep.h b/src/Processors/QueryPlan/InnerShuffleStep.h
new file mode 100644
index 000000000000..27220819d459
--- /dev/null
+++ b/src/Processors/QueryPlan/InnerShuffleStep.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+#include <Processors/QueryPlan/ITransformingStep.h>
+#include <QueryPipeline/Pipe.h>
+#include <base/types.h>
+namespace DB
+{
+/**
+ * For shuffling one block into blocks by a hash function. Each processor on the downstream
+ * will handle different part of the data.
+ */
+class InnerShuffleStep : public ITransformingStep
+{
+public:
+    explicit InnerShuffleStep(const DataStream & input_stream_, const std::vector<String> & hash_columns_);
+    ~InnerShuffleStep() override = default;
+
+    String getName() const override { return "InnerShuffle"; }
+    // The shuffle buckets size is equal to pipeline's num_streams
+    void transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings) override;
+    static UInt32 alignStreamsNum(UInt32 n);
+private:
+    std::vector<String> hash_columns; // columns' name to build the hash key
+
+    void updateOutputStream() override;
+};
+}
diff --git a/src/Processors/QueryPlan/JoinStep.cpp b/src/Processors/QueryPlan/JoinStep.cpp
index 2ff8f161e992..0760ad730448 100644
--- a/src/Processors/QueryPlan/JoinStep.cpp
+++ b/src/Processors/QueryPlan/JoinStep.cpp
@@ -41,15 +41,31 @@ QueryPipelineBuilderPtr JoinStep::updatePipeline(QueryPipelineBuilders pipelines
         return joined_pipeline;
     }
 
-    return QueryPipelineBuilder::joinPipelinesRightLeft(
-        std::move(pipelines[0]),
-        std::move(pipelines[1]),
-        join,
-        output_stream->header,
-        max_block_size,
-        max_streams,
-        keep_left_read_in_order,
-        &processors);
+    bool could_run_partition_join
+        = join->supportShuffle() && !pipelines[0]->hasTotals() && !pipelines[1]->hasTotals() && !keep_left_read_in_order && max_streams > 1;
+    if (could_run_partition_join)
+    {
+        return QueryPipelineBuilder::joinPipelinesRightLeftByShuffle(
+            std::move(pipelines[0]),
+            std::move(pipelines[1]),
+            join,
+            output_stream->header,
+            max_block_size,
+            max_streams,
+            &processors);
+    }
+    else
+    {
+        return QueryPipelineBuilder::joinPipelinesRightLeft(
+            std::move(pipelines[0]),
+            std::move(pipelines[1]),
+            join,
+            output_stream->header,
+            max_block_size,
+            max_streams,
+            keep_left_read_in_order,
+            &processors);
+    }
 }
 
 bool JoinStep::allowPushDownToRight() const
diff --git a/src/Processors/QueryPlan/ReadFromMemoryStorageStep.cpp b/src/Processors/QueryPlan/ReadFromMemoryStorageStep.cpp
index fa2414ec8853..2080c31d2530 100644
--- a/src/Processors/QueryPlan/ReadFromMemoryStorageStep.cpp
+++ b/src/Processors/QueryPlan/ReadFromMemoryStorageStep.cpp
@@ -6,12 +6,14 @@
 
 #include <Interpreters/getColumnFromBlock.h>
 #include <Interpreters/inplaceBlockConversions.h>
+#include <Interpreters/InterpreterSelectQuery.h>
 #include <Storages/StorageSnapshot.h>
 #include <Storages/StorageMemory.h>
 
 #include <QueryPipeline/Pipe.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Processors/ISource.h>
+#include <Processors/Sources/NullSource.h>
 
 namespace DB
 {
@@ -93,29 +95,39 @@ class MemorySource : public ISource
     InitializerFunc initializer_func;
 };
 
-ReadFromMemoryStorageStep::ReadFromMemoryStorageStep(Pipe pipe_) :
-    SourceStepWithFilter(DataStream{.header = pipe_.getHeader()}),
-    pipe(std::move(pipe_))
+ReadFromMemoryStorageStep::ReadFromMemoryStorageStep(const Names & columns_to_read_,
+                                                     const StorageSnapshotPtr & storage_snapshot_,
+                                                     const size_t num_streams_,
+                                                     const bool delay_read_for_global_sub_queries_) :
+    SourceStepWithFilter(DataStream{.header=storage_snapshot_->getSampleBlockForColumns(columns_to_read_)}),
+    columns_to_read(columns_to_read_),
+    storage_snapshot(storage_snapshot_),
+    num_streams(num_streams_),
+    delay_read_for_global_sub_queries(delay_read_for_global_sub_queries_)
 {
 }
 
 void ReadFromMemoryStorageStep::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
 {
-    // use move - make sure that the call will only be made once.
+    auto pipe = makePipe();
+
+    if (pipe.empty())
+    {
+        assert(output_stream != std::nullopt);
+        pipe = Pipe(std::make_shared<NullSource>(output_stream->header));
+    }
+
     pipeline.init(std::move(pipe));
 }
 
-Pipe ReadFromMemoryStorageStep::makePipe(const Names & columns_to_read_,
-              const StorageSnapshotPtr & storage_snapshot_,
-              size_t num_streams_,
-              const bool delay_read_for_global_sub_queries_)
+Pipe ReadFromMemoryStorageStep::makePipe()
 {
-    storage_snapshot_->check(columns_to_read_);
+    storage_snapshot->check(columns_to_read);
 
-    const auto & snapshot_data = assert_cast<const StorageMemory::SnapshotData &>(*storage_snapshot_->data);
+    const auto & snapshot_data = assert_cast<const StorageMemory::SnapshotData &>(*storage_snapshot->data);
     auto current_data = snapshot_data.blocks;
 
-    if (delay_read_for_global_sub_queries_)
+    if (delay_read_for_global_sub_queries)
     {
         /// Note: for global subquery we use single source.
         /// Mainly, the reason is that at this point table is empty,
@@ -126,8 +138,8 @@ Pipe ReadFromMemoryStorageStep::makePipe(const Names & columns_to_read_,
         /// Since no other manipulation with data is done, multiple sources shouldn't give any profit.
 
         return Pipe(std::make_shared<MemorySource>(
-            columns_to_read_,
-            storage_snapshot_,
+            columns_to_read,
+            storage_snapshot,
             nullptr /* data */,
             nullptr /* parallel execution index */,
             [current_data](std::shared_ptr<const Blocks> & data_to_initialize)
@@ -138,16 +150,16 @@ Pipe ReadFromMemoryStorageStep::makePipe(const Names & columns_to_read_,
 
     size_t size = current_data->size();
 
-    if (num_streams_ > size)
-        num_streams_ = size;
+    if (num_streams > size)
+        num_streams = size;
 
     Pipes pipes;
 
     auto parallel_execution_index = std::make_shared<std::atomic<size_t>>(0);
 
-    for (size_t stream = 0; stream < num_streams_; ++stream)
+    for (size_t stream = 0; stream < num_streams; ++stream)
     {
-        pipes.emplace_back(std::make_shared<MemorySource>(columns_to_read_, storage_snapshot_, current_data, parallel_execution_index));
+        pipes.emplace_back(std::make_shared<MemorySource>(columns_to_read, storage_snapshot, current_data, parallel_execution_index));
     }
     return Pipe::unitePipes(std::move(pipes));
 }
diff --git a/src/Processors/QueryPlan/ReadFromMemoryStorageStep.h b/src/Processors/QueryPlan/ReadFromMemoryStorageStep.h
index 652e729a97a0..cec523ed58bc 100644
--- a/src/Processors/QueryPlan/ReadFromMemoryStorageStep.h
+++ b/src/Processors/QueryPlan/ReadFromMemoryStorageStep.h
@@ -5,6 +5,7 @@
 #include <Interpreters/TreeRewriter.h>
 #include <Processors/QueryPlan/SourceStepWithFilter.h>
 #include <QueryPipeline/Pipe.h>
+#include <Storages/SelectQueryInfo.h>
 
 namespace DB
 {
@@ -14,7 +15,10 @@ class QueryPipelineBuilder;
 class ReadFromMemoryStorageStep final : public SourceStepWithFilter
 {
 public:
-    explicit ReadFromMemoryStorageStep(Pipe pipe_);
+    ReadFromMemoryStorageStep(const Names & columns_to_read_,
+                              const StorageSnapshotPtr & storage_snapshot_,
+                              size_t num_streams_,
+                              bool delay_read_for_global_sub_queries_);
 
     ReadFromMemoryStorageStep() = delete;
     ReadFromMemoryStorageStep(const ReadFromMemoryStorageStep &) = delete;
@@ -27,14 +31,15 @@ class ReadFromMemoryStorageStep final : public SourceStepWithFilter
 
     void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override;
 
-    static Pipe makePipe(const Names & columns_to_read_,
-                         const StorageSnapshotPtr & storage_snapshot_,
-                         size_t num_streams_,
-                         bool delay_read_for_global_sub_queries_);
-
 private:
     static constexpr auto name = "ReadFromMemoryStorage";
-    Pipe pipe;
+
+    Names columns_to_read;
+    StorageSnapshotPtr storage_snapshot;
+    size_t num_streams;
+    bool delay_read_for_global_sub_queries;
+
+    Pipe makePipe();
 };
 
 }
diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index 185ec9bace82..4af153505204 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -852,7 +852,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder(
             if (pipe.numOutputPorts() > 1)
             {
                 auto transform = std::make_shared<MergingSortedTransform>(
-                    pipe.getHeader(), pipe.numOutputPorts(), sort_description, max_block_size, SortingQueueStrategy::Batch);
+                    pipe.getHeader(), pipe.numOutputPorts(), sort_description, max_block_size, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch);
 
                 pipe.addTransform(std::move(transform));
             }
@@ -898,31 +898,31 @@ static void addMergingFinal(
         {
             case MergeTreeData::MergingParams::Ordinary:
                 return std::make_shared<MergingSortedTransform>(header, num_outputs,
-                            sort_description, max_block_size, SortingQueueStrategy::Batch);
+                            sort_description, max_block_size, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch);
 
             case MergeTreeData::MergingParams::Collapsing:
                 return std::make_shared<CollapsingSortedTransform>(header, num_outputs,
-                            sort_description, merging_params.sign_column, true, max_block_size);
+                            sort_description, merging_params.sign_column, true, max_block_size, /*max_block_size_bytes=*/0);
 
             case MergeTreeData::MergingParams::Summing:
                 return std::make_shared<SummingSortedTransform>(header, num_outputs,
-                            sort_description, merging_params.columns_to_sum, partition_key_columns, max_block_size);
+                            sort_description, merging_params.columns_to_sum, partition_key_columns, max_block_size, /*max_block_size_bytes=*/0);
 
             case MergeTreeData::MergingParams::Aggregating:
                 return std::make_shared<AggregatingSortedTransform>(header, num_outputs,
-                            sort_description, max_block_size);
+                            sort_description, max_block_size, /*max_block_size_bytes=*/0);
 
             case MergeTreeData::MergingParams::Replacing:
                 return std::make_shared<ReplacingSortedTransform>(header, num_outputs,
-                            sort_description, merging_params.is_deleted_column, merging_params.version_column, max_block_size, /*out_row_sources_buf_*/ nullptr, /*use_average_block_sizes*/ false, /*cleanup*/ !merging_params.is_deleted_column.empty());
+                            sort_description, merging_params.is_deleted_column, merging_params.version_column, max_block_size, /*max_block_size_bytes=*/0, /*out_row_sources_buf_*/ nullptr, /*use_average_block_sizes*/ false, /*cleanup*/ !merging_params.is_deleted_column.empty());
 
             case MergeTreeData::MergingParams::VersionedCollapsing:
                 return std::make_shared<VersionedCollapsingTransform>(header, num_outputs,
-                            sort_description, merging_params.sign_column, max_block_size);
+                            sort_description, merging_params.sign_column, max_block_size, /*max_block_size_bytes=*/0);
 
             case MergeTreeData::MergingParams::Graphite:
                 return std::make_shared<GraphiteRollupSortedTransform>(header, num_outputs,
-                            sort_description, max_block_size, merging_params.graphite_params, now);
+                            sort_description, max_block_size, /*max_block_size_bytes=*/0, merging_params.graphite_params, now);
         }
 
         UNREACHABLE();
diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp
index db44da5a0fc9..55ce763575ec 100644
--- a/src/Processors/QueryPlan/SortingStep.cpp
+++ b/src/Processors/QueryPlan/SortingStep.cpp
@@ -176,6 +176,7 @@ void SortingStep::mergingSorted(QueryPipelineBuilder & pipeline, const SortDescr
             pipeline.getNumStreams(),
             result_sort_desc,
             sort_settings.max_block_size,
+            /*max_block_size_bytes=*/0,
             SortingQueueStrategy::Batch,
             limit_,
             always_read_till_end);
@@ -269,6 +270,7 @@ void SortingStep::fullSort(
             pipeline.getNumStreams(),
             result_sort_desc,
             sort_settings.max_block_size,
+            /*max_block_size_bytes=*/0,
             SortingQueueStrategy::Batch,
             limit_,
             always_read_till_end);
diff --git a/src/Processors/Transforms/InnerShuffleTransform.cpp b/src/Processors/Transforms/InnerShuffleTransform.cpp
new file mode 100644
index 000000000000..1cf2b3c428ad
--- /dev/null
+++ b/src/Processors/Transforms/InnerShuffleTransform.cpp
@@ -0,0 +1,366 @@
+#include <memory>
+#include <Columns/ColumnSparse.h>
+#include <DataTypes/DataTypeLowCardinality.h>
+#include <Interpreters/createBlockSelector.h>
+#include <Processors/Port.h>
+#include <Processors/Transforms/InnerShuffleTransform.h>
+#include <Common/WeakHash.h>
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
+#include "InnerShuffleTransform.h"
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+static InputPorts buildMultiInputports(size_t num_streams, const Block & header)
+{
+    InputPorts ports;
+    for (size_t i = 0; i < num_streams; ++i)
+    {
+        InputPort port(header);
+        ports.push_back(port);
+    }
+    return ports;
+}
+
+static OutputPorts buildMultiOutputports(size_t num_streams, const Block & header)
+{
+    OutputPorts outports;
+    for (size_t i = 0; i < num_streams; ++i)
+    {
+        OutputPort outport(header);
+        outports.push_back(outport);
+    }
+    return outports;
+}
+
+InnerShuffleScatterTransform::InnerShuffleScatterTransform(size_t num_streams_, const Block & header_, const std::vector<size_t> & hash_columns_)
+    : IProcessor({header_}, {header_})
+    , num_streams(num_streams_)
+    , header(header_)
+    , hash_columns(hash_columns_)
+{}
+
+
+IProcessor::Status InnerShuffleScatterTransform::prepare()
+{
+    auto & output = outputs.front();
+    auto & input = inputs.front();
+    if (output.isFinished())
+    {
+        input.close();
+        return Status::Finished;
+    }
+    if (has_input)
+    {
+        return Status::Ready;
+    }
+    if (has_output)
+    {
+        if (output.canPush())
+        {
+            auto empty_block = header.cloneEmpty();
+            Chunk chunk(empty_block.getColumns(), 0);
+            auto chunk_info = std::make_shared<InnerShuffleScatterChunkInfo>();
+            chunk_info->chunks.swap(output_chunks);
+            chunk.setChunkInfo(chunk_info);
+            output.push(std::move(chunk));
+            has_output = false;
+        }
+        return Status::PortFull;
+    }
+    // There is pending chunk in output port which is not pulled out.
+    if (!output.canPush())
+    {
+        return Status::PortFull;
+    }
+    if (input.isFinished())
+    {
+        output.finish();
+        return Status::Finished;
+    }
+
+    input.setNeeded();
+    if (!input.hasData())
+    {
+        return Status::NeedData;
+    }
+    input_chunk = input.pull(true);
+    has_input = true;
+    return Status::Ready;
+}
+
+void InnerShuffleScatterTransform::work()
+{
+    if (!has_input)
+    {
+        return;
+    }
+    Block block = header.cloneWithColumns(input_chunk.detachColumns());
+    size_t num_rows = block.rows();
+    WeakHash32 hash(num_rows);
+    for (const auto col_index : hash_columns)
+    {
+        const auto & key_col = block.getByPosition(col_index).column->convertToFullColumnIfConst();
+        const auto & key_col_no_lc = recursiveRemoveLowCardinality(recursiveRemoveSparse(key_col));
+        key_col_no_lc->updateWeakHash32(hash);
+    }
+
+    IColumn::Selector selector(num_rows);
+    const auto & hash_data = hash.getData();
+    for (size_t i = 0; i < num_rows; ++i)
+    {
+        selector[i] = hash_data[i] & (num_streams - 1);
+    }
+
+    Blocks result_blocks;
+    for (size_t i = 0; i < num_streams; ++i)
+    {
+        result_blocks.emplace_back(header.cloneEmpty());
+    }
+
+    for (size_t i = 0, num_cols = header.columns(); i < num_cols; ++i)
+    {
+        auto shuffled_columms = block.getByPosition(i).column->scatter(num_streams, selector);
+        for (size_t block_index = 0; block_index < num_streams; ++block_index)
+        {
+            result_blocks[block_index].getByPosition(i).column = std::move(shuffled_columms[block_index]);
+        }
+    }
+    output_chunks.clear();
+    for (auto & scattered_block : result_blocks)
+    {
+        Chunk chunk(scattered_block.getColumns(), scattered_block.rows());
+        output_chunks.emplace_back(std::move(chunk));
+    }
+    has_output = true;
+    has_input = false;
+}
+
+InnerShuffleDispatchTransform::InnerShuffleDispatchTransform(size_t input_nums_, size_t output_nums_, const Block & header_)
+    : IProcessor(buildMultiInputports(input_nums_, header_), buildMultiOutputports(output_nums_, header_))
+    , input_nums(input_nums_)
+    , output_nums(output_nums_)
+    , header(header_)
+{
+    for (size_t i = 0; i < output_nums; ++i)
+    {
+        output_chunks.emplace_back(std::list<Chunk>());
+    }
+}
+
+IProcessor::Status InnerShuffleDispatchTransform::prepare()
+{
+    // If there is any output port is finished, make it finished.
+    bool is_output_finished = true;
+    for (auto & iter : outputs)
+    {
+        if (!iter.isFinished())
+        {
+            is_output_finished = false;
+        }
+    }
+    if (is_output_finished)
+    {
+        for (auto & iter : inputs)
+        {
+            iter.close();
+        }
+        return Status::Finished;
+    }
+
+    if (has_input)
+    {
+        return Status::Ready;
+    }
+
+    {
+        bool has_chunks_out = false;
+        bool has_pending_chunks = false;
+        size_t i = 0;
+        for (auto & outport : outputs)
+        {
+            if (outport.isFinished())
+            {
+                output_chunks[i].clear();
+                i += 1;
+                continue;
+            }
+            if (!output_chunks[i].empty())
+            {
+                if (outport.canPush())
+                {
+                    Chunk tmp_chunk;
+                    tmp_chunk.swap(output_chunks[i].front());
+                    output_chunks[i].pop_front();
+                    outport.push(std::move(tmp_chunk));
+                    has_chunks_out = true;
+                }
+            }
+            if (!outport.canPush())
+                has_pending_chunks = true;
+            i += 1;
+        }
+        // If there is no output port available, return PortFull
+        if (has_pending_chunks && !has_chunks_out)
+        {
+            return Status::PortFull;
+        }
+    }
+
+    bool all_input_finished = true;
+    for (auto & input : inputs)
+    {
+        if (input.isFinished())
+            continue;
+        all_input_finished = false;
+        input.setNeeded();
+        if (input.hasData())
+        {
+            auto chunk = input.pull(true);
+            input_chunks.emplace_back(std::move(chunk));
+            has_input = true;
+        }
+    }
+    if (all_input_finished)
+    {
+        for (auto & outport : outputs)
+        {
+            if (!outport.isFinished() && !outport.canPush())
+                return Status::PortFull;
+            outport.finish();
+        }
+        return Status::Finished;
+    }
+    if (!has_input)
+    {
+        return Status::NeedData;
+    }
+    return Status::Ready;
+}
+
+void InnerShuffleDispatchTransform::work()
+{
+    for (auto & input_chunk : input_chunks)
+    {
+        if (!input_chunk.hasChunkInfo())
+        {
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty chunk info");
+        }
+        auto chunk_list = std::dynamic_pointer_cast<const InnerShuffleScatterChunkInfo>(input_chunk.getChunkInfo());
+        size_t i = 0;
+        for (const auto & chunk : chunk_list->chunks)
+        {
+            auto num_rows = chunk.getNumRows();
+            if (num_rows)
+            {
+                if (output_chunks[i].empty() || output_chunks[i].back().getNumRows() >= DEFAULT_BLOCK_SIZE)
+                {
+                    Columns columns;
+                    columns.reserve(chunk.getColumns().size());
+                    for (const auto & col : chunk.getColumns())
+                    {
+                        columns.emplace_back(recursiveRemoveSparse(col));
+                    }
+                    Chunk new_chunk(columns, num_rows);
+                    output_chunks[i].push_back(std::move(new_chunk));
+                }
+                else
+                {
+                    auto & last_chunk = output_chunks[i].back();
+                    auto src_cols = chunk.getColumns();
+                    auto dst_cols = last_chunk.mutateColumns();
+                    for (size_t n = 0; n < src_cols.size(); ++n)
+                    {
+                        auto src_col = recursiveRemoveSparse(src_cols[n]);
+                        dst_cols[n]->insertRangeFrom(*src_col, 0, src_col->size());
+                    }
+                    auto rows = dst_cols[0]->size();
+                    Chunk new_chunk(std::move(dst_cols), rows);
+                    output_chunks[i].back().swap(new_chunk);
+                }
+            }
+            i += 1;
+        }
+    }
+    input_chunks.clear();
+    has_input = false;
+}
+
+InnerShuffleGatherTransform::InnerShuffleGatherTransform(const Block & header_, size_t input_num_)
+    : IProcessor(buildMultiInputports(input_num_, header_), {header_})
+{
+    for (auto & port : inputs)
+    {
+        input_port_ptrs.emplace_back(&port);
+    }
+}
+
+IProcessor::Status InnerShuffleGatherTransform::prepare()
+{
+    if (outputs.front().isFinished())
+    {
+        for (auto & iter : inputs)
+        {
+            iter.close();
+        }
+        return Status::Finished;
+    }
+    if (has_input)
+    {
+        return Status::Ready;
+    }
+    if (has_output)
+    {
+        if (outputs.front().canPush())
+        {
+            outputs.front().push(std::move(output_chunk));
+            has_output = false;
+        }
+        return Status::PortFull;
+    }
+    if (!outputs.front().canPush())
+    {
+        return Status::PortFull;
+    }
+    size_t i = 0;
+    bool all_input_finished = true;
+    while (i < inputs.size())
+    {
+        if (!input_port_ptrs[input_port_iter]->isFinished())
+        {
+            all_input_finished = false;
+            input_port_ptrs[input_port_iter]->setNeeded();
+            if (input_port_ptrs[input_port_iter]->hasData())
+            {
+                output_chunk = input_port_ptrs[input_port_iter]->pull(true);
+                has_input = true;
+                input_port_iter = (input_port_iter + 1) % inputs.size();
+                break;
+            }
+        }
+        input_port_iter = (input_port_iter + 1) % inputs.size();
+        i += 1;
+    }
+    if (all_input_finished)
+    {
+        outputs.front().finish();
+        return Status::Finished;
+    }
+    if (!has_input)
+    {
+        return Status::NeedData;
+    }
+
+    return Status::Ready;
+}
+
+void InnerShuffleGatherTransform::work()
+{
+    has_input = false;
+    has_output = output_chunk.getNumRows() > 0;
+}
+}
diff --git a/src/Processors/Transforms/InnerShuffleTransform.h b/src/Processors/Transforms/InnerShuffleTransform.h
new file mode 100644
index 000000000000..f9e1617c547c
--- /dev/null
+++ b/src/Processors/Transforms/InnerShuffleTransform.h
@@ -0,0 +1,73 @@
+#pragma once
+#include <Processors/IProcessor.h>
+#include <Poco/Logger.h>
+
+namespace DB
+{
+
+class InnerShuffleScatterChunkInfo : public ChunkInfo
+{
+public:
+    size_t finished_streams = 0;
+    size_t count = 0;
+    // scatter result from InnerShuffleScatterTransform. chunks.size() == num_streams
+    std::vector<Chunk> chunks;
+};
+
+// Split one chunk into multiple chunks according to the hash value of the specified columns.
+// And pass a list of chunks to InnerShuffleDispatchTransform.
+class InnerShuffleScatterTransform : public IProcessor
+{
+public:
+    InnerShuffleScatterTransform(size_t num_streams_, const Block & header_, const std::vector<size_t> & hash_columns_);
+    ~InnerShuffleScatterTransform() override = default;
+    String getName() const override { return "InnerShuffleScatterTransform"; }
+    Status prepare() override;
+    void work() override;
+private:
+    size_t num_streams;
+    Block header;
+    std::vector<size_t> hash_columns;
+    bool has_output = false;
+    std::vector<Chunk> output_chunks;
+    bool has_input = false;
+    Chunk input_chunk;
+};
+
+// Collect all hash split chunks from multiple InnerShuffleScatterTransform and dispatch them
+// into corresponding partitions.
+class InnerShuffleDispatchTransform : public IProcessor
+{
+public:
+    InnerShuffleDispatchTransform(size_t input_nums_, size_t output_nums_, const Block & header_);
+    ~InnerShuffleDispatchTransform() override = default;
+    String getName() const override { return "InnerShuffleDispatchTransform"; }
+    Status prepare() override;
+    void work() override;
+private:
+    size_t input_nums;
+    size_t output_nums;
+    Block header;
+    bool has_input = false;
+    std::vector<Chunk> input_chunks;
+    std::vector<std::list<Chunk>> output_chunks;
+};
+
+// Collect result from InnerShuffleDispatchTransforms, make sure that each stream will be handled by
+// one thread.
+class InnerShuffleGatherTransform : public IProcessor
+{
+public:
+    InnerShuffleGatherTransform(const Block & header_, size_t input_num_);
+    ~InnerShuffleGatherTransform() override = default;
+    String getName() const override { return "InnerShuffleGatherTransform"; }
+    Status prepare() override;
+    void work() override;
+private:
+    Chunk output_chunk;
+    bool has_input = false;
+    bool has_output = false;
+    std::vector<InputPort *> input_port_ptrs;
+    size_t input_port_iter = 0;
+};
+}
diff --git a/src/Processors/Transforms/MergeSortingTransform.cpp b/src/Processors/Transforms/MergeSortingTransform.cpp
index ecf14a81c000..de77711d1294 100644
--- a/src/Processors/Transforms/MergeSortingTransform.cpp
+++ b/src/Processors/Transforms/MergeSortingTransform.cpp
@@ -186,6 +186,7 @@ void MergeSortingTransform::consume(Chunk chunk)
                     0,
                     description,
                     max_merged_block_size,
+                    /*max_merged_block_size_bytes*/0,
                     SortingQueueStrategy::Batch,
                     limit,
                     /*always_read_till_end_=*/ false,
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 04a1f12f30a7..2ab6a4d4b03e 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -2012,7 +2012,7 @@ struct WindowFunctionNtile final : public WindowFunction
 
             if (!buckets)
             {
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument must > 0");
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument must be greater than 0");
             }
         }
         // new partition
diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp
index a4edf107b2f8..ceead0c5028c 100644
--- a/src/QueryPipeline/QueryPipelineBuilder.cpp
+++ b/src/QueryPipeline/QueryPipelineBuilder.cpp
@@ -1,8 +1,11 @@
+#include <memory>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 
 #include <Common/CurrentThread.h>
 #include <Common/typeid_cast.h>
-#include "Core/UUID.h"
+#include <Core/UUID.h>
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+#include <Processors/QueryPlan/InnerShuffleStep.h>
 #include <Core/SortDescription.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ExpressionActions.h>
@@ -30,6 +33,11 @@
 #include <Processors/Transforms/TotalsHavingTransform.h>
 #include <QueryPipeline/narrowPipe.h>
 
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
+#include <Interpreters/ProcessorsProfileLog.h>
+#include <QueryPipeline/Pipe.h>
+
 namespace DB
 {
 namespace ErrorCodes
@@ -455,7 +463,6 @@ std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesRightLe
     auto lit = left->pipe.output_ports.begin();
     auto rit = right->pipe.output_ports.begin();
 
-
     std::vector<OutputPort *> joined_output_ports;
     std::vector<OutputPort *> delayed_root_output_ports;
 
@@ -569,6 +576,188 @@ std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesRightLe
     return left;
 }
 
+std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesRightLeftByShuffle(
+        std::unique_ptr<QueryPipelineBuilder> left,
+        std::unique_ptr<QueryPipelineBuilder> right,
+        JoinPtr join,
+        const Block & output_header,
+        size_t max_block_size,
+        size_t max_streams,
+        Processors * collected_processors)
+{
+    left->checkInitializedAndNotCompleted();
+    right->checkInitializedAndNotCompleted();
+
+    left->pipe.dropExtremes();
+    right->pipe.dropExtremes();
+
+    left->pipe.collected_processors = collected_processors;
+
+    QueryPipelineProcessorsCollector collector(*right);
+
+    IQueryPlanStep * step = right->pipe.processors->back()->getQueryPlanStep();
+
+    size_t original_num_strems = left->getNumStreams();
+    size_t num_streams = original_num_strems;
+    size_t default_max_shuffle_parts = 32;
+    size_t max_shuffle_parts = InnerShuffleStep::alignStreamsNum(static_cast<UInt32>(max_streams));
+    if (max_shuffle_parts > max_streams &&  max_streams != 1)
+    {
+        max_shuffle_parts /= 2;
+    }
+    max_shuffle_parts = std::min(default_max_shuffle_parts, max_shuffle_parts);
+    if (num_streams != max_shuffle_parts)
+    {
+        left->resize(max_shuffle_parts);
+        num_streams = max_shuffle_parts;
+    }
+    right->resize(num_streams);
+
+    // Clone joins for different streams.
+    std::vector<JoinPtr> stream_joins;
+    stream_joins.push_back(join);
+    for (size_t i = 1; i < num_streams; ++i)
+    {
+        stream_joins.push_back(join->clone());
+    }
+
+    // make shuffle for left and right, make sure that each stream handle rows which have the same hash key
+    // from both side.
+    DataStream right_datestream;
+    right_datestream.header = right->getHeader();
+    InnerShuffleStep right_shuffle_step(right_datestream, join->getTableJoin().getOnlyClause().key_names_right);
+    right_shuffle_step.transformPipeline(*right, BuildQueryPipelineSettings());
+    DataStream left_datastream;
+    left_datastream.header = left->getHeader();
+    InnerShuffleStep left_shuffle_step(left_datastream, join->getTableJoin().getOnlyClause().key_names_left);
+    left_shuffle_step.transformPipeline(*left, BuildQueryPipelineSettings());
+
+
+    auto right_filling_transform = [&](OutputPortRawPtrs outports)
+    {
+        // Different stream has different join instance.
+        Processors processors;
+        for (size_t i = 0; i < outports.size(); ++i)
+        {
+            auto & outport = outports[i];
+            auto adding_joined = std::make_shared<FillingRightJoinSideTransform>(right->getHeader(), stream_joins[i]);
+            connect(*outport, adding_joined->getInputs().front());
+            processors.emplace_back(adding_joined);
+        }
+        return processors;
+    };
+    right->transform(right_filling_transform);
+
+    auto collect_processor = [&](ProcessorPtr p)
+    {
+        if (collected_processors)
+            collected_processors->push_back(p);
+        left->pipe.processors->push_back(p);
+    };
+
+    // Eache stream has its own delayed root
+    Processors stream_delayed_roots;
+    if (join->hasDelayedBlocks())
+    {
+        for (size_t i = 0; i < num_streams; ++i)
+        {
+            auto delayed_root = std::make_shared<DelayedJoinedBlocksTransform>(1, stream_joins[i]);
+            if (!delayed_root->getInputs().empty() || delayed_root->getOutputs().size() != 1)
+            {
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "DelayedJoinedBlocksTransform should have no inputs and {} outputs, "
+                    "but has {} inputs and {} outputs",
+                    1,
+                    delayed_root->getInputs().size(),
+                    delayed_root->getOutputs().size());
+            }
+            stream_delayed_roots.push_back(delayed_root);
+            collect_processor(delayed_root);
+        }
+    }
+
+    auto lit = left->pipe.output_ports.begin();
+    auto rit = right->pipe.output_ports.begin();
+    auto left_header = left->getHeader();
+    Block joined_header = JoiningTransform::transformHeader(left_header, join);
+    Processors stream_delay_workers;
+    Processors stream_joinings;
+    for (size_t i = 0; i < num_streams; ++i)
+    {
+        // Each stream has it own finish_counter, so we let the count to be 1
+        auto finish_counter = std::make_shared<JoiningTransform::FinishCounter>(1);
+        auto joining
+            = std::make_shared<JoiningTransform>(left_header, output_header, stream_joins[i], max_block_size, false, false, finish_counter);
+        connect(**lit, joining->getInputs().front());
+        connect(**rit, joining->getInputs().back());
+        stream_joinings.push_back(joining);
+        collect_processor(joining);
+
+        if (!stream_delayed_roots.empty())
+        {
+            auto & delayed_root = stream_delayed_roots[i];
+            auto delayed_worker = std::make_shared<DelayedJoinedBlocksWorkerTransform>(joined_header);
+            if (delayed_worker->getInputs().size() != 1 || delayed_worker->getOutputs().size() != 1)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "DelayedJoinedBlocksWorkerTransform should have one input and one output");
+            connect(delayed_root->getOutputs().front(), delayed_worker->getInputs().front());
+            collect_processor(delayed_worker);
+            stream_delay_workers.push_back(delayed_worker);
+        }
+        else
+        {
+            // overwrite the output ports on left.
+            *lit = &joining->getOutputs().front();
+        }
+        ++lit;
+        ++rit;
+    }
+
+    if (!stream_delay_workers.empty())
+    {
+        Processors stream_delayed_processors;
+        for (size_t i = 0; i < num_streams; ++i)
+        {
+            // Each stream has two inputs for DelayedPortsProcessor. the 1th input port
+            // is the delayed stream;
+            DelayedPortsProcessor::PortNumbers delayed_ports_numbers;
+            delayed_ports_numbers.push_back(1);
+            auto delayed_processor = std::make_shared<DelayedPortsProcessor>(joined_header, 2, delayed_ports_numbers);
+            collect_processor(delayed_processor);
+            stream_delayed_processors.push_back(delayed_processor);
+
+            auto port_it = delayed_processor->getInputs().begin();
+            connect(stream_joinings[i]->getOutputs().front(), *port_it);
+            port_it++;
+            connect(stream_delay_workers[i]->getOutputs().front(), *port_it);
+        }
+
+        // overwrite the output ports on left.
+        left->pipe.output_ports.clear();
+        for (size_t i = 0; i < num_streams; ++i)
+        {
+            left->pipe.output_ports.push_back(&stream_delayed_processors[i]->getOutputs().front());
+        }
+        left->pipe.header = joined_header;
+        left->resize(num_streams);
+    }
+
+    Processors processors = collector.detachProcessors();
+    if (step)
+    {
+        step->appendExtraProcessors(processors);
+    }
+    left->pipe.processors->insert(left->pipe.processors->end(), right->pipe.processors->begin(), right->pipe.processors->end());
+    left->resources = std::move(right->resources);
+    left->pipe.header = left->pipe.output_ports.front()->getHeader();
+    left->pipe.max_parallel_streams = std::max(left->pipe.max_parallel_streams, right->pipe.max_parallel_streams);
+
+    // FIXME, mess up the streams here, otherwise the executing graph will be hungup with MergingSortedTransform.
+    left->resize(1);
+    left->resize(original_num_strems);
+    return left;
+}
+
 void QueryPipelineBuilder::addCreatingSetsTransform(const Block & res_header, SubqueryForSet subquery_for_set, const SizeLimits & limits, ContextPtr context)
 {
     resize(1);
diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h
index 3a5d65d43881..c36bbf493355 100644
--- a/src/QueryPipeline/QueryPipelineBuilder.h
+++ b/src/QueryPipeline/QueryPipelineBuilder.h
@@ -124,6 +124,15 @@ class QueryPipelineBuilder
         bool keep_left_read_in_order,
         Processors * collected_processors = nullptr);
 
+    static std::unique_ptr<QueryPipelineBuilder> joinPipelinesRightLeftByShuffle(
+        std::unique_ptr<QueryPipelineBuilder> left,
+        std::unique_ptr<QueryPipelineBuilder> right,
+        JoinPtr join,
+        const Block & output_header,
+        size_t max_block_size,
+        size_t max_streams,
+        Processors * collected_processors = nullptr);
+
     /// Join two independent pipelines, processing them simultaneously.
     static std::unique_ptr<QueryPipelineBuilder> joinPipelinesYShaped(
         std::unique_ptr<QueryPipelineBuilder> left,
diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp
index e83eaf3c111f..8ca561bd3091 100644
--- a/src/QueryPipeline/RemoteQueryExecutor.cpp
+++ b/src/QueryPipeline/RemoteQueryExecutor.cpp
@@ -710,9 +710,10 @@ void RemoteQueryExecutor::tryCancel(const char * reason)
     if (read_context)
         read_context->cancel();
 
-    /// Query could be cancelled during connection creation or query sending,
-    /// we should check if connections were already created and query were sent.
-    if (connections && sent_query)
+    /// Query could be cancelled during connection creation, query sending or data receiving.
+    /// We should send cancel request if connections were already created, query were sent
+    /// and remote query is not finished.
+    if (connections && sent_query && !finished)
     {
         connections->sendCancel();
         if (log)
diff --git a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp
index d968dae3ff8c..bc22f249f976 100644
--- a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp
+++ b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp
@@ -83,7 +83,7 @@ TEST(MergingSortedTest, SimpleBlockSizeTest)
     EXPECT_EQ(pipe.numOutputPorts(), 3);
 
     auto transform = std::make_shared<MergingSortedTransform>(pipe.getHeader(), pipe.numOutputPorts(), sort_description,
-        DEFAULT_MERGE_BLOCK_SIZE, SortingQueueStrategy::Batch, 0, false, nullptr, false, true);
+        8192, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch, 0, false, nullptr, false, true);
 
     pipe.addTransform(std::move(transform));
 
@@ -125,7 +125,7 @@ TEST(MergingSortedTest, MoreInterestingBlockSizes)
     EXPECT_EQ(pipe.numOutputPorts(), 3);
 
     auto transform = std::make_shared<MergingSortedTransform>(pipe.getHeader(), pipe.numOutputPorts(), sort_description,
-        DEFAULT_MERGE_BLOCK_SIZE, SortingQueueStrategy::Batch, 0, false, nullptr, false, true);
+        8192, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch, 0, false, nullptr, false, true);
 
     pipe.addTransform(std::move(transform));
 
diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp
index aff17465466b..ae1f9172459d 100644
--- a/src/Storages/AlterCommands.cpp
+++ b/src/Storages/AlterCommands.cpp
@@ -1090,7 +1090,11 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const
                                                              "in a single ALTER query", backQuote(column_name));
 
             if (command.codec)
+            {
+                if (all_columns.hasAlias(column_name))
+                    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS");
                 CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs);
+            }
             auto column_default = all_columns.getDefault(column_name);
             if (column_default)
             {
diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp
index fa39e304925d..21b140bd73af 100644
--- a/src/Storages/ColumnsDescription.cpp
+++ b/src/Storages/ColumnsDescription.cpp
@@ -659,6 +659,12 @@ bool ColumnsDescription::hasPhysical(const String & column_name) const
         it->default_desc.kind != ColumnDefaultKind::Alias && it->default_desc.kind != ColumnDefaultKind::Ephemeral;
 }
 
+bool ColumnsDescription::hasAlias(const String & column_name) const
+{
+    auto it = columns.get<1>().find(column_name);
+    return it != columns.get<1>().end() && it->default_desc.kind == ColumnDefaultKind::Alias;
+}
+
 bool ColumnsDescription::hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const
 {
     auto it = columns.get<1>().find(column_name);
diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h
index 5551fdea2e34..e5ec867cd648 100644
--- a/src/Storages/ColumnsDescription.h
+++ b/src/Storages/ColumnsDescription.h
@@ -177,6 +177,7 @@ class ColumnsDescription : public IHints<1, ColumnsDescription>
     Names getNamesOfPhysical() const;
 
     bool hasPhysical(const String & column_name) const;
+    bool hasAlias(const String & column_name) const;
     bool hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const;
     bool hasColumnOrNested(GetColumnsOptions::Kind kind, const String & column_name) const;
 
diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp
index 45f1df92e38d..856d3eb2b275 100644
--- a/src/Storages/HDFS/StorageHDFSCluster.cpp
+++ b/src/Storages/HDFS/StorageHDFSCluster.cpp
@@ -102,7 +102,8 @@ Pipe StorageHDFSCluster::read(
         addColumnsStructureToQueryWithClusterEngine(
             query_to_send, StorageDictionary::generateNamesAndTypesDescription(storage_snapshot->metadata->getColumns().getAll()), 3, getName());
 
-    const auto & current_settings = context->getSettingsRef();
+    auto new_context = IStorageCluster::updateSettingsForTableFunctionCluster(context, context->getSettingsRef());
+    const auto & current_settings = new_context->getSettingsRef();
     auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings);
     for (const auto & shard_info : cluster->getShardsInfo())
     {
@@ -113,7 +114,7 @@ Pipe StorageHDFSCluster::read(
                 std::vector<IConnectionPool::Entry>{try_result},
                 queryToString(query_to_send),
                 header,
-                context,
+                new_context,
                 /*throttler=*/nullptr,
                 scalars,
                 Tables(),
diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h
index 3cebb7d6820e..5743d9036304 100644
--- a/src/Storages/IStorage.h
+++ b/src/Storages/IStorage.h
@@ -178,6 +178,8 @@ class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromo
     /// Returns true if the storage is for system, which cannot be target of SHOW CREATE TABLE.
     virtual bool isSystemStorage() const { return false; }
 
+    /// Returns true if asynchronous inserts are enabled for table.
+    virtual bool areAsynchronousInsertsEnabled() const { return false; }
 
     /// Optional size information of each physical column.
     /// Currently it's only used by the MergeTree family for query optimizations.
@@ -512,7 +514,7 @@ class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromo
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Mutations are not supported by storage {}", getName());
     }
 
-    virtual void waitForMutation(const String & /*mutation_id*/)
+    virtual void waitForMutation(const String & /*mutation_id*/, bool /*wait_for_another_mutation*/)
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Mutations are not supported by storage {}", getName());
     }
diff --git a/src/Storages/IStorageCluster.h b/src/Storages/IStorageCluster.h
index 35d297428ba2..03185e7aee62 100644
--- a/src/Storages/IStorageCluster.h
+++ b/src/Storages/IStorageCluster.h
@@ -23,6 +23,18 @@ class IStorageCluster : public IStorage
     virtual RemoteQueryExecutor::Extension getTaskIteratorExtension(ASTPtr query, ContextPtr context) const = 0;
 
     bool isRemote() const override { return true; }
+
+    static ContextPtr updateSettingsForTableFunctionCluster(ContextPtr context, const Settings & settings)
+    {
+        Settings new_settings = settings;
+
+        /// Cluster table functions should always skip unavailable shards.
+        new_settings.skip_unavailable_shards = true;
+
+        auto new_context = Context::createCopy(context);
+        new_context->setSettings(new_settings);
+        return new_context;
+    }
 };
 
 
diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp
index 09456088d74d..ec00cc3d2b9f 100644
--- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp
+++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp
@@ -10,6 +10,7 @@
 #include <Backups/BackupEntryFromSmallFile.h>
 #include <Backups/BackupEntryFromImmutableFile.h>
 #include <Disks/SingleDiskVolume.h>
+#include <Storages/MergeTree/IMergeTreeDataPart.h>
 
 namespace DB
 {
@@ -415,6 +416,7 @@ MutableDataPartStoragePtr DataPartStorageOnDiskBase::freeze(
 
     disk->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt");
     disk->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt");
+    disk->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME);
 
     auto single_disk_volume = std::make_shared<SingleDiskVolume>(disk->getName(), disk, 0);
 
@@ -461,6 +463,7 @@ void DataPartStorageOnDiskBase::rename(
 
     if (volume->getDisk()->exists(to))
     {
+        /// FIXME it should be logical error
         if (remove_new_dir_if_exists)
         {
             Names files;
@@ -471,7 +474,8 @@ void DataPartStorageOnDiskBase::rename(
                     "Part directory {} already exists and contains {} files. Removing it.",
                     fullPath(volume->getDisk(), to), files.size());
 
-            executeWriteOperation([&](auto & disk) { disk.removeRecursive(to); });
+            /// Do not remove blobs if they exist
+            executeWriteOperation([&](auto & disk) { disk.removeSharedRecursive(to, true, {}); });
         }
         else
         {
@@ -574,6 +578,9 @@ void DataPartStorageOnDiskBase::remove(
             if (e.code() == ErrorCodes::FILE_DOESNT_EXIST)
             {
                 LOG_ERROR(log, "Directory {} (part to remove) doesn't exist or one of nested files has gone. Most likely this is due to manual removing. This should be discouraged. Ignoring.", fullPath(disk, from));
+                /// We will never touch this part again, so unlocking it from zero-copy
+                if (!can_remove_description)
+                    can_remove_description.emplace(can_remove_callback());
                 return;
             }
             throw;
@@ -584,6 +591,10 @@ void DataPartStorageOnDiskBase::remove(
             {
                 LOG_ERROR(log, "Directory {} (part to remove) doesn't exist or one of nested files has gone. "
                           "Most likely this is due to manual removing. This should be discouraged. Ignoring.", fullPath(disk, from));
+                /// We will never touch this part again, so unlocking it from zero-copy
+                if (!can_remove_description)
+                    can_remove_description.emplace(can_remove_callback());
+
                 return;
             }
             throw;
diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp
index 46c6d09eca47..0e707a821354 100644
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@@ -821,6 +821,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
     const auto data_settings = data.getSettings();
     MergeTreeData::DataPart::Checksums data_checksums;
 
+    zkutil::EphemeralNodeHolderPtr zero_copy_temporary_lock_holder;
     if (to_remote_disk)
     {
         readStringBinary(part_id, in);
@@ -829,7 +830,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
             throw Exception(ErrorCodes::ZERO_COPY_REPLICATION_ERROR, "Part {} unique id {} doesn't exist on {} (with type {}).", part_name, part_id, disk->getName(), toString(disk->getDataSourceDescription().type));
 
         LOG_DEBUG(log, "Downloading part {} unique id {} metadata onto disk {}.", part_name, part_id, disk->getName());
-        data.lockSharedDataTemporary(part_name, part_id, disk);
+        zero_copy_temporary_lock_holder = data.lockSharedDataTemporary(part_name, part_id, disk);
     }
     else
     {
@@ -938,7 +939,6 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
 
     if (to_remote_disk)
     {
-        data.lockSharedData(*new_data_part, /* replace_existing_lock = */ true, {});
         LOG_DEBUG(log, "Download of part {} unique id {} metadata onto disk {} finished.", part_name, part_id, disk->getName());
     }
     else
@@ -948,6 +948,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
         LOG_DEBUG(log, "Download of part {} onto disk {} finished.", part_name, disk->getName());
     }
 
+    if (zero_copy_temporary_lock_holder)
+        zero_copy_temporary_lock_holder->setAlreadyRemoved();
+
     return new_data_part;
 }
 
diff --git a/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp b/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp
index 996d2bc46a5c..5741e11aa224 100644
--- a/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp
+++ b/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp
@@ -24,7 +24,7 @@ template <typename T>
 std::optional<EphemeralLockInZooKeeper> createEphemeralLockInZooKeeper(
     const String & path_prefix_, const String & temp_path, const ZooKeeperWithFaultInjectionPtr & zookeeper_, const T & deduplication_path)
 {
-    constexpr bool async_insert = std::is_same_v<T, std::vector<String>>;
+    static constexpr bool async_insert = std::is_same_v<T, std::vector<String>>;
 
     String path;
 
@@ -42,16 +42,15 @@ std::optional<EphemeralLockInZooKeeper> createEphemeralLockInZooKeeper(
         if constexpr (async_insert)
         {
             for (const auto & single_dedup_path : deduplication_path)
-            {
-                ops.emplace_back(zkutil::makeCreateRequest(single_dedup_path, "", zkutil::CreateMode::Persistent));
-                ops.emplace_back(zkutil::makeRemoveRequest(single_dedup_path, -1));
-            }
+                zkutil::addCheckNotExistsRequest(ops, *zookeeper_, single_dedup_path);
         }
         else
         {
-            ops.emplace_back(zkutil::makeCreateRequest(deduplication_path, "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeRemoveRequest(deduplication_path, -1));
+            zkutil::addCheckNotExistsRequest(ops, *zookeeper_, deduplication_path);
         }
+
+        auto deduplication_path_ops_size = ops.size();
+
         ops.emplace_back(zkutil::makeCreateRequest(path_prefix_, holder_path, zkutil::CreateMode::EphemeralSequential));
         Coordination::Responses responses;
         Coordination::Error e = zookeeper_->tryMulti(ops, responses);
@@ -60,9 +59,10 @@ std::optional<EphemeralLockInZooKeeper> createEphemeralLockInZooKeeper(
             if constexpr (async_insert)
             {
                 auto failed_idx = zkutil::getFailedOpIndex(Coordination::Error::ZNODEEXISTS, responses);
-                if (failed_idx < deduplication_path.size() * 2)
+
+                if (failed_idx < deduplication_path_ops_size)
                 {
-                    const String & failed_op_path = deduplication_path[failed_idx / 2];
+                    const String & failed_op_path = ops[failed_idx]->getPath();
                     LOG_DEBUG(
                         &Poco::Logger::get("createEphemeralLockInZooKeeper"),
                         "Deduplication path already exists: deduplication_path={}",
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 148cbf939489..d3bc3cc70d9f 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -1357,14 +1357,6 @@ void IMergeTreeDataPart::loadColumns(bool require)
     else
     {
         loaded_metadata_version = metadata_snapshot->getMetadataVersion();
-
-        if (!is_readonly_storage)
-        {
-            writeMetadata(METADATA_VERSION_FILE_NAME, {}, [loaded_metadata_version](auto & buffer)
-            {
-                writeIntText(loaded_metadata_version, buffer);
-            });
-        }
     }
 
     setColumns(loaded_columns, infos, loaded_metadata_version);
diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp
index 5d961425469a..ed686a0b38b8 100644
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@@ -1999,9 +1999,9 @@ static BoolMask forAnyHyperrectangle(
         if (left_bounded && right_bounded)
             hyperrectangle[prefix_size] = Range(left_keys[prefix_size], true, right_keys[prefix_size], true);
         else if (left_bounded)
-            hyperrectangle[prefix_size] = Range::createLeftBounded(left_keys[prefix_size], true);
+            hyperrectangle[prefix_size] = Range::createLeftBounded(left_keys[prefix_size], true, data_types[prefix_size]->isNullable());
         else if (right_bounded)
-            hyperrectangle[prefix_size] = Range::createRightBounded(right_keys[prefix_size], true);
+            hyperrectangle[prefix_size] = Range::createRightBounded(right_keys[prefix_size], true, data_types[prefix_size]->isNullable());
 
         return callback(hyperrectangle);
     }
diff --git a/src/Storages/MergeTree/MergeList.cpp b/src/Storages/MergeTree/MergeList.cpp
index 0bf662921ad3..6812ef93a78e 100644
--- a/src/Storages/MergeTree/MergeList.cpp
+++ b/src/Storages/MergeTree/MergeList.cpp
@@ -31,6 +31,7 @@ MergeListElement::MergeListElement(
         source_part_paths.emplace_back(source_part->getDataPartStorage().getFullPath());
 
         total_size_bytes_compressed += source_part->getBytesOnDisk();
+        total_size_bytes_uncompressed += source_part->getTotalColumnsSize().data_uncompressed;
         total_size_marks += source_part->getMarksCount();
         total_rows_count += source_part->index_granularity.getTotalRows();
     }
@@ -57,6 +58,7 @@ MergeInfo MergeListElement::getInfo() const
     res.progress = progress.load(std::memory_order_relaxed);
     res.num_parts = num_parts;
     res.total_size_bytes_compressed = total_size_bytes_compressed;
+    res.total_size_bytes_uncompressed = total_size_bytes_uncompressed;
     res.total_size_marks = total_size_marks;
     res.total_rows_count = total_rows_count;
     res.bytes_read_uncompressed = bytes_read_uncompressed.load(std::memory_order_relaxed);
diff --git a/src/Storages/MergeTree/MergeList.h b/src/Storages/MergeTree/MergeList.h
index 9c8c2ebd1e4e..045b4015c8ed 100644
--- a/src/Storages/MergeTree/MergeList.h
+++ b/src/Storages/MergeTree/MergeList.h
@@ -40,6 +40,7 @@ struct MergeInfo
     Float64 progress;
     UInt64 num_parts;
     UInt64 total_size_bytes_compressed;
+    UInt64 total_size_bytes_uncompressed;
     UInt64 total_size_marks;
     UInt64 total_rows_count;
     UInt64 bytes_read_uncompressed;
@@ -82,6 +83,7 @@ struct MergeListElement : boost::noncopyable
     std::atomic<bool> is_cancelled{};
 
     UInt64 total_size_bytes_compressed{};
+    UInt64 total_size_bytes_uncompressed{};
     UInt64 total_size_marks{};
     UInt64 total_rows_count{};
     std::atomic<UInt64> bytes_read_uncompressed{};
diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp
index df759b3bd45f..eee550f8dd63 100644
--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@@ -921,7 +921,9 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
     /// If merge is vertical we cannot calculate it
     ctx->blocks_are_granules_size = (global_ctx->chosen_merge_algorithm == MergeAlgorithm::Vertical);
 
-    UInt64 merge_block_size = data_settings->merge_max_block_size;
+    /// There is no sense to have the block size bigger than one granule for merge operations.
+    const UInt64 merge_block_size_rows = data_settings->merge_max_block_size;
+    const UInt64 merge_block_size_bytes = data_settings->merge_max_block_size_bytes;
 
     switch (ctx->merging_params.mode)
     {
@@ -930,7 +932,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
                 header,
                 pipes.size(),
                 sort_description,
-                merge_block_size,
+                merge_block_size_rows,
+                merge_block_size_bytes,
                 SortingQueueStrategy::Default,
                 /* limit_= */0,
                 /* always_read_till_end_= */false,
@@ -942,35 +945,35 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
         case MergeTreeData::MergingParams::Collapsing:
             merged_transform = std::make_shared<CollapsingSortedTransform>(
                 header, pipes.size(), sort_description, ctx->merging_params.sign_column, false,
-                merge_block_size, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size);
+                merge_block_size_rows, merge_block_size_bytes, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size);
             break;
 
         case MergeTreeData::MergingParams::Summing:
             merged_transform = std::make_shared<SummingSortedTransform>(
-                header, pipes.size(), sort_description, ctx->merging_params.columns_to_sum, partition_key_columns, merge_block_size);
+                header, pipes.size(), sort_description, ctx->merging_params.columns_to_sum, partition_key_columns, merge_block_size_rows, merge_block_size_bytes);
             break;
 
         case MergeTreeData::MergingParams::Aggregating:
-            merged_transform = std::make_shared<AggregatingSortedTransform>(header, pipes.size(), sort_description, merge_block_size);
+            merged_transform = std::make_shared<AggregatingSortedTransform>(header, pipes.size(), sort_description, merge_block_size_rows, merge_block_size_bytes);
             break;
 
         case MergeTreeData::MergingParams::Replacing:
             merged_transform = std::make_shared<ReplacingSortedTransform>(
                 header, pipes.size(), sort_description, ctx->merging_params.is_deleted_column, ctx->merging_params.version_column,
-                merge_block_size, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size,
+                merge_block_size_rows, merge_block_size_bytes, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size,
                 (data_settings->clean_deleted_rows != CleanDeletedRows::Never) || global_ctx->cleanup);
             break;
 
         case MergeTreeData::MergingParams::Graphite:
             merged_transform = std::make_shared<GraphiteRollupSortedTransform>(
-                header, pipes.size(), sort_description, merge_block_size,
+                header, pipes.size(), sort_description, merge_block_size_rows, merge_block_size_bytes,
                 ctx->merging_params.graphite_params, global_ctx->time_of_merge);
             break;
 
         case MergeTreeData::MergingParams::VersionedCollapsing:
             merged_transform = std::make_shared<VersionedCollapsingTransform>(
                 header, pipes.size(), sort_description, ctx->merging_params.sign_column,
-                merge_block_size, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size);
+                merge_block_size_rows, merge_block_size_bytes, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size);
             break;
     }
 
@@ -1011,7 +1014,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
 
 MergeAlgorithm MergeTask::ExecuteAndFinalizeHorizontalPart::chooseMergeAlgorithm() const
 {
-    const size_t sum_rows_upper_bound = global_ctx->merge_list_element_ptr->total_rows_count;
+    const size_t total_rows_count = global_ctx->merge_list_element_ptr->total_rows_count;
+    const size_t total_size_bytes_uncompressed = global_ctx->merge_list_element_ptr->total_size_bytes_uncompressed;
     const auto data_settings = global_ctx->data->getSettings();
 
     if (global_ctx->deduplicate)
@@ -1042,11 +1046,13 @@ MergeAlgorithm MergeTask::ExecuteAndFinalizeHorizontalPart::chooseMergeAlgorithm
 
     bool enough_ordinary_cols = global_ctx->gathering_columns.size() >= data_settings->vertical_merge_algorithm_min_columns_to_activate;
 
-    bool enough_total_rows = sum_rows_upper_bound >= data_settings->vertical_merge_algorithm_min_rows_to_activate;
+    bool enough_total_rows = total_rows_count >= data_settings->vertical_merge_algorithm_min_rows_to_activate;
+
+    bool enough_total_bytes = total_size_bytes_uncompressed >= data_settings->vertical_merge_algorithm_min_bytes_to_activate;
 
     bool no_parts_overflow = global_ctx->future_part->parts.size() <= RowSourcePart::MAX_PARTS;
 
-    auto merge_alg = (is_supported_storage && enough_total_rows && enough_ordinary_cols && no_parts_overflow) ?
+    auto merge_alg = (is_supported_storage && enough_total_rows && enough_total_bytes && enough_ordinary_cols && no_parts_overflow) ?
                         MergeAlgorithm::Vertical : MergeAlgorithm::Horizontal;
 
     return merge_alg;
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index 6f1e41d27913..6512aad9260b 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -32,13 +32,15 @@ MergeTreeBackgroundExecutor<Queue>::MergeTreeBackgroundExecutor(
     size_t threads_count_,
     size_t max_tasks_count_,
     CurrentMetrics::Metric metric_,
-    CurrentMetrics::Metric max_tasks_metric_)
+    CurrentMetrics::Metric max_tasks_metric_,
+    std::string_view policy)
     : name(name_)
     , threads_count(threads_count_)
     , max_tasks_count(max_tasks_count_)
     , metric(metric_)
     , max_tasks_metric(max_tasks_metric_, 2 * max_tasks_count) // active + pending
-    , pool(std::make_unique<ThreadPool>(CurrentMetrics::MergeTreeBackgroundExecutorThreads, CurrentMetrics::MergeTreeBackgroundExecutorThreadsActive))
+    , pool(std::make_unique<ThreadPool>(
+          CurrentMetrics::MergeTreeBackgroundExecutorThreads, CurrentMetrics::MergeTreeBackgroundExecutorThreadsActive))
 {
     if (max_tasks_count == 0)
         throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Task count for MergeTreeBackgroundExecutor must not be zero");
@@ -52,20 +54,9 @@ MergeTreeBackgroundExecutor<Queue>::MergeTreeBackgroundExecutor(
 
     for (size_t number = 0; number < threads_count; ++number)
         pool->scheduleOrThrowOnError([this] { threadFunction(); });
-}
 
-template <class Queue>
-MergeTreeBackgroundExecutor<Queue>::MergeTreeBackgroundExecutor(
-    String name_,
-    size_t threads_count_,
-    size_t max_tasks_count_,
-    CurrentMetrics::Metric metric_,
-    CurrentMetrics::Metric max_tasks_metric_,
-    std::string_view policy)
-    requires requires(Queue queue) { queue.updatePolicy(policy); } // Because we use explicit template instantiation
-    : MergeTreeBackgroundExecutor(name_, threads_count_, max_tasks_count_, metric_, max_tasks_metric_)
-{
-    pending.updatePolicy(policy);
+    if (!policy.empty())
+        pending.updatePolicy(policy);
 }
 
 template <class Queue>
@@ -326,5 +317,4 @@ void MergeTreeBackgroundExecutor<Queue>::threadFunction()
 template class MergeTreeBackgroundExecutor<RoundRobinRuntimeQueue>;
 template class MergeTreeBackgroundExecutor<PriorityRuntimeQueue>;
 template class MergeTreeBackgroundExecutor<DynamicRuntimeQueue>;
-
 }
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index 9bfea32c7f93..8142e383d0c8 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -13,16 +13,22 @@
 #include <boost/noncopyable.hpp>
 #include <Poco/Event.h>
 
+#include <Storages/MergeTree/IExecutableTask.h>
+#include <base/defines.h>
 #include <Common/CurrentMetrics.h>
-#include <Common/ThreadPool_fwd.h>
+#include <Common/Exception.h>
 #include <Common/Stopwatch.h>
-#include <base/defines.h>
-#include <Storages/MergeTree/IExecutableTask.h>
+#include <Common/ThreadPool_fwd.h>
 
 
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
 struct TaskRuntimeData;
 using TaskRuntimeDataPtr = std::shared_ptr<TaskRuntimeData>;
 
@@ -92,6 +98,11 @@ class RoundRobinRuntimeQueue
     void setCapacity(size_t count) { queue.set_capacity(count); }
     bool empty() { return queue.empty(); }
 
+    [[noreturn]] void updatePolicy(std::string_view)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updatePolicy() is not implemented");
+    }
+
     static constexpr std::string_view name = "round_robin";
 
 private:
@@ -126,6 +137,11 @@ class PriorityRuntimeQueue
     void setCapacity(size_t count) { buffer.reserve(count); }
     bool empty() { return buffer.empty(); }
 
+    [[noreturn]] void updatePolicy(std::string_view)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updatePolicy() is not implemented");
+    }
+
     static constexpr std::string_view name = "shortest_task_first";
 
 private:
@@ -239,20 +255,14 @@ template <class Queue>
 class MergeTreeBackgroundExecutor final : boost::noncopyable
 {
 public:
-    MergeTreeBackgroundExecutor(
-        String name_,
-        size_t threads_count_,
-        size_t max_tasks_count_,
-        CurrentMetrics::Metric metric_,
-        CurrentMetrics::Metric max_tasks_metric_);
     MergeTreeBackgroundExecutor(
         String name_,
         size_t threads_count_,
         size_t max_tasks_count_,
         CurrentMetrics::Metric metric_,
         CurrentMetrics::Metric max_tasks_metric_,
-        std::string_view policy)
-        requires requires(Queue queue) { queue.updatePolicy(policy); }; // Because we use explicit template instantiation
+        std::string_view policy = {});
+
     ~MergeTreeBackgroundExecutor();
 
     /// Handler for hot-reloading
@@ -271,7 +281,6 @@ class MergeTreeBackgroundExecutor final : boost::noncopyable
 
     /// Update scheduling policy for pending tasks. It does nothing if `new_policy` is the same or unknown.
     void updateSchedulingPolicy(std::string_view new_policy)
-        requires requires(Queue queue) { queue.updatePolicy(new_policy); } // Because we use explicit template instantiation
     {
         std::lock_guard lock(mutex);
         pending.updatePolicy(new_policy);
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index afde1cd2fcad..fd98db7962e7 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -18,6 +18,7 @@
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/typeid_cast.h>
 #include <Common/CurrentMetrics.h>
+#include <Common/ThreadFuzzer.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Core/QueryProcessingStage.h>
 #include <DataTypes/DataTypeEnum.h>
@@ -47,6 +48,7 @@
 #include <IO/WriteHelpers.h>
 #include <IO/Operators.h>
 #include <IO/WriteBufferFromString.h>
+#include <IO/SharedThreadPools.h>
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTIndexDeclaration.h>
 #include <Parsers/ASTFunction.h>
@@ -1207,14 +1209,59 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart(
     auto single_disk_volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, part_disk_ptr, 0);
     auto data_part_storage = std::make_shared<DataPartStorageOnDiskFull>(single_disk_volume, relative_data_path, part_name);
 
-    res.part = getDataPartBuilder(part_name, single_disk_volume, part_name)
-        .withPartInfo(part_info)
-        .withPartFormatFromDisk()
-        .build();
-
     String part_path = fs::path(relative_data_path) / part_name;
     String marker_path = fs::path(part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME;
 
+    /// Ignore broken parts that can appear as a result of hard server restart.
+    auto mark_broken = [&]
+    {
+        if (!res.part)
+        {
+            /// Build a fake part and mark it as broken in case of filesystem error.
+            /// If the error impacts part directory instead of single files,
+            /// an exception will be thrown during detach and silently ignored.
+            res.part = getDataPartBuilder(part_name, single_disk_volume, part_name)
+                .withPartStorageType(MergeTreeDataPartStorageType::Full)
+                .withPartType(MergeTreeDataPartType::Wide)
+                .build();
+        }
+
+        res.is_broken = true;
+        tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part_name, part_path));
+
+        res.size_of_part = calculatePartSizeSafe(res.part, log);
+        auto part_size_str = res.size_of_part ? formatReadableSizeWithBinarySuffix(*res.size_of_part) : "failed to calculate size";
+
+        LOG_ERROR(log,
+            "Detaching broken part {}{} (size: {}). "
+            "If it happened after update, it is likely because of backward incompatibility. "
+            "You need to resolve this manually",
+            getFullPathOnDisk(part_disk_ptr), part_name, part_size_str);
+    };
+
+    try
+    {
+        res.part = getDataPartBuilder(part_name, single_disk_volume, part_name)
+            .withPartInfo(part_info)
+            .withPartFormatFromDisk()
+            .build();
+    }
+    catch (const Exception & e)
+    {
+        /// Don't count the part as broken if there was a retryalbe error
+        /// during loading, such as "not enough memory" or network error.
+        if (isRetryableException(e))
+            throw;
+
+        mark_broken();
+        return res;
+    }
+    catch (...)
+    {
+        mark_broken();
+        return res;
+    }
+
     if (part_disk_ptr->exists(marker_path))
     {
         /// NOTE: getBytesOnDisk() cannot be used here, since it may be zero if checksums.txt does not exist.
@@ -1242,27 +1289,12 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart(
         if (isRetryableException(e))
             throw;
 
-        res.is_broken = true;
-        tryLogCurrentException(log, fmt::format("while loading part {} on path {}", res.part->name, part_path));
+        mark_broken();
+        return res;
     }
     catch (...)
     {
-        res.is_broken = true;
-        tryLogCurrentException(log, fmt::format("while loading part {} on path {}", res.part->name, part_path));
-    }
-
-    /// Ignore broken parts that can appear as a result of hard server restart.
-    if (res.is_broken)
-    {
-        res.size_of_part = calculatePartSizeSafe(res.part, log);
-        auto part_size_str = res.size_of_part ? formatReadableSizeWithBinarySuffix(*res.size_of_part) : "failed to calculate size";
-
-        LOG_ERROR(log,
-            "Detaching broken part {}{} (size: {}). "
-            "If it happened after update, it is likely because of backward incompatibility. "
-            "You need to resolve this manually",
-            getFullPathOnDisk(part_disk_ptr), part_name, part_size_str);
-
+        mark_broken();
         return res;
     }
 
@@ -1857,6 +1889,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
         {
             std::lock_guard lock(outdated_data_parts_mutex);
             outdated_unloaded_data_parts = std::move(unloaded_parts);
+            outdated_data_parts_loading_finished = false;
         }
 
         outdated_data_parts_loading_task = getContext()->getSchedulePool().createTask(
@@ -1874,7 +1907,11 @@ try
     {
         std::lock_guard lock(outdated_data_parts_mutex);
         if (outdated_unloaded_data_parts.empty())
+        {
+            outdated_data_parts_loading_finished = true;
+            outdated_data_parts_cv.notify_all();
             return;
+        }
 
         LOG_DEBUG(log, "Loading {} outdated data parts {}",
             outdated_unloaded_data_parts.size(),
@@ -1886,7 +1923,11 @@ try
     if (is_async)
         shared_lock = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);
 
-    size_t num_loaded_parts = 0;
+    std::atomic_size_t num_loaded_parts = 0;
+
+    auto runner = threadPoolCallbackRunner<void>(OutdatedPartsLoadingThreadPool::get(), "OutdatedParts");
+    std::vector<std::future<void>> parts_futures;
+
     while (true)
     {
         PartLoadingTree::NodePtr part;
@@ -1896,6 +1937,10 @@ try
 
             if (is_async && outdated_data_parts_loading_canceled)
             {
+                /// Wait for every scheduled task
+                for (auto & future : parts_futures)
+                    future.wait();
+
                 LOG_DEBUG(log,
                     "Stopped loading outdated data parts because task was canceled. "
                     "Loaded {} parts, {} left unloaded", num_loaded_parts, outdated_unloaded_data_parts.size());
@@ -1906,34 +1951,38 @@ try
                 break;
 
             part = outdated_unloaded_data_parts.back();
+            outdated_unloaded_data_parts.pop_back();
         }
 
-        auto res = loadDataPartWithRetries(
+        parts_futures.push_back(runner([&, part = part]()
+        {
+            auto res = loadDataPartWithRetries(
             part->info, part->name, part->disk,
             DataPartState::Outdated, data_parts_mutex, loading_parts_initial_backoff_ms,
             loading_parts_max_backoff_ms, loading_parts_max_tries);
 
-        ++num_loaded_parts;
-        if (res.is_broken)
-            res.part->renameToDetached("broken-on-start"); /// detached parts must not have '_' in prefixes
-        else if (res.part->is_duplicate)
-            res.part->remove();
-        else
-            preparePartForRemoval(res.part);
-
-        {
-            std::lock_guard lock(outdated_data_parts_mutex);
-            chassert(part == outdated_unloaded_data_parts.back());
-            outdated_unloaded_data_parts.pop_back();
-
-            if (outdated_unloaded_data_parts.empty())
-                break;
-        }
+            ++num_loaded_parts;
+            if (res.is_broken)
+                res.part->renameToDetached("broken-on-start"); /// detached parts must not have '_' in prefixes
+            else if (res.part->is_duplicate)
+                res.part->remove();
+            else
+                preparePartForRemoval(res.part);
+        }, 0));
     }
 
+    /// Wait for every scheduled task
+    for (auto & future : parts_futures)
+        future.wait();
+
     LOG_DEBUG(log, "Loaded {} outdated data parts {}",
         num_loaded_parts, is_async ? "asynchronously" : "synchronously");
-    outdated_data_parts_cv.notify_all();
+
+    {
+        std::lock_guard lock(outdated_data_parts_mutex);
+        outdated_data_parts_loading_finished = true;
+        outdated_data_parts_cv.notify_all();
+    }
 }
 catch (...)
 {
@@ -1950,15 +1999,13 @@ void MergeTreeData::waitForOutdatedPartsToBeLoaded() const TSA_NO_THREAD_SAFETY_
     if (isStaticStorage())
         return;
 
-    std::unique_lock lock(outdated_data_parts_mutex);
-    if (outdated_unloaded_data_parts.empty())
-        return;
-
     LOG_TRACE(log, "Will wait for outdated data parts to be loaded");
 
+    std::unique_lock lock(outdated_data_parts_mutex);
+
     outdated_data_parts_cv.wait(lock, [this]() TSA_NO_THREAD_SAFETY_ANALYSIS
     {
-        return outdated_unloaded_data_parts.empty() || outdated_data_parts_loading_canceled;
+        return outdated_data_parts_loading_finished || outdated_data_parts_loading_canceled;
     });
 
     if (outdated_data_parts_loading_canceled)
@@ -2044,6 +2091,7 @@ size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lif
             {
                 if (isOldPartDirectory(disk, it->path(), deadline))
                 {
+                    ThreadFuzzer::maybeInjectSleep();
                     if (temporary_parts.contains(basename))
                     {
                         /// Actually we don't rely on temporary_directories_lifetime when removing old temporaries directories,
@@ -2051,6 +2099,13 @@ size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lif
                         LOG_INFO(LogFrequencyLimiter(log, 10), "{} is in use (by merge/mutation/INSERT) (consider increasing temporary_directories_lifetime setting)", full_path);
                         continue;
                     }
+                    else if (!disk->exists(it->path()))
+                    {
+                        /// We should recheck that the dir exists, otherwise we can get "No such file or directory"
+                        /// due to a race condition with "Renaming temporary part" (temporary part holder could be already released, so the check above is not enough)
+                        LOG_WARNING(log, "Temporary directory {} suddenly disappeared while iterating, assuming it was concurrently renamed to persistent", it->path());
+                        continue;
+                    }
                     else
                     {
                         LOG_WARNING(log, "Removing temporary directory {}", full_path);
@@ -3948,6 +4003,9 @@ void MergeTreeData::forcefullyMovePartToDetachedAndRemoveFromMemory(const MergeT
     else
         LOG_INFO(log, "Renaming {} to {}_{} and forgetting it.", part_to_detach->getDataPartStorage().getPartDirectory(), prefix, part_to_detach->name);
 
+    if (restore_covered)
+        waitForOutdatedPartsToBeLoaded();
+
     auto lock = lockParts();
     bool removed_active_part = false;
     bool restored_active_part = false;
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 42cbd311b868..563ad3d642f1 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -32,6 +32,7 @@
 #include <Storages/extractKeyExpressionList.h>
 #include <Storages/PartitionCommands.h>
 #include <Interpreters/PartLog.h>
+#include <Interpreters/threadPoolCallbackRunner.h>
 
 
 #include <boost/multi_index_container.hpp>
@@ -431,6 +432,8 @@ class MergeTreeData : public IStorage, public WithMutableContext
 
     bool supportsLightweightDelete() const override;
 
+    bool areAsynchronousInsertsEnabled() const override { return getSettings()->async_insert; }
+
     NamesAndTypesList getVirtuals() const override;
 
     bool mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, ContextPtr, const StorageMetadataPtr & metadata_snapshot) const override;
@@ -1407,6 +1410,10 @@ class MergeTreeData : public IStorage, public WithMutableContext
     PartLoadingTreeNodes outdated_unloaded_data_parts TSA_GUARDED_BY(outdated_data_parts_mutex);
     bool outdated_data_parts_loading_canceled TSA_GUARDED_BY(outdated_data_parts_mutex) = false;
 
+    /// This has to be "true" by default, because in case of empty table or absence of Outdated parts
+    /// it is automatically finished.
+    bool outdated_data_parts_loading_finished TSA_GUARDED_BY(outdated_data_parts_mutex) = true;
+
     void loadOutdatedDataParts(bool is_async);
     void startOutdatedDataPartsLoadingTask();
     void stopOutdatedDataPartsLoadingTask();
diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
index adb7505a8ba8..dd7a0fcea24d 100644
--- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
@@ -280,23 +280,23 @@ Block MergeTreeDataWriter::mergeBlock(
                 return nullptr;
             case MergeTreeData::MergingParams::Replacing:
                 return std::make_shared<ReplacingSortedAlgorithm>(
-                    block, 1, sort_description, merging_params.is_deleted_column, merging_params.version_column, block_size + 1);
+                    block, 1, sort_description, merging_params.is_deleted_column, merging_params.version_column, block_size + 1, /*block_size_bytes=*/0);
             case MergeTreeData::MergingParams::Collapsing:
                 return std::make_shared<CollapsingSortedAlgorithm>(
                     block, 1, sort_description, merging_params.sign_column,
-                    false, block_size + 1, &Poco::Logger::get("MergeTreeDataWriter"));
+                    false, block_size + 1, /*block_size_bytes=*/0, &Poco::Logger::get("MergeTreeDataWriter"));
             case MergeTreeData::MergingParams::Summing:
                 return std::make_shared<SummingSortedAlgorithm>(
                     block, 1, sort_description, merging_params.columns_to_sum,
-                    partition_key_columns, block_size + 1);
+                    partition_key_columns, block_size + 1, /*block_size_bytes=*/0);
             case MergeTreeData::MergingParams::Aggregating:
-                return std::make_shared<AggregatingSortedAlgorithm>(block, 1, sort_description, block_size + 1);
+                return std::make_shared<AggregatingSortedAlgorithm>(block, 1, sort_description, block_size + 1, /*block_size_bytes=*/0);
             case MergeTreeData::MergingParams::VersionedCollapsing:
                 return std::make_shared<VersionedCollapsingAlgorithm>(
-                    block, 1, sort_description, merging_params.sign_column, block_size + 1);
+                    block, 1, sort_description, merging_params.sign_column, block_size + 1, /*block_size_bytes=*/0);
             case MergeTreeData::MergingParams::Graphite:
                 return std::make_shared<GraphiteRollupSortedAlgorithm>(
-                    block, 1, sort_description, block_size + 1, merging_params.graphite_params, time(nullptr));
+                    block, 1, sort_description, block_size + 1, /*block_size_bytes=*/0, merging_params.graphite_params, time(nullptr));
         }
 
         UNREACHABLE();
diff --git a/src/Storages/MergeTree/MergeTreeIndexReader.cpp b/src/Storages/MergeTree/MergeTreeIndexReader.cpp
index 7d7024a8ac24..88fbc8c2488a 100644
--- a/src/Storages/MergeTree/MergeTreeIndexReader.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexReader.cpp
@@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeIndexReader.h>
 #include <Interpreters/Context.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 
 namespace
 {
@@ -20,7 +21,7 @@ std::unique_ptr<MergeTreeReaderStream> makeIndexReader(
     auto * load_marks_threadpool = settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr;
 
     return std::make_unique<MergeTreeReaderStream>(
-        part->getDataPartStoragePtr(),
+        std::make_shared<LoadedMergeTreeDataPartInfoForReader>(part),
         index->getFileName(), extension, marks_count,
         all_mark_ranges,
         std::move(settings), mark_cache, uncompressed_cache,
diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp
index c6bb021e80fc..9a5576f0ad2d 100644
--- a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp
+++ b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp
@@ -30,7 +30,7 @@ namespace ErrorCodes
 }
 
 MergeTreeMarksLoader::MergeTreeMarksLoader(
-    DataPartStoragePtr data_part_storage_,
+    MergeTreeDataPartInfoForReaderPtr data_part_reader_,
     MarkCache * mark_cache_,
     const String & mrk_path_,
     size_t marks_count_,
@@ -39,7 +39,7 @@ MergeTreeMarksLoader::MergeTreeMarksLoader(
     const ReadSettings & read_settings_,
     ThreadPool * load_marks_threadpool_,
     size_t columns_in_mark_)
-    : data_part_storage(std::move(data_part_storage_))
+    : data_part_reader(data_part_reader_)
     , mark_cache(mark_cache_)
     , mrk_path(mrk_path_)
     , marks_count(marks_count_)
@@ -98,6 +98,8 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl()
     /// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache.
     MemoryTrackerBlockerInThread temporarily_disable_memory_tracker;
 
+    auto data_part_storage = data_part_reader->getDataPartStorage();
+
     size_t file_size = data_part_storage->getFileSize(mrk_path);
     size_t mark_size = index_granularity_info.getMarkSizeInBytes(columns_in_mark);
     size_t expected_uncompressed_size = mark_size * marks_count;
@@ -177,6 +179,8 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarks()
 {
     MarkCache::MappedPtr loaded_marks;
 
+    auto data_part_storage = data_part_reader->getDataPartStorage();
+
     if (mark_cache)
     {
         auto key = mark_cache->hash(fs::path(data_part_storage->getFullPath()) / mrk_path);
diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.h b/src/Storages/MergeTree/MergeTreeMarksLoader.h
index 17e52939d3f6..0889da0cb851 100644
--- a/src/Storages/MergeTree/MergeTreeMarksLoader.h
+++ b/src/Storages/MergeTree/MergeTreeMarksLoader.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <Storages/MergeTree/IDataPartStorage.h>
 #include <Storages/MarkCache.h>
 #include <IO/ReadSettings.h>
 #include <Common/ThreadPool_fwd.h>
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
 
 
 namespace DB
@@ -18,7 +18,7 @@ class MergeTreeMarksLoader
     using MarksPtr = MarkCache::MappedPtr;
 
     MergeTreeMarksLoader(
-        DataPartStoragePtr data_part_storage_,
+        MergeTreeDataPartInfoForReaderPtr data_part_reader_,
         MarkCache * mark_cache_,
         const String & mrk_path,
         size_t marks_count_,
@@ -33,7 +33,7 @@ class MergeTreeMarksLoader
     MarkInCompressedFile getMark(size_t row_index, size_t column_index = 0);
 
 private:
-    DataPartStoragePtr data_part_storage;
+    MergeTreeDataPartInfoForReaderPtr data_part_reader;
     MarkCache * mark_cache = nullptr;
     String mrk_path;
     size_t marks_count;
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
index d1796dac6ccf..26a7cb2b50b5 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@@ -36,7 +36,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
         settings_,
         avg_value_size_hints_)
     , marks_loader(
-          data_part_info_for_read_->getDataPartStorage(),
+          data_part_info_for_read_,
           mark_cache,
           data_part_info_for_read_->getIndexGranularityInfo().getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME),
           data_part_info_for_read_->getMarksCount(),
diff --git a/src/Storages/MergeTree/MergeTreeReaderStream.cpp b/src/Storages/MergeTree/MergeTreeReaderStream.cpp
index cdca5aa1247d..6d80dc5522cb 100644
--- a/src/Storages/MergeTree/MergeTreeReaderStream.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderStream.cpp
@@ -15,7 +15,7 @@ namespace ErrorCodes
 }
 
 MergeTreeReaderStream::MergeTreeReaderStream(
-        DataPartStoragePtr data_part_storage_,
+        MergeTreeDataPartInfoForReaderPtr data_part_reader_,
         const String & path_prefix_,
         const String & data_file_extension_,
         size_t marks_count_,
@@ -35,7 +35,7 @@ MergeTreeReaderStream::MergeTreeReaderStream(
     , all_mark_ranges(all_mark_ranges_)
     , file_size(file_size_)
     , uncompressed_cache(uncompressed_cache_)
-    , data_part_storage(std::move(data_part_storage_))
+    , data_part_storage(data_part_reader_->getDataPartStorage())
     , path_prefix(path_prefix_)
     , data_file_extension(data_file_extension_)
     , is_low_cardinality_dictionary(is_low_cardinality_dictionary_)
@@ -44,7 +44,7 @@ MergeTreeReaderStream::MergeTreeReaderStream(
     , save_marks_in_cache(settings.save_marks_in_cache)
     , index_granularity_info(index_granularity_info_)
     , marks_loader(
-        data_part_storage,
+        data_part_reader_,
         mark_cache,
         index_granularity_info->getMarksFilePath(path_prefix),
         marks_count,
diff --git a/src/Storages/MergeTree/MergeTreeReaderStream.h b/src/Storages/MergeTree/MergeTreeReaderStream.h
index f3785e175df3..baf8ec713f99 100644
--- a/src/Storages/MergeTree/MergeTreeReaderStream.h
+++ b/src/Storages/MergeTree/MergeTreeReaderStream.h
@@ -9,6 +9,7 @@
 #include <Compression/CompressedReadBufferFromFile.h>
 #include <Storages/MergeTree/MergeTreeIOSettings.h>
 #include <Storages/MergeTree/MergeTreeMarksLoader.h>
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
 
 
 namespace DB
@@ -19,7 +20,7 @@ class MergeTreeReaderStream
 {
 public:
     MergeTreeReaderStream(
-        DataPartStoragePtr data_part_storage_,
+        MergeTreeDataPartInfoForReaderPtr data_part_reader_,
         const String & path_prefix_,
         const String & data_file_extension_,
         size_t marks_count_,
diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
index 05af33da20a9..69617fdf9e3b 100644
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@@ -242,7 +242,7 @@ void MergeTreeReaderWide::addStreams(
         auto * load_marks_threadpool = settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr;
 
         streams.emplace(stream_name, std::make_unique<MergeTreeReaderStream>(
-            data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION,
+            data_part_info_for_read, stream_name, DATA_FILE_EXTENSION,
             data_part_info_for_read->getMarksCount(), all_mark_ranges, settings, mark_cache,
             uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION),
             &data_part_info_for_read->getIndexGranularityInfo(),
diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
index ba98fca2f508..3d6b0d5762b0 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -40,7 +40,8 @@ struct Settings;
     M(Float, ratio_of_defaults_for_sparse_serialization, 1.0, "Minimal ratio of number of default values to number of all values in column to store it in sparse serializations. If >= 1, columns will be always written in full serialization.", 0) \
     \
     /** Merge settings. */ \
-    M(UInt64, merge_max_block_size, DEFAULT_MERGE_BLOCK_SIZE, "How many rows in blocks should be formed for merge operations.", 0) \
+    M(UInt64, merge_max_block_size, 8192, "How many rows in blocks should be formed for merge operations. By default has the same value as `index_granularity`.", 0) \
+    M(UInt64, merge_max_block_size_bytes, 10 * 1024 * 1024, "How many bytes in blocks should be formed for merge operations. By default has the same value as `index_granularity_bytes`.", 0) \
     M(UInt64, max_bytes_to_merge_at_max_space_in_pool, 150ULL * 1024 * 1024 * 1024, "Maximum in total size of parts to merge, when there are maximum free threads in background pool (or entries in replication queue).", 0) \
     M(UInt64, max_bytes_to_merge_at_min_space_in_pool, 1024 * 1024, "Maximum in total size of parts to merge, when there are minimum free threads in background pool (or entries in replication queue).", 0) \
     M(UInt64, max_replicated_merges_in_queue, 1000, "How many tasks of merging and mutating parts are allowed simultaneously in ReplicatedMergeTree queue.", 0) \
@@ -83,6 +84,7 @@ struct Settings;
     M(UInt64, max_delay_to_insert, 1, "Max delay of inserting data into MergeTree table in seconds, if there are a lot of unmerged parts in single partition.", 0) \
     M(UInt64, min_delay_to_insert_ms, 10, "Min delay of inserting data into MergeTree table in milliseconds, if there are a lot of unmerged parts in single partition.", 0) \
     M(UInt64, max_parts_in_total, 100000, "If more than this number active parts in all partitions in total, throw 'Too many parts ...' exception.", 0) \
+    M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background.", 0) \
     \
     /* Part removal settings. */ \
     M(UInt64, simultaneous_parts_removal_limit, 0, "Maximum number of parts to remove during one CleanupThread iteration (0 means unlimited).", 0) \
@@ -126,7 +128,8 @@ struct Settings;
     M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \
     M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \
     M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \
-    M(UInt64, vertical_merge_algorithm_min_rows_to_activate, 16 * DEFAULT_MERGE_BLOCK_SIZE, "Minimal (approximate) sum of rows in merging parts to activate Vertical merge algorithm.", 0) \
+    M(UInt64, vertical_merge_algorithm_min_rows_to_activate, 16 * 8192, "Minimal (approximate) sum of rows in merging parts to activate Vertical merge algorithm.", 0) \
+    M(UInt64, vertical_merge_algorithm_min_bytes_to_activate, 0, "Minimal (approximate) uncompressed size in bytes in merging parts to activate Vertical merge algorithm.", 0) \
     M(UInt64, vertical_merge_algorithm_min_columns_to_activate, 11, "Minimal amount of non-PK columns to activate Vertical merge algorithm.", 0) \
     \
     /** Compatibility settings */ \
diff --git a/src/Storages/MergeTree/MergeTreeSource.cpp b/src/Storages/MergeTree/MergeTreeSource.cpp
index 328336ff71ab..500327afd617 100644
--- a/src/Storages/MergeTree/MergeTreeSource.cpp
+++ b/src/Storages/MergeTree/MergeTreeSource.cpp
@@ -1,7 +1,7 @@
 #include <Storages/MergeTree/MergeTreeSource.h>
 #include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
 #include <Interpreters/threadPoolCallbackRunner.h>
-#include <IO/IOThreadPool.h>
+#include <IO/SharedThreadPools.h>
 #include <Common/EventFD.h>
 
 namespace DB
diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp
index 6ad0628eac45..77c280d4710b 100644
--- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp
+++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp
@@ -131,7 +131,7 @@ class DefaultCoordinator : public ParallelReplicasReadingCoordinator::ImplInterf
 
 DefaultCoordinator::~DefaultCoordinator()
 {
-    LOG_INFO(log, "Coordination done: {}", toString(stats));
+    LOG_DEBUG(log, "Coordination done: {}", toString(stats));
 }
 
 void DefaultCoordinator::updateReadingState(const InitialAllRangesAnnouncement & announcement)
@@ -214,7 +214,7 @@ void DefaultCoordinator::finalizeReadingState()
         description += fmt::format("Replicas: ({}) --- ", fmt::join(part.replicas, ","));
     }
 
-    LOG_INFO(log, "Reading state is fully initialized: {}", description);
+    LOG_DEBUG(log, "Reading state is fully initialized: {}", description);
 }
 
 
@@ -228,7 +228,7 @@ void DefaultCoordinator::handleInitialAllRangesAnnouncement(InitialAllRangesAnno
     stats[announcement.replica_num].number_of_requests +=1;
 
     ++sent_initial_requests;
-    LOG_INFO(log, "{} {}", sent_initial_requests, replicas_count);
+    LOG_DEBUG(log, "Sent initial requests: {} Replicas count: {}", sent_initial_requests, replicas_count);
     if (sent_initial_requests == replicas_count)
         finalizeReadingState();
 }
@@ -334,7 +334,7 @@ class InOrderCoordinator : public ParallelReplicasReadingCoordinator::ImplInterf
     {}
     ~InOrderCoordinator() override
     {
-        LOG_INFO(log, "Coordination done: {}", toString(stats));
+        LOG_DEBUG(log, "Coordination done: {}", toString(stats));
     }
 
     ParallelReadResponse handleRequest([[ maybe_unused ]]  ParallelReadRequest request) override;
@@ -349,7 +349,7 @@ class InOrderCoordinator : public ParallelReplicasReadingCoordinator::ImplInterf
 template <CoordinationMode mode>
 void InOrderCoordinator<mode>::handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement)
 {
-    LOG_TRACE(log, "Received an announecement {}", announcement.describe());
+    LOG_TRACE(log, "Received an announcement {}", announcement.describe());
 
     /// To get rid of duplicates
     for (const auto & part: announcement.description)
diff --git a/src/Storages/MergeTree/RangesInDataPart.cpp b/src/Storages/MergeTree/RangesInDataPart.cpp
index ab76611a5073..6203f9f74832 100644
--- a/src/Storages/MergeTree/RangesInDataPart.cpp
+++ b/src/Storages/MergeTree/RangesInDataPart.cpp
@@ -1,12 +1,23 @@
 #include <Storages/MergeTree/RangesInDataPart.h>
 
-#include <Storages/MergeTree/IMergeTreeDataPart.h>
-
-#include "IO/VarInt.h"
+#include <fmt/format.h>
 
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>
+#include <Storages/MergeTree/IMergeTreeDataPart.h>
+#include "IO/VarInt.h"
 
+template <>
+struct fmt::formatter<DB::RangesInDataPartDescription>
+{
+    static constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); }
+
+    template <typename FormatContext>
+    auto format(const DB::RangesInDataPartDescription & range, FormatContext & ctx)
+    {
+        return format_to(ctx.out(), "{}", range.describe());
+    }
+};
 
 namespace DB
 {
@@ -26,8 +37,7 @@ void RangesInDataPartDescription::serialize(WriteBuffer & out) const
 String RangesInDataPartDescription::describe() const
 {
     String result;
-    result += fmt::format("Part: {}, ", info.getPartNameV1());
-    result += fmt::format("Ranges: [{}], ", fmt::join(ranges, ","));
+    result += fmt::format("part {} with ranges [{}]", info.getPartNameV1(), fmt::join(ranges, ","));
     return result;
 }
 
@@ -46,10 +56,7 @@ void RangesInDataPartsDescription::serialize(WriteBuffer & out) const
 
 String RangesInDataPartsDescription::describe() const
 {
-    String result;
-    for (const auto & desc : *this)
-        result += desc.describe() + ",";
-    return result;
+    return fmt::format("{} parts: [{}]", this->size(), fmt::join(*this, ", "));
 }
 
 void RangesInDataPartsDescription::deserialize(ReadBuffer & in)
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
index 357b9e0125aa..d7166b4a3b93 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
@@ -365,6 +365,12 @@ void ReplicatedMergeTreeRestartingThread::setReadonly(bool on_shutdown)
         CurrentMetrics::sub(CurrentMetrics::ReadonlyReplica);
         assert(CurrentMetrics::get(CurrentMetrics::ReadonlyReplica) >= 0);
     }
+
+    if (storage.since_metadata_err_incr_readonly_metric)
+    {
+        CurrentMetrics::sub(CurrentMetrics::ReadonlyReplica);
+        assert(CurrentMetrics::get(CurrentMetrics::ReadonlyReplica) >= 0);
+    }
 }
 
 void ReplicatedMergeTreeRestartingThread::setNotReadonly()
diff --git a/src/Storages/MergeTree/RequestResponse.cpp b/src/Storages/MergeTree/RequestResponse.cpp
index 945477c5a34d..05930d5a4c41 100644
--- a/src/Storages/MergeTree/RequestResponse.cpp
+++ b/src/Storages/MergeTree/RequestResponse.cpp
@@ -88,10 +88,7 @@ void ParallelReadResponse::serialize(WriteBuffer & out) const
 
 String ParallelReadResponse::describe() const
 {
-    String result;
-    result += fmt::format("finish: {} \n", finish);
-    result += description.describe();
-    return result;
+    return fmt::format("{}. Finish: {}", description.describe(), finish);
 }
 
 void ParallelReadResponse::deserialize(ReadBuffer & in)
diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp
index dec741beb450..23bcdd23484f 100644
--- a/src/Storages/StorageJoin.cpp
+++ b/src/Storages/StorageJoin.cpp
@@ -220,12 +220,13 @@ HashJoinPtr StorageJoin::getJoinLocked(std::shared_ptr<TableJoin> analyzed_join,
     Names left_key_names_resorted;
     for (const auto & key_name : key_names)
     {
-        const auto & renamed_key = analyzed_join->renamedRightColumnName(key_name);
+        const auto & renamed_key = analyzed_join->renamedRightColumnNameWithAlias(key_name);
         /// find position of renamed_key in key_names_right
         auto it = std::find(key_names_right.begin(), key_names_right.end(), renamed_key);
         if (it == key_names_right.end())
             throw Exception(ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN,
-                "Key '{}' not found in JOIN ON section. All Join engine keys '{}' have to be used", key_name, fmt::join(key_names, ", "));
+                "Key '{}' not found in JOIN ON section. Join engine key{} '{}' have to be used",
+                key_name, key_names.size() > 1 ? "s" : "", fmt::join(key_names, ", "));
         const size_t key_position = std::distance(key_names_right.begin(), it);
         left_key_names_resorted.push_back(key_names_left[key_position]);
     }
diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index d469113d3582..37d9e3bc32cf 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -144,7 +144,8 @@ StorageSnapshotPtr StorageMemory::getStorageSnapshot(const StorageMetadataPtr &
     return std::make_shared<StorageSnapshot>(*this, metadata_snapshot, object_columns, std::move(snapshot_data));
 }
 
-Pipe StorageMemory::read(
+void StorageMemory::read(
+    QueryPlan & query_plan,
     const Names & column_names,
     const StorageSnapshotPtr & storage_snapshot,
     SelectQueryInfo & /*query_info*/,
@@ -153,29 +154,7 @@ Pipe StorageMemory::read(
     size_t /*max_block_size*/,
     size_t num_streams)
 {
-    return ReadFromMemoryStorageStep::makePipe(column_names, storage_snapshot, num_streams, delay_read_for_global_subqueries);
-}
-
-void StorageMemory::read(
-    QueryPlan & query_plan,
-    const Names & column_names,
-    const StorageSnapshotPtr & storage_snapshot,
-    SelectQueryInfo & query_info,
-    ContextPtr context,
-    QueryProcessingStage::Enum processed_stage,
-    size_t max_block_size,
-    size_t num_streams)
-{
-    // @TODO it looks like IStorage::readFromPipe. different only step's type.
-    auto pipe = read(column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams);
-    if (pipe.empty())
-    {
-        auto header = storage_snapshot->getSampleBlockForColumns(column_names);
-        InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, context);
-        return;
-    }
-    auto read_step = std::make_unique<ReadFromMemoryStorageStep>(std::move(pipe));
-    query_plan.addStep(std::move(read_step));
+    query_plan.addStep(std::make_unique<ReadFromMemoryStorageStep>(column_names, storage_snapshot, num_streams, delay_read_for_global_subqueries));
 }
 
 
diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h
index db231379df9b..ce8a59b8bcd0 100644
--- a/src/Storages/StorageMemory.h
+++ b/src/Storages/StorageMemory.h
@@ -45,15 +45,6 @@ friend class MemorySink;
 
     StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override;
 
-    Pipe read(
-        const Names & column_names,
-        const StorageSnapshotPtr & storage_snapshot,
-        SelectQueryInfo & query_info,
-        ContextPtr context,
-        QueryProcessingStage::Enum processed_stage,
-        size_t max_block_size,
-        size_t num_streams) override;
-
     void read(
         QueryPlan & query_plan,
         const Names & column_names,
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 61b096b66895..981e856fc839 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -339,7 +339,7 @@ void StorageMergeTree::alter(
             if (prev_mutation != 0)
             {
                 LOG_DEBUG(log, "Cannot change metadata with barrier alter query, will wait for mutation {}", prev_mutation);
-                waitForMutation(prev_mutation);
+                waitForMutation(prev_mutation, /* from_another_mutation */ true);
                 LOG_DEBUG(log, "Mutation {} finished", prev_mutation);
             }
         }
@@ -535,19 +535,19 @@ void StorageMergeTree::updateMutationEntriesErrors(FutureMergedMutatedPartPtr re
     mutation_wait_event.notify_all();
 }
 
-void StorageMergeTree::waitForMutation(Int64 version)
+void StorageMergeTree::waitForMutation(Int64 version, bool wait_for_another_mutation)
 {
     String mutation_id = MergeTreeMutationEntry::versionToFileName(version);
-    waitForMutation(version, mutation_id);
+    waitForMutation(version, mutation_id, wait_for_another_mutation);
 }
 
-void StorageMergeTree::waitForMutation(const String & mutation_id)
+void StorageMergeTree::waitForMutation(const String & mutation_id, bool wait_for_another_mutation)
 {
     Int64 version = MergeTreeMutationEntry::parseFileName(mutation_id);
-    waitForMutation(version, mutation_id);
+    waitForMutation(version, mutation_id, wait_for_another_mutation);
 }
 
-void StorageMergeTree::waitForMutation(Int64 version, const String & mutation_id)
+void StorageMergeTree::waitForMutation(Int64 version, const String & mutation_id, bool wait_for_another_mutation)
 {
     LOG_INFO(log, "Waiting mutation: {}", mutation_id);
     {
@@ -567,7 +567,7 @@ void StorageMergeTree::waitForMutation(Int64 version, const String & mutation_id
     std::set<String> mutation_ids;
     mutation_ids.insert(mutation_id);
 
-    auto mutation_status = getIncompleteMutationsStatus(version, &mutation_ids);
+    auto mutation_status = getIncompleteMutationsStatus(version, &mutation_ids, wait_for_another_mutation);
     checkMutationStatus(mutation_status, mutation_ids);
 
     LOG_INFO(log, "Mutation {} done", mutation_id);
@@ -617,7 +617,8 @@ bool comparator(const PartVersionWithName & f, const PartVersionWithName & s)
 
 }
 
-std::optional<MergeTreeMutationStatus> StorageMergeTree::getIncompleteMutationsStatus(Int64 mutation_version, std::set<String> * mutation_ids) const
+std::optional<MergeTreeMutationStatus> StorageMergeTree::getIncompleteMutationsStatus(
+    Int64 mutation_version, std::set<String> * mutation_ids, bool from_another_mutation) const
 {
     std::lock_guard lock(currently_processing_in_background_mutex);
 
@@ -631,7 +632,9 @@ std::optional<MergeTreeMutationStatus> StorageMergeTree::getIncompleteMutationsS
     const auto & mutation_entry = current_mutation_it->second;
 
     auto txn = tryGetTransactionForMutation(mutation_entry, log);
-    assert(txn || mutation_entry.tid.isPrehistoric());
+    /// There's no way a transaction may finish before a mutation that was started by the transaction.
+    /// But sometimes we need to check status of an unrelated mutation, in this case we don't care about transactions.
+    assert(txn || mutation_entry.tid.isPrehistoric() || from_another_mutation);
     auto data_parts = getVisibleDataPartsVector(txn);
     for (const auto & data_part : data_parts)
     {
@@ -656,7 +659,7 @@ std::optional<MergeTreeMutationStatus> StorageMergeTree::getIncompleteMutationsS
                             mutation_ids->insert(it->second.file_name);
                 }
             }
-            else if (txn)
+            else if (txn && !from_another_mutation)
             {
                 /// Part is locked by concurrent transaction, most likely it will never be mutated
                 TIDHash part_locked = data_part->version.removal_tid_lock.load();
diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h
index 6f8acf9965a9..3a39ae128566 100644
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@@ -190,9 +190,9 @@ class StorageMergeTree final : public MergeTreeData
     /// and into in-memory structures. Wake up merge-mutation task.
     Int64 startMutation(const MutationCommands & commands, ContextPtr query_context);
     /// Wait until mutation with version will finish mutation for all parts
-    void waitForMutation(Int64 version);
-    void waitForMutation(const String & mutation_id) override;
-    void waitForMutation(Int64 version, const String & mutation_id);
+    void waitForMutation(Int64 version, bool wait_for_another_mutation = false);
+    void waitForMutation(const String & mutation_id, bool wait_for_another_mutation) override;
+    void waitForMutation(Int64 version, const String & mutation_id, bool wait_for_another_mutation = false);
     void setMutationCSN(const String & mutation_id, CSN csn) override;
 
 
@@ -253,7 +253,8 @@ class StorageMergeTree final : public MergeTreeData
     /// because we can execute several mutations at once. Order is important for
     /// better readability of exception message. If mutation was killed doesn't
     /// return any ids.
-    std::optional<MergeTreeMutationStatus> getIncompleteMutationsStatus(Int64 mutation_version, std::set<String> * mutation_ids = nullptr) const;
+    std::optional<MergeTreeMutationStatus> getIncompleteMutationsStatus(Int64 mutation_version, std::set<String> * mutation_ids = nullptr,
+                                                                        bool from_another_mutation = false) const;
 
     void fillNewPartName(MutableDataPartPtr & part, DataPartsLock & lock);
 
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 470bb181c026..700c87867aff 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -131,6 +131,7 @@ namespace ProfileEvents
 namespace CurrentMetrics
 {
     extern const Metric BackgroundFetchesPoolTask;
+    extern const Metric ReadonlyReplica;
 }
 
 namespace DB
@@ -1330,6 +1331,11 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
                     uncovered_unexpected_parts.size(), uncovered_unexpected_parts_rows, unexpected_parts_nonnew, unexpected_parts_nonnew_rows,
                     parts_to_fetch.size(), parts_to_fetch_blocks, covered_unexpected_parts.size(), unexpected_parts_rows - uncovered_unexpected_parts_rows);
     }
+    else
+    {
+        if (!parts_to_fetch.empty())
+            LOG_DEBUG(log, "Found parts to fetch (exist in zookeeper, but not locally): [{}]", fmt::join(parts_to_fetch, ", "));
+    }
 
     /// Add to the queue jobs to pick up the missing parts from other replicas and remove from ZK the information that we have them.
     queue.setBrokenPartsToEnqueueFetchesOnLoading(std::move(parts_to_fetch));
@@ -1468,16 +1474,18 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
 }
 
 MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAndCommit(Transaction & transaction,
-    const MutableDataPartPtr & part, std::optional<MergeTreeData::HardlinkedFiles> hardlinked_files)
+    const MutableDataPartPtr & part, std::optional<MergeTreeData::HardlinkedFiles> hardlinked_files, bool replace_zero_copy_lock)
 {
     auto zookeeper = getZooKeeper();
 
     while (true)
     {
+        LOG_DEBUG(log, "Committing part {} to zookeeper", part->name);
         Coordination::Requests ops;
         NameSet absent_part_paths_on_replicas;
 
-        lockSharedData(*part, false, hardlinked_files);
+        getLockSharedDataOps(*part, std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), replace_zero_copy_lock, hardlinked_files, ops);
+        size_t zero_copy_lock_ops_size = ops.size();
 
         /// Checksums are checked here and `ops` is filled. In fact, the part is added to ZK just below, when executing `multi`.
         checkPartChecksumsAndAddCommitOps(zookeeper, part, ops, part->name, &absent_part_paths_on_replicas);
@@ -1505,11 +1513,14 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd
         Coordination::Responses responses;
         Coordination::Error e = zookeeper->tryMulti(ops, responses);
         if (e == Coordination::Error::ZOK)
+        {
+            LOG_DEBUG(log, "Part {} committed to zookeeper", part->name);
             return transaction.commit();
+        }
 
         if (e == Coordination::Error::ZNODEEXISTS)
         {
-            size_t num_check_ops = 2 * absent_part_paths_on_replicas.size();
+            size_t num_check_ops = 2 * absent_part_paths_on_replicas.size() + zero_copy_lock_ops_size;
             size_t failed_op_index = zkutil::getFailedOpIndex(e, responses);
             if (failed_op_index < num_check_ops)
             {
@@ -2470,8 +2481,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
         {
             /// We check that it was not suddenly upgraded to new version.
             /// Otherwise it can be upgraded and instantly become lost, but we cannot notice that.
-            ops.push_back(zkutil::makeCreateRequest(fs::path(source_path) / "is_lost", "0", zkutil::CreateMode::Persistent));
-            ops.push_back(zkutil::makeRemoveRequest(fs::path(source_path) / "is_lost", -1));
+            zkutil::addCheckNotExistsRequest(ops, *zookeeper, fs::path(source_path) / "is_lost");
         }
         else /// The replica we clone should not suddenly become lost.
             ops.push_back(zkutil::makeCheckRequest(fs::path(source_path) / "is_lost", source_is_lost_stat.version));
@@ -4161,7 +4171,7 @@ bool StorageReplicatedMergeTree::fetchPart(
             Transaction transaction(*this, NO_TRANSACTION_RAW);
             renameTempPartAndReplace(part, transaction);
 
-            replaced_parts = checkPartChecksumsAndCommit(transaction, part, hardlinked_files);
+            replaced_parts = checkPartChecksumsAndCommit(transaction, part, hardlinked_files, !part_to_clone);
 
             /** If a quorum is tracked for this part, you must update it.
               * If you do not have time, in case of losing the session, when you restart the server - see the `ReplicatedMergeTreeRestartingThread::updateQuorumIfWeHavePart` method.
@@ -4358,7 +4368,21 @@ void StorageReplicatedMergeTree::startupImpl(bool from_attach_thread)
 {
     /// Do not start replication if ZooKeeper is not configured or there is no metadata in zookeeper
     if (!has_metadata_in_zookeeper.has_value() || !*has_metadata_in_zookeeper)
+    {
+        if (!since_metadata_err_incr_readonly_metric)
+        {
+            since_metadata_err_incr_readonly_metric = true;
+            CurrentMetrics::add(CurrentMetrics::ReadonlyReplica);
+        }
         return;
+    }
+
+    if (since_metadata_err_incr_readonly_metric)
+    {
+        since_metadata_err_incr_readonly_metric = false;
+        CurrentMetrics::sub(CurrentMetrics::ReadonlyReplica);
+        assert(CurrentMetrics::get(CurrentMetrics::ReadonlyReplica) >= 0);
+    }
 
     try
     {
@@ -7617,7 +7641,7 @@ bool StorageReplicatedMergeTree::waitForProcessingQueue(UInt64 max_wait_millisec
     background_operations_assignee.trigger();
 
     std::unordered_set<String> wait_for_ids;
-    bool was_interrupted = false;
+    std::atomic_bool was_interrupted = false;
 
     Poco::Event target_entry_event;
     auto callback = [this, &target_entry_event, &wait_for_ids, &was_interrupted, sync_mode]
@@ -8112,31 +8136,31 @@ std::optional<String> StorageReplicatedMergeTree::tryGetTableSharedIDFromCreateQ
 }
 
 
-void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const
+zkutil::EphemeralNodeHolderPtr StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const
 {
     auto settings = getSettings();
 
     if (!disk || !disk->supportZeroCopyReplication() || !settings->allow_remote_fs_zero_copy_replication)
-        return;
+        return {};
 
     zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper();
     if (!zookeeper)
-        return;
+        return {};
 
     String id = part_id;
     boost::replace_all(id, "/", "_");
 
-    Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), toString(disk->getDataSourceDescription().type), getTableSharedID(),
-        part_name, zookeeper_path);
+    String zc_zookeeper_path = getZeroCopyPartPath(*getSettings(), toString(disk->getDataSourceDescription().type), getTableSharedID(),
+        part_name, zookeeper_path)[0];
 
-    for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
-    {
-        String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name;
+    String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name;
 
-        LOG_TRACE(log, "Set zookeeper temporary ephemeral lock {}", zookeeper_node);
-        createZeroCopyLockNode(
-            std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), zookeeper_node, zkutil::CreateMode::Ephemeral, false);
-    }
+    LOG_TRACE(log, "Set zookeeper temporary ephemeral lock {}", zookeeper_node);
+    createZeroCopyLockNode(
+        std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), zookeeper_node, zkutil::CreateMode::Ephemeral, false);
+
+    LOG_TRACE(log, "Zookeeper temporary ephemeral lock {} created", zookeeper_node);
+    return zkutil::EphemeralNodeHolder::existing(zookeeper_node, *zookeeper);
 }
 
 void StorageReplicatedMergeTree::lockSharedData(
@@ -8144,6 +8168,7 @@ void StorageReplicatedMergeTree::lockSharedData(
     bool replace_existing_lock,
     std::optional<HardlinkedFiles> hardlinked_files) const
 {
+    LOG_DEBUG(log, "Trying to create zero-copy lock for part {}", part.name);
     auto zookeeper = tryGetZooKeeper();
     if (zookeeper)
         return lockSharedData(part, std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), replace_existing_lock, hardlinked_files);
@@ -8151,6 +8176,54 @@ void StorageReplicatedMergeTree::lockSharedData(
         return lockSharedData(part, std::make_shared<ZooKeeperWithFaultInjection>(nullptr), replace_existing_lock, hardlinked_files);
 }
 
+void StorageReplicatedMergeTree::getLockSharedDataOps(
+    const IMergeTreeDataPart & part,
+    const ZooKeeperWithFaultInjectionPtr & zookeeper,
+    bool replace_existing_lock,
+    std::optional<HardlinkedFiles> hardlinked_files,
+    Coordination::Requests & requests) const
+{
+    auto settings = getSettings();
+
+    if (!part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication)
+        return;
+
+    if (!part.getDataPartStorage().supportZeroCopyReplication())
+        return;
+
+    if (zookeeper->isNull())
+        return;
+
+    String id = part.getUniqueId();
+    boost::replace_all(id, "/", "_");
+
+    Strings zc_zookeeper_paths = getZeroCopyPartPath(
+        *getSettings(), part.getDataPartStorage().getDiskType(), getTableSharedID(),
+        part.name, zookeeper_path);
+
+    String path_to_set_hardlinked_files;
+    NameSet hardlinks;
+
+    if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty())
+    {
+        path_to_set_hardlinked_files = getZeroCopyPartPath(
+            *getSettings(), part.getDataPartStorage().getDiskType(), hardlinked_files->source_table_shared_id,
+            hardlinked_files->source_part_name, zookeeper_path)[0];
+
+        hardlinks = hardlinked_files->hardlinks_from_source_part;
+    }
+
+    for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
+    {
+        String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name;
+
+        getZeroCopyLockNodeCreateOps(
+            zookeeper, zookeeper_node, requests, zkutil::CreateMode::Persistent,
+            replace_existing_lock, path_to_set_hardlinked_files, hardlinks);
+    }
+}
+
+
 void StorageReplicatedMergeTree::lockSharedData(
     const IMergeTreeDataPart & part,
     const ZooKeeperWithFaultInjectionPtr & zookeeper,
@@ -8191,11 +8264,13 @@ void StorageReplicatedMergeTree::lockSharedData(
     {
         String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name;
 
-        LOG_TRACE(log, "Set zookeeper persistent lock {}", zookeeper_node);
+        LOG_TRACE(log, "Trying to create zookeeper persistent lock {}", zookeeper_node);
 
         createZeroCopyLockNode(
             zookeeper, zookeeper_node, zkutil::CreateMode::Persistent,
             replace_existing_lock, path_to_set_hardlinked_files, hardlinks);
+
+        LOG_TRACE(log, "Zookeeper persistent lock {} created", zookeeper_node);
     }
 }
 
@@ -8329,6 +8404,7 @@ std::pair<bool, NameSet> getParentLockedBlobs(const ZooKeeperWithFaultInjectionP
     /// all_0_0_0_1
     /// all_0_0_0
     std::sort(parts_infos.begin(), parts_infos.end());
+    std::string part_info_str = part_info.getPartNameV1();
 
     /// In reverse order to process from bigger to smaller
     for (const auto & [parent_candidate_info, part_candidate_info_str] : parts_infos | std::views::reverse)
@@ -8339,7 +8415,7 @@ std::pair<bool, NameSet> getParentLockedBlobs(const ZooKeeperWithFaultInjectionP
         /// We are mutation child of this parent
         if (part_info.isMutationChildOf(parent_candidate_info))
         {
-            LOG_TRACE(log, "Found mutation parent {} for part {}", part_candidate_info_str, part_info.getPartNameV1());
+            LOG_TRACE(log, "Found mutation parent {} for part {}", part_candidate_info_str, part_info_str);
             /// Get hardlinked files
             String files_not_to_remove_str;
             Coordination::Error code;
@@ -8356,6 +8432,7 @@ std::pair<bool, NameSet> getParentLockedBlobs(const ZooKeeperWithFaultInjectionP
             return {true, files_not_to_remove};
         }
     }
+    LOG_TRACE(log, "No mutation parent found for part {}", part_info_str);
     return {false, files_not_to_remove};
 }
 
@@ -8407,6 +8484,10 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
                     LOG_INFO(logger, "Lock on path {} for part {} doesn't exist, refuse to remove blobs", zookeeper_part_replica_node, part_name);
                     return {false, {}};
                 }
+                else
+                {
+                    LOG_INFO(logger, "Lock on path {} for part {} doesn't exist, but we don't have mutation parent, can remove blobs", zookeeper_part_replica_node, part_name);
+                }
             }
             else
             {
@@ -8874,8 +8955,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
                 /// We must be sure that this part doesn't exist on other replicas
                 if (!zookeeper->exists(current_part_path))
                 {
-                    ops.emplace_back(zkutil::makeCreateRequest(current_part_path, "", zkutil::CreateMode::Persistent));
-                    ops.emplace_back(zkutil::makeRemoveRequest(current_part_path, -1));
+                    zkutil::addCheckNotExistsRequest(ops, *zookeeper, current_part_path);
                 }
                 else
                 {
@@ -8928,6 +9008,46 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
     return true;
 }
 
+void StorageReplicatedMergeTree::getZeroCopyLockNodeCreateOps(
+    const ZooKeeperWithFaultInjectionPtr & zookeeper, const String & zookeeper_node, Coordination::Requests & requests,
+    int32_t mode, bool replace_existing_lock,
+    const String & path_to_set_hardlinked_files, const NameSet & hardlinked_files)
+{
+
+    /// Ephemeral locks can be created only when we fetch shared data.
+    /// So it never require to create ancestors. If we create them
+    /// race condition with source replica drop is possible.
+    if (mode == zkutil::CreateMode::Persistent)
+        zookeeper->checkExistsAndGetCreateAncestorsOps(zookeeper_node, requests);
+
+    if (replace_existing_lock && zookeeper->exists(zookeeper_node))
+    {
+        requests.emplace_back(zkutil::makeRemoveRequest(zookeeper_node, -1));
+        requests.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
+        if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
+        {
+            std::string data = boost::algorithm::join(hardlinked_files, "\n");
+            /// List of files used to detect hardlinks. path_to_set_hardlinked_files --
+            /// is a path to source part zero copy node. During part removal hardlinked
+            /// files will be left for source part.
+            requests.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
+        }
+    }
+    else
+    {
+        Coordination::Requests ops;
+        if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
+        {
+            std::string data = boost::algorithm::join(hardlinked_files, "\n");
+            /// List of files used to detect hardlinks. path_to_set_hardlinked_files --
+            /// is a path to source part zero copy node. During part removal hardlinked
+            /// files will be left for source part.
+            requests.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
+        }
+        requests.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
+    }
+}
+
 
 void StorageReplicatedMergeTree::createZeroCopyLockNode(
     const ZooKeeperWithFaultInjectionPtr & zookeeper, const String & zookeeper_node, int32_t mode,
@@ -8939,75 +9059,49 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(
     bool created = false;
     for (int attempts = 5; attempts > 0; --attempts)
     {
-        try
+        Coordination::Requests ops;
+        Coordination::Responses responses;
+        getZeroCopyLockNodeCreateOps(zookeeper, zookeeper_node, ops, mode, replace_existing_lock, path_to_set_hardlinked_files, hardlinked_files);
+        auto error = zookeeper->tryMulti(ops, responses);
+        if (error == Coordination::Error::ZOK)
         {
-            /// Ephemeral locks can be created only when we fetch shared data.
-            /// So it never require to create ancestors. If we create them
-            /// race condition with source replica drop is possible.
-            if (mode == zkutil::CreateMode::Persistent)
-                zookeeper->createAncestors(zookeeper_node);
+            created = true;
+            break;
+        }
+        else if (mode == zkutil::CreateMode::Persistent)
+        {
+            if (error == Coordination::Error::ZNONODE)
+                continue;
 
-            if (replace_existing_lock && zookeeper->exists(zookeeper_node))
+            if (error == Coordination::Error::ZNODEEXISTS)
             {
-                Coordination::Requests ops;
-                ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_node, -1));
-                ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
-                if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
-                {
-                    std::string data = boost::algorithm::join(hardlinked_files, "\n");
-                    /// List of files used to detect hardlinks. path_to_set_hardlinked_files --
-                    /// is a path to source part zero copy node. During part removal hardlinked
-                    /// files will be left for source part.
-                    ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
-                }
-                Coordination::Responses responses;
-                auto error = zookeeper->tryMulti(ops, responses);
-                if (error == Coordination::Error::ZOK)
+                size_t failed_op = zkutil::getFailedOpIndex(error, responses);
+                /// Part was locked before, unfortunately it's possible during moves
+                if (ops[failed_op]->getPath() == zookeeper_node)
                 {
                     created = true;
                     break;
                 }
-                else if (error == Coordination::Error::ZNONODE && mode != zkutil::CreateMode::Persistent)
-                {
-                    throw Exception(ErrorCodes::NOT_FOUND_NODE,
-                                    "Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper", zookeeper_node);
-                }
+                continue;
             }
-            else
+        }
+        else if (mode == zkutil::CreateMode::Ephemeral)
+        {
+            /// It is super rare case when we had part, but it was lost and we were unable to unlock it from keeper.
+            /// Now we are trying to fetch it from other replica and unlocking.
+            if (error == Coordination::Error::ZNODEEXISTS)
             {
-                Coordination::Requests ops;
-                if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
-                {
-                    std::string data = boost::algorithm::join(hardlinked_files, "\n");
-                    /// List of files used to detect hardlinks. path_to_set_hardlinked_files --
-                    /// is a path to source part zero copy node. During part removal hardlinked
-                    /// files will be left for source part.
-                    ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
-                }
-                ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
-
-                Coordination::Responses responses;
-                auto error = zookeeper->tryMulti(ops, responses);
-                if (error == Coordination::Error::ZOK || error == Coordination::Error::ZNODEEXISTS)
-                {
-                    created = true;
-                    break;
-                }
-                else if (error == Coordination::Error::ZNONODE && mode != zkutil::CreateMode::Persistent)
+                size_t failed_op = zkutil::getFailedOpIndex(error, responses);
+                if (ops[failed_op]->getPath() == zookeeper_node)
                 {
-                    /// Ephemeral locks used during fetches so if parent node was removed we cannot do anything
-                    throw Exception(ErrorCodes::NOT_FOUND_NODE,
-                                    "Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper", zookeeper_node);
+                    LOG_WARNING(&Poco::Logger::get("ZeroCopyLocks"), "Replacing persistent lock with ephemeral for path {}. It can happen only in case of local part loss", zookeeper_node);
+                    replace_existing_lock = true;
+                    continue;
                 }
             }
         }
-        catch (const zkutil::KeeperException & e)
-        {
-            if (e.code == Coordination::Error::ZNONODE)
-                continue;
 
-            throw;
-        }
+        zkutil::KeeperMultiException::check(error, ops, responses);
     }
 
     if (!created)
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index ade4e4f0b4bc..6580309b54a3 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -252,7 +252,14 @@ class StorageReplicatedMergeTree final : public MergeTreeData
         bool replace_existing_lock,
         std::optional<HardlinkedFiles> hardlinked_files) const;
 
-    void lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const;
+    void getLockSharedDataOps(
+        const IMergeTreeDataPart & part,
+        const ZooKeeperWithFaultInjectionPtr & zookeeper,
+        bool replace_existing_lock,
+        std::optional<HardlinkedFiles> hardlinked_files,
+        Coordination::Requests & requests) const;
+
+    zkutil::EphemeralNodeHolderPtr lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const;
 
     /// Unlock shared data part in zookeeper
     /// Return true if data unlocked
@@ -380,6 +387,11 @@ class StorageReplicatedMergeTree final : public MergeTreeData
     /// If false - ZooKeeper is available, but there is no table metadata. It's safe to drop table in this case.
     std::optional<bool> has_metadata_in_zookeeper;
 
+    /// during server restart or attach table process, set since_metadata_err_incr_readonly_metric = true and increase readonly metric if has_metadata_in_zookeeper = false.
+    /// during detach or drop table process, decrease readonly metric if since_metadata_err_incr_readonly_metric = true.
+    /// during restore replica process, set since_metadata_err_incr_readonly_metric = false and decrease readonly metric if since_metadata_err_incr_readonly_metric = true.
+    bool since_metadata_err_incr_readonly_metric = false;
+
     static const String default_zookeeper_name;
     const String zookeeper_name;
     const String zookeeper_path;
@@ -542,7 +554,7 @@ class StorageReplicatedMergeTree final : public MergeTreeData
     String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const;
 
     /// Accepts a PreActive part, atomically checks its checksums with ones on other replicas and commit the part
-    DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const MutableDataPartPtr & part, std::optional<HardlinkedFiles> hardlinked_files = {});
+    DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const MutableDataPartPtr & part, std::optional<HardlinkedFiles> hardlinked_files = {}, bool replace_zero_copy_lock=false);
 
     bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override;
 
@@ -861,6 +873,12 @@ class StorageReplicatedMergeTree final : public MergeTreeData
         int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false,
         const String & path_to_set_hardlinked_files = "", const NameSet & hardlinked_files = {});
 
+    static void getZeroCopyLockNodeCreateOps(
+        const ZooKeeperWithFaultInjectionPtr & zookeeper, const String & zookeeper_node, Coordination::Requests & requests,
+        int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false,
+        const String & path_to_set_hardlinked_files = "", const NameSet & hardlinked_files = {});
+
+
     bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name) override;
 
     /// Create freeze metadata for table and save in zookeeper. Required only if zero-copy replication enabled.
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index bd3e8fe886d5..5e8cb6f910b3 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -1,7 +1,5 @@
 #include "config.h"
 #include <Common/ProfileEvents.h>
-#include "IO/ParallelReadBuffer.h"
-#include "IO/IOThreadPool.h"
 #include "Parsers/ASTCreateQuery.h"
 
 #if USE_AWS_S3
@@ -12,6 +10,8 @@
 
 #include <IO/S3Common.h>
 #include <IO/S3/Requests.h>
+#include <IO/ParallelReadBuffer.h>
+#include <IO/SharedThreadPools.h>
 
 #include <Interpreters/TreeRewriter.h>
 #include <Interpreters/evaluateConstantExpression.h>
diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp
index 220cc1dc1f6c..fb99d95ef52b 100644
--- a/src/Storages/StorageS3Cluster.cpp
+++ b/src/Storages/StorageS3Cluster.cpp
@@ -140,7 +140,8 @@ Pipe StorageS3Cluster::read(
         /* only_replace_in_join_= */true);
     visitor.visit(query_to_send);
 
-    const auto & current_settings = context->getSettingsRef();
+    auto new_context = IStorageCluster::updateSettingsForTableFunctionCluster(context, context->getSettingsRef());
+    const auto & current_settings = new_context->getSettingsRef();
     auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings);
     for (const auto & shard_info : cluster->getShardsInfo())
     {
@@ -151,7 +152,7 @@ Pipe StorageS3Cluster::read(
                     std::vector<IConnectionPool::Entry>{try_result},
                     queryToString(query_to_send),
                     sample_block,
-                    context,
+                    new_context,
                     /*throttler=*/nullptr,
                     scalars,
                     Tables(),
diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp
index 17a11ba98480..4db44b7b4f78 100644
--- a/src/Storages/StorageS3Settings.cpp
+++ b/src/Storages/StorageS3Settings.cpp
@@ -32,6 +32,7 @@ S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(
     : PartUploadSettings(settings)
 {
     String key = config_prefix + "." + setting_name_prefix;
+    strict_upload_part_size = config.getUInt64(key + "strict_upload_part_size", strict_upload_part_size);
     min_upload_part_size = config.getUInt64(key + "min_upload_part_size", min_upload_part_size);
     max_upload_part_size = config.getUInt64(key + "max_upload_part_size", max_upload_part_size);
     upload_part_size_multiply_factor = config.getUInt64(key + "upload_part_size_multiply_factor", upload_part_size_multiply_factor);
@@ -49,10 +50,11 @@ S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(
 
 S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const NamedCollection & collection)
 {
+    strict_upload_part_size = collection.getOrDefault<UInt64>("strict_upload_part_size", strict_upload_part_size);
     min_upload_part_size = collection.getOrDefault<UInt64>("min_upload_part_size", min_upload_part_size);
+    max_single_part_upload_size = collection.getOrDefault<UInt64>("max_single_part_upload_size", max_single_part_upload_size);
     upload_part_size_multiply_factor = collection.getOrDefault<UInt64>("upload_part_size_multiply_factor", upload_part_size_multiply_factor);
     upload_part_size_multiply_parts_count_threshold = collection.getOrDefault<UInt64>("upload_part_size_multiply_parts_count_threshold", upload_part_size_multiply_parts_count_threshold);
-    max_single_part_upload_size = collection.getOrDefault<UInt64>("max_single_part_upload_size", max_single_part_upload_size);
 
     /// This configuration is only applicable to s3. Other types of object storage are not applicable or have different meanings.
     storage_class_name = collection.getOrDefault<String>("s3_storage_class", storage_class_name);
@@ -63,6 +65,9 @@ S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const NamedC
 
 void S3Settings::RequestSettings::PartUploadSettings::updateFromSettingsImpl(const Settings & settings, bool if_changed)
 {
+    if (!if_changed || settings.s3_strict_upload_part_size.changed)
+        strict_upload_part_size = settings.s3_strict_upload_part_size;
+
     if (!if_changed || settings.s3_min_upload_part_size.changed)
         min_upload_part_size = settings.s3_min_upload_part_size;
 
@@ -82,6 +87,12 @@ void S3Settings::RequestSettings::PartUploadSettings::updateFromSettingsImpl(con
 void S3Settings::RequestSettings::PartUploadSettings::validate()
 {
     static constexpr size_t min_upload_part_size_limit = 5 * 1024 * 1024;
+    if (strict_upload_part_size && strict_upload_part_size < min_upload_part_size_limit)
+        throw Exception(
+            ErrorCodes::INVALID_SETTING_VALUE,
+            "Setting strict_upload_part_size has invalid value {} which is less than the s3 API limit {}",
+            ReadableSize(strict_upload_part_size), ReadableSize(min_upload_part_size_limit));
+
     if (min_upload_part_size < min_upload_part_size_limit)
         throw Exception(
             ErrorCodes::INVALID_SETTING_VALUE,
diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h
index 49cb481626d3..cd5be1626b54 100644
--- a/src/Storages/StorageS3Settings.h
+++ b/src/Storages/StorageS3Settings.h
@@ -28,6 +28,7 @@ struct S3Settings
     {
         struct PartUploadSettings
         {
+            size_t strict_upload_part_size = 0;
             size_t min_upload_part_size = 16 * 1024 * 1024;
             size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024;
             size_t upload_part_size_multiply_factor = 2;
diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp
index 64d4609bcfbb..f4ef52d76059 100644
--- a/src/Storages/System/StorageSystemClusters.cpp
+++ b/src/Storages/System/StorageSystemClusters.cpp
@@ -1,5 +1,6 @@
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeNullable.h>
 #include <Interpreters/Cluster.h>
 #include <Interpreters/Context.h>
 #include <Storages/System/StorageSystemClusters.h>
@@ -24,7 +25,10 @@ NamesAndTypesList StorageSystemClusters::getNamesAndTypes()
         {"default_database", std::make_shared<DataTypeString>()},
         {"errors_count", std::make_shared<DataTypeUInt32>()},
         {"slowdowns_count", std::make_shared<DataTypeUInt32>()},
-        {"estimated_recovery_time", std::make_shared<DataTypeUInt32>()}
+        {"estimated_recovery_time", std::make_shared<DataTypeUInt32>()},
+        {"database_shard_name", std::make_shared<DataTypeString>()},
+        {"database_replica_name", std::make_shared<DataTypeString>()},
+        {"is_active", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt8>())},
     };
 }
 
@@ -32,26 +36,30 @@ NamesAndTypesList StorageSystemClusters::getNamesAndTypes()
 void StorageSystemClusters::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const
 {
     for (const auto & name_and_cluster : context->getClusters())
-        writeCluster(res_columns, name_and_cluster);
+        writeCluster(res_columns, name_and_cluster, {});
 
     const auto databases = DatabaseCatalog::instance().getDatabases();
     for (const auto & name_and_database : databases)
     {
         if (const auto * replicated = typeid_cast<const DatabaseReplicated *>(name_and_database.second.get()))
         {
+
             if (auto database_cluster = replicated->tryGetCluster())
-                writeCluster(res_columns, {name_and_database.first, database_cluster});
+                writeCluster(res_columns, {name_and_database.first, database_cluster},
+                             replicated->tryGetAreReplicasActive(database_cluster));
         }
     }
 }
 
-void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster)
+void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster,
+                                         const std::vector<UInt8> & is_active)
 {
     const String & cluster_name = name_and_cluster.first;
     const ClusterPtr & cluster = name_and_cluster.second;
     const auto & shards_info = cluster->getShardsInfo();
     const auto & addresses_with_failover = cluster->getShardsAddresses();
 
+    size_t replica_idx = 0;
     for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index)
     {
         const auto & shard_info = shards_info[shard_index];
@@ -77,6 +85,12 @@ void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const Nam
             res_columns[i++]->insert(pool_status[replica_index].error_count);
             res_columns[i++]->insert(pool_status[replica_index].slowdown_count);
             res_columns[i++]->insert(pool_status[replica_index].estimated_recovery_time.count());
+            res_columns[i++]->insert(address.database_shard_name);
+            res_columns[i++]->insert(address.database_replica_name);
+            if (is_active.empty())
+                res_columns[i++]->insertDefault();
+            else
+                res_columns[i++]->insert(is_active[replica_idx++]);
         }
     }
 }
diff --git a/src/Storages/System/StorageSystemClusters.h b/src/Storages/System/StorageSystemClusters.h
index f14446bf4d30..9aa1a6a51836 100644
--- a/src/Storages/System/StorageSystemClusters.h
+++ b/src/Storages/System/StorageSystemClusters.h
@@ -27,7 +27,7 @@ class StorageSystemClusters final : public IStorageSystemOneBlock<StorageSystemC
     using NameAndCluster = std::pair<String, std::shared_ptr<Cluster>>;
 
     void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override;
-    static void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster);
+    static void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster, const std::vector<UInt8> & is_active);
 };
 
 }
diff --git a/src/Storages/System/StorageSystemDetachedParts.cpp b/src/Storages/System/StorageSystemDetachedParts.cpp
index 66e610ca6530..9f80b9940513 100644
--- a/src/Storages/System/StorageSystemDetachedParts.cpp
+++ b/src/Storages/System/StorageSystemDetachedParts.cpp
@@ -8,7 +8,7 @@
 #include <Storages/System/StorageSystemPartsBase.h>
 #include <Processors/Sources/SourceFromSingleChunk.h>
 #include <QueryPipeline/Pipe.h>
-#include <IO/IOThreadPool.h>
+#include <IO/SharedThreadPools.h>
 #include <Interpreters/threadPoolCallbackRunner.h>
 
 #include <mutex>
diff --git a/src/Storages/System/StorageSystemFilesystemCache.cpp b/src/Storages/System/StorageSystemFilesystemCache.cpp
index 4fa821bf689c..8e9ad2ac501c 100644
--- a/src/Storages/System/StorageSystemFilesystemCache.cpp
+++ b/src/Storages/System/StorageSystemFilesystemCache.cpp
@@ -18,6 +18,7 @@ NamesAndTypesList StorageSystemFilesystemCache::getNamesAndTypes()
         {"cache_name", std::make_shared<DataTypeString>()},
         {"cache_base_path", std::make_shared<DataTypeString>()},
         {"cache_path", std::make_shared<DataTypeString>()},
+        {"key", std::make_shared<DataTypeString>()},
         {"file_segment_range_begin", std::make_shared<DataTypeUInt64>()},
         {"file_segment_range_end", std::make_shared<DataTypeUInt64>()},
         {"size", std::make_shared<DataTypeUInt64>()},
@@ -45,27 +46,27 @@ void StorageSystemFilesystemCache::fillData(MutableColumns & res_columns, Contex
         const auto & cache = cache_data->cache;
         auto file_segments = cache->getSnapshot();
 
-        for (const auto & file_segment : file_segments)
+        for (const auto & file_segment : *file_segments)
         {
             res_columns[0]->insert(cache_name);
             res_columns[1]->insert(cache->getBasePath());
 
             /// Do not use `file_segment->getPathInLocalCache` here because it will lead to nullptr dereference
             /// (because file_segments in getSnapshot doesn't have `cache` field set)
-            res_columns[2]->insert(
-                cache->getPathInLocalCache(file_segment->key(), file_segment->offset(), file_segment->getKind()));
+            res_columns[2]->insert(cache->getPathInLocalCache(file_segment->key(), file_segment->offset(), file_segment->getKind()));
+            res_columns[3]->insert(file_segment->key().toString());
 
             const auto & range = file_segment->range();
-            res_columns[3]->insert(range.left);
-            res_columns[4]->insert(range.right);
-            res_columns[5]->insert(range.size());
-            res_columns[6]->insert(FileSegment::stateToString(file_segment->state()));
-            res_columns[7]->insert(file_segment->getHitsCount());
-            res_columns[8]->insert(file_segment->getRefCount());
-            res_columns[9]->insert(file_segment->getDownloadedSize());
-            res_columns[10]->insert(file_segment->isPersistent());
-            res_columns[11]->insert(toString(file_segment->getKind()));
-            res_columns[12]->insert(file_segment->isUnbound());
+            res_columns[4]->insert(range.left);
+            res_columns[5]->insert(range.right);
+            res_columns[6]->insert(range.size());
+            res_columns[7]->insert(FileSegment::stateToString(file_segment->state()));
+            res_columns[8]->insert(file_segment->getHitsCount());
+            res_columns[9]->insert(file_segment->getRefCount());
+            res_columns[10]->insert(file_segment->getDownloadedSize(false));
+            res_columns[11]->insert(file_segment->isPersistent());
+            res_columns[12]->insert(toString(file_segment->getKind()));
+            res_columns[13]->insert(file_segment->isUnbound());
         }
     }
 }
diff --git a/src/Storages/System/StorageSystemFunctions.cpp b/src/Storages/System/StorageSystemFunctions.cpp
index a0a406a974c6..f3a297a11d15 100644
--- a/src/Storages/System/StorageSystemFunctions.cpp
+++ b/src/Storages/System/StorageSystemFunctions.cpp
@@ -1,4 +1,5 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <DataTypes/DataTypeMap.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeEnum.h>
@@ -55,15 +56,38 @@ namespace
         if constexpr (std::is_same_v<Factory, FunctionFactory>)
         {
             if (factory.isAlias(name))
+            {
                 res_columns[6]->insertDefault();
+                res_columns[7]->insertDefault();
+                res_columns[8]->insertDefault();
+                res_columns[9]->insertDefault();
+                res_columns[10]->insertDefault();
+                res_columns[11]->insertDefault();
+            }
             else
-                res_columns[6]->insert(factory.getDocumentation(name).description);
+            {
+                auto documentation = factory.getDocumentation(name);
+                res_columns[6]->insert(documentation.description);
+                res_columns[7]->insertDefault();
+                res_columns[8]->insertDefault();
+                res_columns[9]->insertDefault();
+                res_columns[10]->insert(documentation.examplesAsString());
+                res_columns[11]->insert(documentation.categoriesAsString());
+            }
         }
         else
+        {
             res_columns[6]->insertDefault();
+            res_columns[7]->insertDefault();
+            res_columns[8]->insertDefault();
+            res_columns[9]->insertDefault();
+            res_columns[10]->insertDefault();
+            res_columns[11]->insertDefault();
+        }
     }
 }
 
+
 std::vector<std::pair<String, Int8>> getOriginEnumsAndValues()
 {
     return std::vector<std::pair<String, Int8>>{
@@ -83,6 +107,11 @@ NamesAndTypesList StorageSystemFunctions::getNamesAndTypes()
         {"create_query", std::make_shared<DataTypeString>()},
         {"origin", std::make_shared<DataTypeEnum8>(getOriginEnumsAndValues())},
         {"description", std::make_shared<DataTypeString>()},
+        {"syntax", std::make_shared<DataTypeString>()},
+        {"arguments", std::make_shared<DataTypeString>()},
+        {"returned_value", std::make_shared<DataTypeString>()},
+        {"examples", std::make_shared<DataTypeString>()},
+        {"categories", std::make_shared<DataTypeString>()}
     };
 }
 
diff --git a/src/Storages/System/StorageSystemMerges.cpp b/src/Storages/System/StorageSystemMerges.cpp
index b29836206d01..1f32a0ff700f 100644
--- a/src/Storages/System/StorageSystemMerges.cpp
+++ b/src/Storages/System/StorageSystemMerges.cpp
@@ -22,6 +22,7 @@ NamesAndTypesList StorageSystemMerges::getNamesAndTypes()
         {"partition_id", std::make_shared<DataTypeString>()},
         {"is_mutation", std::make_shared<DataTypeUInt8>()},
         {"total_size_bytes_compressed", std::make_shared<DataTypeUInt64>()},
+        {"total_size_bytes_uncompressed", std::make_shared<DataTypeUInt64>()},
         {"total_size_marks", std::make_shared<DataTypeUInt64>()},
         {"bytes_read_uncompressed", std::make_shared<DataTypeUInt64>()},
         {"rows_read", std::make_shared<DataTypeUInt64>()},
@@ -59,6 +60,7 @@ void StorageSystemMerges::fillData(MutableColumns & res_columns, ContextPtr cont
         res_columns[i++]->insert(merge.partition_id);
         res_columns[i++]->insert(merge.is_mutation);
         res_columns[i++]->insert(merge.total_size_bytes_compressed);
+        res_columns[i++]->insert(merge.total_size_bytes_uncompressed);
         res_columns[i++]->insert(merge.total_size_marks);
         res_columns[i++]->insert(merge.bytes_read_uncompressed);
         res_columns[i++]->insert(merge.rows_read);
diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp
index 9a70633155da..23981c2a275f 100644
--- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp
+++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp
@@ -82,7 +82,7 @@ Pipe StorageSystemRemoteDataPaths::read(
 
                     if (cache)
                     {
-                        auto cache_paths = cache->tryGetCachePaths(cache->hash(object.getPathKeyForCache()));
+                        auto cache_paths = cache->tryGetCachePaths(cache->createKeyForPath(object.getPathKeyForCache()));
                         col_cache_paths->insert(Array(cache_paths.begin(), cache_paths.end()));
                     }
                     else
diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp
index 4ff00facfdc7..c38a2b4ed42d 100644
--- a/src/Storages/VirtualColumnUtils.cpp
+++ b/src/Storages/VirtualColumnUtils.cpp
@@ -162,7 +162,7 @@ bool prepareFilterBlockWithQuery(const ASTPtr & query, ContextPtr context, Block
         const ColumnNumbersList grouping_set_keys;
 
         ActionsVisitor::Data visitor_data(
-            context, SizeLimits{}, 1, source_columns, std::move(actions), prepared_sets, true, true, true, false,
+            context, SizeLimits{}, 1, source_columns, std::move(actions), prepared_sets, true, true, true,
             { aggregation_keys, grouping_set_keys, GroupByKind::NONE });
 
         ActionsVisitor(visitor_data).visit(node);
diff --git a/src/TableFunctions/TableFunctionRemote.cpp b/src/TableFunctions/TableFunctionRemote.cpp
index 1ee51bcb0400..b2f09adf7737 100644
--- a/src/TableFunctions/TableFunctionRemote.cpp
+++ b/src/TableFunctions/TableFunctionRemote.cpp
@@ -255,15 +255,18 @@ void TableFunctionRemote::parseArguments(const ASTPtr & ast_function, ContextPtr
 
         bool treat_local_as_remote = false;
         bool treat_local_port_as_remote = context->getApplicationType() == Context::ApplicationType::LOCAL;
-        cluster = std::make_shared<Cluster>(
-            context->getSettingsRef(),
-            names,
+        ClusterConnectionParameters params{
             username,
             password,
-            (secure ? (maybe_secure_port ? *maybe_secure_port : DBMS_DEFAULT_SECURE_PORT) : context->getTCPPort()),
+            static_cast<UInt16>(secure ? (maybe_secure_port ? *maybe_secure_port : DBMS_DEFAULT_SECURE_PORT) : context->getTCPPort()),
             treat_local_as_remote,
             treat_local_port_as_remote,
-            secure);
+            secure,
+            /* priority= */ 1,
+            /* cluster_name= */ "",
+            /* password= */ ""
+        };
+        cluster = std::make_shared<Cluster>(context->getSettingsRef(), names, params);
     }
 
     if (!remote_table_function_ptr && table.empty())
diff --git a/src/configure_config.cmake b/src/configure_config.cmake
index 9cb0d0efb39f..fedc05e1fdcd 100644
--- a/src/configure_config.cmake
+++ b/src/configure_config.cmake
@@ -147,6 +147,9 @@ endif()
 if (TARGET ch_contrib::capnp)
     set(USE_CAPNP 1)
 endif()
+if (TARGET ch_contrib::bcrypt)
+    set(USE_BCRYPT 1)
+endif()
 if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC))
     set(USE_BORINGSSL 1)
 endif ()
diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt
index f052faa65152..96c4ea62f55d 100644
--- a/tests/broken_tests.txt
+++ b/tests/broken_tests.txt
@@ -122,13 +122,6 @@
 02575_merge_prewhere_different_default_kind
 02713_array_low_cardinality_string
 02707_skip_index_with_in
-02707_complex_query_fails_analyzer
-02699_polygons_sym_difference_rollup
-02680_mysql_ast_logical_err
-02677_analyzer_bitmap_has_any
-02661_quantile_approx
-02540_duplicate_primary_key2
-02516_join_with_totals_and_subquery_bug
 02324_map_combinator_bug
 02241_join_rocksdb_bs
 02003_WithMergeableStateAfterAggregationAndLimit_LIMIT_BY_LIMIT_OFFSET
@@ -139,6 +132,5 @@
 00725_join_on_bug_1
 00636_partition_key_parts_pruning
 00261_storage_aliases_and_array_join
-02701_non_parametric_function
 01825_type_json_multiple_files
-01281_group_by_limit_memory_tracking
\ No newline at end of file
+01281_group_by_limit_memory_tracking
diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py
index 3c2fa05016fe..514aaf7e2ac5 100644
--- a/tests/ci/ast_fuzzer_check.py
+++ b/tests/ci/ast_fuzzer_check.py
@@ -9,19 +9,21 @@
 
 from build_download_helper import get_build_name_for_check, read_build_urls
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
-from commit_status_helper import format_description, post_commit_status
+from commit_status_helper import (
+    RerunHelper,
+    format_description,
+    get_commit,
+    post_commit_status,
+)
 from docker_pull_helper import get_image_with_version
 from env_helper import (
-    GITHUB_REPOSITORY,
     GITHUB_RUN_URL,
     REPORTS_PATH,
-    REPO_COPY,
     TEMP_PATH,
 )
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResult
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 
@@ -41,19 +43,12 @@ def get_run_command(pr_number, sha, download_url, workspace_path, image):
     )
 
 
-def get_commit(gh, commit_sha):
-    repo = gh.get_repo(GITHUB_REPOSITORY)
-    commit = repo.get_commit(commit_sha)
-    return commit
-
-
-if __name__ == "__main__":
+def main():
     logging.basicConfig(level=logging.INFO)
 
     stopwatch = Stopwatch()
 
     temp_path = TEMP_PATH
-    repo_path = REPO_COPY
     reports_path = REPORTS_PATH
 
     check_name = sys.argv[1]
@@ -64,8 +59,9 @@ def get_commit(gh, commit_sha):
     pr_info = PRInfo()
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name)
+    rerun_helper = RerunHelper(commit, check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -172,4 +168,8 @@ def get_commit(gh, commit_sha):
 
     logging.info("Result: '%s', '%s', '%s'", status, description, report_url)
     print(f"::notice ::Report url: {report_url}")
-    post_commit_status(gh, pr_info.sha, check_name, description, status, report_url)
+    post_commit_status(commit, status, report_url, description, check_name, pr_info)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/ci/bugfix_validate_check.py b/tests/ci/bugfix_validate_check.py
index 14ea58500bcf..e5ce655bdde3 100644
--- a/tests/ci/bugfix_validate_check.py
+++ b/tests/ci/bugfix_validate_check.py
@@ -8,7 +8,7 @@
 
 from github import Github
 
-from commit_status_helper import post_commit_status
+from commit_status_helper import get_commit, post_commit_status
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, TestResult
@@ -81,13 +81,14 @@ def main(args):
     )
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
     post_commit_status(
-        gh,
-        pr_info.sha,
-        check_name_with_group,
-        "" if is_ok else "Changed tests don't reproduce the bug",
+        commit,
         "success" if is_ok else "error",
         report_url,
+        "" if is_ok else "Changed tests don't reproduce the bug",
+        check_name_with_group,
+        pr_info,
     )
 
 
diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py
index a829069985db..4bc61c79fc03 100644
--- a/tests/ci/build_check.py
+++ b/tests/ci/build_check.py
@@ -9,7 +9,12 @@
 from typing import List, Tuple
 
 from ci_config import CI_CONFIG, BuildConfig
-from commit_status_helper import get_commit_filtered_statuses, get_commit
+from commit_status_helper import (
+    NotSet,
+    get_commit_filtered_statuses,
+    get_commit,
+    post_commit_status,
+)
 from docker_pull_helper import get_image_with_version
 from env_helper import (
     GITHUB_JOB,
@@ -232,10 +237,10 @@ def upload_master_static_binaries(
     print(f"::notice ::Binary static URL: {url}")
 
 
-def mark_failed_reports_pending(build_name: str, sha: str) -> None:
+def mark_failed_reports_pending(build_name: str, pr_info: PRInfo) -> None:
     try:
         gh = GitHub(get_best_robot_token())
-        commit = get_commit(gh, sha)
+        commit = get_commit(gh, pr_info.sha)
         statuses = get_commit_filtered_statuses(commit)
         report_status = [
             name
@@ -248,8 +253,13 @@ def mark_failed_reports_pending(build_name: str, sha: str) -> None:
                     "Commit already have failed status for '%s', setting it to 'pending'",
                     report_status,
                 )
-                commit.create_status(
-                    "pending", status.url, "Set to pending on rerun", report_status
+                post_commit_status(
+                    commit,
+                    "pending",
+                    status.target_url or NotSet,
+                    "Set to pending on rerun",
+                    report_status,
+                    pr_info,
                 )
     except:  # we do not care about any exception here
         logging.info("Failed to get or mark the reports status as pending, continue")
@@ -285,7 +295,7 @@ def main():
     check_for_success_run(s3_helper, s3_path_prefix, build_name, build_config)
 
     # If it's a latter running, we need to mark possible failed status
-    mark_failed_reports_pending(build_name, pr_info.sha)
+    mark_failed_reports_pending(build_name, pr_info)
 
     docker_image = get_image_with_version(IMAGES_PATH, IMAGE_NAME)
     image_version = docker_image.version
diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py
index c61360153160..47c11ee0911d 100644
--- a/tests/ci/build_download_helper.py
+++ b/tests/ci/build_download_helper.py
@@ -6,10 +6,11 @@
 import sys
 import time
 from pathlib import Path
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List
 
 import requests  # type: ignore
 
+import get_robot_token as grt  # we need an updated ROBOT_TOKEN
 from ci_config import CI_CONFIG
 
 DOWNLOAD_RETRIES_COUNT = 5
@@ -24,22 +25,69 @@ def get_with_retries(
     logging.info(
         "Getting URL with %i tries and sleep %i in between: %s", retries, sleep, url
     )
-    exc = None  # type: Optional[Exception]
+    exc = Exception("A placeholder to satisfy typing and avoid nesting")
     for i in range(retries):
         try:
             response = requests.get(url, **kwargs)
             response.raise_for_status()
-            break
+            return response
         except Exception as e:
             if i + 1 < retries:
                 logging.info("Exception '%s' while getting, retry %i", e, i + 1)
                 time.sleep(sleep)
 
             exc = e
-    else:
-        raise Exception(exc)
 
-    return response
+    raise exc
+
+
+def get_gh_api(
+    url: str,
+    retries: int = DOWNLOAD_RETRIES_COUNT,
+    sleep: int = 3,
+    **kwargs: Any,
+) -> requests.Response:
+    """It's a wrapper around get_with_retries that requests GH api w/o auth by
+    default, and falls back to the get_best_robot_token in case of receiving
+    "403 rate limit exceeded" error
+    It sets auth automatically when ROBOT_TOKEN is already set by get_best_robot_token
+    """
+
+    def set_auth_header():
+        if "headers" in kwargs:
+            if "Authorization" not in kwargs["headers"]:
+                kwargs["headers"][
+                    "Authorization"
+                ] = f"Bearer {grt.get_best_robot_token()}"
+        else:
+            kwargs["headers"] = {
+                "Authorization": f"Bearer {grt.get_best_robot_token()}"
+            }
+
+    if grt.ROBOT_TOKEN is not None:
+        set_auth_header()
+
+    need_retry = False
+    for _ in range(retries):
+        try:
+            response = get_with_retries(url, 1, sleep, **kwargs)
+            response.raise_for_status()
+            return response
+        except requests.HTTPError as exc:
+            if (
+                exc.response.status_code == 403
+                and b"rate limit exceeded"
+                in exc.response._content  # pylint:disable=protected-access
+            ):
+                logging.warning(
+                    "Received rate limit exception, setting the auth header and retry"
+                )
+                set_auth_header()
+                need_retry = True
+                break
+
+    if need_retry:
+        return get_with_retries(url, retries, sleep, **kwargs)
 
 
 def get_build_name_for_check(check_name: str) -> str:
diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py
index 32cbaf08f071..13257eabb717 100644
--- a/tests/ci/build_report_check.py
+++ b/tests/ci/build_report_check.py
@@ -22,11 +22,12 @@
 from get_robot_token import get_best_robot_token
 from pr_info import NeedsDataType, PRInfo
 from commit_status_helper import (
+    RerunHelper,
     get_commit,
+    post_commit_status,
     update_mergeable_check,
 )
 from ci_config import CI_CONFIG
-from rerun_helper import RerunHelper
 
 
 NEEDS_DATA_PATH = os.getenv("NEEDS_DATA_PATH", "")
@@ -136,10 +137,11 @@ def main():
 
     gh = Github(get_best_robot_token(), per_page=100)
     pr_info = PRInfo()
+    commit = get_commit(gh, pr_info.sha)
 
     atexit.register(update_mergeable_check, gh, pr_info, build_check_name)
 
-    rerun_helper = RerunHelper(gh, pr_info, build_check_name)
+    rerun_helper = RerunHelper(commit, build_check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -274,12 +276,8 @@ def main():
 
     description = f"{ok_groups}/{total_groups} artifact groups are OK {addition}"
 
-    commit = get_commit(gh, pr_info.sha)
-    commit.create_status(
-        context=build_check_name,
-        description=description,
-        state=summary_status,
-        target_url=url,
+    post_commit_status(
+        commit, summary_status, url, description, build_check_name, pr_info
     )
 
     if summary_status == "error":
diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index bda04853d81a..74cc75636b0c 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
-from typing import Dict, TypeVar
+from dataclasses import dataclass
+from typing import Callable, Dict, TypeVar
 
 ConfValue = TypeVar("ConfValue", str, bool)
 BuildConfig = Dict[str, ConfValue]
@@ -399,3 +400,167 @@
     "Unit tests (tsan)",
     "Unit tests (ubsan)",
 ]
+
+
+@dataclass
+class CheckDescription:
+    name: str
+    description: str  # the check descriptions, will be put into the status table
+    match_func: Callable[[str], bool]  # the function to check vs the commit status
+
+    def __hash__(self) -> int:
+        return hash(self.name + self.description)
+
+
+CHECK_DESCRIPTIONS = [
+    CheckDescription(
+        "AST fuzzer",
+        "Runs randomly generated queries to catch program errors. "
+        "The build type is optionally given in parenthesis. "
+        "If it fails, ask a maintainer for help",
+        lambda x: x.startswith("AST fuzzer"),
+    ),
+    CheckDescription(
+        "Bugfix validate check",
+        "Checks that either a new test (functional or integration) or there "
+        "some changed tests that fail with the binary built on master branch",
+        lambda x: x == "Bugfix validate check",
+    ),
+    CheckDescription(
+        "CI running",
+        "A meta-check that indicates the running CI. Normally, it's in <b>success</b> or "
+        "<b>pending</b> state. The failed status indicates some problems with the PR",
+        lambda x: x == "CI running",
+    ),
+    CheckDescription(
+        "ClickHouse build check",
+        "Builds ClickHouse in various configurations for use in further steps. "
+        "You have to fix the builds that fail. Build logs often has enough "
+        "information to fix the error, but you might have to reproduce the failure "
+        "locally. The <b>cmake</b> options can be found in the build log, grepping for "
+        '<b>cmake</b>. Use these options and follow the <a href="'
+        'https://clickhouse.com/docs/en/development/build">general build process</a>',
+        lambda x: x.startswith("ClickHouse") and x.endswith("build check"),
+    ),
+    CheckDescription(
+        "Compatibility check",
+        "Checks that <b>clickhouse</b> binary runs on distributions with old libc "
+        "versions. If it fails, ask a maintainer for help",
+        lambda x: x.startswith("Compatibility check"),
+    ),
+    CheckDescription(
+        "Docker image for servers",
+        "The check to build and optionally push the mentioned image to docker hub",
+        lambda x: x.startswith("Docker image")
+        and (x.endswith("building check") or x.endswith("build and push")),
+    ),
+    CheckDescription(
+        "Docs Check", "Builds and tests the documentation", lambda x: x == "Docs Check"
+    ),
+    CheckDescription(
+        "Fast test",
+        "Normally this is the first check that is ran for a PR. It builds ClickHouse "
+        'and runs most of <a href="https://clickhouse.com/docs/en/development/tests'
+        '#functional-tests">stateless functional tests</a>, '
+        "omitting some. If it fails, further checks are not started until it is fixed. "
+        "Look at the report to see which tests fail, then reproduce the failure "
+        'locally as described <a href="https://clickhouse.com/docs/en/development/'
+        'tests#functional-test-locally">here</a>',
+        lambda x: x == "Fast test",
+    ),
+    CheckDescription(
+        "Flaky tests",
+        "Checks if new added or modified tests are flaky by running them repeatedly, "
+        "in parallel, with more randomization. Functional tests are run 100 times "
+        "with address sanitizer, and additional randomization of thread scheduling. "
+        "Integrational tests are run up to 10 times. If at least once a new test has "
+        "failed, or was too long, this check will be red. We don't allow flaky tests, "
+        'read <a href="https://clickhouse.com/blog/decorating-a-christmas-tree-with-'
+        'the-help-of-flaky-tests/">the doc</a>',
+        lambda x: "tests flaky check" in x,
+    ),
+    CheckDescription(
+        "Install packages",
+        "Checks that the built packages are installable in a clear environment",
+        lambda x: x.startswith("Install packages ("),
+    ),
+    CheckDescription(
+        "Integration tests",
+        "The integration tests report. In parenthesis the package type is given, "
+        "and in square brackets are the optional part/total tests",
+        lambda x: x.startswith("Integration tests ("),
+    ),
+    CheckDescription(
+        "Mergeable Check",
+        "Checks if all other necessary checks are successful",
+        lambda x: x == "Mergeable Check",
+    ),
+    CheckDescription(
+        "Performance Comparison",
+        "Measure changes in query performance. The performance test report is "
+        'described in detail <a href="https://github.com/ClickHouse/ClickHouse/tree'
+        '/master/docker/test/performance-comparison#how-to-read-the-report">here</a>. '
+        "In square brackets are the optional part/total tests",
+        lambda x: x.startswith("Performance Comparison"),
+    ),
+    CheckDescription(
+        "Push to Dockerhub",
+        "The check for building and pushing the CI related docker images to docker hub",
+        lambda x: x.startswith("Push") and "to Dockerhub" in x,
+    ),
+    CheckDescription(
+        "Sqllogic",
+        "Run clickhouse on the "
+        '<a href="https://www.sqlite.org/sqllogictest">sqllogic</a> '
+        "test set against sqlite and checks that all statements are passed",
+        lambda x: x.startswith("Sqllogic test"),
+    ),
+    CheckDescription(
+        "SQLancer",
+        "Fuzzing tests that detect logical bugs with "
+        '<a href="https://github.com/sqlancer/sqlancer">SQLancer</a> tool',
+        lambda x: x.startswith("SQLancer"),
+    ),
+    CheckDescription(
+        "Stateful tests",
+        "Runs stateful functional tests for ClickHouse binaries built in various "
+        "configurations -- release, debug, with sanitizers, etc",
+        lambda x: x.startswith("Stateful tests ("),
+    ),
+    CheckDescription(
+        "Stateless tests",
+        "Runs stateless functional tests for ClickHouse binaries built in various "
+        "configurations -- release, debug, with sanitizers, etc",
+        lambda x: x.startswith("Stateless tests ("),
+    ),
+    CheckDescription(
+        "Stress test",
+        "Runs stateless functional tests concurrently from several clients to detect "
+        "concurrency-related errors",
+        lambda x: x.startswith("Stress test ("),
+    ),
+    CheckDescription(
+        "Style Check",
+        "Runs a set of checks to keep the code style clean. If some of tests failed, "
+        "see the related log from the report",
+        lambda x: x == "Style Check",
+    ),
+    CheckDescription(
+        "Unit tests",
+        "Runs the unit tests for different release types",
+        lambda x: x.startswith("Unit tests ("),
+    ),
+    CheckDescription(
+        "Upgrade check",
+        "Runs stress tests on server version from last release and then tries to "
+        "upgrade it to the version from the PR. It checks if the new server can "
+        "successfully startup without any errors, crashes or sanitizer asserts",
+        lambda x: x.startswith("Upgrade check ("),
+    ),
+    CheckDescription(
+        "Falback for unknown",
+        "There's no description for the check yet, please add it to "
+        "tests/ci/ci_config.py:CHECK_DESCRIPTIONS",
+        lambda x: True,
+    ),
+]
diff --git a/tests/ci/codebrowser_check.py b/tests/ci/codebrowser_check.py
index 9fa202a357c3..0eb4921e3fef 100644
--- a/tests/ci/codebrowser_check.py
+++ b/tests/ci/codebrowser_check.py
@@ -7,7 +7,7 @@
 
 from github import Github
 
-from commit_status_helper import post_commit_status
+from commit_status_helper import get_commit, post_commit_status
 from docker_pull_helper import get_image_with_version
 from env_helper import (
     IMAGES_PATH,
@@ -43,6 +43,7 @@ def get_run_command(repo_path, output_path, image):
 
     gh = Github(get_best_robot_token(), per_page=100)
     pr_info = PRInfo()
+    commit = get_commit(gh, pr_info.sha)
 
     if not os.path.exists(TEMP_PATH):
         os.makedirs(TEMP_PATH)
@@ -87,4 +88,4 @@ def get_run_command(repo_path, output_path, image):
 
     print(f"::notice ::Report url: {report_url}")
 
-    post_commit_status(gh, pr_info.sha, NAME, "Report built", "success", report_url)
+    post_commit_status(commit, "success", report_url, "Report built", NAME, pr_info)
diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py
index 6260abac1eb7..8731f8280e2f 100644
--- a/tests/ci/commit_status_helper.py
+++ b/tests/ci/commit_status_helper.py
@@ -3,20 +3,51 @@
 import csv
 import os
 import time
-from typing import List, Literal
+from typing import Dict, List, Literal, Optional, Union
 import logging
 
 from github import Github
+from github.GithubObject import _NotSetType, NotSet as NotSet  # type: ignore
 from github.Commit import Commit
 from github.CommitStatus import CommitStatus
+from github.IssueComment import IssueComment
+from github.Repository import Repository
 
-from ci_config import CI_CONFIG, REQUIRED_CHECKS
+from ci_config import CI_CONFIG, REQUIRED_CHECKS, CHECK_DESCRIPTIONS, CheckDescription
 from env_helper import GITHUB_REPOSITORY, GITHUB_RUN_URL
 from pr_info import PRInfo, SKIP_MERGEABLE_CHECK_LABEL
+from report import TestResult, TestResults
+from s3_helper import S3Helper
+from upload_result_helper import upload_results
 
 RETRY = 5
 CommitStatuses = List[CommitStatus]
 MERGEABLE_NAME = "Mergeable Check"
+GH_REPO = None  # type: Optional[Repository]
+CI_STATUS_NAME = "CI running"
+
+
+class RerunHelper:
+    def __init__(self, commit: Commit, check_name: str):
+        self.check_name = check_name
+        self.commit = commit
+        self.statuses = get_commit_filtered_statuses(commit)
+
+    def is_already_finished_by_status(self) -> bool:
+        # currently we agree even for failed statuses
+        for status in self.statuses:
+            if self.check_name in status.context and status.state in (
+                "success",
+                "failure",
+            ):
+                return True
+        return False
+
+    def get_finished_status(self) -> Optional[CommitStatus]:
+        for status in self.statuses:
+            if self.check_name in status.context:
+                return status
+        return None
 
 
 def override_status(status: str, check_name: str, invert: bool = False) -> str:
@@ -34,7 +65,7 @@ def override_status(status: str, check_name: str, invert: bool = False) -> str:
 def get_commit(gh: Github, commit_sha: str, retry_count: int = RETRY) -> Commit:
     for i in range(retry_count):
         try:
-            repo = gh.get_repo(GITHUB_REPOSITORY)
+            repo = get_repo(gh)
             commit = repo.get_commit(commit_sha)
             break
         except Exception as ex:
@@ -46,22 +77,191 @@ def get_commit(gh: Github, commit_sha: str, retry_count: int = RETRY) -> Commit:
 
 
 def post_commit_status(
-    gh: Github, sha: str, check_name: str, description: str, state: str, report_url: str
+    commit: Commit,
+    state: str,
+    report_url: Union[_NotSetType, str] = NotSet,
+    description: Union[_NotSetType, str] = NotSet,
+    check_name: Union[_NotSetType, str] = NotSet,
+    pr_info: Optional[PRInfo] = None,
 ) -> None:
+    """The parameters are given in the same order as for commit.create_status,
+    if an optional parameter `pr_info` is given, the `set_status_comment` functions
+    is invoked to add or update the comment with statuses overview"""
     for i in range(RETRY):
         try:
-            commit = get_commit(gh, sha, 1)
             commit.create_status(
-                context=check_name,
-                description=description,
                 state=state,
                 target_url=report_url,
+                description=description,
+                context=check_name,
             )
             break
         except Exception as ex:
             if i == RETRY - 1:
                 raise ex
             time.sleep(i)
+    if pr_info:
+        status_updated = False
+        for i in range(RETRY):
+            try:
+                set_status_comment(commit, pr_info)
+                status_updated = True
+                break
+            except Exception as ex:
+                logging.warning(
+                    "Failed to update the status commit, will retry %s times: %s",
+                    RETRY - i - 1,
+                    ex,
+                )
+
+        if not status_updated:
+            logging.error("Failed to update the status comment, continue anyway")
+
+
+def set_status_comment(commit: Commit, pr_info: PRInfo) -> None:
+    """It adds or updates the comment status to all Pull Requests but for release
+    one, so the method does nothing for simple pushes and pull requests with
+    `release`/`release-lts` labels"""
+    # to reduce number of parameters, the Github is constructed on the fly
+    gh = Github()
+    gh.__requester = commit._requester  # type:ignore #pylint:disable=protected-access
+    repo = get_repo(gh)
+    statuses = sorted(get_commit_filtered_statuses(commit), key=lambda x: x.context)
+    if not statuses:
+        return
+
+    if not [status for status in statuses if status.context == CI_STATUS_NAME]:
+        # This is the case, when some statuses already exist for the check,
+        # but not the CI_STATUS_NAME. We should create it as pending.
+        # W/o pr_info to avoid recursion, and yes, one extra create_ci_report
+        post_commit_status(
+            commit,
+            "pending",
+            create_ci_report(pr_info, statuses),
+            "The report for running CI",
+            CI_STATUS_NAME,
+        )
+
+    # We update the report in generate_status_comment function, so do it each
+    # run, even in the release PRs and normal pushes
+    comment_body = generate_status_comment(pr_info, statuses)
+    # We post the comment only to normal and backport PRs
+    if pr_info.number == 0 or pr_info.labels.intersection({"release", "release-lts"}):
+        return
+
+    comment_service_header = comment_body.split("\n", 1)[0]
+    comment = None  # type: Optional[IssueComment]
+    pr = repo.get_pull(pr_info.number)
+    for ic in pr.get_issue_comments():
+        if ic.body.startswith(comment_service_header):
+            comment = ic
+            break
+
+    if comment is None:
+        pr.create_issue_comment(comment_body)
+        return
+
+    if comment.body == comment_body:
+        logging.info("The status comment is already updated, no needs to change it")
+        return
+    comment.edit(comment_body)
+
+
+def generate_status_comment(pr_info: PRInfo, statuses: CommitStatuses) -> str:
+    """The method generates the comment body, as well it updates the CI report"""
+
+    def beauty_state(state: str) -> str:
+        if state == "success":
+            return f"🟢 {state}"
+        if state == "pending":
+            return f"🟡 {state}"
+        if state in ["error", "failure"]:
+            return f"🔴 {state}"
+        return state
+
+    report_url = create_ci_report(pr_info, statuses)
+    worst_state = get_worst_state(statuses)
+    if not worst_state:
+        # Theoretically possible, although
+        # the function should not be used on empty statuses
+        worst_state = "The commit doesn't have the statuses yet"
+    else:
+        worst_state = f"The overall status of the commit is {beauty_state(worst_state)}"
+
+    comment_body = (
+        f"<!-- automatic status comment for PR #{pr_info.number} "
+        f"from {pr_info.head_name}:{pr_info.head_ref} -->\n"
+        f"This is an automated comment for commit {pr_info.sha} with "
+        f"description of existing statuses. It's updated for the latest CI running\n"
+        f"The full report is available [here]({report_url})\n"
+        f"{worst_state}\n\n<table>"
+        "<thead><tr><th>Check name</th><th>Description</th><th>Status</th></tr></thead>\n"
+        "<tbody>"
+    )
+    # group checks by the name to get the worst one per each
+    grouped_statuses = {}  # type: Dict[CheckDescription, CommitStatuses]
+    for status in statuses:
+        cd = None
+        for c in CHECK_DESCRIPTIONS:
+            if c.match_func(status.context):
+                cd = c
+                break
+
+        if cd is None or cd == CHECK_DESCRIPTIONS[-1]:
+            # This is the case for either non-found description or a fallback
+            cd = CheckDescription(
+                status.context,
+                CHECK_DESCRIPTIONS[-1].description,
+                CHECK_DESCRIPTIONS[-1].match_func,
+            )
+
+        if cd in grouped_statuses:
+            grouped_statuses[cd].append(status)
+        else:
+            grouped_statuses[cd] = [status]
+
+    table_rows = []  # type: List[str]
+    for desc, gs in grouped_statuses.items():
+        table_rows.append(
+            f"<tr><td>{desc.name}</td><td>{desc.description}</td>"
+            f"<td>{beauty_state(get_worst_state(gs))}</td></tr>\n"
+        )
+
+    table_rows.sort()
+
+    comment_footer = "</table>"
+    return "".join([comment_body, *table_rows, comment_footer])
+
+
+def get_worst_state(statuses: CommitStatuses) -> str:
+    worst_status = None
+    states = {"error": 0, "failure": 1, "pending": 2, "success": 3}
+    for status in statuses:
+        if worst_status is None:
+            worst_status = status
+            continue
+        if states[status.state] < states[worst_status.state]:
+            worst_status = status
+        if worst_status.state == "error":
+            break
+
+    if worst_status is None:
+        return ""
+    return worst_status.state
+
+
+def create_ci_report(pr_info: PRInfo, statuses: CommitStatuses) -> str:
+    """The function converst the statuses to TestResults and uploads the report
+    to S3 tests bucket. Then it returns the URL"""
+    test_results = []  # type: TestResults
+    for status in statuses:
+        log_urls = None
+        if status.target_url is not None:
+            log_urls = [status.target_url]
+        test_results.append(TestResult(status.context, status.state, log_urls=log_urls))
+    return upload_results(
+        S3Helper(), pr_info.number, pr_info.sha, test_results, [], CI_STATUS_NAME
+    )
 
 
 def post_commit_status_to_file(
@@ -90,8 +290,16 @@ def get_commit_filtered_statuses(commit: Commit) -> CommitStatuses:
     return list(filtered.values())
 
 
+def get_repo(gh: Github) -> Repository:
+    global GH_REPO
+    if GH_REPO is not None:
+        return GH_REPO
+    GH_REPO = gh.get_repo(GITHUB_REPOSITORY)
+    return GH_REPO
+
+
 def remove_labels(gh: Github, pr_info: PRInfo, labels_names: List[str]) -> None:
-    repo = gh.get_repo(GITHUB_REPOSITORY)
+    repo = get_repo(gh)
     pull_request = repo.get_pull(pr_info.number)
     for label in labels_names:
         pull_request.remove_from_labels(label)
@@ -99,7 +307,7 @@ def remove_labels(gh: Github, pr_info: PRInfo, labels_names: List[str]) -> None:
 
 
 def post_labels(gh: Github, pr_info: PRInfo, labels_names: List[str]) -> None:
-    repo = gh.get_repo(GITHUB_REPOSITORY)
+    repo = get_repo(gh)
     pull_request = repo.get_pull(pr_info.number)
     for label in labels_names:
         pull_request.add_to_labels(label)
diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py
index 432e9ec7c019..04203617dca7 100644
--- a/tests/ci/compatibility_check.py
+++ b/tests/ci/compatibility_check.py
@@ -16,13 +16,12 @@
     mark_flaky_tests,
     prepare_tests_results_for_clickhouse,
 )
-from commit_status_helper import post_commit_status
+from commit_status_helper import RerunHelper, get_commit, post_commit_status
 from docker_pull_helper import get_images_with_versions
 from env_helper import TEMP_PATH, REPORTS_PATH
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, TestResult
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from upload_result_helper import upload_results
@@ -150,8 +149,9 @@ def main():
     pr_info = PRInfo()
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
-    rerun_helper = RerunHelper(gh, pr_info, args.check_name)
+    rerun_helper = RerunHelper(commit, args.check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -242,7 +242,7 @@ def url_filter(url):
         args.check_name,
     )
     print(f"::notice ::Report url: {report_url}")
-    post_commit_status(gh, pr_info.sha, args.check_name, description, state, report_url)
+    post_commit_status(commit, state, report_url, description, args.check_name, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py
index f2b1105b3b0f..16a58a90dcf6 100644
--- a/tests/ci/docker_images_check.py
+++ b/tests/ci/docker_images_check.py
@@ -14,7 +14,7 @@
 from github import Github
 
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
-from commit_status_helper import format_description, post_commit_status
+from commit_status_helper import format_description, get_commit, post_commit_status
 from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP, GITHUB_RUN_URL
 from get_robot_token import get_best_robot_token, get_parameter_from_ssm
 from pr_info import PRInfo
@@ -474,7 +474,8 @@ def main():
         return
 
     gh = Github(get_best_robot_token(), per_page=100)
-    post_commit_status(gh, pr_info.sha, NAME, description, status, url)
+    commit = get_commit(gh, pr_info.sha)
+    post_commit_status(commit, status, url, description, NAME, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/docker_manifests_merge.py b/tests/ci/docker_manifests_merge.py
index 0484ea8f6416..d89708b9277e 100644
--- a/tests/ci/docker_manifests_merge.py
+++ b/tests/ci/docker_manifests_merge.py
@@ -10,7 +10,7 @@
 from github import Github
 
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
-from commit_status_helper import format_description, post_commit_status
+from commit_status_helper import format_description, get_commit, post_commit_status
 from env_helper import RUNNER_TEMP
 from get_robot_token import get_best_robot_token, get_parameter_from_ssm
 from pr_info import PRInfo
@@ -221,7 +221,8 @@ def main():
     description = format_description(description)
 
     gh = Github(get_best_robot_token(), per_page=100)
-    post_commit_status(gh, pr_info.sha, NAME, description, status, url)
+    commit = get_commit(gh, pr_info.sha)
+    post_commit_status(commit, status, url, description, NAME, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/docker_server.py b/tests/ci/docker_server.py
index c6854c5aa788..a434d3cc8411 100644
--- a/tests/ci/docker_server.py
+++ b/tests/ci/docker_server.py
@@ -15,7 +15,7 @@
 
 from build_check import get_release_or_pr
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
-from commit_status_helper import format_description, post_commit_status
+from commit_status_helper import format_description, get_commit, post_commit_status
 from docker_images_check import DockerImage
 from env_helper import CI, GITHUB_RUN_URL, RUNNER_TEMP, S3_BUILDS_BUCKET, S3_DOWNLOAD
 from get_robot_token import get_best_robot_token, get_parameter_from_ssm
@@ -372,7 +372,8 @@ def main():
     description = format_description(description)
 
     gh = Github(get_best_robot_token(), per_page=100)
-    post_commit_status(gh, pr_info.sha, NAME, description, status, url)
+    commit = get_commit(gh, pr_info.sha)
+    post_commit_status(commit, status, url, description, NAME, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py
index ed2743ca9655..e3930a20bd98 100644
--- a/tests/ci/docs_check.py
+++ b/tests/ci/docs_check.py
@@ -9,13 +9,18 @@
 from github import Github
 
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
-from commit_status_helper import post_commit_status, get_commit, update_mergeable_check
+from commit_status_helper import (
+    NotSet,
+    RerunHelper,
+    get_commit,
+    post_commit_status,
+    update_mergeable_check,
+)
 from docker_pull_helper import get_image_with_version
 from env_helper import TEMP_PATH, REPO_COPY
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, TestResult
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
@@ -52,8 +57,9 @@ def main():
     pr_info = PRInfo(need_changed_files=True)
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
-    rerun_helper = RerunHelper(gh, pr_info, NAME)
+    rerun_helper = RerunHelper(commit, NAME)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -61,9 +67,8 @@ def main():
 
     if not pr_info.has_changes_in_documentation() and not args.force:
         logging.info("No changes in documentation")
-        commit = get_commit(gh, pr_info.sha)
-        commit.create_status(
-            context=NAME, description="No changes in docs", state="success"
+        post_commit_status(
+            commit, "success", NotSet, "No changes in docs", NAME, pr_info
         )
         sys.exit(0)
 
@@ -132,7 +137,7 @@ def main():
         s3_helper, pr_info.number, pr_info.sha, test_results, additional_files, NAME
     )
     print("::notice ::Report url: {report_url}")
-    post_commit_status(gh, pr_info.sha, NAME, description, status, report_url)
+    post_commit_status(commit, status, report_url, description, NAME, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py
index a5a4913be0b4..5c2139ae0bc2 100644
--- a/tests/ci/env_helper.py
+++ b/tests/ci/env_helper.py
@@ -1,7 +1,7 @@
 import os
 from os import path as p
 
-from build_download_helper import get_with_retries
+from build_download_helper import get_gh_api
 
 module_dir = p.abspath(p.dirname(__file__))
 git_root = p.abspath(p.join(module_dir, "..", ".."))
@@ -46,7 +46,7 @@ def GITHUB_JOB_ID() -> str:
     jobs = []
     page = 1
     while not _GITHUB_JOB_ID:
-        response = get_with_retries(
+        response = get_gh_api(
             f"https://api.github.com/repos/{GITHUB_REPOSITORY}/"
             f"actions/runs/{GITHUB_RUN_ID}/jobs?per_page=100&page={page}"
         )
diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py
index f13b40996572..89066ade2cb3 100644
--- a/tests/ci/fast_test_check.py
+++ b/tests/ci/fast_test_check.py
@@ -17,6 +17,8 @@
     prepare_tests_results_for_clickhouse,
 )
 from commit_status_helper import (
+    RerunHelper,
+    get_commit,
     post_commit_status,
     update_mergeable_check,
 )
@@ -25,7 +27,6 @@
 from get_robot_token import get_best_robot_token
 from pr_info import FORCE_TESTS_LABEL, PRInfo
 from report import TestResults, read_test_results
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
@@ -106,10 +107,11 @@ def main():
     pr_info = PRInfo()
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
     atexit.register(update_mergeable_check, gh, pr_info, NAME)
 
-    rerun_helper = RerunHelper(gh, pr_info, NAME)
+    rerun_helper = RerunHelper(commit, NAME)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         status = rerun_helper.get_finished_status()
@@ -197,7 +199,7 @@ def main():
         NAME,
     )
     print(f"::notice ::Report url: {report_url}")
-    post_commit_status(gh, pr_info.sha, NAME, description, state, report_url)
+    post_commit_status(commit, state, report_url, description, NAME, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py
index ea2f5eb3136b..aa8a0cf9553f 100644
--- a/tests/ci/finish_check.py
+++ b/tests/ci/finish_check.py
@@ -2,32 +2,42 @@
 import logging
 from github import Github
 
-from env_helper import GITHUB_RUN_URL
-from pr_info import PRInfo
+from commit_status_helper import (
+    CI_STATUS_NAME,
+    NotSet,
+    get_commit,
+    get_commit_filtered_statuses,
+    post_commit_status,
+)
 from get_robot_token import get_best_robot_token
-from commit_status_helper import get_commit, get_commit_filtered_statuses
-
-NAME = "Run Check"
+from pr_info import PRInfo
 
 
-if __name__ == "__main__":
+def main():
     logging.basicConfig(level=logging.INFO)
 
     pr_info = PRInfo(need_orgs=True)
     gh = Github(get_best_robot_token(), per_page=100)
     commit = get_commit(gh, pr_info.sha)
 
-    url = GITHUB_RUN_URL
-    statuses = get_commit_filtered_statuses(commit)
-    pending_status = any(  # find NAME status in pending state
-        True
-        for status in statuses
-        if status.context == NAME and status.state == "pending"
-    )
-    if pending_status:
-        commit.create_status(
-            context=NAME,
-            description="All checks finished",
-            state="success",
-            target_url=url,
+    statuses = [
+        status
+        for status in get_commit_filtered_statuses(commit)
+        if status.context == CI_STATUS_NAME
+    ]
+    if not statuses:
+        return
+    status = statuses[0]
+    if status.state == "pending":
+        post_commit_status(
+            commit,
+            "success",
+            status.target_url or NotSet,
+            "All checks finished",
+            CI_STATUS_NAME,
+            pr_info,
         )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py
index 813386bc0db6..037bb13f1f87 100644
--- a/tests/ci/functional_test_check.py
+++ b/tests/ci/functional_test_check.py
@@ -20,9 +20,11 @@
     prepare_tests_results_for_clickhouse,
 )
 from commit_status_helper import (
-    post_commit_status,
+    NotSet,
+    RerunHelper,
     get_commit,
     override_status,
+    post_commit_status,
     post_commit_status_to_file,
     update_mergeable_check,
 )
@@ -32,7 +34,6 @@
 from get_robot_token import get_best_robot_token
 from pr_info import FORCE_TESTS_LABEL, PRInfo
 from report import TestResults, read_test_results
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
@@ -247,6 +248,7 @@ def main():
         need_changed_files=run_changed_tests, pr_event_from_api=validate_bugfix_check
     )
 
+    commit = get_commit(gh, pr_info.sha)
     atexit.register(update_mergeable_check, gh, pr_info, check_name)
 
     if not os.path.exists(temp_path):
@@ -274,7 +276,7 @@ def main():
         run_by_hash_total = 0
         check_name_with_group = check_name
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name_with_group)
+    rerun_helper = RerunHelper(commit, check_name_with_group)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -283,13 +285,15 @@ def main():
     if run_changed_tests:
         tests_to_run = get_tests_to_run(pr_info)
         if not tests_to_run:
-            commit = get_commit(gh, pr_info.sha)
             state = override_status("success", check_name, validate_bugfix_check)
             if args.post_commit_status == "commit_status":
-                commit.create_status(
-                    context=check_name_with_group,
-                    description=NO_CHANGES_MSG,
-                    state=state,
+                post_commit_status(
+                    commit,
+                    state,
+                    NotSet,
+                    NO_CHANGES_MSG,
+                    check_name_with_group,
+                    pr_info,
                 )
             elif args.post_commit_status == "file":
                 post_commit_status_to_file(
@@ -376,16 +380,16 @@ def main():
     if args.post_commit_status == "commit_status":
         if "parallelreplicas" in check_name.lower():
             post_commit_status(
-                gh,
-                pr_info.sha,
-                check_name_with_group,
-                description,
+                commit,
                 "success",
                 report_url,
+                description,
+                check_name_with_group,
+                pr_info,
             )
         else:
             post_commit_status(
-                gh, pr_info.sha, check_name_with_group, description, state, report_url
+                commit, state, report_url, description, check_name_with_group, pr_info
             )
     elif args.post_commit_status == "file":
         if "parallelreplicas" in check_name.lower():
diff --git a/tests/ci/get_robot_token.py b/tests/ci/get_robot_token.py
index 6ecaf468ed15..b41eba49cc32 100644
--- a/tests/ci/get_robot_token.py
+++ b/tests/ci/get_robot_token.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import logging
 from dataclasses import dataclass
+from typing import Optional
 
 import boto3  # type: ignore
 from github import Github
@@ -20,7 +21,13 @@ def get_parameter_from_ssm(name, decrypt=True, client=None):
     return client.get_parameter(Name=name, WithDecryption=decrypt)["Parameter"]["Value"]
 
 
+ROBOT_TOKEN = None  # type: Optional[Token]
+
+
 def get_best_robot_token(token_prefix_env_name="github_robot_token_"):
+    global ROBOT_TOKEN
+    if ROBOT_TOKEN is not None:
+        return ROBOT_TOKEN.value
     client = boto3.client("ssm", region_name="us-east-1")
     parameters = client.describe_parameters(
         ParameterFilters=[
@@ -28,7 +35,6 @@ def get_best_robot_token(token_prefix_env_name="github_robot_token_"):
         ]
     )["Parameters"]
     assert parameters
-    token = None
 
     for token_name in [p["Name"] for p in parameters]:
         value = get_parameter_from_ssm(token_name, True, client)
@@ -38,15 +44,17 @@ def get_best_robot_token(token_prefix_env_name="github_robot_token_"):
         user = gh.get_user()
         rest, _ = gh.rate_limiting
         logging.info("Get token with %s remaining requests", rest)
-        if token is None:
-            token = Token(user, value, rest)
+        if ROBOT_TOKEN is None:
+            ROBOT_TOKEN = Token(user, value, rest)
             continue
-        if token.rest < rest:
-            token.user, token.value, token.rest = user, value, rest
+        if ROBOT_TOKEN.rest < rest:
+            ROBOT_TOKEN.user, ROBOT_TOKEN.value, ROBOT_TOKEN.rest = user, value, rest
 
-    assert token
+    assert ROBOT_TOKEN
     logging.info(
-        "User %s with %s remaining requests is used", token.user.login, token.rest
+        "User %s with %s remaining requests is used",
+        ROBOT_TOKEN.user.login,
+        ROBOT_TOKEN.rest,
     )
 
-    return token.value
+    return ROBOT_TOKEN.value
diff --git a/tests/ci/install_check.py b/tests/ci/install_check.py
index 54245670b26c..d619ce96ceec 100644
--- a/tests/ci/install_check.py
+++ b/tests/ci/install_check.py
@@ -19,7 +19,9 @@
     prepare_tests_results_for_clickhouse,
 )
 from commit_status_helper import (
+    RerunHelper,
     format_description,
+    get_commit,
     post_commit_status,
     update_mergeable_check,
 )
@@ -29,7 +31,6 @@
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, TestResult
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
@@ -268,9 +269,10 @@ def main():
 
     if CI:
         gh = Github(get_best_robot_token(), per_page=100)
+        commit = get_commit(gh, pr_info.sha)
         atexit.register(update_mergeable_check, gh, pr_info, args.check_name)
 
-        rerun_helper = RerunHelper(gh, pr_info, args.check_name)
+        rerun_helper = RerunHelper(commit, args.check_name)
         if rerun_helper.is_already_finished_by_status():
             logging.info(
                 "Check is already finished according to github status, exiting"
@@ -347,7 +349,7 @@ def filter_artifacts(path: str) -> bool:
 
     description = format_description(description)
 
-    post_commit_status(gh, pr_info.sha, args.check_name, description, state, report_url)
+    post_commit_status(commit, state, report_url, description, args.check_name, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py
index f864751e8304..8ef6244a1c55 100644
--- a/tests/ci/integration_test_check.py
+++ b/tests/ci/integration_test_check.py
@@ -19,8 +19,10 @@
     prepare_tests_results_for_clickhouse,
 )
 from commit_status_helper import (
-    post_commit_status,
+    RerunHelper,
+    get_commit,
     override_status,
+    post_commit_status,
     post_commit_status_to_file,
 )
 from docker_pull_helper import get_images_with_versions
@@ -29,7 +31,6 @@
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, read_test_results
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
@@ -198,8 +199,9 @@ def main():
         sys.exit(0)
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name_with_group)
+    rerun_helper = RerunHelper(commit, check_name_with_group)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -284,15 +286,10 @@ def main():
     print(f"::notice:: {check_name} Report url: {report_url}")
     if args.post_commit_status == "commit_status":
         post_commit_status(
-            gh, pr_info.sha, check_name_with_group, description, state, report_url
+            commit, state, report_url, description, check_name_with_group, pr_info
         )
     elif args.post_commit_status == "file":
-        post_commit_status_to_file(
-            post_commit_path,
-            description,
-            state,
-            report_url,
-        )
+        post_commit_status_to_file(post_commit_path, description, state, report_url)
     else:
         raise Exception(
             f'Unknown post_commit_status option "{args.post_commit_status}"'
diff --git a/tests/ci/jepsen_check.py b/tests/ci/jepsen_check.py
index ffa9e45373f4..9d35d2d6e352 100644
--- a/tests/ci/jepsen_check.py
+++ b/tests/ci/jepsen_check.py
@@ -13,13 +13,12 @@
 
 from build_download_helper import get_build_name_for_check
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
-from commit_status_helper import post_commit_status
+from commit_status_helper import RerunHelper, get_commit, post_commit_status
 from compress_files import compress_fast
 from env_helper import REPO_COPY, TEMP_PATH, S3_BUILDS_BUCKET, S3_DOWNLOAD
 from get_robot_token import get_best_robot_token, get_parameter_from_ssm
 from pr_info import PRInfo
 from report import TestResults, TestResult
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from ssh import SSHKey
 from stopwatch import Stopwatch
@@ -181,10 +180,11 @@ def get_run_command(
         sys.exit(0)
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
     check_name = KEEPER_CHECK_NAME if args.program == "keeper" else SERVER_CHECK_NAME
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name)
+    rerun_helper = RerunHelper(commit, check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -293,7 +293,7 @@ def get_run_command(
     )
 
     print(f"::notice ::Report url: {report_url}")
-    post_commit_status(gh, pr_info.sha, check_name, description, status, report_url)
+    post_commit_status(commit, status, report_url, description, check_name, pr_info)
 
     ch_helper = ClickHouseHelper()
     prepared_events = prepare_tests_results_for_clickhouse(
diff --git a/tests/ci/mark_release_ready.py b/tests/ci/mark_release_ready.py
index b103dd053bbc..4501d40e4d32 100755
--- a/tests/ci/mark_release_ready.py
+++ b/tests/ci/mark_release_ready.py
@@ -4,7 +4,7 @@
 import logging
 import os
 
-from commit_status_helper import get_commit
+from commit_status_helper import NotSet, get_commit, post_commit_status
 from env_helper import GITHUB_JOB_URL
 from get_robot_token import get_best_robot_token
 from github_helper import GitHub
@@ -34,6 +34,7 @@ def main():
     args = parser.parse_args()
     url = ""
     description = "the release can be created from the commit, manually set"
+    pr_info = None
     if not args.commit:
         pr_info = PRInfo()
         if pr_info.event == pr_info.default_event:
@@ -45,14 +46,10 @@ def main():
 
     gh = GitHub(args.token, create_cache_dir=False)
     # Get the rate limits for a quick fail
-    gh.get_rate_limit()
     commit = get_commit(gh, args.commit)
-
-    commit.create_status(
-        context=RELEASE_READY_STATUS,
-        description=description,
-        state="success",
-        target_url=url,
+    gh.get_rate_limit()
+    post_commit_status(
+        commit, "success", url or NotSet, description, RELEASE_READY_STATUS, pr_info
     )
 
 
diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py
index 0da41e0ae82f..bf5704f31bd2 100644
--- a/tests/ci/performance_comparison_check.py
+++ b/tests/ci/performance_comparison_check.py
@@ -12,13 +12,12 @@
 
 from github import Github
 
-from commit_status_helper import get_commit, post_commit_status
+from commit_status_helper import RerunHelper, get_commit, post_commit_status
 from ci_config import CI_CONFIG
 from docker_pull_helper import get_image_with_version
 from env_helper import GITHUB_EVENT_PATH, GITHUB_RUN_URL, S3_BUILDS_BUCKET, S3_DOWNLOAD
 from get_robot_token import get_best_robot_token, get_parameter_from_ssm
 from pr_info import PRInfo
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from tee_popen import TeePopen
 
@@ -118,7 +117,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         message = "Skipped, not labeled with 'pr-performance'"
         report_url = GITHUB_RUN_URL
         post_commit_status(
-            gh, pr_info.sha, check_name_with_group, message, status, report_url
+            commit, status, report_url, message, check_name_with_group, pr_info
         )
         sys.exit(0)
 
@@ -131,7 +130,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             "Fill fliter our performance tests by grep -v %s", test_grep_exclude_filter
         )
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name_with_group)
+    rerun_helper = RerunHelper(commit, check_name_with_group)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -267,7 +266,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         report_url = uploaded["report.html"]
 
     post_commit_status(
-        gh, pr_info.sha, check_name_with_group, message, status, report_url
+        commit, status, report_url, message, check_name_with_group, pr_info
     )
 
     if status == "error":
diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py
index ddeb070b2b94..86d4985c6b27 100644
--- a/tests/ci/pr_info.py
+++ b/tests/ci/pr_info.py
@@ -6,7 +6,7 @@
 
 from unidiff import PatchSet  # type: ignore
 
-from build_download_helper import get_with_retries
+from build_download_helper import get_gh_api
 from env_helper import (
     GITHUB_REPOSITORY,
     GITHUB_SERVER_URL,
@@ -45,7 +45,7 @@ def get_pr_for_commit(sha, ref):
         f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls"
     )
     try:
-        response = get_with_retries(try_get_pr_url, sleep=RETRY_SLEEP)
+        response = get_gh_api(try_get_pr_url, sleep=RETRY_SLEEP)
         data = response.json()
         our_prs = []  # type: List[Dict]
         if len(data) > 1:
@@ -105,7 +105,7 @@ def __init__(
         # workflow completed event, used for PRs only
         if "action" in github_event and github_event["action"] == "completed":
             self.sha = github_event["workflow_run"]["head_sha"]
-            prs_for_sha = get_with_retries(
+            prs_for_sha = get_gh_api(
                 f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}"
                 "/pulls",
                 sleep=RETRY_SLEEP,
@@ -117,7 +117,7 @@ def __init__(
             self.number = github_event["pull_request"]["number"]
             if pr_event_from_api:
                 try:
-                    response = get_with_retries(
+                    response = get_gh_api(
                         f"https://api.github.com/repos/{GITHUB_REPOSITORY}"
                         f"/pulls/{self.number}",
                         sleep=RETRY_SLEEP,
@@ -159,7 +159,7 @@ def __init__(
             self.user_login = github_event["pull_request"]["user"]["login"]
             self.user_orgs = set([])
             if need_orgs:
-                user_orgs_response = get_with_retries(
+                user_orgs_response = get_gh_api(
                     github_event["pull_request"]["user"]["organizations_url"],
                     sleep=RETRY_SLEEP,
                 )
@@ -255,7 +255,7 @@ def fetch_changed_files(self):
             raise TypeError("The event does not have diff URLs")
 
         for diff_url in self.diff_urls:
-            response = get_with_retries(
+            response = get_gh_api(
                 diff_url,
                 sleep=RETRY_SLEEP,
             )
diff --git a/tests/ci/report.py b/tests/ci/report.py
index a40eb559792d..cdef8409e7ea 100644
--- a/tests/ci/report.py
+++ b/tests/ci/report.py
@@ -370,6 +370,7 @@ def create_test_html_report(
                 colspan += 1
 
             if test_result.log_urls is not None:
+                has_log_urls = True
                 test_logs_html = "<br>".join(
                     [_get_html_url(url) for url in test_result.log_urls]
                 )
diff --git a/tests/ci/rerun_helper.py b/tests/ci/rerun_helper.py
deleted file mode 100644
index fa73256d759f..000000000000
--- a/tests/ci/rerun_helper.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-from typing import Optional
-
-from commit_status_helper import get_commit, get_commit_filtered_statuses
-from github import Github
-from github.CommitStatus import CommitStatus
-from pr_info import PRInfo
-
-
-# TODO: move it to commit_status_helper
-class RerunHelper:
-    def __init__(self, gh: Github, pr_info: PRInfo, check_name: str):
-        self.gh = gh
-        self.pr_info = pr_info
-        self.check_name = check_name
-        commit = get_commit(gh, self.pr_info.sha)
-        if commit is None:
-            raise ValueError(f"unable to receive commit for {pr_info.sha}")
-        self.pygh_commit = commit
-        self.statuses = get_commit_filtered_statuses(commit)
-
-    def is_already_finished_by_status(self) -> bool:
-        # currently we agree even for failed statuses
-        for status in self.statuses:
-            if self.check_name in status.context and status.state in (
-                "success",
-                "failure",
-            ):
-                return True
-        return False
-
-    def get_finished_status(self) -> Optional[CommitStatus]:
-        for status in self.statuses:
-            if self.check_name in status.context:
-                return status
-        return None
diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 44e1e4132c82..351e740bd3cc 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -7,20 +7,22 @@
 from github import Github
 
 from commit_status_helper import (
+    CI_STATUS_NAME,
+    NotSet,
+    create_ci_report,
     format_description,
     get_commit,
+    post_commit_status,
     post_labels,
     remove_labels,
     set_mergeable_check,
 )
 from docs_check import NAME as DOCS_NAME
-from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL
+from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL
 from get_robot_token import get_best_robot_token
 from pr_info import FORCE_TESTS_LABEL, PRInfo
 from workflow_approve_rerun_lambda.app import TRUSTED_CONTRIBUTORS
 
-NAME = "Run Check"
-
 TRUSTED_ORG_IDS = {
     54801242,  # clickhouse
 }
@@ -89,7 +91,7 @@ def pr_is_by_trusted_user(pr_user_login, pr_user_orgs):
 # Returns whether we should look into individual checks for this PR. If not, it
 # can be skipped entirely.
 # Returns can_run, description, labels_state
-def should_run_checks_for_pr(pr_info: PRInfo) -> Tuple[bool, str, str]:
+def should_run_ci_for_pr(pr_info: PRInfo) -> Tuple[bool, str, str]:
     # Consider the labels and whether the user is trusted.
     print("Got labels", pr_info.labels)
     if FORCE_TESTS_LABEL in pr_info.labels:
@@ -203,7 +205,7 @@ def check_pr_description(pr_info: PRInfo) -> Tuple[str, str]:
     return description_error, category
 
 
-if __name__ == "__main__":
+def main():
     logging.basicConfig(level=logging.INFO)
 
     pr_info = PRInfo(need_orgs=True, pr_event_from_api=True, need_changed_files=True)
@@ -213,7 +215,7 @@ def check_pr_description(pr_info: PRInfo) -> Tuple[str, str]:
         print("::notice ::Cannot run, no PR exists for the commit")
         sys.exit(1)
 
-    can_run, description, labels_state = should_run_checks_for_pr(pr_info)
+    can_run, description, labels_state = should_run_ci_for_pr(pr_info)
     if can_run and OK_SKIP_LABELS.intersection(pr_info.labels):
         print("::notice :: Early finish the check, running in a special PR")
         sys.exit(0)
@@ -253,10 +255,12 @@ def check_pr_description(pr_info: PRInfo) -> Tuple[str, str]:
 
     if FEATURE_LABEL in pr_info.labels:
         print(f"The '{FEATURE_LABEL}' in the labels, expect the 'Docs Check' status")
-        commit.create_status(
-            context=DOCS_NAME,
-            description=f"expect adding docs for {FEATURE_LABEL}",
-            state="pending",
+        post_commit_status(  # do not pass pr_info here intentionally
+            commit,
+            "pending",
+            NotSet,
+            f"expect adding docs for {FEATURE_LABEL}",
+            DOCS_NAME,
         )
     else:
         set_mergeable_check(commit, "skipped")
@@ -267,7 +271,7 @@ def check_pr_description(pr_info: PRInfo) -> Tuple[str, str]:
             f"{description_error}"
         )
         logging.info(
-            "PR body doesn't match the template: (start)\n%s\n(end)\n" "Reason: %s",
+            "PR body doesn't match the template: (start)\n%s\n(end)\nReason: %s",
             pr_info.body,
             description_error,
         )
@@ -275,23 +279,29 @@ def check_pr_description(pr_info: PRInfo) -> Tuple[str, str]:
             f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/"
             "blob/master/.github/PULL_REQUEST_TEMPLATE.md?plain=1"
         )
-        commit.create_status(
-            context=NAME,
-            description=format_description(description_error),
-            state="failure",
-            target_url=url,
+        post_commit_status(
+            commit,
+            "failure",
+            url,
+            format_description(description_error),
+            CI_STATUS_NAME,
+            pr_info,
         )
         sys.exit(1)
 
-    url = GITHUB_RUN_URL
+    ci_report_url = create_ci_report(pr_info, [])
     if not can_run:
         print("::notice ::Cannot run")
-        commit.create_status(
-            context=NAME, description=description, state=labels_state, target_url=url
+        post_commit_status(
+            commit, labels_state, ci_report_url, description, CI_STATUS_NAME, pr_info
         )
         sys.exit(1)
     else:
         print("::notice ::Can run")
-        commit.create_status(
-            context=NAME, description=description, state="pending", target_url=url
+        post_commit_status(
+            commit, "pending", ci_report_url, description, CI_STATUS_NAME, pr_info
         )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py
index fbe9f33b49bb..2af02d572c89 100644
--- a/tests/ci/s3_helper.py
+++ b/tests/ci/s3_helper.py
@@ -40,11 +40,11 @@ def _flatten_list(lst):
 
 
 class S3Helper:
-    def __init__(self, host=S3_URL, download_host=S3_DOWNLOAD):
+    def __init__(self):
         self.session = boto3.session.Session(region_name="us-east-1")
-        self.client = self.session.client("s3", endpoint_url=host)
-        self.host = host
-        self.download_host = download_host
+        self.client = self.session.client("s3", endpoint_url=S3_URL)
+        self.host = S3_URL
+        self.download_host = S3_DOWNLOAD
 
     def _upload_file_to_s3(self, bucket_name: str, file_path: str, s3_path: str) -> str:
         logging.debug(
diff --git a/tests/ci/sqlancer_check.py b/tests/ci/sqlancer_check.py
index 1a6c4d146162..144dea54133f 100644
--- a/tests/ci/sqlancer_check.py
+++ b/tests/ci/sqlancer_check.py
@@ -10,10 +10,14 @@
 
 from build_download_helper import get_build_name_for_check, read_build_urls
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
-from commit_status_helper import format_description, post_commit_status
+from commit_status_helper import (
+    RerunHelper,
+    format_description,
+    get_commit,
+    post_commit_status,
+)
 from docker_pull_helper import get_image_with_version
 from env_helper import (
-    GITHUB_REPOSITORY,
     GITHUB_RUN_URL,
     REPORTS_PATH,
     TEMP_PATH,
@@ -21,7 +25,6 @@
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, TestResult
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from upload_result_helper import upload_results
@@ -46,12 +49,6 @@ def get_run_command(download_url, workspace_path, image):
     )
 
 
-def get_commit(gh, commit_sha):
-    repo = gh.get_repo(GITHUB_REPOSITORY)
-    commit = repo.get_commit(commit_sha)
-    return commit
-
-
 def main():
     logging.basicConfig(level=logging.INFO)
 
@@ -68,8 +65,9 @@ def main():
     pr_info = PRInfo()
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name)
+    rerun_helper = RerunHelper(commit, check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -187,12 +185,10 @@ def main():
         check_name,
     )
 
-    post_commit_status(gh, pr_info.sha, check_name, description, status, report_url)
-
+    post_commit_status(commit, status, report_url, description, check_name, pr_info)
     print(f"::notice:: {check_name} Report url: {report_url}")
 
     ch_helper = ClickHouseHelper()
-
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
         test_results,
@@ -202,12 +198,8 @@ def main():
         report_url,
         check_name,
     )
-
     ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
 
-    print(f"::notice Result: '{status}', '{description}', '{report_url}'")
-    post_commit_status(gh, pr_info.sha, check_name, description, status, report_url)
-
 
 if __name__ == "__main__":
     main()
diff --git a/tests/ci/sqllogic_test.py b/tests/ci/sqllogic_test.py
index 9b41ff4680f6..942c9c60ee8c 100755
--- a/tests/ci/sqllogic_test.py
+++ b/tests/ci/sqllogic_test.py
@@ -17,11 +17,15 @@
 from build_download_helper import download_all_deb_packages
 from upload_result_helper import upload_results
 from docker_pull_helper import get_image_with_version
-from commit_status_helper import override_status, post_commit_status
+from commit_status_helper import (
+    RerunHelper,
+    get_commit,
+    override_status,
+    post_commit_status,
+)
 from report import TestResults, read_test_results
 
 from stopwatch import Stopwatch
-from rerun_helper import RerunHelper
 from tee_popen import TeePopen
 
 
@@ -103,8 +107,9 @@ def parse_args():
 
     pr_info = PRInfo()
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name)
+    rerun_helper = RerunHelper(commit, check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -203,7 +208,7 @@ def parse_args():
 
     # Until it pass all tests, do not block CI, report "success"
     assert description is not None
-    post_commit_status(gh, pr_info.sha, check_name, description, "success", report_url)
+    post_commit_status(commit, "success", report_url, description, check_name, pr_info)
 
     if status != "success":
         if FORCE_TESTS_LABEL in pr_info.labels:
diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py
index 7596a81ebc9c..ac280916a2f7 100644
--- a/tests/ci/stress_check.py
+++ b/tests/ci/stress_check.py
@@ -16,13 +16,12 @@
     mark_flaky_tests,
     prepare_tests_results_for_clickhouse,
 )
-from commit_status_helper import post_commit_status
+from commit_status_helper import RerunHelper, get_commit, post_commit_status
 from docker_pull_helper import get_image_with_version
 from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, read_test_results
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
@@ -125,8 +124,9 @@ def run_stress_test(docker_image_name):
     pr_info = PRInfo()
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name)
+    rerun_helper = RerunHelper(commit, check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -180,7 +180,7 @@ def run_stress_test(docker_image_name):
     )
     print(f"::notice ::Report url: {report_url}")
 
-    post_commit_status(gh, pr_info.sha, check_name, description, state, report_url)
+    post_commit_status(commit, state, report_url, description, check_name, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py
index 89878990c2c3..33a5cd21f39b 100644
--- a/tests/ci/style_check.py
+++ b/tests/ci/style_check.py
@@ -15,7 +15,12 @@
     mark_flaky_tests,
     prepare_tests_results_for_clickhouse,
 )
-from commit_status_helper import post_commit_status, update_mergeable_check
+from commit_status_helper import (
+    RerunHelper,
+    get_commit,
+    post_commit_status,
+    update_mergeable_check,
+)
 from docker_pull_helper import get_image_with_version
 from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP
 from get_robot_token import get_best_robot_token
@@ -23,7 +28,6 @@
 from git_helper import git_runner
 from pr_info import PRInfo
 from report import TestResults, read_test_results
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from ssh import SSHKey
 from stopwatch import Stopwatch
@@ -149,10 +153,11 @@ def main():
         checkout_head(pr_info)
 
     gh = GitHub(get_best_robot_token(), create_cache_dir=False)
+    commit = get_commit(gh, pr_info.sha)
 
     atexit.register(update_mergeable_check, gh, pr_info, NAME)
 
-    rerun_helper = RerunHelper(gh, pr_info, NAME)
+    rerun_helper = RerunHelper(commit, NAME)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         # Finish with the same code as previous
@@ -190,7 +195,7 @@ def main():
         s3_helper, pr_info.number, pr_info.sha, test_results, additional_files, NAME
     )
     print(f"::notice ::Report url: {report_url}")
-    post_commit_status(gh, pr_info.sha, NAME, description, state, report_url)
+    post_commit_status(commit, state, report_url, description, NAME, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py
index edc096908f41..5279ccde492e 100644
--- a/tests/ci/unit_tests_check.py
+++ b/tests/ci/unit_tests_check.py
@@ -15,13 +15,17 @@
     mark_flaky_tests,
     prepare_tests_results_for_clickhouse,
 )
-from commit_status_helper import post_commit_status, update_mergeable_check
+from commit_status_helper import (
+    RerunHelper,
+    get_commit,
+    post_commit_status,
+    update_mergeable_check,
+)
 from docker_pull_helper import get_image_with_version
 from env_helper import TEMP_PATH, REPORTS_PATH
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
 from report import TestResults, TestResult
-from rerun_helper import RerunHelper
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
@@ -116,10 +120,11 @@ def main():
     pr_info = PRInfo()
 
     gh = Github(get_best_robot_token(), per_page=100)
+    commit = get_commit(gh, pr_info.sha)
 
     atexit.register(update_mergeable_check, gh, pr_info, check_name)
 
-    rerun_helper = RerunHelper(gh, pr_info, check_name)
+    rerun_helper = RerunHelper(commit, check_name)
     if rerun_helper.is_already_finished_by_status():
         logging.info("Check is already finished according to github status, exiting")
         sys.exit(0)
@@ -165,7 +170,7 @@ def main():
         check_name,
     )
     print(f"::notice ::Report url: {report_url}")
-    post_commit_status(gh, pr_info.sha, check_name, description, state, report_url)
+    post_commit_status(commit, state, report_url, description, check_name, pr_info)
 
     prepared_events = prepare_tests_results_for_clickhouse(
         pr_info,
diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 7c492a9b4673..acc8688cc4a1 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -11,6 +11,7 @@ import shutil
 import sys
 import os
 import os.path
+import platform
 import signal
 import re
 import copy
@@ -542,7 +543,10 @@ class SettingsRandomizer:
             0.2, 0.5, 1, 10 * 1024 * 1024 * 1024
         ),
         "local_filesystem_read_method": lambda: random.choice(
+            # Allow to use uring only when running on Linux
             ["read", "pread", "mmap", "pread_threadpool", "io_uring"]
+            if platform.system().lower() == "linux"
+            else ["read", "pread", "mmap", "pread_threadpool"]
         ),
         "remote_filesystem_read_method": lambda: random.choice(["read", "threadpool"]),
         "local_filesystem_read_prefetch": lambda: random.randint(0, 1),
@@ -2113,7 +2117,14 @@ def reportLogStats(args):
               'Column ''{}'' already exists', 'No macro {} in config', 'Invalid origin H3 index: {}',
               'Invalid session timeout: ''{}''', 'Tuple cannot be empty', 'Database name is empty',
               'Table {} is not a Dictionary', 'Expected function, got: {}', 'Unknown identifier: ''{}''',
-              'Failed to {} input ''{}''', '{}.{} is not a VIEW', 'Cannot convert NULL to {}', 'Dictionary {} doesn''t exist'
+              'Failed to {} input ''{}''', '{}.{} is not a VIEW', 'Cannot convert NULL to {}', 'Dictionary {} doesn''t exist',
+              'Write file: {}', 'Unable to parse JSONPath', 'Host is empty in S3 URI.', 'Expected end of line',
+              'inflate failed: {}{}', 'Center is not valid', 'Column ''{}'' is ambiguous', 'Cannot parse object', 'Invalid date: {}',
+              'There is no cache by name: {}', 'No part {} in table', '`{}` should be a String', 'There are duplicate id {}',
+              'Invalid replica name: {}', 'Unexpected value {} in enum', 'Unknown BSON type: {}', 'Point is not valid',
+              'Invalid qualified name: {}', 'INTO OUTFILE is not allowed', 'Arguments must not be NaN', 'Cell is not valid',
+              'brotli decode error{}', 'Invalid H3 index: {}', 'Too large node state size', 'No additional keys found.',
+              'Attempt to read after EOF.', 'Replication was stopped', '{}	building file infos', 'Cannot parse uuid {}'
         ) AS known_short_messages
         SELECT count() AS c, message_format_string, substr(any(message), 1, 120)
         FROM system.text_log
@@ -2252,7 +2263,7 @@ def main(args):
                     "\nFound hung queries in processlist:", args, "red", attrs=["bold"]
                 )
             )
-            print(json.dumps(processlist, indent=4))
+            print(processlist)
             print(get_transactions_list(args))
 
             print_stacktraces()
diff --git a/tests/config/config.d/merge_tree_old_dirs_cleanup.xml b/tests/config/config.d/merge_tree_old_dirs_cleanup.xml
index 41932cb6d61d..2b8ea63b63de 100644
--- a/tests/config/config.d/merge_tree_old_dirs_cleanup.xml
+++ b/tests/config/config.d/merge_tree_old_dirs_cleanup.xml
@@ -3,6 +3,6 @@
         <!-- Default is 86400 (1 day), but we have protection from removal of tmp dirs that are currently in use -->
         <temporary_directories_lifetime>1</temporary_directories_lifetime>
         <!-- Default is 60 seconds, but let's make tests more aggressive -->
-        <merge_tree_clear_old_temporary_directories_interval_seconds>10</merge_tree_clear_old_temporary_directories_interval_seconds>
+        <merge_tree_clear_old_temporary_directories_interval_seconds>5</merge_tree_clear_old_temporary_directories_interval_seconds>
     </merge_tree>
 </clickhouse>
diff --git a/tests/config/config.d/storage_conf.xml b/tests/config/config.d/storage_conf.xml
index bc9269e6ec1c..cb5a75f96989 100644
--- a/tests/config/config.d/storage_conf.xml
+++ b/tests/config/config.d/storage_conf.xml
@@ -55,52 +55,58 @@
                 <type>cache</type>
                 <disk>s3_disk</disk>
                 <path>s3_cache/</path>
-                <max_size>2147483648</max_size>
+                <max_size>128Mi</max_size>
                 <cache_on_write_operations>1</cache_on_write_operations>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache>
             <s3_cache_2>
                 <type>cache</type>
                 <disk>s3_disk_2</disk>
                 <path>s3_cache_2/</path>
-                <max_size>2Gi</max_size>
+                <max_size>128Mi</max_size>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
                 <max_file_segment_size>100Mi</max_file_segment_size>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_2>
             <s3_cache_3>
                 <type>cache</type>
                 <disk>s3_disk_3</disk>
                 <path>s3_disk_3_cache/</path>
-                <max_size>22548578304</max_size>
+                <max_size>128Mi</max_size>
                 <data_cache_max_size>22548578304</data_cache_max_size>
                 <cache_on_write_operations>1</cache_on_write_operations>
                 <enable_cache_hits_threshold>1</enable_cache_hits_threshold>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_3>
             <s3_cache_4>
                 <type>cache</type>
                 <disk>s3_disk_4</disk>
                 <path>s3_cache_4/</path>
-                <max_size>22548578304</max_size>
+                <max_size>128Mi</max_size>
                 <cache_on_write_operations>1</cache_on_write_operations>
                 <enable_filesystem_query_cache_limit>1</enable_filesystem_query_cache_limit>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_4>
             <s3_cache_5>
                 <type>cache</type>
                 <disk>s3_disk_5</disk>
                 <path>s3_cache_5/</path>
-                <max_size>22548578304</max_size>
+                <max_size>128Mi</max_size>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_5>
             <s3_cache_6>
                 <type>cache</type>
                 <disk>s3_disk_6</disk>
                 <path>s3_cache_6/</path>
-                <max_size>22548578304</max_size>
+                <max_size>128Mi</max_size>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
                 <enable_bypass_cache_with_threashold>1</enable_bypass_cache_with_threashold>
                 <bypass_cache_threashold>100</bypass_cache_threashold>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_6>
             <s3_cache_small>
                 <type>cache</type>
@@ -108,15 +114,17 @@
                 <path>s3_cache_small/</path>
                 <max_size>1000</max_size>
                 <do_not_evict_index_and_mark_files>1</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_small>
             <s3_cache_small_segment_size>
                 <type>cache</type>
                 <disk>s3_disk_6</disk>
                 <path>s3_cache_small_segment_size/</path>
-                <max_size>22548578304</max_size>
+                <max_size>128Mi</max_size>
                 <max_file_segment_size>10Ki</max_file_segment_size>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
                 <cache_on_write_operations>1</cache_on_write_operations>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_small_segment_size>
             <!-- local disks -->
             <local_disk>
@@ -139,6 +147,7 @@
                 <max_size>22548578304</max_size>
                 <cache_on_write_operations>1</cache_on_write_operations>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </local_cache>
             <local_cache_2>
                 <type>cache</type>
@@ -146,6 +155,7 @@
                 <path>local_cache_2/</path>
                 <max_size>22548578304</max_size>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </local_cache_2>
             <local_cache_3>
                 <type>cache</type>
@@ -155,6 +165,7 @@
                 <cache_on_write_operations>1</cache_on_write_operations>
                 <enable_cache_hits_threshold>1</enable_cache_hits_threshold>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </local_cache_3>
             <!-- multi layer cache -->
             <s3_cache_multi>
@@ -163,6 +174,7 @@
                 <path>s3_cache_multi/</path>
                 <max_size>22548578304</max_size>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_multi>
             <s3_cache_multi_2>
                 <type>cache</type>
@@ -170,6 +182,7 @@
                 <path>s3_cache_multi_2/</path>
                 <max_size>22548578304</max_size>
                 <do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>
+                <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
             </s3_cache_multi_2>
         </disks>
         <policies>
diff --git a/tests/integration/helpers/external_sources.py b/tests/integration/helpers/external_sources.py
index fd086fc45266..afb91083d57b 100644
--- a/tests/integration/helpers/external_sources.py
+++ b/tests/integration/helpers/external_sources.py
@@ -161,6 +161,29 @@ def load_data(self, data, table_name):
 
 
 class SourceMongo(ExternalSource):
+    def __init__(
+        self,
+        name,
+        internal_hostname,
+        internal_port,
+        docker_hostname,
+        docker_port,
+        user,
+        password,
+        secure=False,
+    ):
+        ExternalSource.__init__(
+            self,
+            name,
+            internal_hostname,
+            internal_port,
+            docker_hostname,
+            docker_port,
+            user,
+            password,
+        )
+        self.secure = secure
+
     def get_source_str(self, table_name):
         return """
             <mongodb>
@@ -170,6 +193,7 @@ def get_source_str(self, table_name):
                 <password>{password}</password>
                 <db>test</db>
                 <collection>{tbl}</collection>
+                {options}
             </mongodb>
         """.format(
             host=self.docker_hostname,
@@ -177,6 +201,7 @@ def get_source_str(self, table_name):
             user=self.user,
             password=self.password,
             tbl=table_name,
+            options="<options>ssl=true</options>" if self.secure else "",
         )
 
     def prepare(self, structure, table_name, cluster):
@@ -186,6 +211,8 @@ def prepare(self, structure, table_name, cluster):
             user=self.user,
             password=self.password,
         )
+        if self.secure:
+            connection_str += "/?tls=true&tlsAllowInvalidCertificates=true"
         self.connection = pymongo.MongoClient(connection_str)
         self.converters = {}
         for field in structure.get_all_fields():
@@ -228,7 +255,7 @@ def compatible_with_layout(self, layout):
     def get_source_str(self, table_name):
         return """
             <mongodb>
-                <uri>mongodb://{user}:{password}@{host}:{port}/test</uri>
+                <uri>mongodb://{user}:{password}@{host}:{port}/test{options}</uri>
                 <collection>{tbl}</collection>
             </mongodb>
         """.format(
@@ -237,6 +264,7 @@ def get_source_str(self, table_name):
             user=self.user,
             password=self.password,
             tbl=table_name,
+            options="?ssl=true" if self.secure else "",
         )
 
 
diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/ssl_verification.xml b/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/ssl_verification.xml
new file mode 100644
index 000000000000..3efe98e70450
--- /dev/null
+++ b/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/ssl_verification.xml
@@ -0,0 +1,8 @@
+<clickhouse>
+    <openSSL>
+        <client>
+            <!-- For self-signed certificate -->
+            <verificationMode>none</verificationMode>
+        </client>
+    </openSSL>
+</clickhouse>
diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo.py b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo.py
index 55639877ba0b..973dbfc04299 100644
--- a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo.py
+++ b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo.py
@@ -17,15 +17,19 @@
 test_name = "mongo"
 
 
-def setup_module(module):
-    global cluster
-    global node
-    global simple_tester
-    global complex_tester
-    global ranged_tester
-
-    cluster = ClickHouseCluster(__file__)
-    SOURCE = SourceMongo(
+@pytest.fixture(scope="module")
+def secure_connection(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def cluster(secure_connection):
+    return ClickHouseCluster(__file__)
+
+
+@pytest.fixture(scope="module")
+def source(secure_connection, cluster):
+    return SourceMongo(
         "MongoDB",
         "localhost",
         cluster.mongo_port,
@@ -33,35 +37,71 @@ def setup_module(module):
         "27017",
         "root",
         "clickhouse",
+        secure=secure_connection,
     )
 
-    simple_tester = SimpleLayoutTester(test_name)
-    simple_tester.cleanup()
-    simple_tester.create_dictionaries(SOURCE)
 
-    complex_tester = ComplexLayoutTester(test_name)
-    complex_tester.create_dictionaries(SOURCE)
+@pytest.fixture(scope="module")
+def simple_tester(source):
+    tester = SimpleLayoutTester(test_name)
+    tester.cleanup()
+    tester.create_dictionaries(source)
+    return tester
 
-    ranged_tester = RangedLayoutTester(test_name)
-    ranged_tester.create_dictionaries(SOURCE)
-    # Since that all .xml configs were created
 
-    main_configs = []
-    main_configs.append(os.path.join("configs", "disable_ssl_verification.xml"))
+@pytest.fixture(scope="module")
+def complex_tester(source):
+    tester = ComplexLayoutTester(test_name)
+    tester.create_dictionaries(source)
+    return tester
 
-    dictionaries = simple_tester.list_dictionaries()
 
-    node = cluster.add_instance(
-        "node", main_configs=main_configs, dictionaries=dictionaries, with_mongo=True
-    )
+@pytest.fixture(scope="module")
+def ranged_tester(source):
+    tester = RangedLayoutTester(test_name)
+    tester.create_dictionaries(source)
+    return tester
 
 
-def teardown_module(module):
-    simple_tester.cleanup()
+@pytest.fixture(scope="module")
+def main_config(secure_connection):
+    main_config = []
+    if secure_connection:
+        main_config.append(os.path.join("configs", "disable_ssl_verification.xml"))
+    else:
+        main_config.append(os.path.join("configs", "ssl_verification.xml"))
+    return main_config
 
 
 @pytest.fixture(scope="module")
-def started_cluster():
+def started_cluster(
+    secure_connection,
+    cluster,
+    main_config,
+    simple_tester,
+    ranged_tester,
+    complex_tester,
+):
+    SOURCE = SourceMongo(
+        "MongoDB",
+        "localhost",
+        cluster.mongo_port,
+        cluster.mongo_host,
+        "27017",
+        "root",
+        "clickhouse",
+        secure=secure_connection,
+    )
+    dictionaries = simple_tester.list_dictionaries()
+
+    node = cluster.add_instance(
+        "node",
+        main_configs=main_config,
+        dictionaries=dictionaries,
+        with_mongo=True,
+        with_mongo_secure=secure_connection,
+    )
+
     try:
         cluster.start()
 
@@ -75,16 +115,25 @@ def started_cluster():
         cluster.shutdown()
 
 
+@pytest.mark.parametrize("secure_connection", [False], indirect=["secure_connection"])
 @pytest.mark.parametrize("layout_name", sorted(LAYOUTS_SIMPLE))
-def test_simple(started_cluster, layout_name):
-    simple_tester.execute(layout_name, node)
+def test_simple(secure_connection, started_cluster, layout_name, simple_tester):
+    simple_tester.execute(layout_name, started_cluster.instances["node"])
 
 
+@pytest.mark.parametrize("secure_connection", [False], indirect=["secure_connection"])
 @pytest.mark.parametrize("layout_name", sorted(LAYOUTS_COMPLEX))
-def test_complex(started_cluster, layout_name):
-    complex_tester.execute(layout_name, node)
+def test_complex(secure_connection, started_cluster, layout_name, complex_tester):
+    complex_tester.execute(layout_name, started_cluster.instances["node"])
 
 
+@pytest.mark.parametrize("secure_connection", [False], indirect=["secure_connection"])
 @pytest.mark.parametrize("layout_name", sorted(LAYOUTS_RANGED))
-def test_ranged(started_cluster, layout_name):
-    ranged_tester.execute(layout_name, node)
+def test_ranged(secure_connection, started_cluster, layout_name, ranged_tester):
+    ranged_tester.execute(layout_name, started_cluster.instances["node"])
+
+
+@pytest.mark.parametrize("secure_connection", [True], indirect=["secure_connection"])
+@pytest.mark.parametrize("layout_name", sorted(LAYOUTS_SIMPLE))
+def test_simple_ssl(secure_connection, started_cluster, layout_name, simple_tester):
+    simple_tester.execute(layout_name, started_cluster.instances["node"])
diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo_uri.py b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo_uri.py
index 84c547b7a6b3..225414322591 100644
--- a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo_uri.py
+++ b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mongo_uri.py
@@ -8,25 +8,22 @@
 from helpers.dictionary import Field, Row, Dictionary, DictionaryStructure, Layout
 from helpers.external_sources import SourceMongoURI
 
-SOURCE = None
-cluster = None
-node = None
-simple_tester = None
-complex_tester = None
-ranged_tester = None
 test_name = "mongo_uri"
 
 
-def setup_module(module):
-    global cluster
-    global node
-    global simple_tester
-    global complex_tester
-    global ranged_tester
+@pytest.fixture(scope="module")
+def secure_connection(request):
+    return request.param
+
 
-    cluster = ClickHouseCluster(__file__)
+@pytest.fixture(scope="module")
+def cluster(secure_connection):
+    return ClickHouseCluster(__file__)
 
-    SOURCE = SourceMongoURI(
+
+@pytest.fixture(scope="module")
+def source(secure_connection, cluster):
+    return SourceMongoURI(
         "MongoDB",
         "localhost",
         cluster.mongo_port,
@@ -34,52 +31,55 @@ def setup_module(module):
         "27017",
         "root",
         "clickhouse",
+        secure=secure_connection,
     )
 
-    simple_tester = SimpleLayoutTester(test_name)
-    simple_tester.cleanup()
-    simple_tester.create_dictionaries(SOURCE)
 
-    complex_tester = ComplexLayoutTester(test_name)
-    complex_tester.create_dictionaries(SOURCE)
+@pytest.fixture(scope="module")
+def simple_tester(source):
+    tester = SimpleLayoutTester(test_name)
+    tester.cleanup()
+    tester.create_dictionaries(source)
+    return tester
+
 
-    ranged_tester = RangedLayoutTester(test_name)
-    ranged_tester.create_dictionaries(SOURCE)
-    # Since that all .xml configs were created
+@pytest.fixture(scope="module")
+def main_config(secure_connection):
+    main_config = []
+    if secure_connection:
+        main_config.append(os.path.join("configs", "disable_ssl_verification.xml"))
+    else:
+        main_config.append(os.path.join("configs", "ssl_verification.xml"))
+    return main_config
 
-    main_configs = []
-    main_configs.append(os.path.join("configs", "disable_ssl_verification.xml"))
 
+@pytest.fixture(scope="module")
+def started_cluster(secure_connection, cluster, main_config, simple_tester):
     dictionaries = simple_tester.list_dictionaries()
 
     node = cluster.add_instance(
         "uri_node",
-        main_configs=main_configs,
+        main_configs=main_config,
         dictionaries=dictionaries,
         with_mongo=True,
+        with_mongo_secure=secure_connection,
     )
-
-
-def teardown_module(module):
-    simple_tester.cleanup()
-
-
-@pytest.fixture(scope="module")
-def started_cluster():
     try:
         cluster.start()
-
         simple_tester.prepare(cluster)
-        complex_tester.prepare(cluster)
-        ranged_tester.prepare(cluster)
-
         yield cluster
-
     finally:
         cluster.shutdown()
 
 
 # See comment in SourceMongoURI
+@pytest.mark.parametrize("secure_connection", [False], indirect=["secure_connection"])
+@pytest.mark.parametrize("layout_name", ["flat"])
+def test_simple(secure_connection, started_cluster, simple_tester, layout_name):
+    simple_tester.execute(layout_name, started_cluster.instances["uri_node"])
+
+
+@pytest.mark.parametrize("secure_connection", [True], indirect=["secure_connection"])
 @pytest.mark.parametrize("layout_name", ["flat"])
-def test_simple(started_cluster, layout_name):
-    simple_tester.execute(layout_name, node)
+def test_simple_ssl(secure_connection, started_cluster, simple_tester, layout_name):
+    simple_tester.execute(layout_name, started_cluster.instances["uri_node"])
diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py
index 92232f7e6a8b..69144c0eb074 100644
--- a/tests/integration/test_mask_sensitive_info/test.py
+++ b/tests/integration/test_mask_sensitive_info/test.py
@@ -95,14 +95,14 @@ def test_create_alter_user():
 
     check_logs(
         must_contain=[
-            "CREATE USER u1 IDENTIFIED WITH sha256_password",
-            "ALTER USER u1 IDENTIFIED WITH sha256_password",
+            "CREATE USER u1 IDENTIFIED",
+            "ALTER USER u1 IDENTIFIED",
             "CREATE USER u2 IDENTIFIED WITH plaintext_password",
         ],
         must_not_contain=[
             password,
-            "IDENTIFIED WITH sha256_password BY",
-            "IDENTIFIED WITH sha256_hash BY",
+            "IDENTIFIED BY",
+            "IDENTIFIED BY",
             "IDENTIFIED WITH plaintext_password BY",
         ],
     )
diff --git a/tests/integration/test_max_rows_to_read_leaf_with_view/__init__.py b/tests/integration/test_max_rows_to_read_leaf_with_view/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/integration/test_max_rows_to_read_leaf_with_view/configs/remote_servers.xml b/tests/integration/test_max_rows_to_read_leaf_with_view/configs/remote_servers.xml
new file mode 100644
index 000000000000..9ce90edb727e
--- /dev/null
+++ b/tests/integration/test_max_rows_to_read_leaf_with_view/configs/remote_servers.xml
@@ -0,0 +1,18 @@
+<clickhouse>
+    <remote_servers>
+        <two_shards>
+            <shard>
+                <replica>
+                    <host>node1</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <replica>
+                    <host>node2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </two_shards>
+    </remote_servers>
+</clickhouse>
diff --git a/tests/integration/test_max_rows_to_read_leaf_with_view/test.py b/tests/integration/test_max_rows_to_read_leaf_with_view/test.py
new file mode 100755
index 000000000000..6957534ce0d2
--- /dev/null
+++ b/tests/integration/test_max_rows_to_read_leaf_with_view/test.py
@@ -0,0 +1,76 @@
+from contextlib import contextmanager
+
+import pytest
+from helpers.cluster import ClickHouseCluster
+from helpers.client import QueryRuntimeException
+
+cluster = ClickHouseCluster(__file__)
+
+node1 = cluster.add_instance(
+    "node1",
+    main_configs=["configs/remote_servers.xml"],
+    with_zookeeper=True,
+)
+
+node2 = cluster.add_instance(
+    "node2",
+    main_configs=["configs/remote_servers.xml"],
+    with_zookeeper=True,
+)
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+
+        for node in (node1, node2):
+            node.query(
+                f"""
+                CREATE TABLE local_table(id UInt32, d DateTime) ENGINE = ReplicatedMergeTree('/clickhouse/tables/0/max_rows_read_leaf', '{node}') PARTITION BY toYYYYMM(d) ORDER BY d;
+
+                CREATE TABLE distributed_table(id UInt32, d DateTime) ENGINE = Distributed(two_shards, default, local_table);
+
+                CREATE OR REPLACE VIEW test_view AS select id from distributed_table;
+"""
+            )
+        node1.query(
+            "INSERT INTO local_table (id) select * from system.numbers limit 200"
+        )
+        node2.query(
+            "INSERT INTO local_table (id) select * from system.numbers limit 200"
+        )
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+
+def test_max_rows_to_read_leaf_via_view(started_cluster):
+    """
+    Asserts the expected behaviour that we should be able to select
+    the total amount of rows (400 -  200 from each shard) from a
+    view that selects from a distributed table.
+    """
+    assert (
+        node1.query(
+            "SELECT count() from test_view SETTINGS max_rows_to_read_leaf=200"
+        ).rstrip()
+        == "400"
+    )
+    with pytest.raises(
+        QueryRuntimeException, match="controlled by 'max_rows_to_read_leaf'"
+    ):
+        # insert some more data and ensure we get a legitimate failure
+        node2.query(
+            "INSERT INTO local_table (id) select * from system.numbers limit 10"
+        )
+        node2.query("SELECT count() from test_view SETTINGS max_rows_to_read_leaf=200")
+
+
+if __name__ == "__main__":
+    with contextmanager(started_cluster)() as cluster:
+        for name, instance in list(cluster.instances.items()):
+            print(name, instance.ip_address)
+        input("Cluster created, press any key to destroy...")
diff --git a/tests/integration/test_merge_tree_hdfs/test.py b/tests/integration/test_merge_tree_hdfs/test.py
index 782237539fa6..3057e48c7530 100644
--- a/tests/integration/test_merge_tree_hdfs/test.py
+++ b/tests/integration/test_merge_tree_hdfs/test.py
@@ -224,14 +224,22 @@ def test_attach_detach_partition(cluster):
     wait_for_delete_empty_parts(node, "hdfs_test")
     wait_for_delete_inactive_parts(node, "hdfs_test")
     wait_for_delete_hdfs_objects(
-        cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
+        cluster,
+        FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 2
+        - FILES_OVERHEAD_METADATA_VERSION,
     )
 
     node.query("ALTER TABLE hdfs_test ATTACH PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"
 
     hdfs_objects = fs.listdir("/clickhouse")
-    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
+    assert (
+        len(hdfs_objects)
+        == FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 2
+        - FILES_OVERHEAD_METADATA_VERSION
+    )
 
     node.query("ALTER TABLE hdfs_test DROP PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(4096)"
@@ -355,7 +363,14 @@ def test_move_replace_partition_to_another_table(cluster):
 
     # Number of objects in HDFS should be unchanged.
     hdfs_objects = fs.listdir("/clickhouse")
-    assert len(hdfs_objects) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4
+    for obj in hdfs_objects:
+        print("Object in HDFS after move", obj)
+    wait_for_delete_hdfs_objects(
+        cluster,
+        FILES_OVERHEAD * 2
+        + FILES_OVERHEAD_PER_PART_WIDE * 4
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
+    )
 
     # Add new partitions to source table, but with different values and replace them from copied table.
     node.query(
@@ -370,7 +385,15 @@ def test_move_replace_partition_to_another_table(cluster):
     assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)"
 
     hdfs_objects = fs.listdir("/clickhouse")
-    assert len(hdfs_objects) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 6
+    for obj in hdfs_objects:
+        print("Object in HDFS after insert", obj)
+
+    wait_for_delete_hdfs_objects(
+        cluster,
+        FILES_OVERHEAD * 2
+        + FILES_OVERHEAD_PER_PART_WIDE * 6
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
+    )
 
     node.query("ALTER TABLE hdfs_test REPLACE PARTITION '2020-01-03' FROM hdfs_clone")
     node.query("ALTER TABLE hdfs_test REPLACE PARTITION '2020-01-05' FROM hdfs_clone")
@@ -381,7 +404,10 @@ def test_move_replace_partition_to_another_table(cluster):
 
     # Wait for outdated partitions deletion.
     wait_for_delete_hdfs_objects(
-        cluster, FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4
+        cluster,
+        FILES_OVERHEAD * 2
+        + FILES_OVERHEAD_PER_PART_WIDE * 4
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
     )
 
     node.query("DROP TABLE hdfs_clone NO DELAY")
@@ -390,4 +416,13 @@ def test_move_replace_partition_to_another_table(cluster):
 
     # Data should remain in hdfs
     hdfs_objects = fs.listdir("/clickhouse")
-    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
+
+    for obj in hdfs_objects:
+        print("Object in HDFS after drop", obj)
+
+    wait_for_delete_hdfs_objects(
+        cluster,
+        FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 4
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
+    )
diff --git a/tests/integration/test_merge_tree_load_parts/test.py b/tests/integration/test_merge_tree_load_parts/test.py
index dfbe00c8e28c..049dd516647c 100644
--- a/tests/integration/test_merge_tree_load_parts/test.py
+++ b/tests/integration/test_merge_tree_load_parts/test.py
@@ -6,12 +6,14 @@
 
 
 cluster = helpers.cluster.ClickHouseCluster(__file__)
+
 node1 = cluster.add_instance(
     "node1",
     main_configs=["configs/fast_background_pool.xml"],
     with_zookeeper=True,
     stay_alive=True,
 )
+
 node2 = cluster.add_instance(
     "node2",
     main_configs=["configs/fast_background_pool.xml"],
@@ -19,6 +21,12 @@
     stay_alive=True,
 )
 
+node3 = cluster.add_instance(
+    "node3",
+    with_zookeeper=True,
+    stay_alive=True,
+)
+
 
 @pytest.fixture(scope="module")
 def started_cluster():
@@ -194,3 +202,54 @@ def check_parts_loading(node, partition, loaded, failed, skipped):
         )
         == "111\t1\n222\t1\n333\t1\n"
     )
+
+
+def test_merge_tree_load_parts_filesystem_error(started_cluster):
+    if node3.is_built_with_sanitizer() or node3.is_debug_build():
+        pytest.skip(
+            "Skip with debug build and sanitizers. \
+            This test intentionally triggers LOGICAL_ERROR which leads to crash with those builds"
+        )
+
+    node3.query(
+        """
+        CREATE TABLE mt_load_parts (id UInt32)
+        ENGINE = MergeTree ORDER BY id
+        SETTINGS index_granularity_bytes = 0"""
+    )
+
+    node3.query("SYSTEM STOP MERGES mt_load_parts")
+
+    for i in range(2):
+        node3.query(f"INSERT INTO mt_load_parts VALUES ({i})")
+
+    # We want to somehow check that exception thrown on part creation is handled during part loading.
+    # It can be a filesystem exception triggered at initialization of part storage but it hard
+    # to trigger it because it should be an exception on stat/listDirectory.
+    # The most easy way to trigger such exception is to use chmod but clickhouse server
+    # is run with root user in integration test and this won't work. So let's do some
+    # some stupid things: create a table without adaptive granularity and change mark
+    # extensions of data files in part to make clickhouse think that it's a compact part which
+    # cannot be created in such table. This will trigger a LOGICAL_ERROR on part creation.
+
+    def corrupt_part(table, part_name):
+        part_path = node3.query(
+            "SELECT path FROM system.parts WHERE table = '{}' and name = '{}'".format(
+                table, part_name
+            )
+        ).strip()
+
+        node3.exec_in_container(
+            ["bash", "-c", f"mv {part_path}id.mrk {part_path}id.mrk3"], privileged=True
+        )
+
+    corrupt_part("mt_load_parts", "all_1_1_0")
+    node3.restart_clickhouse(kill=True)
+
+    assert node3.query("SELECT * FROM mt_load_parts") == "1\n"
+    assert (
+        node3.query(
+            "SELECT name FROM system.detached_parts WHERE table = 'mt_load_parts'"
+        )
+        == "broken-on-start_all_1_1_0\n"
+    )
diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py
index c2e00dc0cb86..9e9903c36c73 100644
--- a/tests/integration/test_merge_tree_s3/test.py
+++ b/tests/integration/test_merge_tree_s3/test.py
@@ -101,44 +101,45 @@ def run_s3_mocks(cluster):
     )
 
 
-def list_objects(cluster, path="data/"):
+def list_objects(cluster, path="data/", hint="list_objects"):
     minio = cluster.minio_client
     objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True))
-    logging.info(f"list_objects ({len(objects)}): {[x.object_name for x in objects]}")
+    logging.info(f"{hint} ({len(objects)}): {[x.object_name for x in objects]}")
     return objects
 
 
 def wait_for_delete_s3_objects(cluster, expected, timeout=30):
-    minio = cluster.minio_client
     while timeout > 0:
-        if (
-            len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-            == expected
-        ):
+        if len(list_objects(cluster, "data/")) == expected:
             return
         timeout -= 1
         time.sleep(1)
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == expected
-    )
+    assert len(list_objects(cluster, "data/")) == expected
 
 
-@pytest.fixture(autouse=True)
-@pytest.mark.parametrize("node_name", ["node"])
-def drop_table(cluster, node_name):
-    yield
-    node = cluster.instances[node_name]
+def remove_all_s3_objects(cluster):
     minio = cluster.minio_client
+    for obj in list_objects(cluster, "data/"):
+        minio.remove_object(cluster.minio_bucket, obj.object_name)
 
-    node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
 
+@pytest.fixture(autouse=True, scope="function")
+def clear_minio(cluster):
     try:
-        wait_for_delete_s3_objects(cluster, 0)
-    finally:
+        # CH do some writes to the S3 at start. For example, file data/clickhouse_access_check_{server_uuid}.
+        # Set the timeout there as 10 sec in order to resolve the race with that file exists.
+        wait_for_delete_s3_objects(cluster, 0, timeout=10)
+    except:
         # Remove extra objects to prevent tests cascade failing
-        for obj in list_objects(cluster, "data/"):
-            minio.remove_object(cluster.minio_bucket, obj.object_name)
+        remove_all_s3_objects(cluster)
+
+    yield
+
+
+def check_no_objects_after_drop(cluster, table_name="s3_test", node_name="node"):
+    node = cluster.instances[node_name]
+    node.query(f"DROP TABLE IF EXISTS {table_name} NO DELAY")
+    wait_for_delete_s3_objects(cluster, 0, timeout=0)
 
 
 @pytest.mark.parametrize(
@@ -158,10 +159,7 @@ def test_simple_insert_select(
     values1 = generate_values("2020-01-03", 4096)
     node.query("INSERT INTO s3_test VALUES {}".format(values1))
     assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values") == values1
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD + files_per_part
-    )
+    assert len(list_objects(cluster, "data/")) == FILES_OVERHEAD + files_per_part
 
     values2 = generate_values("2020-01-04", 4096)
     node.query("INSERT INTO s3_test VALUES {}".format(values2))
@@ -169,15 +167,14 @@ def test_simple_insert_select(
         node.query("SELECT * FROM s3_test ORDER BY dt, id FORMAT Values")
         == values1 + "," + values2
     )
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD + files_per_part * 2
-    )
+    assert len(list_objects(cluster, "data/")) == FILES_OVERHEAD + files_per_part * 2
 
     assert (
         node.query("SELECT count(*) FROM s3_test where id = 1 FORMAT Values") == "(2)"
     )
 
+    check_no_objects_after_drop(cluster)
+
 
 @pytest.mark.parametrize("merge_vertical,node_name", [(True, "node"), (False, "node")])
 def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name):
@@ -188,7 +185,6 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name):
 
     node = cluster.instances[node_name]
     create_table(node, "s3_test", **settings)
-    minio = cluster.minio_client
 
     node.query("SYSTEM STOP MERGES s3_test")
     node.query(
@@ -214,7 +210,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name):
         node.query("SELECT count(distinct(id)) FROM s3_test FORMAT Values") == "(8192)"
     )
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list_objects(cluster, "data/"))
         == FILES_OVERHEAD_PER_PART_WIDE * 6 + FILES_OVERHEAD
     )
 
@@ -242,6 +238,8 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name):
         cluster, FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD, timeout=45
     )
 
+    check_no_objects_after_drop(cluster)
+
 
 @pytest.mark.parametrize("node_name", ["node"])
 def test_alter_table_columns(cluster, node_name):
@@ -287,12 +285,13 @@ def test_alter_table_columns(cluster, node_name):
         cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + 2
     )
 
+    check_no_objects_after_drop(cluster)
+
 
 @pytest.mark.parametrize("node_name", ["node"])
 def test_attach_detach_partition(cluster, node_name):
     node = cluster.instances[node_name]
     create_table(node, "s3_test")
-    minio = cluster.minio_client
 
     node.query(
         "INSERT INTO s3_test VALUES {}".format(generate_values("2020-01-03", 4096))
@@ -312,14 +311,18 @@ def test_attach_detach_partition(cluster, node_name):
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)"
     assert (
         len(list_objects(cluster, "data/"))
-        == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
+        == FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 2
+        - FILES_OVERHEAD_METADATA_VERSION
     )
 
     node.query("ALTER TABLE s3_test ATTACH PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
         len(list_objects(cluster, "data/"))
-        == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
+        == FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 2
+        - FILES_OVERHEAD_METADATA_VERSION
     )
 
     node.query("ALTER TABLE s3_test DROP PARTITION '2020-01-03'")
@@ -337,7 +340,9 @@ def test_attach_detach_partition(cluster, node_name):
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)"
     assert (
         len(list_objects(cluster, "data/"))
-        == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 1
+        == FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 1
+        - FILES_OVERHEAD_METADATA_VERSION
     )
     node.query(
         "ALTER TABLE s3_test DROP DETACHED PARTITION '2020-01-04'",
@@ -349,12 +354,13 @@ def test_attach_detach_partition(cluster, node_name):
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 0
     )
 
+    check_no_objects_after_drop(cluster)
+
 
 @pytest.mark.parametrize("node_name", ["node"])
 def test_move_partition_to_another_disk(cluster, node_name):
     node = cluster.instances[node_name]
     create_table(node, "s3_test")
-    minio = cluster.minio_client
 
     node.query(
         "INSERT INTO s3_test VALUES {}".format(generate_values("2020-01-03", 4096))
@@ -364,30 +370,31 @@ def test_move_partition_to_another_disk(cluster, node_name):
     )
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list_objects(cluster, "data/"))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list_objects(cluster, "data/"))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE
     )
 
     node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 's3'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list_objects(cluster, "data/"))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
+    check_no_objects_after_drop(cluster)
+
 
 @pytest.mark.parametrize("node_name", ["node"])
 def test_table_manipulations(cluster, node_name):
     node = cluster.instances[node_name]
     create_table(node, "s3_test")
-    minio = cluster.minio_client
 
     node.query(
         "INSERT INTO s3_test VALUES {}".format(generate_values("2020-01-03", 4096))
@@ -399,9 +406,10 @@ def test_table_manipulations(cluster, node_name):
     node.query("RENAME TABLE s3_test TO s3_renamed")
     assert node.query("SELECT count(*) FROM s3_renamed FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list_objects(cluster, "data/"))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
+
     node.query("RENAME TABLE s3_renamed TO s3_test")
 
     assert node.query("CHECK TABLE s3_test FORMAT Values") == "(1)"
@@ -410,7 +418,7 @@ def test_table_manipulations(cluster, node_name):
     node.query("ATTACH TABLE s3_test")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list_objects(cluster, "data/"))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
@@ -418,17 +426,15 @@ def test_table_manipulations(cluster, node_name):
     wait_for_delete_empty_parts(node, "s3_test")
     wait_for_delete_inactive_parts(node, "s3_test")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)"
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD
-    )
+    assert len(list_objects(cluster, "data/")) == FILES_OVERHEAD
+
+    check_no_objects_after_drop(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
 def test_move_replace_partition_to_another_table(cluster, node_name):
     node = cluster.instances[node_name]
     create_table(node, "s3_test")
-    minio = cluster.minio_client
 
     node.query(
         "INSERT INTO s3_test VALUES {}".format(generate_values("2020-01-03", 4096))
@@ -444,11 +450,11 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     )
     assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
+
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list_objects(cluster, "data/", "Objects at start"))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
-
     create_table(node, "s3_clone")
 
     node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-03' TO TABLE s3_clone")
@@ -457,10 +463,14 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert node.query("SELECT sum(id) FROM s3_clone FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_clone FORMAT Values") == "(8192)"
+
+    list_objects(cluster, "data/", "Object after move partition")
     # Number of objects in S3 should be unchanged.
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4
+    wait_for_delete_s3_objects(
+        cluster,
+        FILES_OVERHEAD * 2
+        + FILES_OVERHEAD_PER_PART_WIDE * 4
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
     )
 
     # Add new partitions to source table, but with different values and replace them from copied table.
@@ -472,9 +482,13 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     )
     assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 6
+
+    list_objects(cluster, "data/", "Object after insert")
+    wait_for_delete_s3_objects(
+        cluster,
+        FILES_OVERHEAD * 2
+        + FILES_OVERHEAD_PER_PART_WIDE * 6
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
     )
 
     node.query("ALTER TABLE s3_test REPLACE PARTITION '2020-01-03' FROM s3_clone")
@@ -486,39 +500,48 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
 
     # Wait for outdated partitions deletion.
     wait_for_delete_s3_objects(
-        cluster, FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4
+        cluster,
+        FILES_OVERHEAD * 2
+        + FILES_OVERHEAD_PER_PART_WIDE * 4
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
     )
 
     node.query("DROP TABLE s3_clone NO DELAY")
     assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
-    # Data should remain in S3
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
+
+    list_objects(cluster, "data/", "Object after drop")
+    wait_for_delete_s3_objects(
+        cluster,
+        FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 4
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
     )
 
     node.query("ALTER TABLE s3_test FREEZE")
     # Number S3 objects should be unchanged.
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
+    list_objects(cluster, "data/", "Object after freeze")
+    wait_for_delete_s3_objects(
+        cluster,
+        FILES_OVERHEAD
+        + FILES_OVERHEAD_PER_PART_WIDE * 4
+        - FILES_OVERHEAD_METADATA_VERSION * 2,
     )
 
     node.query("DROP TABLE s3_test NO DELAY")
     # Backup data should remain in S3.
 
-    wait_for_delete_s3_objects(cluster, FILES_OVERHEAD_PER_PART_WIDE * 4)
+    wait_for_delete_s3_objects(
+        cluster, FILES_OVERHEAD_PER_PART_WIDE * 4 - FILES_OVERHEAD_METADATA_VERSION * 4
+    )
 
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
-        minio.remove_object(cluster.minio_bucket, obj.object_name)
+    remove_all_s3_objects(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
 def test_freeze_unfreeze(cluster, node_name):
     node = cluster.instances[node_name]
     create_table(node, "s3_test")
-    minio = cluster.minio_client
 
     node.query(
         "INSERT INTO s3_test VALUES {}".format(generate_values("2020-01-03", 4096))
@@ -533,8 +556,9 @@ def test_freeze_unfreeze(cluster, node_name):
     wait_for_delete_empty_parts(node, "s3_test")
     wait_for_delete_inactive_parts(node, "s3_test")
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
+        len(list_objects(cluster, "data/"))
+        == FILES_OVERHEAD
+        + (FILES_OVERHEAD_PER_PART_WIDE - FILES_OVERHEAD_METADATA_VERSION) * 2
     )
 
     # Unfreeze single partition from backup1.
@@ -544,13 +568,10 @@ def test_freeze_unfreeze(cluster, node_name):
     # Unfreeze all partitions from backup2.
     node.query("ALTER TABLE s3_test UNFREEZE WITH NAME 'backup2'")
 
+    # Data should be removed from S3.
     wait_for_delete_s3_objects(cluster, FILES_OVERHEAD)
 
-    # Data should be removed from S3.
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD
-    )
+    check_no_objects_after_drop(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
@@ -558,7 +579,6 @@ def test_freeze_system_unfreeze(cluster, node_name):
     node = cluster.instances[node_name]
     create_table(node, "s3_test")
     create_table(node, "s3_test_removed")
-    minio = cluster.minio_client
 
     node.query(
         "INSERT INTO s3_test VALUES {}".format(generate_values("2020-01-04", 4096))
@@ -574,20 +594,18 @@ def test_freeze_system_unfreeze(cluster, node_name):
     wait_for_delete_inactive_parts(node, "s3_test")
     node.query("DROP TABLE s3_test_removed NO DELAY")
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
+        len(list_objects(cluster, "data/"))
+        == FILES_OVERHEAD
+        + (FILES_OVERHEAD_PER_PART_WIDE - FILES_OVERHEAD_METADATA_VERSION) * 2
     )
 
     # Unfreeze all data from backup3.
     node.query("SYSTEM UNFREEZE WITH NAME 'backup3'")
 
+    # Data should be removed from S3.
     wait_for_delete_s3_objects(cluster, FILES_OVERHEAD)
 
-    # Data should be removed from S3.
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD
-    )
+    check_no_objects_after_drop(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
@@ -633,6 +651,8 @@ def get_s3_requests():
     # There should be 3 times more S3 requests because multi-part upload mode uses 3 requests to upload object.
     assert get_s3_requests() - s3_requests_before == s3_requests_to_write_partition * 3
 
+    check_no_objects_after_drop(cluster)
+
 
 @pytest.mark.parametrize("node_name", ["node"])
 def test_s3_no_delete_objects(cluster, node_name):
@@ -641,6 +661,7 @@ def test_s3_no_delete_objects(cluster, node_name):
         node, "s3_test_no_delete_objects", storage_policy="no_delete_objects_s3"
     )
     node.query("DROP TABLE s3_test_no_delete_objects SYNC")
+    remove_all_s3_objects(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
@@ -655,6 +676,7 @@ def test_s3_disk_reads_on_unstable_connection(cluster, node_name):
         assert node.query("SELECT sum(id) FROM s3_test").splitlines() == [
             "40499995500000"
         ]
+    check_no_objects_after_drop(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
@@ -664,14 +686,13 @@ def test_lazy_seek_optimization_for_async_read(cluster, node_name):
     node.query(
         "CREATE TABLE s3_test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3';"
     )
+    node.query("SYSTEM STOP MERGES s3_test")
     node.query(
         "INSERT INTO s3_test SELECT * FROM generateRandom('key UInt32, value String') LIMIT 10000000"
     )
     node.query("SELECT * FROM s3_test WHERE value LIKE '%abc%' ORDER BY value LIMIT 10")
-    node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
-    minio = cluster.minio_client
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
-        minio.remove_object(cluster.minio_bucket, obj.object_name)
+
+    check_no_objects_after_drop(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node_with_limited_disk"])
@@ -681,6 +702,7 @@ def test_cache_with_full_disk_space(cluster, node_name):
     node.query(
         "CREATE TABLE s3_test (key UInt32, value String) Engine=MergeTree() ORDER BY value SETTINGS storage_policy='s3_with_cache_and_jbod';"
     )
+    node.query("SYSTEM STOP MERGES s3_test")
     node.query(
         "INSERT INTO s3_test SELECT number, toString(number) FROM numbers(100000000)"
     )
@@ -699,7 +721,7 @@ def test_cache_with_full_disk_space(cluster, node_name):
     assert node.contains_in_log(
         "Insert into cache is skipped due to insufficient disk space"
     )
-    node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
+    check_no_objects_after_drop(cluster, node_name=node_name)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
@@ -724,6 +746,7 @@ def test_store_cleanup_disk_s3(cluster, node_name):
         "CREATE TABLE s3_test UUID '00000000-1000-4000-8000-000000000001' (n UInt64) Engine=MergeTree() ORDER BY n SETTINGS storage_policy='s3';"
     )
     node.query("INSERT INTO s3_test SELECT 1")
+    check_no_objects_after_drop(cluster)
 
 
 @pytest.mark.parametrize("node_name", ["node"])
@@ -800,3 +823,5 @@ def test_cache_setting_compatibility(cluster, node_name):
     node.query("SELECT * FROM s3_test FORMAT Null")
 
     assert not node.contains_in_log("No such file or directory: Cache info:")
+
+    check_no_objects_after_drop(cluster)
diff --git a/tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml b/tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml
index 976933b2d217..74af657c7830 100644
--- a/tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml
+++ b/tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml
@@ -11,6 +11,7 @@
                 <skip_access_check>true</skip_access_check>
                 <!-- Avoid extra retries to speed up tests -->
                 <retry_attempts>0</retry_attempts>
+                <connect_timeout_ms>20000</connect_timeout_ms>
             </s3>
             <s3_retryable>
                 <type>s3</type>
@@ -20,6 +21,7 @@
                 <secret_access_key>minio123</secret_access_key>
                 <!-- ClickHouse starts earlier than custom S3 endpoint. Skip access check to avoid fail on start-up -->
                 <skip_access_check>true</skip_access_check>
+                <connect_timeout_ms>20000</connect_timeout_ms>
             </s3_retryable>
             <s3_no_retries>
                 <type>s3</type>
@@ -32,6 +34,7 @@
                 <!-- Avoid extra retries to speed up tests -->
                 <s3_retry_attempts>1</s3_retry_attempts>
                 <s3_max_single_read_retries>1</s3_max_single_read_retries>
+                <connect_timeout_ms>20000</connect_timeout_ms>
             </s3_no_retries>
             <default/>
         </disks>
diff --git a/tests/integration/test_partition/test.py b/tests/integration/test_partition/test.py
index 5a972b58f999..93f03f4420ea 100644
--- a/tests/integration/test_partition/test.py
+++ b/tests/integration/test_partition/test.py
@@ -70,7 +70,7 @@ def partition_complex_assert_columns_txt():
         )
 
 
-def partition_complex_assert_checksums():
+def partition_complex_assert_checksums(after_detach=False):
     # Do not check increment.txt - it can be changed by other tests with FREEZE
     cmd = [
         "bash",
@@ -80,36 +80,67 @@ def partition_complex_assert_checksums():
         " | sed 's shadow/[0-9]*/data/[a-z0-9_-]*/ shadow/1/data/test/ g' | sort | uniq",
     ]
 
-    checksums = (
-        "082814b5aa5109160d5c0c5aff10d4df\tshadow/1/data/test/partition_complex/19700102_2_2_0/k.bin\n"
-        "082814b5aa5109160d5c0c5aff10d4df\tshadow/1/data/test/partition_complex/19700201_1_1_0/v1.bin\n"
-        "13cae8e658e0ca4f75c56b1fc424e150\tshadow/1/data/test/partition_complex/19700102_2_2_0/minmax_p.idx\n"
-        "25daad3d9e60b45043a70c4ab7d3b1c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/partition.dat\n"
-        "3726312af62aec86b64a7708d5751787\tshadow/1/data/test/partition_complex/19700201_1_1_0/partition.dat\n"
-        "37855b06a39b79a67ea4e86e4a3299aa\tshadow/1/data/test/partition_complex/19700102_2_2_0/checksums.txt\n"
-        "38e62ff37e1e5064e9a3f605dfe09d13\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.bin\n"
-        "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/k.mrk\n"
-        "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.mrk\n"
-        "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.mrk\n"
-        "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/k.mrk\n"
-        "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.mrk\n"
-        "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/v1.mrk\n"
-        "55a54008ad1ba589aa210d2629c1df41\tshadow/1/data/test/partition_complex/19700201_1_1_0/primary.idx\n"
-        "5f087cb3e7071bf9407e095821e2af8f\tshadow/1/data/test/partition_complex/19700201_1_1_0/checksums.txt\n"
-        "77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700102_2_2_0/columns.txt\n"
-        "77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700201_1_1_0/columns.txt\n"
-        "88cdc31ded355e7572d68d8cde525d3a\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.bin\n"
-        "9e688c58a5487b8eaf69c9e1005ad0bf\tshadow/1/data/test/partition_complex/19700102_2_2_0/primary.idx\n"
-        "c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700102_2_2_0/default_compression_codec.txt\n"
-        "c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700201_1_1_0/default_compression_codec.txt\n"
-        "c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700102_2_2_0/count.txt\n"
-        "c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700201_1_1_0/count.txt\n"
-        "cfcb770c3ecd0990dcceb1bde129e6c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.bin\n"
-        "cfcd208495d565ef66e7dff9f98764da\tshadow/1/data/test/partition_complex/19700102_2_2_0/metadata_version.txt\n"
-        "cfcd208495d565ef66e7dff9f98764da\tshadow/1/data/test/partition_complex/19700201_1_1_0/metadata_version.txt\n"
-        "e2af3bef1fd129aea73a890ede1e7a30\tshadow/1/data/test/partition_complex/19700201_1_1_0/k.bin\n"
-        "f2312862cc01adf34a93151377be2ddf\tshadow/1/data/test/partition_complex/19700201_1_1_0/minmax_p.idx\n"
-    )
+    # no metadata version
+    if after_detach:
+        checksums = (
+            "082814b5aa5109160d5c0c5aff10d4df\tshadow/1/data/test/partition_complex/19700102_2_2_0/k.bin\n"
+            "082814b5aa5109160d5c0c5aff10d4df\tshadow/1/data/test/partition_complex/19700201_1_1_0/v1.bin\n"
+            "13cae8e658e0ca4f75c56b1fc424e150\tshadow/1/data/test/partition_complex/19700102_2_2_0/minmax_p.idx\n"
+            "25daad3d9e60b45043a70c4ab7d3b1c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/partition.dat\n"
+            "3726312af62aec86b64a7708d5751787\tshadow/1/data/test/partition_complex/19700201_1_1_0/partition.dat\n"
+            "37855b06a39b79a67ea4e86e4a3299aa\tshadow/1/data/test/partition_complex/19700102_2_2_0/checksums.txt\n"
+            "38e62ff37e1e5064e9a3f605dfe09d13\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.bin\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/k.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/k.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/v1.mrk\n"
+            "55a54008ad1ba589aa210d2629c1df41\tshadow/1/data/test/partition_complex/19700201_1_1_0/primary.idx\n"
+            "5f087cb3e7071bf9407e095821e2af8f\tshadow/1/data/test/partition_complex/19700201_1_1_0/checksums.txt\n"
+            "77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700102_2_2_0/columns.txt\n"
+            "77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700201_1_1_0/columns.txt\n"
+            "88cdc31ded355e7572d68d8cde525d3a\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.bin\n"
+            "9e688c58a5487b8eaf69c9e1005ad0bf\tshadow/1/data/test/partition_complex/19700102_2_2_0/primary.idx\n"
+            "c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700102_2_2_0/default_compression_codec.txt\n"
+            "c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700201_1_1_0/default_compression_codec.txt\n"
+            "c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700102_2_2_0/count.txt\n"
+            "c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700201_1_1_0/count.txt\n"
+            "cfcb770c3ecd0990dcceb1bde129e6c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.bin\n"
+            "e2af3bef1fd129aea73a890ede1e7a30\tshadow/1/data/test/partition_complex/19700201_1_1_0/k.bin\n"
+            "f2312862cc01adf34a93151377be2ddf\tshadow/1/data/test/partition_complex/19700201_1_1_0/minmax_p.idx\n"
+        )
+    else:
+        checksums = (
+            "082814b5aa5109160d5c0c5aff10d4df\tshadow/1/data/test/partition_complex/19700102_2_2_0/k.bin\n"
+            "082814b5aa5109160d5c0c5aff10d4df\tshadow/1/data/test/partition_complex/19700201_1_1_0/v1.bin\n"
+            "13cae8e658e0ca4f75c56b1fc424e150\tshadow/1/data/test/partition_complex/19700102_2_2_0/minmax_p.idx\n"
+            "25daad3d9e60b45043a70c4ab7d3b1c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/partition.dat\n"
+            "3726312af62aec86b64a7708d5751787\tshadow/1/data/test/partition_complex/19700201_1_1_0/partition.dat\n"
+            "37855b06a39b79a67ea4e86e4a3299aa\tshadow/1/data/test/partition_complex/19700102_2_2_0/checksums.txt\n"
+            "38e62ff37e1e5064e9a3f605dfe09d13\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.bin\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/k.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700102_2_2_0/v1.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/k.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.mrk\n"
+            "4ae71336e44bf9bf79d2752e234818a5\tshadow/1/data/test/partition_complex/19700201_1_1_0/v1.mrk\n"
+            "55a54008ad1ba589aa210d2629c1df41\tshadow/1/data/test/partition_complex/19700201_1_1_0/primary.idx\n"
+            "5f087cb3e7071bf9407e095821e2af8f\tshadow/1/data/test/partition_complex/19700201_1_1_0/checksums.txt\n"
+            "77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700102_2_2_0/columns.txt\n"
+            "77d5af402ada101574f4da114f242e02\tshadow/1/data/test/partition_complex/19700201_1_1_0/columns.txt\n"
+            "88cdc31ded355e7572d68d8cde525d3a\tshadow/1/data/test/partition_complex/19700201_1_1_0/p.bin\n"
+            "9e688c58a5487b8eaf69c9e1005ad0bf\tshadow/1/data/test/partition_complex/19700102_2_2_0/primary.idx\n"
+            "c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700102_2_2_0/default_compression_codec.txt\n"
+            "c0904274faa8f3f06f35666cc9c5bd2f\tshadow/1/data/test/partition_complex/19700201_1_1_0/default_compression_codec.txt\n"
+            "c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700102_2_2_0/count.txt\n"
+            "c4ca4238a0b923820dcc509a6f75849b\tshadow/1/data/test/partition_complex/19700201_1_1_0/count.txt\n"
+            "cfcb770c3ecd0990dcceb1bde129e6c6\tshadow/1/data/test/partition_complex/19700102_2_2_0/p.bin\n"
+            "cfcd208495d565ef66e7dff9f98764da\tshadow/1/data/test/partition_complex/19700102_2_2_0/metadata_version.txt\n"
+            "cfcd208495d565ef66e7dff9f98764da\tshadow/1/data/test/partition_complex/19700201_1_1_0/metadata_version.txt\n"
+            "e2af3bef1fd129aea73a890ede1e7a30\tshadow/1/data/test/partition_complex/19700201_1_1_0/k.bin\n"
+            "f2312862cc01adf34a93151377be2ddf\tshadow/1/data/test/partition_complex/19700201_1_1_0/minmax_p.idx\n"
+        )
 
     assert TSV(instance.exec_in_container(cmd).replace("  ", "\t")) == TSV(checksums)
 
@@ -134,7 +165,7 @@ def test_partition_complex(partition_table_complex):
 
     q("ALTER TABLE test.partition_complex FREEZE")
 
-    partition_complex_assert_checksums()
+    partition_complex_assert_checksums(True)
 
     q("ALTER TABLE test.partition_complex DETACH PARTITION 197001")
     q("ALTER TABLE test.partition_complex ATTACH PARTITION 197001")
@@ -144,7 +175,7 @@ def test_partition_complex(partition_table_complex):
     q("ALTER TABLE test.partition_complex MODIFY COLUMN v1 Int8")
 
     # Check the backup hasn't changed
-    partition_complex_assert_checksums()
+    partition_complex_assert_checksums(True)
 
     q("OPTIMIZE TABLE test.partition_complex")
 
diff --git a/tests/integration/test_password_constraints/configs/default_password_type.xml b/tests/integration/test_password_constraints/configs/default_password_type.xml
new file mode 100644
index 000000000000..4b23ea31df0a
--- /dev/null
+++ b/tests/integration/test_password_constraints/configs/default_password_type.xml
@@ -0,0 +1,3 @@
+<clickhouse>
+    <default_password_type>double_sha1_password</default_password_type>
+</clickhouse>
diff --git a/tests/integration/test_password_constraints/test.py b/tests/integration/test_password_constraints/test.py
index 9cdff51caa11..94e10ed5f9e3 100644
--- a/tests/integration/test_password_constraints/test.py
+++ b/tests/integration/test_password_constraints/test.py
@@ -5,6 +5,9 @@
 cluster = ClickHouseCluster(__file__)
 
 node = cluster.add_instance("node", main_configs=["configs/complexity_rules.xml"])
+node2 = cluster.add_instance(
+    "node2", main_configs=["configs/default_password_type.xml"]
+)
 
 
 @pytest.fixture(scope="module")
@@ -39,3 +42,10 @@ def test_complexity_rules(start_cluster):
 
     node.query("CREATE USER u_5 IDENTIFIED WITH plaintext_password BY 'aA!000000000'")
     node.query("DROP USER u_5")
+
+
+def test_default_password_type(start_cluster):
+    node2.query("CREATE USER u1 IDENTIFIED BY 'pwd'")
+
+    required_type = "double_sha1_password"
+    assert required_type in node2.query("SHOW CREATE USER u1")
diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py
index 241b90cac3f5..237a81da0f5b 100644
--- a/tests/integration/test_s3_cluster/test.py
+++ b/tests/integration/test_s3_cluster/test.py
@@ -247,9 +247,10 @@ def test_skip_unavailable_shards(started_cluster):
     assert result == "10\n"
 
 
-def test_unskip_unavailable_shards(started_cluster):
+def test_unset_skip_unavailable_shards(started_cluster):
+    # Although skip_unavailable_shards is not set, cluster table functions should always skip unavailable shards.
     node = started_cluster.instances["s0_0_0"]
-    error = node.query_and_get_error(
+    result = node.query(
         """
     SELECT count(*) from s3Cluster(
         'cluster_non_existent_port',
@@ -258,7 +259,7 @@ def test_unskip_unavailable_shards(started_cluster):
     """
     )
 
-    assert "NETWORK_ERROR" in error
+    assert result == "10\n"
 
 
 def test_distributed_insert_select_with_replicated(started_cluster):
diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py
index d4752d6cf2e6..edf5344e887f 100644
--- a/tests/integration/test_storage_hdfs/test.py
+++ b/tests/integration/test_storage_hdfs/test.py
@@ -788,6 +788,7 @@ def test_schema_inference_cache(started_cluster):
 
 
 def test_hdfsCluster_skip_unavailable_shards(started_cluster):
+    # Although skip_unavailable_shards is not set, cluster table functions should always skip unavailable shards.
     hdfs_api = started_cluster.hdfs_api
     node = started_cluster.instances["node1"]
     data = "1\tSerialize\t555.222\n2\tData\t777.333\n"
@@ -801,16 +802,18 @@ def test_hdfsCluster_skip_unavailable_shards(started_cluster):
     )
 
 
-def test_hdfsCluster_unskip_unavailable_shards(started_cluster):
+def test_hdfsCluster_unset_skip_unavailable_shards(started_cluster):
     hdfs_api = started_cluster.hdfs_api
     node = started_cluster.instances["node1"]
     data = "1\tSerialize\t555.222\n2\tData\t777.333\n"
     hdfs_api.write_data("/unskip_unavailable_shards", data)
-    error = node.query_and_get_error(
-        "select * from hdfsCluster('cluster_non_existent_port', 'hdfs://hdfs1:9000/unskip_unavailable_shards', 'TSV', 'id UInt64, text String, number Float64')"
-    )
 
-    assert "NETWORK_ERROR" in error
+    assert (
+        node1.query(
+            "select * from hdfsCluster('cluster_non_existent_port', 'hdfs://hdfs1:9000/skip_unavailable_shards', 'TSV', 'id UInt64, text String, number Float64')"
+        )
+        == data
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_system_metrics/test.py b/tests/integration/test_system_metrics/test.py
index 439e8b66db11..8539828a8b81 100644
--- a/tests/integration/test_system_metrics/test.py
+++ b/tests/integration/test_system_metrics/test.py
@@ -157,3 +157,57 @@ def test_metrics_storage_buffer_size(start_cluster):
         )
         == "0\n"
     )
+
+
+def test_attach_without_zk_incr_readonly_metric(start_cluster):
+    assert (
+        node1.query("SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'")
+        == "0\n"
+    )
+
+    node1.query(
+        "ATTACH TABLE test.test_no_zk UUID 'a50b7933-59b2-49ce-8db6-59da3c9b4413' (i Int8, d Date) ENGINE = ReplicatedMergeTree('no_zk', 'replica') ORDER BY tuple()"
+    )
+    assert_eq_with_retry(
+        node1,
+        "SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'",
+        "1\n",
+        retry_count=300,
+        sleep_time=1,
+    )
+
+    node1.query("DETACH TABLE test.test_no_zk")
+    assert_eq_with_retry(
+        node1,
+        "SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'",
+        "0\n",
+        retry_count=300,
+        sleep_time=1,
+    )
+
+    node1.query("ATTACH TABLE test.test_no_zk")
+    assert_eq_with_retry(
+        node1,
+        "SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'",
+        "1\n",
+        retry_count=300,
+        sleep_time=1,
+    )
+
+    node1.query("SYSTEM RESTORE REPLICA test.test_no_zk")
+    assert_eq_with_retry(
+        node1,
+        "SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'",
+        "0\n",
+        retry_count=300,
+        sleep_time=1,
+    )
+
+    node1.query("DROP TABLE test.test_no_zk")
+    assert_eq_with_retry(
+        node1,
+        "SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'",
+        "0\n",
+        retry_count=300,
+        sleep_time=1,
+    )
diff --git a/tests/performance/parallel_hash_join.xml b/tests/performance/parallel_hash_join.xml
new file mode 100644
index 000000000000..412f33e9fe73
--- /dev/null
+++ b/tests/performance/parallel_hash_join.xml
@@ -0,0 +1,75 @@
+<test>
+    <settings>
+        <max_threads>8</max_threads>
+        <join_algorithm>parallel_hash</join_algorithm>
+    </settings>
+    <create_query>
+        CREATE TABLE test_data_left (
+            `id` UInt64,
+            `x` UInt64,
+            `y` UInt64,
+            `z` UInt64,
+            `w` String
+        )ENGINE = Memory
+    </create_query>
+
+    <create_query>
+        CREATE TABLE test_data_right (
+            `id` UInt64,
+            `x` UInt64,
+            `y` UInt64,
+            `z` UInt64,
+            `w` String
+        )ENGINE = Memory
+    </create_query>
+    <fill_query>
+        insert into test_data_left select number as id, number % 1111 as x, number * 3 as y, number - 100 as z, cast(number as String) as w from numbers(10000000);
+    </fill_query>
+    <fill_query>
+        insert into test_data_right select number as id, number % 1311 as x, number * 5 as y, number + 100 as z, cast(number as String) as w from numbers(10000000);    </fill_query>
+    <!-- could test the performance of column scatter -->
+    <query>
+        SELECT
+            t1.id,
+            t1.x,
+            t1.y,
+            t2.id,
+            t2.x,
+            t2.y
+        FROM test_data_left AS t1
+        INNER JOIN test_data_right AS t2
+        ON t1.id = t2.id
+        FORMAT `Null`
+    </query>
+
+    <query>
+        SELECT
+            t1.id,
+            t1.x,
+            t1.y,
+            t2.id,
+            t2.x,
+            t2.y
+        FROM test_data_left AS t1
+        LEFT JOIN test_data_right AS t2
+        ON t1.id = t2.id
+        FORMAT `Null`
+    </query>
+    
+    <query>
+        SELECT
+            t1.id,
+            t1.x,
+            t1.y,
+            t2.id,
+            t2.x,
+            t2.y
+        FROM test_data_left AS t1
+        RIGHT JOIN test_data_right AS t2
+        ON t1.id = t2.id
+        FORMAT `Null`
+    </query>
+
+    <drop_query> DROP TABLE test_data_right </drop_query>
+    <drop_query> DROP TABLE test_data_left </drop_query>
+</test>
diff --git a/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.reference b/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.reference
index cd9da9837857..11b660b54a3c 100644
--- a/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.reference
+++ b/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.reference
@@ -2,7 +2,7 @@ runtime messages	0.001
 runtime exceptions	0.05
 messages shorter than 10	1
 messages shorter than 16	3
-exceptions shorter than 30	30
+exceptions shorter than 30	3
 noisy messages	0.3
 noisy Trace messages	0.16
 noisy Debug messages	0.09
diff --git a/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.sql b/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.sql
index 480effec0653..7796785afb53 100644
--- a/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.sql
+++ b/tests/queries/0_stateless/00002_log_and_exception_messages_formatting.sql
@@ -49,7 +49,14 @@ create temporary table known_short_messages (s String) as select * from (select
 'Column ''{}'' already exists', 'No macro {} in config', 'Invalid origin H3 index: {}',
 'Invalid session timeout: ''{}''', 'Tuple cannot be empty', 'Database name is empty',
 'Table {} is not a Dictionary', 'Expected function, got: {}', 'Unknown identifier: ''{}''',
-'Failed to {} input ''{}''', '{}.{} is not a VIEW', 'Cannot convert NULL to {}', 'Dictionary {} doesn''t exist'
+'Failed to {} input ''{}''', '{}.{} is not a VIEW', 'Cannot convert NULL to {}', 'Dictionary {} doesn''t exist',
+'Write file: {}', 'Unable to parse JSONPath', 'Host is empty in S3 URI.', 'Expected end of line',
+'inflate failed: {}{}', 'Center is not valid', 'Column ''{}'' is ambiguous', 'Cannot parse object', 'Invalid date: {}',
+'There is no cache by name: {}', 'No part {} in table', '`{}` should be a String', 'There are duplicate id {}',
+'Invalid replica name: {}', 'Unexpected value {} in enum', 'Unknown BSON type: {}', 'Point is not valid',
+'Invalid qualified name: {}', 'INTO OUTFILE is not allowed', 'Arguments must not be NaN', 'Cell is not valid',
+'brotli decode error{}', 'Invalid H3 index: {}', 'Too large node state size', 'No additional keys found.',
+'Attempt to read after EOF.', 'Replication was stopped', '{}	building file infos', 'Cannot parse uuid {}'
 ] as arr) array join arr;
 
 -- Check that we don't have too many short meaningless message patterns.
@@ -59,7 +66,7 @@ select 'messages shorter than 10', max2(countDistinctOrDefault(message_format_st
 select 'messages shorter than 16', max2(countDistinctOrDefault(message_format_string), 3) from logs where length(message_format_string) < 16 and message_format_string not in known_short_messages;
 
 -- Same as above, but exceptions must be more informative. Feel free to update the threshold or remove this query if really necessary
-select 'exceptions shorter than 30', max2(countDistinctOrDefault(message_format_string), 30) from logs where length(message_format_string) < 30 and message ilike '%DB::Exception%' and message_format_string not in known_short_messages;
+select 'exceptions shorter than 30', max2(countDistinctOrDefault(message_format_string), 3) from logs where length(message_format_string) < 30 and message ilike '%DB::Exception%' and message_format_string not in known_short_messages;
 
 
 -- Avoid too noisy messages: top 1 message frequency must be less than 30%. We should reduce the threshold
@@ -98,7 +105,9 @@ select 'incorrect patterns', max2(countDistinct(message_format_string), 15) from
     where ((rand() % 8) = 0)
     and message not like (replaceRegexpAll(message_format_string, '{[:.0-9dfx]*}', '%') as s)
     and message not like (s || ' (skipped % similar messages)')
-    and message not like ('%Exception: '||s||'%') group by message_format_string
+    and message not like ('%Exception: '||s||'%')
+    and message not like ('%(skipped % similar messages)%')
+    group by message_format_string
 ) where any_message not like '%Poco::Exception%';
 
 drop table logs;
diff --git a/tests/queries/0_stateless/00049_any_left_join.sql b/tests/queries/0_stateless/00049_any_left_join.sql
index ecd079a5085b..b3ff7a4ea41c 100644
--- a/tests/queries/0_stateless/00049_any_left_join.sql
+++ b/tests/queries/0_stateless/00049_any_left_join.sql
@@ -1 +1 @@
-SELECT number, joined FROM system.numbers ANY LEFT JOIN (SELECT number * 2 AS number, number * 10 + 1 AS joined FROM system.numbers LIMIT 10) js2 USING number LIMIT 10
+SELECT number, joined FROM (SELECT number FROM system.numbers LIMIT 1000) as js1 ANY LEFT JOIN (SELECT number * 2 AS number, number * 10 + 1 AS joined FROM system.numbers LIMIT 10) js2 USING number ORDER BY number LIMIT 10
diff --git a/tests/queries/0_stateless/00051_any_inner_join.sql b/tests/queries/0_stateless/00051_any_inner_join.sql
index 566b5ad526b5..65577bdaf8bb 100644
--- a/tests/queries/0_stateless/00051_any_inner_join.sql
+++ b/tests/queries/0_stateless/00051_any_inner_join.sql
@@ -8,4 +8,5 @@ ANY INNER JOIN
 (
     SELECT number * 2 AS k, number AS joined FROM system.numbers LIMIT 10
 ) AS b
-USING k;
+USING k
+ORDER BY k;
diff --git a/tests/queries/0_stateless/00052_all_left_join.sql b/tests/queries/0_stateless/00052_all_left_join.sql
index 6d5a1ba073c8..49c3473dbb13 100644
--- a/tests/queries/0_stateless/00052_all_left_join.sql
+++ b/tests/queries/0_stateless/00052_all_left_join.sql
@@ -6,4 +6,5 @@ ALL LEFT JOIN
 (
     SELECT intDiv(number, 2) AS k, number AS joined FROM system.numbers LIMIT 10
 ) js2
-USING k;
+USING k
+ORDER BY k, joined;
diff --git a/tests/queries/0_stateless/00057_join_aliases.sql b/tests/queries/0_stateless/00057_join_aliases.sql
index 6f2830943962..bc4c5f9faef8 100644
--- a/tests/queries/0_stateless/00057_join_aliases.sql
+++ b/tests/queries/0_stateless/00057_join_aliases.sql
@@ -1,7 +1,8 @@
 SELECT * FROM (
     SELECT number, n, j1, j2
-    FROM (SELECT number, number / 2 AS n FROM system.numbers) js1
+    FROM (SELECT number, number / 2 AS n FROM system.numbers LIMIT 1000) js1
     ANY LEFT JOIN (SELECT number / 3 AS n, number AS j1, 'Hello' AS j2 FROM system.numbers LIMIT 10) js2
-    USING n LIMIT 10
+    USING n
+    ORDER BY number LIMIT 10
 ) ORDER BY n
 SETTINGS join_algorithm = 'hash'; -- the query does not finish with merge join
diff --git a/tests/queries/0_stateless/00163_shard_join_with_empty_table.sql b/tests/queries/0_stateless/00163_shard_join_with_empty_table.sql
index 07242f7a8d4d..433f268faee8 100644
--- a/tests/queries/0_stateless/00163_shard_join_with_empty_table.sql
+++ b/tests/queries/0_stateless/00163_shard_join_with_empty_table.sql
@@ -5,7 +5,7 @@ SET joined_subquery_requires_alias = 0;
 
 SELECT * FROM (
     SELECT number, n, j1, j2
-    FROM (SELECT number, number / 2 AS n FROM remote('127.0.0.{2,3}', system.numbers))
+    FROM (SELECT number, number / 2 AS n FROM remote('127.0.0.{2,3}', system.numbers) LIMIT 10)
     ANY LEFT JOIN (SELECT number / 3 AS n, number AS j1, 'Hello' AS j2 FROM system.numbers LIMIT 0)
     USING n LIMIT 10
 ) ORDER BY number;
@@ -19,7 +19,7 @@ SELECT * FROM (
 
 SELECT * FROM (
     SELECT number, n, j1, j2
-    FROM (SELECT number, number / 2 AS n FROM remote('127.0.0.{2,3}', system.numbers))
+    FROM (SELECT number, number / 2 AS n FROM remote('127.0.0.{2,3}', system.numbers) LIMIT 10)
     GLOBAL ANY LEFT JOIN (SELECT number / 3 AS n, number AS j1, 'Hello' AS j2 FROM system.numbers LIMIT 0)
     USING n LIMIT 10
 ) ORDER BY number;
diff --git a/tests/queries/0_stateless/00189_time_zones_long.reference b/tests/queries/0_stateless/00189_time_zones_long.reference
index 8717a662771c..d41c925bbe55 100644
--- a/tests/queries/0_stateless/00189_time_zones_long.reference
+++ b/tests/queries/0_stateless/00189_time_zones_long.reference
@@ -246,18 +246,18 @@ toUnixTimestamp
 1426415400
 1426415400
 date_trunc
-2019-01-01
-2020-01-01
-2020-01-01
-2019-10-01
-2020-01-01
-2020-01-01
-2019-12-01
-2020-01-01
-2020-01-01
-2019-12-30
-2019-12-30
-2019-12-30
+2019-01-01 00:00:00
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2019-10-01 00:00:00
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2019-12-01 00:00:00
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2019-12-30 00:00:00
+2019-12-30 00:00:00
+2019-12-30 00:00:00
 2019-12-31 00:00:00
 2020-01-01 00:00:00
 2020-01-02 00:00:00
@@ -270,18 +270,18 @@ date_trunc
 2019-12-31 20:11:22
 2020-01-01 12:11:22
 2020-01-02 05:11:22
-2019-01-01
-2020-01-01
-2020-01-01
-2019-10-01
-2020-01-01
-2020-01-01
-2019-12-01
-2020-01-01
-2020-01-01
-2019-12-30
-2019-12-30
-2019-12-30
+2019-01-01 00:00:00
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2019-10-01 00:00:00
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2019-12-01 00:00:00
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2019-12-30 00:00:00
+2019-12-30 00:00:00
+2019-12-30 00:00:00
 2019-12-31 00:00:00
 2020-01-01 00:00:00
 2020-01-02 00:00:00
@@ -294,8 +294,8 @@ date_trunc
 2019-12-31 20:11:22
 2020-01-01 12:11:22
 2020-01-02 05:11:22
-2020-01-01
-2020-01-01
-2020-01-01
-2019-12-30
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2020-01-01 00:00:00
+2019-12-30 00:00:00
 2020-01-01 00:00:00
diff --git a/tests/queries/0_stateless/00475_in_join_db_table.sql b/tests/queries/0_stateless/00475_in_join_db_table.sql
index 5f90d1080154..6aea40be3445 100644
--- a/tests/queries/0_stateless/00475_in_join_db_table.sql
+++ b/tests/queries/0_stateless/00475_in_join_db_table.sql
@@ -13,11 +13,11 @@ DROP TABLE set;
 DROP TABLE IF EXISTS join;
 CREATE TABLE join (k UInt8, x String) ENGINE = Memory;
 INSERT INTO join VALUES (1, 'hello');
-SELECT k, x FROM (SELECT arrayJoin([1, 2]) AS k) js1 ANY LEFT JOIN join USING k;
+SELECT k, x FROM (SELECT arrayJoin([1, 2]) AS k) js1 ANY LEFT JOIN join USING k ORDER BY k;
 
 DROP TABLE join;
 CREATE TABLE join (k UInt8, x String) ENGINE = Join(ANY, LEFT, k);
 INSERT INTO join VALUES (1, 'hello');
-SELECT k, x FROM (SELECT arrayJoin([1, 2]) AS k) js1 ANY LEFT JOIN join USING k;
+SELECT k, x FROM (SELECT arrayJoin([1, 2]) AS k) js1 ANY LEFT JOIN join USING k ORDER BY k;
 
 DROP TABLE join;
diff --git a/tests/queries/0_stateless/00700_decimal_aggregates.reference b/tests/queries/0_stateless/00700_decimal_aggregates.reference
index acf41546f5c4..79195312867b 100644
--- a/tests/queries/0_stateless/00700_decimal_aggregates.reference
+++ b/tests/queries/0_stateless/00700_decimal_aggregates.reference
@@ -5,7 +5,7 @@
 -1275	-424.99999983	-255	-1275	-424.99999983	-255
 101	101	101	101	101	101
 -101	-101	-101	-101	-101	-101
-(101,101,101)	(101,101,101)	(101,101,101)	(101,101,101)	(102,100,101)
+(101,101,101)	(101,101,101)	(101,101,101)	(101,101,101)	(1,1,1,1,1,1)
 5	5	5
 10	10	10
 -50	-50	-16.66666666	-16.66666666	-10	-10
diff --git a/tests/queries/0_stateless/00700_decimal_aggregates.sql b/tests/queries/0_stateless/00700_decimal_aggregates.sql
index a1814fc866fa..6ca37e069180 100644
--- a/tests/queries/0_stateless/00700_decimal_aggregates.sql
+++ b/tests/queries/0_stateless/00700_decimal_aggregates.sql
@@ -24,7 +24,7 @@ SELECT (uniq(a), uniq(b), uniq(c)),
     (uniqCombined(a), uniqCombined(b), uniqCombined(c)),
     (uniqCombined(17)(a), uniqCombined(17)(b), uniqCombined(17)(c)),
     (uniqExact(a), uniqExact(b), uniqExact(c)),
-    (uniqHLL12(a), uniqHLL12(b), uniqHLL12(c))
+    (102 - uniqHLL12(a) >= 0, 102 - uniqHLL12(b) >= 0, 102 - uniqHLL12(c) >= 0, uniqHLL12(a) - 99 >= 0, uniqHLL12(b) - 99 >= 0, uniqHLL12(c) - 99 >= 0)
 FROM (SELECT * FROM decimal ORDER BY a);
 
 SELECT uniqUpTo(10)(a), uniqUpTo(10)(b), uniqUpTo(10)(c) FROM decimal WHERE a >= 0 AND a < 5;
diff --git a/tests/queries/0_stateless/00725_join_on_bug_2.sql b/tests/queries/0_stateless/00725_join_on_bug_2.sql
index 14fedfa14e76..7eaddab24b73 100644
--- a/tests/queries/0_stateless/00725_join_on_bug_2.sql
+++ b/tests/queries/0_stateless/00725_join_on_bug_2.sql
@@ -9,17 +9,17 @@ insert into t_00725_2 values(2,2);
 create table s_00725_2(a Int64, b Int64) engine = TinyLog;
 insert into s_00725_2 values(1,1);
 
-select a, b, s_a, s_b from t_00725_2 all left join (select a,b,a s_a, b s_b from s_00725_2) using (a,b);
+select a, b, s_a, s_b from t_00725_2 all left join (select a,b,a s_a, b s_b from s_00725_2) using (a,b) order by a;
 select '-';
-select t_00725_2.*, s_00725_2.* from t_00725_2 all left join s_00725_2 using (a,b);
+select t_00725_2.*, s_00725_2.* from t_00725_2 all left join s_00725_2 using (a,b) order by t_00725_2.a;
 select '-';
-select a,b,s_a,s_b from t_00725_2 all left join (select a, b, a s_a, b s_b from s_00725_2) s_00725_2 on (s_00725_2.a = t_00725_2.a and s_00725_2.b = t_00725_2.b);
+select a,b,s_a,s_b from t_00725_2 all left join (select a, b, a s_a, b s_b from s_00725_2) s_00725_2 on (s_00725_2.a = t_00725_2.a and s_00725_2.b = t_00725_2.b) order by a;
 select '-';
-select * from t_00725_2 all left join (select a s_a, b s_b from s_00725_2) on (s_a = t_00725_2.a and s_b = t_00725_2.b);
+select * from t_00725_2 all left join (select a s_a, b s_b from s_00725_2) on (s_a = t_00725_2.a and s_b = t_00725_2.b) order by t_00725_2.a;
 select '-';
-select a,b,s_a,s_b from t_00725_2 all left join (select a,b, a s_a, b s_b from s_00725_2) on (s_a = t_00725_2.a and s_b = t_00725_2.b);
+select a,b,s_a,s_b from t_00725_2 all left join (select a,b, a s_a, b s_b from s_00725_2) on (s_a = t_00725_2.a and s_b = t_00725_2.b) order by a;
 select '-';
-select t_00725_2.*, s_00725_2.* from t_00725_2 all left join s_00725_2 on (s_00725_2.a = t_00725_2.a and s_00725_2.b = t_00725_2.b);
+select t_00725_2.*, s_00725_2.* from t_00725_2 all left join s_00725_2 on (s_00725_2.a = t_00725_2.a and s_00725_2.b = t_00725_2.b) order by t_00725_2.a;
 
 drop table if exists t_00725_2;
 drop table if exists s_00725_2;
diff --git a/tests/queries/0_stateless/00800_low_cardinality_join.sql b/tests/queries/0_stateless/00800_low_cardinality_join.sql
index 9c1fd9b7ad3e..e0f284102ae4 100644
--- a/tests/queries/0_stateless/00800_low_cardinality_join.sql
+++ b/tests/queries/0_stateless/00800_low_cardinality_join.sql
@@ -22,10 +22,10 @@ select * from (select toLowCardinality(toNullable(dummy)) as val from system.one
 select * from (select toLowCardinality(dummy) as val from system.one) any left join (select toLowCardinality(toNullable(dummy)) as rval from system.one) on val + 0 = rval * 1;
 select * from (select toLowCardinality(toNullable(dummy)) as val from system.one) any left join (select toLowCardinality(toNullable(dummy)) as rval from system.one) on val + 0 = rval * 1;
 select '-';
-select * from (select number as l from system.numbers limit 3) any left join (select number as r from system.numbers limit 3) on l + 1 = r * 1;
-select * from (select toLowCardinality(number) as l from system.numbers limit 3) any left join (select number as r from system.numbers limit 3) on l + 1 = r * 1;
-select * from (select number as l from system.numbers limit 3) any left join (select toLowCardinality(number) as r from system.numbers limit 3) on l + 1 = r * 1;
-select * from (select toLowCardinality(number) as l from system.numbers limit 3) any left join (select toLowCardinality(number) as r from system.numbers limit 3) on l + 1 = r * 1;
-select * from (select toLowCardinality(toNullable(number)) as l from system.numbers limit 3) any left join (select toLowCardinality(number) as r from system.numbers limit 3) on l + 1 = r * 1;
-select * from (select toLowCardinality(number) as l from system.numbers limit 3) any left join (select toLowCardinality(toNullable(number)) as r from system.numbers limit 3) on l + 1 = r * 1;
-select * from (select toLowCardinality(toNullable(number)) as l from system.numbers limit 3) any left join (select toLowCardinality(toNullable(number)) as r from system.numbers limit 3) on l + 1 = r * 1;
+select * from (select number as l from system.numbers limit 3) any left join (select number as r from system.numbers limit 3) on l + 1 = r * 1 order by l;
+select * from (select toLowCardinality(number) as l from system.numbers limit 3) any left join (select number as r from system.numbers limit 3) on l + 1 = r * 1 order by l;
+select * from (select number as l from system.numbers limit 3) any left join (select toLowCardinality(number) as r from system.numbers limit 3) on l + 1 = r * 1 order by l;
+select * from (select toLowCardinality(number) as l from system.numbers limit 3) any left join (select toLowCardinality(number) as r from system.numbers limit 3) on l + 1 = r * 1 order by l;
+select * from (select toLowCardinality(toNullable(number)) as l from system.numbers limit 3) any left join (select toLowCardinality(number) as r from system.numbers limit 3) on l + 1 = r * 1 order by l;
+select * from (select toLowCardinality(number) as l from system.numbers limit 3) any left join (select toLowCardinality(toNullable(number)) as r from system.numbers limit 3) on l + 1 = r * 1 order by l;
+select * from (select toLowCardinality(toNullable(number)) as l from system.numbers limit 3) any left join (select toLowCardinality(toNullable(number)) as r from system.numbers limit 3) on l + 1 = r * 1 order by l;
diff --git a/tests/queries/0_stateless/00818_inner_join_bug_3567.reference b/tests/queries/0_stateless/00818_inner_join_bug_3567.reference
index 41e0d8ea43aa..e640e676a67e 100644
--- a/tests/queries/0_stateless/00818_inner_join_bug_3567.reference
+++ b/tests/queries/0_stateless/00818_inner_join_bug_3567.reference
@@ -1,19 +1,11 @@
-┌─[1ma[0m─┬──────────[1mb[0m─┐
-│ a │ 2018-01-01 │
-│ b │ 2018-01-01 │
-│ c │ 2018-01-01 │
-└───┴────────────┘
-┌─[1mc[0m─┬─[1mtable2.a[0m─┬──────────[1md[0m─┬─[1ma[0m─┬──────────[1mb[0m─┐
-│ B │ b        │ 2018-01-01 │ B │ 2018-01-01 │
-│ C │ c        │ 2018-01-01 │ C │ 2018-01-01 │
-│ D │ d        │ 2018-01-01 │ D │ 2018-01-01 │
-└───┴──────────┴────────────┴───┴────────────┘
-┌─[1ma[0m─┬──────────[1mb[0m─┬─[1mc[0m─┬──────────[1md[0m─┬─[1mc[0m─┐
-│ a │ 2018-01-01 │   │ 1970-01-01 │   │
-│ b │ 2018-01-01 │ B │ 2018-01-01 │ B │
-│ c │ 2018-01-01 │ C │ 2018-01-01 │ C │
-└───┴────────────┴───┴────────────┴───┘
-┌─[1ma[0m─┬──────────[1mb[0m─┬─[1mc[0m─┬──────────[1md[0m─┬─[1mc[0m─┐
-│ b │ 2018-01-01 │ B │ 2018-01-01 │ B │
-│ c │ 2018-01-01 │ C │ 2018-01-01 │ C │
-└───┴────────────┴───┴────────────┴───┘
+a	2018-01-01
+b	2018-01-01
+c	2018-01-01
+B	b	2018-01-01	B	2018-01-01
+C	c	2018-01-01	C	2018-01-01
+D	d	2018-01-01	D	2018-01-01
+a	2018-01-01		1970-01-01	
+b	2018-01-01	B	2018-01-01	B
+c	2018-01-01	C	2018-01-01	C
+b	2018-01-01	B	2018-01-01	B
+c	2018-01-01	C	2018-01-01	C
diff --git a/tests/queries/0_stateless/00818_inner_join_bug_3567.sql b/tests/queries/0_stateless/00818_inner_join_bug_3567.sql
index cc0b63f9def5..c3cc9ebe31fb 100644
--- a/tests/queries/0_stateless/00818_inner_join_bug_3567.sql
+++ b/tests/queries/0_stateless/00818_inner_join_bug_3567.sql
@@ -9,10 +9,10 @@ CREATE TABLE table2(c String, a String, d Date) ENGINE MergeTree order by c;
 INSERT INTO table1 VALUES ('a', '2018-01-01') ('b', '2018-01-01') ('c', '2018-01-01');
 INSERT INTO table2 VALUES ('D', 'd', '2018-01-01') ('B', 'b', '2018-01-01') ('C', 'c', '2018-01-01');
 
-SELECT * FROM table1 t1 FORMAT PrettyCompact;
-SELECT *, c as a, d as b FROM table2 FORMAT PrettyCompact;
-SELECT * FROM table1 t1 ALL LEFT JOIN (SELECT *, c, d as b FROM table2) t2 USING (a, b) ORDER BY d, t1.a FORMAT PrettyCompact;
-SELECT * FROM table1 t1 ALL INNER JOIN (SELECT *, c, d as b FROM table2) t2 USING (a, b) ORDER BY d, t1.a FORMAT PrettyCompact;
+SELECT * FROM table1 t1;
+SELECT *, c as a, d as b FROM table2;
+SELECT * FROM table1 t1 ALL LEFT JOIN (SELECT *, c, d as b FROM table2) t2 USING (a, b) ORDER BY d, t1.a;
+SELECT * FROM table1 t1 ALL INNER JOIN (SELECT *, c, d as b FROM table2) t2 USING (a, b) ORDER BY d, t1.a;
 
 DROP TABLE table1;
 DROP TABLE table2;
diff --git a/tests/queries/0_stateless/00820_multiple_joins.reference b/tests/queries/0_stateless/00820_multiple_joins.reference
index 3d7054bacfc9..b2d3c1cef742 100644
--- a/tests/queries/0_stateless/00820_multiple_joins.reference
+++ b/tests/queries/0_stateless/00820_multiple_joins.reference
@@ -1,51 +1,58 @@
+-- query 1 --
 0	0	0
 6	60	600
 12	120	1200
 18	180	1800
+-- query 2 --
 0	0	0
 10	100	1000
 20	200	2000
-┌─t1.a─┬─t2.a─┬─t2.b─┬─t3.b─┬─t3.c─┬─t5.a─┬─t5.b─┬─t5.c─┐
-│    0 │    0 │    0 │    0 │    0 │    0 │    0 │    0 │
-└──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┘
+-- query 3 --
+0	0	0	0	0	0	0	0
+-- query 4 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 5 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 6 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 7 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 8 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 9 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
-┌─t1.a─┬─t2.a─┬─t2.b─┬─t3.b─┬────c─┐
-│    0 │    0 │    0 │    0 │    0 │
-│    6 │    6 │   60 │   60 │  600 │
-│   12 │   12 │  120 │  120 │ 1200 │
-│   18 │   18 │  180 │  180 │ 1800 │
-└──────┴──────┴──────┴──────┴──────┘
-┌─t1.a─┬─t2.a─┬─t2.b─┬─t3.b─┬────c─┐
-│    0 │    0 │    0 │    0 │    0 │
-│    6 │    6 │   60 │   60 │  600 │
-│   12 │   12 │  120 │  120 │ 1200 │
-│   18 │   18 │  180 │  180 │ 1800 │
-└──────┴──────┴──────┴──────┴──────┘
+-- query 10 --
+0	0	0	0	0
+6	6	60	60	600
+12	12	120	120	1200
+18	18	180	180	1800
+-- query 11 --
+0	0	0	0	0
+6	6	60	60	600
+12	12	120	120	1200
+18	18	180	180	1800
+-- query 12 --
 0	0	0	0	0	0	0
 6	6	60	60	66	66	120
 12	12	120	120	132	132	240
 18	18	180	180	198	198	360
+-- query 13 --
 1
diff --git a/tests/queries/0_stateless/00820_multiple_joins.sql b/tests/queries/0_stateless/00820_multiple_joins.sql
index 5c7a7bebb0be..cf849c7ef9ce 100644
--- a/tests/queries/0_stateless/00820_multiple_joins.sql
+++ b/tests/queries/0_stateless/00820_multiple_joins.sql
@@ -16,17 +16,21 @@ INSERT INTO table2 SELECT number * 2, number * 20 FROM numbers(11);
 INSERT INTO table3 SELECT number * 30, number * 300 FROM numbers(10);
 INSERT INTO table5 SELECT number * 5, number * 50, number * 500 FROM numbers(10);
 
+select '-- query 1 --';
 select t1.a, t2.b, t3.c from table1 as t1 join table2 as t2 on t1.a = t2.a join table3 as t3 on t2.b = t3.b ORDER BY t1.a;
+select '-- query 2 --';
 select t1.a, t2.b, t5.c from table1 as t1 join table2 as t2 on t1.a = t2.a join table5 as t5 on t1.a = t5.a AND t2.b = t5.b ORDER BY t1.a;
 
+select '-- query 3 --';
 select t1.a, t2.a, t2.b, t3.b, t3.c, t5.a, t5.b, t5.c
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b
 join table5 as t5 on t3.c = t5.c
 ORDER BY t1.a
-FORMAT PrettyCompactNoEscapes;
+;
 
+select '-- query 4 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b
 from table1 as t1
 join table2 as t2 on t1_a = t2_a
@@ -34,6 +38,7 @@ join table3 as t3 on t2_b = t3_b
 ORDER BY t1.a
 ;
 
+select '-- query 5 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
@@ -41,6 +46,7 @@ join table3 as t3 on t2.b = t3.b
 ORDER BY t1.a
 ;
 
+select '-- query 6 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
@@ -48,6 +54,7 @@ join table3 as t3 on table2.b = table3.b
 ORDER BY t1.a
 ;
 
+select '-- query 7 --';
 select t1.a, t2.a, t2.b, t3.b
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
@@ -55,6 +62,7 @@ join table3 as t3 on table2.b = table3.b
 ORDER BY t1.a
 ;
 
+select '-- query 8 --';
 select t1.a, t2.a, t2.b, t3.b
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
@@ -62,6 +70,7 @@ join table3 as t3 on t2.b = t3.b
 ORDER BY t1.a
 ;
 
+select '-- query 9 --';
 select table1.a, table2.a, table2.b, table3.b
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
@@ -69,20 +78,23 @@ join table3 as t3 on table2.b = table3.b
 ORDER BY t1.a
 ;
 
+select '-- query 10 --';
 select t1.*, t2.*, t3.*
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
 join table3 as t3 on table2.b = table3.b
 ORDER BY t1.a
-FORMAT PrettyCompactNoEscapes;
+;
 
+select '-- query 11 --';
 select *
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b
 ORDER BY t1.a
-FORMAT PrettyCompactNoEscapes;
+;
 
+select '-- query 12 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b,
     (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
 from table1 as t1
@@ -94,6 +106,7 @@ ORDER BY t1.a
 CREATE TABLE table_set ( x UInt32 ) ENGINE = Set;
 INSERT INTO table_set VALUES (0), (1), (2);
 
+select '-- query 13 --';
 select count()
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
diff --git a/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.reference b/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.reference
index 11755d6bc8b2..003d4822d329 100644
--- a/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.reference
+++ b/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.reference
@@ -1,49 +1,55 @@
+-- query 1 --
 0	0	0
 6	60	600
 12	120	1200
 18	180	1800
+-- query 2 --
 0	0	0
 10	100	1000
 20	200	2000
-┌─t1.a─┬─t2.a─┬─t2.b─┬─t3.b─┬─t3.c─┬─t5.a─┬─t5.b─┬─t5.c─┐
-│    0 │    0 │    0 │    0 │    0 │    0 │    0 │    0 │
-└──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┘
+-- query 3 --
+0	0	0	0	0	0	0	0
+-- query 3 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 4 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 5 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 6 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 7 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
+-- query 8 --
 0	0	0	0
 6	6	60	60
 12	12	120	120
 18	18	180	180
-┌─t1.a─┬─t2.a─┬─t2.b─┬─t3.b─┬────c─┐
-│    0 │    0 │    0 │    0 │    0 │
-│    6 │    6 │   60 │   60 │  600 │
-│   12 │   12 │  120 │  120 │ 1200 │
-│   18 │   18 │  180 │  180 │ 1800 │
-└──────┴──────┴──────┴──────┴──────┘
-┌─t1.a─┬─t2.a─┬─t2.b─┬─t3.b─┬────c─┐
-│    0 │    0 │    0 │    0 │    0 │
-│    6 │    6 │   60 │   60 │  600 │
-│   12 │   12 │  120 │  120 │ 1200 │
-│   18 │   18 │  180 │  180 │ 1800 │
-└──────┴──────┴──────┴──────┴──────┘
+-- query 9 --
+0	0	0	0	0
+6	6	60	60	600
+12	12	120	120	1200
+18	18	180	180	1800
+-- query 10 --
+0	0	0	0	0
+6	6	60	60	600
+12	12	120	120	1200
+18	18	180	180	1800
+-- query 11 --
 0	0	0	0	0	0	0
 6	6	60	60	66	66	120
 12	12	120	120	132	132	240
diff --git a/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.sql b/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.sql
index 3da2cad4effb..7e9fbe0776e8 100644
--- a/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.sql
+++ b/tests/queries/0_stateless/00820_multiple_joins_subquery_requires_alias.sql
@@ -17,67 +17,76 @@ INSERT INTO table5 SELECT number * 5, number * 50, number * 500 FROM numbers(10)
 
 SET joined_subquery_requires_alias = 1;
 
+select '-- query 1 --';
 select t1.a, t2.b, t3.c from table1 as t1 join table2 as t2 on t1.a = t2.a join table3 as t3 on t2.b = t3.b ORDER BY t1.a;
+select '-- query 2 --';
 select t1.a, t2.b, t5.c from table1 as t1 join table2 as t2 on t1.a = t2.a join table5 as t5 on t1.a = t5.a AND t2.b = t5.b ORDER BY t1.a;
 
+select '-- query 3 --';
 select t1.a, t2.a, t2.b, t3.b, t3.c, t5.a, t5.b, t5.c
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b
 join table5 as t5 on t3.c = t5.c
-ORDER BY t1.a
-FORMAT PrettyCompactNoEscapes;
+ORDER BY t1.a;
 
+select '-- query 3 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b
 from table1 as t1
 join table2 as t2 on t1_a = t2_a
 join table3 as t3 on t2_b = t3_b
 ORDER BY t1.a;
 
+select '-- query 4 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b
 ORDER BY t1.a;
 
+select '-- query 5 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
 join table3 as t3 on table2.b = table3.b
 ORDER BY t1.a;
 
+select '-- query 6 --';
 select t1.a, t2.a, t2.b, t3.b
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
 join table3 as t3 on table2.b = table3.b
 ORDER BY t1.a;
 
+select '-- query 7 --';
 select t1.a, t2.a, t2.b, t3.b
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b
 ORDER BY t1.a;
 
+select '-- query 8 --';
 select table1.a, table2.a, table2.b, table3.b
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
 join table3 as t3 on table2.b = table3.b
 ORDER BY t1.a;
 
+select '-- query 9 --';
 select t1.*, t2.*, t3.*
 from table1 as t1
 join table2 as t2 on table1.a = table2.a
 join table3 as t3 on table2.b = table3.b
-ORDER BY t1.a
-FORMAT PrettyCompactNoEscapes;
+ORDER BY t1.a;
 
+select '-- query 10 --';
 select *
 from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b
-ORDER BY t1.a
-FORMAT PrettyCompactNoEscapes;
+ORDER BY t1.a;
 
+select '-- query 11 --';
 select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b,
     (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
 from table1 as t1
diff --git a/tests/queries/0_stateless/00826_cross_to_inner_join.sql b/tests/queries/0_stateless/00826_cross_to_inner_join.sql
index e9f9e13e2d39..b72cdfc17951 100644
--- a/tests/queries/0_stateless/00826_cross_to_inner_join.sql
+++ b/tests/queries/0_stateless/00826_cross_to_inner_join.sql
@@ -15,9 +15,9 @@ INSERT INTO t2_00826 values (1,1), (1,2);
 INSERT INTO t2_00826 (a) values (2), (3);
 
 SELECT '--- cross ---';
-SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a;
+SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a order by t1_00826.a;
 SELECT '--- cross nullable ---';
-SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.b;
+SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.b ORDER BY t1_00826.a;
 SELECT '--- cross nullable vs not nullable ---';
 SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.b ORDER BY t1_00826.a;
 SELECT '--- cross self ---';
@@ -44,11 +44,11 @@ SELECT '--- do not rewrite alias ---';
 SELECT a as b FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.a AND b > 0;
 
 SELECT '--- comma ---';
-SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a;
+SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a ORDER BY t1_00826.a, t1_00826.b, t2_00826.b;
 SELECT '--- comma nullable ---';
-SELECT * FROM t1_00826, t2_00826 where t1_00826.b = t2_00826.b;
+SELECT * FROM t1_00826, t2_00826 where t1_00826.b = t2_00826.b ORDER BY t1_00826.a;
 SELECT '--- comma and or ---';
-SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a AND (t2_00826.b IS NULL OR t2_00826.b < 2);
+SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a AND (t2_00826.b IS NULL OR t2_00826.b < 2) ORDER BY t1_00826.a;
 
 
 SELECT '--- cross ---';
diff --git a/tests/queries/0_stateless/00845_join_on_aliases.sql b/tests/queries/0_stateless/00845_join_on_aliases.sql
index 0800d27caa68..84a43f85049c 100644
--- a/tests/queries/0_stateless/00845_join_on_aliases.sql
+++ b/tests/queries/0_stateless/00845_join_on_aliases.sql
@@ -9,33 +9,40 @@ INSERT INTO table2 SELECT number * 2, number * 20 FROM numbers(6);
 
 select t1.a t1_a, t2.a
 from table1 as t1
-join table2 as t2 on table1.a = table2.a and t1.a = table2.a and t1_a = table2.a;
+join table2 as t2 on table1.a = table2.a and t1.a = table2.a and t1_a = table2.a
+order by t1.a, t2.a;
 
 select t1.a t1_a, t2.a
 from table1 as t1
-join table2 as t2 on table1.a = t2.a and t1.a = t2.a and t1_a = t2.a;
+join table2 as t2 on table1.a = t2.a and t1.a = t2.a and t1_a = t2.a
+order by t1.a, t2.a;
 
 select t1.a as t1_a, t2.a t2_a
 from table1 as t1
-join table2 as t2 on table1.a = t2_a and t1.a = t2_a and t1_a = t2_a;
+join table2 as t2 on table1.a = t2_a and t1.a = t2_a and t1_a = t2_a
+order by t1.a, t2.a;
 
 select t1.a t1_a, t2.a
 from table1 as t1
-join table2 as t2 on table1.a = table2.a and t1.a = t2.a and t1_a = t2.a;
+join table2 as t2 on table1.a = table2.a and t1.a = t2.a and t1_a = t2.a
+order by t1.a, t2.a;
 
 select t1.a t1_a, t2.a as t2_a
 from table1 as t1
-join table2 as t2 on table1.a = table2.a and t1.a = t2.a and t1_a = t2_a;
+join table2 as t2 on table1.a = table2.a and t1.a = t2.a and t1_a = t2_a
+order by t1.a, t2.a;
 
 select *
 from table1 as t1
 join table2 as t2 on t1_a = t2_a
-where (table1.a as t1_a) > 4 and (table2.a as t2_a) > 2;
+where (table1.a as t1_a) > 4 and (table2.a as t2_a) > 2
+order by t1.a, t2.a;
 
 select t1.*, t2.*
 from table1 as t1
 join table2 as t2 on t1_a = t2_a
-where (t1.a as t1_a) > 2 and (t2.a as t2_a) > 4;
+where (t1.a as t1_a) > 2 and (t2.a as t2_a) > 4
+order by t1.a, t2.a;
 
 DROP TABLE table1;
 DROP TABLE table2;
diff --git a/tests/queries/0_stateless/00847_multiple_join_same_column.reference b/tests/queries/0_stateless/00847_multiple_join_same_column.reference
index 91bd62ca5a3d..ecb4c8dbc5e3 100644
--- a/tests/queries/0_stateless/00847_multiple_join_same_column.reference
+++ b/tests/queries/0_stateless/00847_multiple_join_same_column.reference
@@ -15,31 +15,17 @@ s.a: 0
 s.b: 0
 y.a: 0
 y.b: 0
-┌─t.a─┬─s.b─┬─s.a─┬─s.b─┬─y.a─┬─y.b─┐
-│   1 │   1 │   1 │   1 │   1 │   1 │
-│   2 │   0 │   0 │   0 │   0 │   0 │
-└─────┴─────┴─────┴─────┴─────┴─────┘
-┌─t_a─┐
-│   1 │
-│   2 │
-└─────┘
-┌─t.a─┬─s_a─┐
-│   1 │   1 │
-│   2 │   0 │
-└─────┴─────┘
-┌─t.a─┬─t.a─┬─t_b─┐
-│   1 │   1 │   1 │
-│   2 │   2 │   2 │
-└─────┴─────┴─────┘
-┌─s.a─┬─s.a─┬─s_b─┬─s.b─┐
-│   1 │   1 │   1 │   1 │
-│   0 │   0 │   0 │   0 │
-└─────┴─────┴─────┴─────┘
-┌─y.a─┬─y.a─┬─y_b─┬─y.b─┐
-│   1 │   1 │   1 │   1 │
-│   0 │   0 │   0 │   0 │
-└─────┴─────┴─────┴─────┘
-┌─t.a─┬─t_a─┬─s.a─┬─s_a─┬─y.a─┬─y_a─┐
-│   1 │   1 │   1 │   1 │   1 │   1 │
-│   2 │   2 │   0 │   0 │   0 │   0 │
-└─────┴─────┴─────┴─────┴─────┴─────┘
+1	1	1	1	1	1
+2	0	0	0	0	0
+1
+2
+1	1
+2	0
+1	1	1
+2	2	2
+1	1	1	1
+0	0	0	0
+1	1	1	1
+0	0	0	0
+1	1	1	1	1	1
+2	2	0	0	0	0
diff --git a/tests/queries/0_stateless/00847_multiple_join_same_column.sql b/tests/queries/0_stateless/00847_multiple_join_same_column.sql
index c7f0c6383c20..eae88610e181 100644
--- a/tests/queries/0_stateless/00847_multiple_join_same_column.sql
+++ b/tests/queries/0_stateless/00847_multiple_join_same_column.sql
@@ -19,43 +19,36 @@ format Vertical;
 select t.a, s.b, s.a, s.b, y.a, y.b from t
 left join s on (t.a = s.a and s.b = t.b)
 left join y on (y.a = s.a and y.b = s.b)
-order by t.a
-format PrettyCompactNoEscapes;
+order by t.a;
 
 select t.a as t_a from t
 left join s on s.a = t_a
-order by t.a
-format PrettyCompactNoEscapes;
+order by t.a;
 
 select t.a, s.a as s_a from t
 left join s on s.a = t.a
 left join y on y.b = s.b
-order by t.a
-format PrettyCompactNoEscapes;
+order by t.a;
 
 select t.a, t.a, t.b as t_b from t
 left join s on t.a = s.a
 left join y on y.b = s.b
-order by t.a
-format PrettyCompactNoEscapes;
+order by t.a;
 
 select s.a, s.a, s.b as s_b, s.b from t
 left join s on s.a = t.a
 left join y on s.b = y.b
-order by t.a
-format PrettyCompactNoEscapes;
+order by t.a;
 
 select y.a, y.a, y.b as y_b, y.b from t
 left join s on s.a = t.a
 left join y on y.b = s.b
-order by t.a
-format PrettyCompactNoEscapes;
+order by t.a;
 
 select t.a, t.a as t_a, s.a, s.a as s_a, y.a, y.a as y_a from t
 left join s on t.a = s.a
 left join y on y.b = s.b
-order by t.a
-format PrettyCompactNoEscapes;
+order by t.a;
 
 drop table t;
 drop table s;
diff --git a/tests/queries/0_stateless/00860_unknown_identifier_bug.sql b/tests/queries/0_stateless/00860_unknown_identifier_bug.sql
index bbcd3de8f20e..a146948b2a75 100644
--- a/tests/queries/0_stateless/00860_unknown_identifier_bug.sql
+++ b/tests/queries/0_stateless/00860_unknown_identifier_bug.sql
@@ -34,6 +34,7 @@ LEFT JOIN
    FROM appointment_events
    WHERE _status in ('Created', 'Transferred')
    GROUP BY _appointment_id ) B USING _appointment_id
-WHERE A._set_at = B.max_set_at;
+WHERE A._set_at = B.max_set_at
+ORDER BY A._appointment_id;
 
 DROP TABLE appointment_events;
diff --git a/tests/queries/0_stateless/00863_comma_join_in.sql b/tests/queries/0_stateless/00863_comma_join_in.sql
index ebccd351c8a1..6979b0e6ae7b 100644
--- a/tests/queries/0_stateless/00863_comma_join_in.sql
+++ b/tests/queries/0_stateless/00863_comma_join_in.sql
@@ -10,13 +10,14 @@ insert into test1_00863 (id, code) select number, toString(number) FROM numbers(
 insert into test3_00863 (id, code) select number, toString(number) FROM numbers(100000);
 insert into test2_00863 (id, code, test1_id, test3_id) select number, toString(number), number, number FROM numbers(100000);
 
-SET max_memory_usage = 50000000;
+SET max_memory_usage = 100000000;
 
 select test2_00863.id
 from test1_00863, test2_00863, test3_00863
 where test1_00863.code in ('1', '2', '3')
     and test2_00863.test1_id = test1_00863.id
-    and test2_00863.test3_id = test3_00863.id;
+    and test2_00863.test3_id = test3_00863.id
+order by test2_00863.id;
 
 drop table test1_00863;
 drop table test2_00863;
diff --git a/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference b/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference
index 62de3a149a75..4f964f2478f7 100644
--- a/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference
+++ b/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference
@@ -135,13 +135,13 @@ Code: 43
 ------------------------------------------
 SELECT date_trunc(\'year\', N, \'Asia/Istanbul\')
 Code: 43
-"Date","2019-01-01"
-"Date","2019-01-01"
+"DateTime('Asia/Istanbul')","2019-01-01 00:00:00"
+"DateTime('Asia/Istanbul')","2019-01-01 00:00:00"
 ------------------------------------------
 SELECT date_trunc(\'month\', N, \'Asia/Istanbul\')
 Code: 43
-"Date","2019-09-01"
-"Date","2019-09-01"
+"DateTime('Asia/Istanbul')","2019-09-01 00:00:00"
+"DateTime('Asia/Istanbul')","2019-09-01 00:00:00"
 ------------------------------------------
 SELECT date_trunc(\'day\', N, \'Asia/Istanbul\')
 "DateTime('Asia/Istanbul')","2019-09-16 00:00:00"
diff --git a/tests/queries/0_stateless/00956_join_use_nulls_with_array_column.sql b/tests/queries/0_stateless/00956_join_use_nulls_with_array_column.sql
index f70bccd68fdf..af37e1de3e81 100644
--- a/tests/queries/0_stateless/00956_join_use_nulls_with_array_column.sql
+++ b/tests/queries/0_stateless/00956_join_use_nulls_with_array_column.sql
@@ -1,3 +1,3 @@
 SET join_use_nulls = 1;
-SELECT number FROM system.numbers SEMI LEFT JOIN (SELECT number, ['test'] FROM system.numbers LIMIT 1) js2 USING (number) LIMIT 1;
-SELECT number FROM system.numbers ANY LEFT  JOIN (SELECT number, ['test'] FROM system.numbers LIMIT 1) js2 USING (number) LIMIT 1;
+SELECT number FROM (SELECT number from system.numbers LIMIT 10) as js1 SEMI LEFT JOIN (SELECT number, ['test'] FROM system.numbers LIMIT 1) js2 USING (number) LIMIT 1;
+SELECT number FROM (SELECT number from system.numbers LIMIT 10) as js1 ANY LEFT  JOIN (SELECT number, ['test'] FROM system.numbers LIMIT 1) js2 USING (number) ORDER BY number LIMIT 1;
diff --git a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
index d49f63e143d8..5b1c50262bf6 100755
--- a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
+++ b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
@@ -13,8 +13,8 @@ $CLICKHOUSE_CLIENT -n -q "
     DROP TABLE IF EXISTS alter_table0;
     DROP TABLE IF EXISTS alter_table1;
 
-    CREATE TABLE alter_table0 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50));
-    CREATE TABLE alter_table1 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50));
+    CREATE TABLE alter_table0 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50 + 100));
+    CREATE TABLE alter_table1 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50 + 200));
 "
 
 function thread1()
diff --git a/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh b/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh
index bceda77c7f81..f4f38ad9c83c 100755
--- a/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh
+++ b/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh
@@ -63,7 +63,6 @@ function thread6()
     done
 }
 
-
 # https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout
 export -f thread1;
 export -f thread2;
diff --git a/tests/queries/0_stateless/01015_empty_in_inner_right_join.sql.j2 b/tests/queries/0_stateless/01015_empty_in_inner_right_join.sql.j2
index cdb9d253b9b5..310e1b028758 100644
--- a/tests/queries/0_stateless/01015_empty_in_inner_right_join.sql.j2
+++ b/tests/queries/0_stateless/01015_empty_in_inner_right_join.sql.j2
@@ -8,10 +8,10 @@ SELECT 'IN empty set', count() FROM system.numbers WHERE number IN (SELECT toUIn
 SELECT 'IN non-empty set', count() FROM (SELECT number FROM system.numbers LIMIT 10) t1 WHERE t1.number IN (SELECT toUInt64(1) WHERE 1);
 SELECT 'NOT IN empty set', count() FROM (SELECT number FROM system.numbers LIMIT 10) WHERE number NOT IN (SELECT toUInt64(1) WHERE 0);
 
-SELECT 'INNER JOIN empty set', count() FROM system.numbers INNER JOIN (SELECT toUInt64(1) AS x WHERE 0) ON system.numbers.number = x;
+SELECT 'INNER JOIN empty set', count() FROM numbers(1000) INNER JOIN (SELECT toUInt64(1) AS x WHERE 0) ON number = x;
 SELECT 'INNER JOIN non-empty set', count() FROM (SELECT number FROM system.numbers LIMIT 10) t1 INNER JOIN (SELECT toUInt64(1) AS x WHERE 1) ON t1.number = x;
 
-SELECT 'RIGHT JOIN empty set', count() FROM system.numbers RIGHT JOIN (SELECT toUInt64(1) AS x WHERE 0) ON system.numbers.number = x;
+SELECT 'RIGHT JOIN empty set', count() FROM numbers(1000) RIGHT JOIN (SELECT toUInt64(1) AS x WHERE 0) ON number = x;
 SELECT 'RIGHT JOIN non-empty set', count() FROM (SELECT number FROM system.numbers LIMIT 10) t1 RIGHT JOIN (SELECT toUInt64(1) AS x WHERE 1) ON t1.number = x;
 
 SELECT 'LEFT JOIN empty set', count() FROM (SELECT number FROM system.numbers LIMIT 10) t1 LEFT JOIN (SELECT toUInt64(1) AS x WHERE 0) ON t1.number = x;
diff --git a/tests/queries/0_stateless/01158_zookeeper_log_long.reference b/tests/queries/0_stateless/01158_zookeeper_log_long.reference
index a0088610c9dc..7ec52cb3366b 100644
--- a/tests/queries/0_stateless/01158_zookeeper_log_long.reference
+++ b/tests/queries/0_stateless/01158_zookeeper_log_long.reference
@@ -18,22 +18,18 @@ Response	0	Create	/test/01158/default/rmt/replicas/1/parts/all_0_0_0	0	0	\N	0	4
 Request	0	Exists	/test/01158/default/rmt/replicas/1/parts/all_0_0_0	0	0	\N	0	0	\N	\N	\N		0	0	0	0
 Response	0	Exists	/test/01158/default/rmt/replicas/1/parts/all_0_0_0	0	0	\N	0	0	ZOK	\N	\N		0	0	96	0
 blocks
-Request	0	Multi		0	0	\N	3	0	\N	\N	\N		0	0	0	0
-Request	0	Create	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	\N	0	1	\N	\N	\N		0	0	0	0
-Request	0	Remove	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	2	\N	\N	\N		0	0	0	0
-Request	0	Create	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	3	\N	\N	\N		0	0	0	0
-Response	0	Multi		0	0	\N	3	0	ZOK	\N	\N		0	0	0	0
-Response	0	Create	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	\N	0	1	ZOK	\N	\N	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	0	0
-Response	0	Remove	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	2	ZOK	\N	\N		0	0	0	0
-Response	0	Create	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	3	ZOK	\N	\N	/test/01158/default/rmt/block_numbers/all/block-0000000000	0	0	0	0
-Request	0	Multi		0	0	\N	3	0	\N	\N	\N		0	0	0	0
-Request	0	Create	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	\N	0	1	\N	\N	\N		0	0	0	0
-Request	0	Remove	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	2	\N	\N	\N		0	0	0	0
-Request	0	Create	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	3	\N	\N	\N		0	0	0	0
-Response	0	Multi		0	0	\N	3	0	ZNODEEXISTS	\N	\N		0	0	0	0
-Response	0	Error	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	\N	0	1	ZNODEEXISTS	\N	\N		0	0	0	0
-Response	0	Error	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	2	ZRUNTIMEINCONSISTENCY	\N	\N		0	0	0	0
-Response	0	Error	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	3	ZRUNTIMEINCONSISTENCY	\N	\N		0	0	0	0
+Request	0	Multi		0	0	\N	2	0	\N	\N	\N		0	0	0	0
+Request	0	CheckNotExists	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	1	\N	\N	\N		0	0	0	0
+Request	0	Create	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	2	\N	\N	\N		0	0	0	0
+Response	0	Multi		0	0	\N	2	0	ZOK	\N	\N		0	0	0	0
+Response	0	CheckNotExists	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	1	ZOK	\N	\N		0	0	0	0
+Response	0	Create	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	2	ZOK	\N	\N	/test/01158/default/rmt/block_numbers/all/block-0000000000	0	0	0	0
+Request	0	Multi		0	0	\N	2	0	\N	\N	\N		0	0	0	0
+Request	0	CheckNotExists	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	1	\N	\N	\N		0	0	0	0
+Request	0	Create	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	2	\N	\N	\N		0	0	0	0
+Response	0	Multi		0	0	\N	2	0	ZNODEEXISTS	\N	\N		0	0	0	0
+Response	0	Error	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	-1	0	1	ZNODEEXISTS	\N	\N		0	0	0	0
+Response	0	Error	/test/01158/default/rmt/block_numbers/all/block-	1	1	\N	0	2	ZRUNTIMEINCONSISTENCY	\N	\N		0	0	0	0
 Request	0	Get	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	\N	0	0	\N	\N	\N		0	0	0	0
 Response	0	Get	/test/01158/default/rmt/blocks/all_6308706741995381342_2495791770474910886	0	0	\N	0	0	ZOK	\N	\N		0	0	9	0
 duration_ms
diff --git a/tests/queries/0_stateless/01292_create_user.reference b/tests/queries/0_stateless/01292_create_user.reference
index f723412c636f..eb89a5ed38c0 100644
--- a/tests/queries/0_stateless/01292_create_user.reference
+++ b/tests/queries/0_stateless/01292_create_user.reference
@@ -13,6 +13,8 @@ CREATE USER u4_01292 IDENTIFIED WITH sha256_password
 CREATE USER u5_01292 IDENTIFIED WITH sha256_password
 CREATE USER u6_01292 IDENTIFIED WITH double_sha1_password
 CREATE USER u7_01292 IDENTIFIED WITH double_sha1_password
+CREATE USER u8_01292 IDENTIFIED WITH bcrypt_password
+CREATE USER u9_01292 IDENTIFIED WITH bcrypt_password
 CREATE USER u1_01292 IDENTIFIED WITH sha256_password
 CREATE USER u2_01292 IDENTIFIED WITH sha256_password
 CREATE USER u3_01292 IDENTIFIED WITH sha256_password
diff --git a/tests/queries/0_stateless/01292_create_user.sql b/tests/queries/0_stateless/01292_create_user.sql
index d0f157d36b09..a283ce687e6e 100644
--- a/tests/queries/0_stateless/01292_create_user.sql
+++ b/tests/queries/0_stateless/01292_create_user.sql
@@ -1,4 +1,4 @@
--- Tags: no-fasttest
+-- Tags: no-fasttest, no-parallel
 
 DROP USER IF EXISTS u1_01292, u2_01292, u3_01292, u4_01292, u5_01292, u6_01292, u7_01292, u8_01292, u9_01292;
 DROP USER IF EXISTS u10_01292, u11_01292, u12_01292, u13_01292, u14_01292, u15_01292, u16_01292;
@@ -31,6 +31,8 @@ CREATE USER u4_01292 IDENTIFIED WITH sha256_password BY 'qwe123';
 CREATE USER u5_01292 IDENTIFIED WITH sha256_hash BY '18138372FAD4B94533CD4881F03DC6C69296DD897234E0CEE83F727E2E6B1F63';
 CREATE USER u6_01292 IDENTIFIED WITH double_sha1_password BY 'qwe123';
 CREATE USER u7_01292 IDENTIFIED WITH double_sha1_hash BY '8DCDD69CE7D121DE8013062AEAEB2A148910D50E';
+CREATE USER u8_01292 IDENTIFIED WITH bcrypt_password BY 'qwe123';
+CREATE USER u9_01292 IDENTIFIED WITH bcrypt_hash BY '$2a$12$rz5iy2LhuwBezsM88ZzWiemOVUeJ94xHTzwAlLMDhTzwUxOHaY64q';
 SHOW CREATE USER u1_01292;
 SHOW CREATE USER u2_01292;
 SHOW CREATE USER u3_01292;
@@ -38,6 +40,8 @@ SHOW CREATE USER u4_01292;
 SHOW CREATE USER u5_01292;
 SHOW CREATE USER u6_01292;
 SHOW CREATE USER u7_01292;
+SHOW CREATE USER u8_01292;
+SHOW CREATE USER u9_01292;
 ALTER USER u1_01292 IDENTIFIED BY '123qwe';
 ALTER USER u2_01292 IDENTIFIED BY '123qwe';
 ALTER USER u3_01292 IDENTIFIED BY '123qwe';
@@ -48,7 +52,7 @@ SHOW CREATE USER u2_01292;
 SHOW CREATE USER u3_01292;
 SHOW CREATE USER u4_01292;
 SHOW CREATE USER u5_01292;
-DROP USER u1_01292, u2_01292, u3_01292, u4_01292, u5_01292, u6_01292, u7_01292;
+DROP USER u1_01292, u2_01292, u3_01292, u4_01292, u5_01292, u6_01292, u7_01292, u8_01292, u9_01292;
 
 SELECT '-- host';
 CREATE USER u1_01292 HOST ANY;
diff --git a/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.reference b/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.reference
index 60c6076aef0c..a905ea97ae51 100644
--- a/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.reference
+++ b/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.reference
@@ -1,24 +1,24 @@
 hello	test
 hello	test
-1_0_0_0	hello	1
-1_0_0_0	hello	1
+0	0	hello	1
+0	0	hello	1
 hello	test
 goodbye	test
 hello	test
 goodbye	test
-3_0_0_1	goodbye	3
-1_0_0_1	hello	1
-3_0_0_1	goodbye	3
-1_0_0_1	hello	1
+0	0	goodbye	3
+0	0	hello	1
+0	0	goodbye	3
+0	0	hello	1
 1	test
 3	test
 111	abc
 1	test
 3	test
 111	abc
-1_0_0_2	1	1
-111_0_0_1	111	111
-3_0_0_2	3	3
-1_0_0_2	1	1
-111_0_0_1	111	111
-3_0_0_2	3	3
+0	0	1	1
+0	0	111	111
+0	0	3	3
+0	0	1	1
+0	0	111	111
+0	0	3	3
diff --git a/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.sql b/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.sql
index f20156fd9e3d..d40bcc15e556 100644
--- a/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.sql
+++ b/tests/queries/0_stateless/01346_alter_enum_partition_key_replicated_zookeeper_long.sql
@@ -13,16 +13,17 @@ INSERT INTO test VALUES ('hello', 'test');
 SELECT * FROM test;
 SYSTEM SYNC REPLICA test2;
 SELECT * FROM test2;
-SELECT name, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test' AND active ORDER BY partition;
-SELECT name, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test2' AND active ORDER BY partition;
+SELECT min_block_number, max_block_number, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test' AND active ORDER BY partition;
+SELECT min_block_number, max_block_number, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test2' AND active ORDER BY partition;
 
 ALTER TABLE test MODIFY COLUMN x Enum('hello' = 1, 'world' = 2, 'goodbye' = 3);
 INSERT INTO test VALUES ('goodbye', 'test');
 OPTIMIZE TABLE test FINAL;
 SELECT * FROM test ORDER BY x;
+SYSTEM SYNC REPLICA test2;
 SELECT * FROM test2 ORDER BY x;
-SELECT name, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test' AND active ORDER BY partition;
-SELECT name, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test2' AND active ORDER BY partition;
+SELECT min_block_number, max_block_number, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test' AND active ORDER BY partition;
+SELECT min_block_number, max_block_number, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test2' AND active ORDER BY partition;
 
 ALTER TABLE test MODIFY COLUMN x Enum('hello' = 1, 'world' = 2); -- { serverError 524 }
 ALTER TABLE test MODIFY COLUMN x Enum('hello' = 1, 'world' = 2, 'test' = 3);
@@ -33,9 +34,10 @@ ALTER TABLE test MODIFY COLUMN x Int8;
 INSERT INTO test VALUES (111, 'abc');
 OPTIMIZE TABLE test FINAL;
 SELECT * FROM test ORDER BY x;
+SYSTEM SYNC REPLICA test2;
 SELECT * FROM test2 ORDER BY x;
-SELECT name, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test' AND active ORDER BY partition;
-SELECT name, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test2' AND active ORDER BY partition;
+SELECT min_block_number, max_block_number, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test' AND active ORDER BY partition;
+SELECT min_block_number, max_block_number, partition, partition_id FROM system.parts WHERE database = currentDatabase() AND table = 'test2' AND active ORDER BY partition;
 
 ALTER TABLE test MODIFY COLUMN x Enum8('' = 1); -- { serverError 524 }
 ALTER TABLE test MODIFY COLUMN x Enum16('' = 1); -- { serverError 524 }
diff --git a/tests/queries/0_stateless/01495_subqueries_in_with_statement.sql b/tests/queries/0_stateless/01495_subqueries_in_with_statement.sql
index 819346be1291..a71053934b2e 100644
--- a/tests/queries/0_stateless/01495_subqueries_in_with_statement.sql
+++ b/tests/queries/0_stateless/01495_subqueries_in_with_statement.sql
@@ -7,7 +7,7 @@ INSERT INTO test1 VALUES (1, 2), (3, 4);
 WITH test1 AS (SELECT * FROM numbers(5)) SELECT * FROM test1;
 WITH test1 AS (SELECT i + 1, j + 1 FROM test1) SELECT * FROM test1;
 WITH test1 AS (SELECT i + 1, j + 1 FROM test1) SELECT * FROM (SELECT * FROM test1);
-SELECT * FROM (WITH test1 AS (SELECT toInt32(*) i FROM numbers(5)) SELECT * FROM test1) l ANY INNER JOIN test1 r on (l.i == r.i);
+SELECT * FROM (WITH test1 AS (SELECT toInt32(*) i FROM numbers(5)) SELECT * FROM test1) l ANY INNER JOIN test1 r on (l.i == r.i) order by l.i;
 WITH test1 AS (SELECT i + 1, j + 1 FROM test1) SELECT toInt64(4) i, toInt64(5) j FROM numbers(3) WHERE (i, j) IN test1;
 
 DROP TABLE IF EXISTS test1;
@@ -20,7 +20,7 @@ WITH test1 AS (SELECT number-1 as n FROM numbers(42))
 SELECT max(n+1)+1 z FROM test1;
 
 WITH test1 AS (SELECT number-1 as n FROM numbers(42)) 
-SELECT max(n+1)+1 z FROM test1 join test1 x using n having z - 1 = (select min(n-1)+41 from test1) + 2;
+SELECT max(n+1)+1 z FROM test1 join test1 x using n having z - 1 = (select min(n-1)+41 from test1) + 2 order by z;
 
 WITH test1 AS (SELECT number-1 as n FROM numbers(4442) order by n limit 100)
 SELECT max(n) FROM test1 where n=422;
@@ -32,10 +32,10 @@ drop table if exists with_test ;
 create table with_test engine=Memory as select cast(number-1 as Nullable(Int64))  n from numbers(10000);
 
 WITH test1 AS (SELECT n FROM with_test where n <= 40) 
-SELECT max(n+1)+1 z FROM test1 join test1 x using (n) having max(n+1)+1 - 1 = (select min(n-1)+41 from test1) + 2;
+SELECT max(n+1)+1 z FROM test1 join test1 x using (n) having max(n+1)+1 - 1 = (select min(n-1)+41 from test1) + 2 order by z;
 
 WITH test1 AS (SELECT n FROM with_test where n <= 40) 
-SELECT max(n+1)+1 z FROM test1 join test1 x using (n) having z - 1 = (select min(n-1)+41 from test1) + 2;
+SELECT max(n+1)+1 z FROM test1 join test1 x using (n) having z - 1 = (select min(n-1)+41 from test1) + 2 order by z;
 
 WITH test1 AS (SELECT  n FROM with_test order by n limit 100)
 SELECT max(n) FROM test1 where n=422;
diff --git a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
index f47d0863e698..89ce84f6dbca 100755
--- a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
+++ b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
@@ -1,5 +1,4 @@
 #!/usr/bin/env bash
-# Tags: long
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
@@ -8,7 +7,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 verify_sql="SELECT
     (SELECT sumIf(value, metric = 'PartsActive'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics)
-    = (SELECT sum(active), sum(NOT active) FROM system.parts)"
+    = (SELECT sum(active), sum(NOT active) FROM
+    (SELECT active FROM system.parts UNION ALL SELECT active FROM system.projection_parts))"
 
 # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time.
 # So, there is inherent race condition. But it should get expected result eventually.
diff --git a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh
index 05edf02f7edc..0b9afcf633ef 100755
--- a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh
+++ b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Tags: long, no-s3-storage
+# Tags: no-s3-storage
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
@@ -11,7 +11,8 @@ set -o pipefail
 # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 verify_sql="SELECT
     (SELECT sumIf(value, metric = 'PartsInMemory'), sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) =
-    (SELECT countIf(part_type == 'InMemory'), countIf(part_type == 'Compact'), countIf(part_type == 'Wide') FROM system.parts)"
+    (SELECT countIf(part_type == 'InMemory'), countIf(part_type == 'Compact'), countIf(part_type == 'Wide')
+    FROM (SELECT part_type FROM system.parts UNION ALL SELECT part_type FROM system.projection_parts))"
 
 # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time.
 # So, there is inherent race condition (especially in fasttest that runs tests in parallel).
diff --git a/tests/queries/0_stateless/01881_join_on_conditions_hash.sql.j2 b/tests/queries/0_stateless/01881_join_on_conditions_hash.sql.j2
index fafefd72cb85..41c3237a0e3c 100644
--- a/tests/queries/0_stateless/01881_join_on_conditions_hash.sql.j2
+++ b/tests/queries/0_stateless/01881_join_on_conditions_hash.sql.j2
@@ -24,13 +24,13 @@ SET join_algorithm = 'hash';
 SELECT '-- hash_join --';
 
 SELECT '--';
-SELECT t1.key, t1.key2 FROM t1 INNER ALL JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2;
+SELECT t1.key, t1.key2 FROM t1 INNER ALL JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 ORDER BY t1.key, t1.key2;
 SELECT '--';
-SELECT t1.key, t1.key2 FROM t1 INNER ALL JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2;
+SELECT t1.key, t1.key2 FROM t1 INNER ALL JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2 ORDER BY t1.key, t1.key2;
 
 SELECT '--';
-SELECT t1.key FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2;
-SELECT t1.key FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2 AND 0; -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT t1.key FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2 ORDER BY t1.key;
+SELECT t1.key FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2 AND 0 ORDER BY t1.key; -- { serverError INVALID_JOIN_ON_EXPRESSION }
 
 SELECT '--';
 SELECT '333' = t1.key FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2 AND t2.id > 2;
@@ -46,25 +46,25 @@ SELECT '333' = t1.key FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND t2.key ==
 -- DISTINCT is used to remove the difference between 'hash' and 'merge' join: 'merge' doesn't support `any_join_distinct_right_table_keys`
 
 SELECT '--';
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2_nullable as t2 ON t1.id == t2.id AND t2.key2 != '';
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(t2.key2 != '');
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(t2.key2 != '');
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(toNullable(t2.key2 != ''));
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(toLowCardinality(t2.key2 != ''));
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(t1.key2 != '');
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(t1.key2 != '');
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(toNullable(t1.key2 != ''));
-SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(toLowCardinality(t1.key2 != ''));
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2_nullable as t2 ON t1.id == t2.id AND t2.key2 != '' ORDER BY t1.id;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(t2.key2 != '') ORDER BY t1.id;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(t2.key2 != '') ORDER BY t1.id;;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(toNullable(t2.key2 != '')) ORDER BY t1.id;;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(toLowCardinality(t2.key2 != '')) ORDER BY t1.id;;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(t1.key2 != '') ORDER BY t1.id;;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(t1.key2 != '') ORDER BY t1.id;;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toLowCardinality(toNullable(t1.key2 != '')) ORDER BY t1.id;;
+SELECT DISTINCT t1.id FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toNullable(toLowCardinality(t1.key2 != '')) ORDER BY t1.id;;
 
 SELECT '--';
-SELECT DISTINCT t1.key, toUInt8(t1.id) as e FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND e;
+SELECT DISTINCT t1.key, toUInt8(t1.id) as e FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND e ORDER BY t1.key;
 -- `e + 1` is UInt16
-SELECT DISTINCT t1.key, toUInt8(t1.id) as e FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND e + 1; -- { serverError 403 }
-SELECT DISTINCT t1.key, toUInt8(t1.id) as e FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toUInt8(e + 1);
+SELECT DISTINCT t1.key, toUInt8(t1.id) as e FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND e + 1 ORDER BY t1.key; -- { serverError 403 }
+SELECT DISTINCT t1.key, toUInt8(t1.id) as e FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND toUInt8(e + 1) ORDER BY t1.key;
 
 SELECT '--';
 SELECT t1.id, t1.key, t1.key2, t2.id, t2.key, t2.key2  FROM t1 FULL JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t1.key == t1.key2 ORDER BY t1.id NULLS FIRST, t2.id NULLS FIRST;
-SELECT t1.id, t1.key, t1.key2, t22.id, t22.idd, t22.key, t22.key2  FROM t1 FULL JOIN t22 ON t1.id == t22.id AND t22.key == t22.key2 AND t1.key == t1.key2 OR t1.id = t22.idd AND t1.key = t1.key2 ORDER BY t1.id NULLS FIRST, t22.id NULLS FIRST;
+SELECT t1.id, t1.key, t1.key2, t22.id, t22.idd, t22.key, t22.key2  FROM t1 FULL JOIN t22 ON t1.id == t22.id AND t22.key == t22.key2 AND t1.key == t1.key2 OR t1.id = t22.idd AND t1.key = t1.key2 ORDER BY t1.id NULLS FIRST, t1.key, t1.key2, t22.id ;
 
 SELECT * FROM t1 INNER ALL JOIN t2 ON t1.id == t2.id AND t1.id; -- { serverError 403 }
 SELECT * FROM t1 INNER ALL JOIN t2 ON t1.id == t2.id AND t2.id; -- { serverError 403 }
@@ -82,7 +82,7 @@ SELECT * FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND t2.key == t2.key2 AND t
 
 SELECT '--';
 -- length(t1.key2) == length(t2.key2) is expression for columns from both tables, it works because it part of joining key
-SELECT t1.*, t2.* FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND length(t1.key2) == length(t2.key2) AND t1.key != '333';
+SELECT t1.*, t2.* FROM t1 INNER ANY JOIN t2 ON t1.id == t2.id AND length(t1.key2) == length(t2.key2) AND t1.key != '333' ORDER BY t1.id, t1.key, t1.key2, t2.id, t2.key, t2.key2;
 
 SELECT 't22', * FROM t1 JOIN t22 ON t1.id == t22.idd and (t22.key == t22.key2 OR t1.id == t22.id); -- { serverError 403 }
 SELECT 't22', * FROM t1 JOIN t22 ON t1.id == t22.idd and (t1.id == t22.id OR t22.key == t22.key2); -- { serverError 403 }
diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference
index 5eba9cca350e..a352f0053694 100644
--- a/tests/queries/0_stateless/02117_show_create_table_system.reference
+++ b/tests/queries/0_stateless/02117_show_create_table_system.reference
@@ -47,7 +47,10 @@ CREATE TABLE system.clusters
     `default_database` String,
     `errors_count` UInt32,
     `slowdowns_count` UInt32,
-    `estimated_recovery_time` UInt32
+    `estimated_recovery_time` UInt32,
+    `database_shard_name` String,
+    `database_replica_name` String,
+    `is_active` Nullable(UInt8)
 )
 ENGINE = SystemClusters
 COMMENT 'SYSTEM TABLE is built on the fly.'
@@ -281,7 +284,12 @@ CREATE TABLE system.functions
     `alias_to` String,
     `create_query` String,
     `origin` Enum8('System' = 0, 'SQLUserDefined' = 1, 'ExecutableUserDefined' = 2),
-    `description` String
+    `description` String,
+    `syntax` String,
+    `arguments` String,
+    `returned_value` String,
+    `examples` String,
+    `categories` String
 )
 ENGINE = SystemFunctions
 COMMENT 'SYSTEM TABLE is built on the fly.'
@@ -356,6 +364,7 @@ CREATE TABLE system.merges
     `partition_id` String,
     `is_mutation` UInt8,
     `total_size_bytes_compressed` UInt64,
+    `total_size_bytes_uncompressed` UInt64,
     `total_size_marks` UInt64,
     `bytes_read_uncompressed` UInt64,
     `rows_read` UInt64,
@@ -1118,7 +1127,7 @@ CREATE TABLE system.users
     `name` String,
     `id` UUID,
     `storage` String,
-    `auth_type` Enum8('no_password' = 0, 'plaintext_password' = 1, 'sha256_password' = 2, 'double_sha1_password' = 3, 'ldap' = 4, 'kerberos' = 5, 'ssl_certificate' = 6),
+    `auth_type` Enum8('no_password' = 0, 'plaintext_password' = 1, 'sha256_password' = 2, 'double_sha1_password' = 3, 'ldap' = 4, 'kerberos' = 5, 'ssl_certificate' = 6, 'bcrypt_password' = 7),
     `auth_params` String,
     `host_ip` Array(String),
     `host_names` Array(String),
diff --git a/tests/queries/0_stateless/02125_transform_decimal_bug.reference b/tests/queries/0_stateless/02125_transform_decimal_bug.reference
index 7f59d0ee7bf6..d1bf333ec8e1 100644
--- a/tests/queries/0_stateless/02125_transform_decimal_bug.reference
+++ b/tests/queries/0_stateless/02125_transform_decimal_bug.reference
@@ -1,3 +1,4 @@
+1
 0
 1
 2
diff --git a/tests/queries/0_stateless/02125_transform_decimal_bug.sql b/tests/queries/0_stateless/02125_transform_decimal_bug.sql
index 4ef471ea875e..002f60076e9a 100644
--- a/tests/queries/0_stateless/02125_transform_decimal_bug.sql
+++ b/tests/queries/0_stateless/02125_transform_decimal_bug.sql
@@ -1,4 +1,4 @@
-SELECT transform(1, [1], [toDecimal32(1, 2)]); -- { serverError 44 }
+SELECT transform(1, [1], [toDecimal32(1, 2)]);
 SELECT transform(toDecimal32(number, 2), [toDecimal32(3, 2)], [toDecimal32(30, 2)]) FROM system.numbers LIMIT 10;
 SELECT transform(toDecimal32(number, 2), [toDecimal32(3, 2)], [toDecimal32(30, 2)], toDecimal32(1000, 2)) FROM system.numbers LIMIT 10;
 SELECT transform(number, [3, 5, 11], [toDecimal32(30, 2), toDecimal32(50, 2), toDecimal32(70,2)], toDecimal32(1000, 2)) FROM system.numbers LIMIT 10;
diff --git a/tests/queries/0_stateless/02163_operators.sql b/tests/queries/0_stateless/02163_operators.sql
index b2414bb197ef..3f2d7d8bbb7a 100644
--- a/tests/queries/0_stateless/02163_operators.sql
+++ b/tests/queries/0_stateless/02163_operators.sql
@@ -1,2 +1,2 @@
-WITH 2 AS `b.c`, [4, 5] AS a, 6 AS u, 3 AS v, 2 AS d, TRUE AS e, 1 AS f, 0 AS g, 2 AS h, 'Hello' AS i, 'World' AS j, TIMESTAMP '2022-02-02 02:02:02' AS w, [] AS k, (1, 2) AS l, 2 AS m, 3 AS n, [] AS o, [1] AS p, 1 AS q, q AS r, 1 AS s, 1 AS t
+WITH 2 AS `b.c`, [4, 5] AS a, 6 AS u, 3 AS v, 2 AS d, TRUE AS e, 1 AS f, 0 AS g, 2 AS h, 'Hello' AS i, 'World' AS j, 'hi' AS w, NULL AS k, (1, 2) AS l, 2 AS m, 3 AS n, [] AS o, [1] AS p, 1 AS q, q AS r, 1 AS s, 1 AS t
 SELECT INTERVAL CASE CASE WHEN NOT -a[`b.c`] * u DIV v + d IS NOT NULL AND e OR f BETWEEN g AND h THEN i ELSE j END WHEN w THEN k END || [l, (m, n)] MINUTE IS NULL OR NOT o::Array(INT) = p <> q < r > s != t AS upyachka;
diff --git a/tests/queries/0_stateless/02169_map_functions.reference b/tests/queries/0_stateless/02169_map_functions.reference
index bec2eaec5958..10746a70f06c 100644
--- a/tests/queries/0_stateless/02169_map_functions.reference
+++ b/tests/queries/0_stateless/02169_map_functions.reference
@@ -40,6 +40,8 @@
 {'key1':1111,'key2':2222,'key5':500,'key6':600}
 {'key1':1112,'key2':2224,'key5':500,'key6':600}
 {'key1':1113,'key2':2226,'key5':500,'key6':600}
+{'key5':500,'key6':600}
+{'key5':500,'key6':600}
 1
 1
 1
diff --git a/tests/queries/0_stateless/02169_map_functions.sql b/tests/queries/0_stateless/02169_map_functions.sql
index 27ceb2520220..febaf2bd9d08 100644
--- a/tests/queries/0_stateless/02169_map_functions.sql
+++ b/tests/queries/0_stateless/02169_map_functions.sql
@@ -11,6 +11,8 @@ SELECT mapApply((k, v) -> tuple(v + 9223372036854775806), col) FROM table_map; -
 
 SELECT mapConcat(col, map('key5', 500), map('key6', 600)) FROM table_map ORDER BY id;
 SELECT mapConcat(col, materialize(map('key5', 500)), map('key6', 600)) FROM table_map ORDER BY id;
+SELECT concat(map('key5', 500), map('key6', 600));
+SELECT map('key5', 500) || map('key6', 600);
 
 SELECT mapExists((k, v) -> k LIKE '%3', col) FROM table_map ORDER BY id;
 SELECT mapExists((k, v) -> k LIKE '%2' AND v < 1000, col) FROM table_map ORDER BY id;
diff --git a/tests/queries/0_stateless/02236_explain_pipeline_join.reference b/tests/queries/0_stateless/02236_explain_pipeline_join.reference
index 5d7a7bfc488a..18e5db7aa4ff 100644
--- a/tests/queries/0_stateless/02236_explain_pipeline_join.reference
+++ b/tests/queries/0_stateless/02236_explain_pipeline_join.reference
@@ -1,17 +1,26 @@
 (Expression)
 ExpressionTransform
   (Join)
-  JoiningTransform 2 → 1
-    (Expression)
-    ExpressionTransform
-      (Limit)
-      Limit
-        (ReadFromStorage)
-        Numbers 0 → 1
-    (Expression)
-    FillingRightJoinSide
-      ExpressionTransform
-        (Limit)
-        Limit
-          (ReadFromStorage)
-          Numbers 0 → 1
+  Resize 16 → 1
+    JoiningTransform × 16 2 → 1
+      InnerShuffleGatherTransform × 16 8 → 1
+        InnerShuffleDispatchTransform × 8 2 → 16
+          InnerShuffleScatterTransform × 16
+            Resize 1 → 16
+              (Expression)
+              ExpressionTransform
+                (Limit)
+                Limit
+                  (ReadFromStorage)
+                  Numbers 0 → 1
+              (Expression)
+              FillingRightJoinSide × 16
+                InnerShuffleGatherTransform × 16 8 → 1
+                  InnerShuffleDispatchTransform × 8 2 → 16
+                    InnerShuffleScatterTransform × 16
+                      Resize 1 → 16
+                        ExpressionTransform
+                          (Limit)
+                          Limit
+                            (ReadFromStorage)
+                            Numbers 0 → 1
diff --git a/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference b/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference
index d3be4855b361..cf2bf5fb521d 100644
--- a/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference
+++ b/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference
@@ -1,10 +1,60 @@
 Using storage policy: s3_cache
-0	79	80
-0	745	746
-0	745	746
-0	745	746
+0
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
 Using storage policy: local_cache
-0	79	80
-0	745	746
-0	745	746
-0	745	746
+0
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
+Expect cache
+DOWNLOADED	0	79	80
+DOWNLOADED	0	745	746
+2
+Expect no cache
diff --git a/tests/queries/0_stateless/02240_system_filesystem_cache_table.sh b/tests/queries/0_stateless/02240_system_filesystem_cache_table.sh
index a487f3ca739c..c7dc9fbd9617 100755
--- a/tests/queries/0_stateless/02240_system_filesystem_cache_table.sh
+++ b/tests/queries/0_stateless/02240_system_filesystem_cache_table.sh
@@ -9,34 +9,69 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 
 for STORAGE_POLICY in 's3_cache' 'local_cache'; do
     echo "Using storage policy: $STORAGE_POLICY"
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM STOP MERGES"
     ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP FILESYSTEM CACHE"
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP MARK CACHE"
+    ${CLICKHOUSE_CLIENT} --query "SELECT count() FROM system.filesystem_cache"
 
     ${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS test_02240_storage_policy"
-    ${CLICKHOUSE_CLIENT} --query "CREATE TABLE test_02240_storage_policy (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='${STORAGE_POLICY}', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false"
+    ${CLICKHOUSE_CLIENT} --query "CREATE TABLE test_02240_storage_policy (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='${STORAGE_POLICY}', min_bytes_for_wide_part = 1000000, compress_marks=false, compress_primary_key=false"
     ${CLICKHOUSE_CLIENT} --query "SYSTEM STOP MERGES test_02240_storage_policy"
     ${CLICKHOUSE_CLIENT} --enable_filesystem_cache_on_write_operations=0 --query "INSERT INTO test_02240_storage_policy SELECT number, toString(number) FROM numbers(100)"
+
+    echo 'Expect cache'
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP MARK CACHE"
+    ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_02240_storage_policy FORMAT Null"
+    ${CLICKHOUSE_CLIENT} --query "SELECT state, file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_begin, file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT uniqExact(key) FROM system.filesystem_cache";
+
+    echo 'Expect cache'
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP MARK CACHE"
     ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_02240_storage_policy FORMAT Null"
-    ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT state, file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_begin, file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT uniqExact(key) FROM system.filesystem_cache";
 
     ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP FILESYSTEM CACHE"
+    echo 'Expect no cache'
     ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache"
+
+    echo 'Expect cache'
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP MARK CACHE"
     ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_02240_storage_policy FORMAT Null"
-    ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache"
+    ${CLICKHOUSE_CLIENT} --query "SELECT state, file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_begin, file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT uniqExact(key) FROM system.filesystem_cache";
+
     ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP FILESYSTEM CACHE"
+    echo 'Expect no cache'
     ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache"
 
     ${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS test_02240_storage_policy_3"
-    ${CLICKHOUSE_CLIENT} --query "CREATE TABLE test_02240_storage_policy_3 (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='${STORAGE_POLICY}_3', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false"
+    ${CLICKHOUSE_CLIENT} --query "CREATE TABLE test_02240_storage_policy_3 (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='${STORAGE_POLICY}_3', min_bytes_for_wide_part = 1000000, compress_marks=false, compress_primary_key=false"
     ${CLICKHOUSE_CLIENT} --enable_filesystem_cache_on_write_operations=0 --query "INSERT INTO test_02240_storage_policy_3 SELECT number, toString(number) FROM numbers(100)"
+
+    echo 'Expect cache'
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP MARK CACHE"
     ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_02240_storage_policy_3 FORMAT Null"
-    ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT state, file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_begin, file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT uniqExact(key) FROM system.filesystem_cache";
+
+    echo 'Expect cache'
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP MARK CACHE"
     ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_02240_storage_policy_3 FORMAT Null"
-    ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT state, file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_begin, file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT uniqExact(key) FROM system.filesystem_cache";
 
+    echo 'Expect no cache'
     ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP FILESYSTEM CACHE"
     ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache"
+
+    echo 'Expect cache'
+    ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP MARK CACHE"
     ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_02240_storage_policy_3 FORMAT Null"
-    ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache"
+    ${CLICKHOUSE_CLIENT} --query "SELECT state, file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_begin, file_segment_range_end, size"
+    ${CLICKHOUSE_CLIENT} --query "SELECT uniqExact(key) FROM system.filesystem_cache";
+
     ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP FILESYSTEM CACHE"
+    echo 'Expect no cache'
     ${CLICKHOUSE_CLIENT} --query "SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache"
 done
diff --git a/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql b/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql
index abc2ee414022..deafa8d6ab5b 100644
--- a/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql
+++ b/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql
@@ -12,8 +12,8 @@ CREATE TABLE  without_nullable
 insert into with_nullable values(0,'f'),(0,'usa');
 insert into without_nullable values(0,'usa'),(0,'us2a');
 
-select if(t0.country is null ,t2.country,t0.country) "country" 
-from without_nullable t0 right outer join with_nullable t2 on t0.country=t2.country;
+select if(t0.country is null ,t2.country,t0.country) as c 
+from without_nullable t0 right outer join with_nullable t2 on t0.country=t2.country order by c desc;
 
 drop table with_nullable;
 drop table without_nullable;
diff --git a/tests/queries/0_stateless/02282_array_distance.sql b/tests/queries/0_stateless/02282_array_distance.sql
index 9c16071dc1f9..08539d461109 100644
--- a/tests/queries/0_stateless/02282_array_distance.sql
+++ b/tests/queries/0_stateless/02282_array_distance.sql
@@ -48,7 +48,8 @@ SELECT
     L2SquaredDistance(v1.v, v2.v),
     cosineDistance(v1.v, v2.v)
 FROM vec2 v1, vec2 v2
-WHERE length(v1.v) == length(v2.v);
+WHERE length(v1.v) == length(v2.v)
+ORDER BY v1.id, v2.id;
 
 INSERT INTO vec2f VALUES (1, [100, 200, 0]), (2, [888, 777, 666]), (3, range(1, 35, 1)), (4, range(3, 37, 1)), (5, range(1, 135, 1)), (6, range(3, 137, 1));
 SELECT
@@ -61,7 +62,8 @@ SELECT
     L2SquaredDistance(v1.v, v2.v),
     cosineDistance(v1.v, v2.v)
 FROM vec2f v1, vec2f v2
-WHERE length(v1.v) == length(v2.v);
+WHERE length(v1.v) == length(v2.v)
+ORDER BY v1.id, v2.id;
 
 INSERT INTO vec2d VALUES (1, [100, 200, 0]), (2, [888, 777, 666]), (3, range(1, 35, 1)), (4, range(3, 37, 1)), (5, range(1, 135, 1)), (6, range(3, 137, 1));
 SELECT
@@ -74,7 +76,8 @@ SELECT
     L2SquaredDistance(v1.v, v2.v),
     cosineDistance(v1.v, v2.v)
 FROM vec2d v1, vec2d v2
-WHERE length(v1.v) == length(v2.v);
+WHERE length(v1.v) == length(v2.v)
+ORDER BY v1.id, v2.id;
 
 SELECT
     v1.id,
@@ -86,7 +89,8 @@ SELECT
     L2SquaredDistance(v1.v, v2.v),
     cosineDistance(v1.v, v2.v)
 FROM vec2f v1, vec2d v2
-WHERE length(v1.v) == length(v2.v);
+WHERE length(v1.v) == length(v2.v)
+ORDER BY v1.id, v2.id;
 
 SELECT L1Distance([0, 0], [1]); -- { serverError 190 }
 SELECT L2Distance([1, 2], (3,4)); -- { serverError 43 }
diff --git a/tests/queries/0_stateless/02344_describe_cache.reference b/tests/queries/0_stateless/02344_describe_cache.reference
index c98e9d263ca7..7561b32bae17 100644
--- a/tests/queries/0_stateless/02344_describe_cache.reference
+++ b/tests/queries/0_stateless/02344_describe_cache.reference
@@ -1,2 +1,2 @@
-2147483648	1048576	104857600	1	0	0	0	/var/lib/clickhouse/caches/s3_cache/	0
-2147483648	1048576	104857600	0	0	0	0	/var/lib/clickhouse/caches/s3_cache_2/	0
+134217728	1048576	104857600	1	0	0	0	/var/lib/clickhouse/caches/s3_cache/	0
+134217728	1048576	104857600	0	0	0	0	/var/lib/clickhouse/caches/s3_cache_2/	0
diff --git a/tests/queries/0_stateless/02346_additional_filters.reference b/tests/queries/0_stateless/02346_additional_filters.reference
index 0a08995223d6..e3b6f2a38c64 100644
--- a/tests/queries/0_stateless/02346_additional_filters.reference
+++ b/tests/queries/0_stateless/02346_additional_filters.reference
@@ -101,7 +101,7 @@ select * from (select number from system.numbers limit 5 union all select x from
 4
 4
 5
-select number, x, y from (select number from system.numbers limit 5) f any left join (select x, y from table_1) s on f.number = s.x settings additional_table_filters={'system.numbers' : 'number != 3', 'table_1' : 'x != 2'};
+select number, x, y from (select number from system.numbers limit 5) f any left join (select x, y from table_1) s on f.number = s.x order by number settings additional_table_filters={'system.numbers' : 'number != 3', 'table_1' : 'x != 2'};
 0	0	
 1	1	a
 2	0	
diff --git a/tests/queries/0_stateless/02346_additional_filters.sql b/tests/queries/0_stateless/02346_additional_filters.sql
index f6b665713ec8..3d6a25f7ffde 100644
--- a/tests/queries/0_stateless/02346_additional_filters.sql
+++ b/tests/queries/0_stateless/02346_additional_filters.sql
@@ -40,7 +40,7 @@ select * from system.numbers as t limit 5 settings additional_table_filters={'t'
 select * from system.numbers limit 5 settings additional_table_filters={'system.numbers' : 'number != 3'};
 select * from system.numbers limit 5 settings additional_table_filters={'system.numbers':'number != 3','table_1':'x!=2'};
 select * from (select number from system.numbers limit 5 union all select x from table_1) order by number settings additional_table_filters={'system.numbers':'number != 3','table_1':'x!=2'};
-select number, x, y from (select number from system.numbers limit 5) f any left join (select x, y from table_1) s on f.number = s.x settings additional_table_filters={'system.numbers' : 'number != 3', 'table_1' : 'x != 2'};
+select number, x, y from (select number from system.numbers limit 5) f any left join (select x, y from table_1) s on f.number = s.x order by number settings additional_table_filters={'system.numbers' : 'number != 3', 'table_1' : 'x != 2'};
 select b + 1 as c from (select a + 1 as b from (select x + 1 as a from table_1)) settings additional_table_filters={'table_1' : 'x != 2 and x != 3'};
 
 -- { echoOff }
diff --git a/tests/queries/0_stateless/02372_analyzer_join.reference b/tests/queries/0_stateless/02372_analyzer_join.reference
index b8a658106ff0..1309d4280715 100644
--- a/tests/queries/0_stateless/02372_analyzer_join.reference
+++ b/tests/queries/0_stateless/02372_analyzer_join.reference
@@ -5,25 +5,25 @@ JOIN INNER
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value
-FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id;
+FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id order by test_table_join_1.id, test_table_join_1.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id order by t1.id, t1.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id, t1.value;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1
 SELECT '--';
 --
 SELECT t1.value, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.value;
 Join_1_Value_0	Join_2_Value_0
 Join_1_Value_1	Join_2_Value_1
 SELECT id FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id; -- { serverError 207 }
@@ -31,37 +31,37 @@ SELECT value FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_j
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id;
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id order by t1.id, t1.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
-SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id);
+SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id) order by t1_id, t1.value, t2_id;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 SELECT '--';
 --
-SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1_id = t2_id;
+SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1_id = t2_id order by t1_id, t1.value, t2_id;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 SELECT 'JOIN LEFT';
@@ -69,28 +69,28 @@ JOIN LEFT
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value
-FROM test_table_join_1 LEFT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id;
+FROM test_table_join_1 LEFT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id order by test_table_join_1.id, test_table_join_1.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id order by t1.id, t1.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id, t1.value;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0		
 SELECT '--';
 --
 SELECT t1.value, t2.value
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.value;
 Join_1_Value_0	Join_2_Value_0
 Join_1_Value_1	Join_2_Value_1
 Join_1_Value_2	
@@ -99,45 +99,45 @@ SELECT value FROM test_table_join_1 LEFT JOIN test_table_join_2 ON test_table_jo
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	0	
 2	Join_1_Value_2	0	
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	0	
 2	Join_1_Value_2	0	
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	0	
 2	Join_1_Value_2	0	
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id;
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id order by t1.id, t1.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
-SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id);
+SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id) order by t1_id, t1.value, t2_id;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
 SELECT '--';
 --
-SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1_id = t2_id;
+SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1_id = t2_id order by t1_id, t1.value, t2_id;
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
@@ -146,182 +146,182 @@ JOIN RIGHT
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value
-FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id;
+FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id order by test_table_join_1.id, test_table_join_1.value;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id order by t1.id, t1.value;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id, t1.value;
+0	0			3	3	Join_2_Value_3	Join_2_Value_3
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1
-0	0			3	3	Join_2_Value_3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.value, t2.value
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.value;
+	Join_2_Value_3
 Join_1_Value_0	Join_2_Value_0
 Join_1_Value_1	Join_2_Value_1
-	Join_2_Value_3
 SELECT id FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id; -- { serverError 207 }
 SELECT value FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id; -- { serverError 207 }
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0		1	Join_2_Value_1
 0		3	Join_2_Value_3
+0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0		1	Join_2_Value_1
 0		3	Join_2_Value_3
+0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0		1	Join_2_Value_1
 0		3	Join_2_Value_3
+0	Join_1_Value_0	0	Join_2_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id;
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id order by t1.id, t1.value;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
-SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id);
+SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id) order by t1_id, t1.value, t2_id;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT '--';
 --
-SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1_id = t2_id;
+SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1_id = t2_id order by t1_id, t1.value, t2_id;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT 'JOIN FULL';
 JOIN FULL
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value
-FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id;
+FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id order by test_table_join_1.id, test_table_join_1.value;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
-0		3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id order by t1.id, t1.value;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
-0		3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id, t1.value;
+0	0			3	3	Join_2_Value_3	Join_2_Value_3
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0		
-0	0			3	3	Join_2_Value_3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.value, t2.value
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.value;
+	Join_2_Value_3
 Join_1_Value_0	Join_2_Value_0
 Join_1_Value_1	Join_2_Value_1
 Join_1_Value_2	
-	Join_2_Value_3
 SELECT id FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id; -- { serverError 207 }
 SELECT value FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id; -- { serverError 207 }
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 0	Join_1_Value_0	0	Join_2_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
+0		1	Join_2_Value_1
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	0	
 2	Join_1_Value_2	0	
-0		1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
+0		1	Join_2_Value_1
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	0	
 2	Join_1_Value_2	0	
-0		1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
+0		1	Join_2_Value_1
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	0	
 2	Join_1_Value_2	0	
-0		1	Join_2_Value_1
-0		3	Join_2_Value_3
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id;
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id order by t1.id, t1.value;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
-0		3	Join_2_Value_3
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
-SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id);
+SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id) order by t1_id, t1.value, t2_id;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
-0		3	Join_2_Value_3
 SELECT '--';
 --
-SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1_id = t2_id;
+SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1_id = t2_id order by t1_id, t1.value, t2_id;
+0		3	Join_2_Value_3
 0	Join_1_Value_0	0	Join_2_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1
 2	Join_1_Value_2	0	
-0		3	Join_2_Value_3
 SELECT 'First JOIN INNER second JOIN INNER';
 First JOIN INNER second JOIN INNER
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
@@ -329,48 +329,48 @@ SELECT '--';
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT 'First JOIN INNER second JOIN LEFT';
@@ -379,14 +379,14 @@ SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
@@ -394,48 +394,48 @@ SELECT '--';
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT 'First JOIN INNER second JOIN RIGHT';
@@ -444,159 +444,159 @@ SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		0		4	Join_3_Value_4
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
 0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT 'First JOIN INNER second JOIN FULL';
 First JOIN INNER second JOIN FULL
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 INNER JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		0		4	Join_3_Value_4
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
 0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT 'First JOIN LEFT second JOIN INNER';
 First JOIN LEFT second JOIN INNER
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 LEFT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -604,7 +604,7 @@ SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -613,7 +613,7 @@ SELECT '--';
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
@@ -621,7 +621,7 @@ SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
@@ -629,20 +629,20 @@ SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
@@ -650,7 +650,7 @@ SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -658,7 +658,7 @@ SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -668,7 +668,7 @@ SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 LEFT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -676,7 +676,7 @@ SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -685,7 +685,7 @@ SELECT '--';
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
@@ -693,7 +693,7 @@ SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
@@ -701,7 +701,7 @@ SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
@@ -709,7 +709,7 @@ SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
@@ -718,7 +718,7 @@ JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
@@ -726,7 +726,7 @@ SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -734,7 +734,7 @@ SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -744,184 +744,184 @@ SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 LEFT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		0		4	Join_3_Value_4
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
 0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT 'First JOIN LEFT second JOIN FULL';
 First JOIN LEFT second JOIN FULL
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 LEFT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
-0		0		4	Join_3_Value_4
-0		0		1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
-0		0		1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT 'First JOIN RIGHT second JOIN INNER';
 First JOIN RIGHT second JOIN INNER
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
@@ -929,48 +929,48 @@ SELECT '--';
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 SELECT 'First JOIN RIGHT second JOIN LEFT';
@@ -979,246 +979,246 @@ SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
-0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-	Join_2_Value_3	
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		1	Join_2_Value_1	0	
+0		3	Join_2_Value_3	0	
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		1	Join_2_Value_1	0	
 0		3	Join_2_Value_3	0	
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-	Join_2_Value_3	
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
 SELECT 'First JOIN RIGHT second JOIN RIGHT';
 First JOIN RIGHT second JOIN RIGHT
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		0		4	Join_3_Value_4
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
 0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT 'First JOIN RIGHT second JOIN FULL';
 First JOIN RIGHT second JOIN FULL
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 RIGHT JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
+0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
-0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-	Join_2_Value_3	
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
-0		1	Join_2_Value_1	0	
-0		0		4	Join_3_Value_4
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0		1	Join_2_Value_1	0	
+0		3	Join_2_Value_3	0	
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		1	Join_2_Value_1	0	
-0		3	Join_2_Value_3	0	
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
 0		0		4	Join_3_Value_4
+0		1	Join_2_Value_1	0	
+0		3	Join_2_Value_3	0	
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
-	Join_2_Value_3	
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
 SELECT 'First JOIN FULL second JOIN INNER';
 First JOIN FULL second JOIN INNER
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -1226,7 +1226,7 @@ SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -1235,7 +1235,7 @@ SELECT '--';
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
@@ -1243,7 +1243,7 @@ SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+INNER JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
@@ -1251,20 +1251,20 @@ SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+INNER JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
@@ -1272,7 +1272,7 @@ SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+INNER JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -1280,7 +1280,7 @@ SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+INNER JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
@@ -1290,265 +1290,265 @@ SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
-0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+LEFT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-	Join_2_Value_3	
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		1	Join_2_Value_1	0	
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
-0		3	Join_2_Value_3	0	
-0		1	Join_2_Value_1	0	
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		1	Join_2_Value_1	0	
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
-0		1	Join_2_Value_1	0	
-0		3	Join_2_Value_3	0	
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+LEFT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-	Join_2_Value_3	
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+LEFT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
 SELECT 'First JOIN FULL second JOIN RIGHT';
 First JOIN FULL second JOIN RIGHT
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+RIGHT JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
-0		0		4	Join_3_Value_4
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
-0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 0		0		1	Join_3_Value_1
 0		0		4	Join_3_Value_4
+0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+RIGHT JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		0		4	Join_3_Value_4
 SELECT 'First JOIN FULL second JOIN FULL';
 First JOIN FULL second JOIN FULL
 SELECT 'JOIN ON without conditions';
 JOIN ON without conditions
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 FULL JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
+0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
+0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
 0	0	Join_1_Value_0	Join_1_Value_0	0	0	Join_2_Value_0	Join_2_Value_0	0	0	Join_3_Value_0	Join_3_Value_0
 1	1	Join_1_Value_1	Join_1_Value_1	1	1	Join_2_Value_1	Join_2_Value_1	1	1	Join_3_Value_1	Join_3_Value_1
 2	2	Join_1_Value_2	Join_1_Value_2	0	0			0	0	Join_3_Value_0	Join_3_Value_0
-0	0			3	3	Join_2_Value_3	Join_2_Value_3	0	0		
-0	0			0	0			4	4	Join_3_Value_4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+FULL JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-	Join_2_Value_3	
-		Join_3_Value_4
 SELECT 'JOIN ON with conditions';
 JOIN ON with conditions
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0		1	Join_2_Value_1	0	
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
-0		3	Join_2_Value_3	0	
-0		1	Join_2_Value_1	0	
-0		0		4	Join_3_Value_4
-0		0		1	Join_3_Value_1
 SELECT '--';
 --
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
+0		0		1	Join_3_Value_1
+0		0		4	Join_3_Value_4
+0		1	Join_2_Value_1	0	
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	0		0	
 2	Join_1_Value_2	0		0	
-0		1	Join_2_Value_1	0	
-0		3	Join_2_Value_3	0	
-0		0		1	Join_3_Value_1
-0		0		4	Join_3_Value_4
 SELECT 'JOIN multiple clauses';
 JOIN multiple clauses
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+FULL JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
+		Join_3_Value_4
+	Join_2_Value_3	
 Join_1_Value_0	Join_2_Value_0	Join_3_Value_0
 Join_1_Value_1	Join_2_Value_1	Join_3_Value_1
 Join_1_Value_2		Join_3_Value_0
-	Join_2_Value_3	
-		Join_3_Value_4
 SELECT 'JOIN expression aliases';
 JOIN expression aliases
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+FULL JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
 SELECT '--';
 --
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+FULL JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
+0		0		4	Join_3_Value_4
+0		3	Join_2_Value_3	0	
 0	Join_1_Value_0	0	Join_2_Value_0	0	Join_3_Value_0
 1	Join_1_Value_1	1	Join_2_Value_1	1	Join_3_Value_1
 2	Join_1_Value_2	0		0	Join_3_Value_0
-0		3	Join_2_Value_3	0	
-0		0		4	Join_3_Value_4
diff --git a/tests/queries/0_stateless/02372_analyzer_join.sql.j2 b/tests/queries/0_stateless/02372_analyzer_join.sql.j2
index f6032a96b33d..481c300b9993 100644
--- a/tests/queries/0_stateless/02372_analyzer_join.sql.j2
+++ b/tests/queries/0_stateless/02372_analyzer_join.sql.j2
@@ -45,22 +45,22 @@ SELECT 'JOIN {{ join_type }}';
 SELECT 'JOIN ON without conditions';
 
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value
-FROM test_table_join_1 {{ join_type }} JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id;
+FROM test_table_join_1 {{ join_type }} JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id order by test_table_join_1.id, test_table_join_1.value;
 
 SELECT '--';
 
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id order by t1.id, t1.value;
 
 SELECT '--';
 
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id, t1.value;
 
 SELECT '--';
 
 SELECT t1.value, t2.value
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.value;
 
 SELECT id FROM test_table_join_1 {{ join_type }} JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id; -- { serverError 207 }
 
@@ -69,35 +69,35 @@ SELECT value FROM test_table_join_1 {{ join_type }} JOIN test_table_join_2 ON te
 SELECT 'JOIN ON with conditions';
 
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0';
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 
 SELECT '--';
 
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 
 SELECT '--';
 
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 
 SELECT '--';
 
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0';
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON toString(t1.id) = toString(t2.id) AND t1.value = 'Join_1_Value_0' AND t2.value = 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value;
 
 SELECT 'JOIN multiple clauses';
 
 SELECT t1.id, t1.value, t2.id, t2.value
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id;
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id order by t1.id, t1.value;
 
 SELECT 'JOIN expression aliases';
 
-SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id);
+SELECT t1_id, t1.value, t2_id, t2.value FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id) order by t1_id, t1.value, t2_id;
 
 SELECT '--';
 
-SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1_id = t2_id;
+SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 ON t1_id = t2_id order by t1_id, t1.value, t2_id;
 
 {% endfor %}
 
@@ -110,56 +110,56 @@ SELECT 'JOIN ON without conditions';
 
 SELECT test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value, test_table_join_3.id, test_table_join_3.value
 FROM test_table_join_1 {{ first_join_type }} JOIN test_table_join_2 ON test_table_join_1.id = test_table_join_2.id
-{{ second_join_type }} JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id;
+{{ second_join_type }} JOIN test_table_join_3 ON test_table_join_2.id = test_table_join_3.id ORDER BY test_table_join_1.id, test_table_join_1.value, test_table_join_2.id, test_table_join_2.value;
 
 SELECT '--';
 
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 
 SELECT '--';
 
 SELECT t1.id, test_table_join_1.id, t1.value, test_table_join_1.value, t2.id, test_table_join_2.id, t2.value, test_table_join_2.value,
 t3.id, test_table_join_3.id, t3.value, test_table_join_3.value
 FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.id, t1.value, t2.id;
 
 SELECT '--';
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON test_table_join_1.id = test_table_join_2.id
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id;
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON test_table_join_2.id = test_table_join_3.id ORDER BY t1.value, t2.value, t3.value;
 
 SELECT 'JOIN ON with conditions';
 
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0'
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0';
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 
 SELECT '--';
 
 SELECT t1.id, t1.value, t2.id, t2.value, t3.id, t3.value
 FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id AND t1.value == 'Join_1_Value_0' AND t2.value == 'Join_2_Value_0'
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0';
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2.id = t3.id AND t2.value == 'Join_2_Value_0' AND t3.value == 'Join_3_Value_0' ORDER BY t1.id, t1.value, t2.id, t2.value, t3.id, t3.value;
 
 SELECT 'JOIN multiple clauses';
 
 SELECT t1.value, t2.value, t3.value
 FROM test_table_join_1 AS t1
 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON t1.id = t2.id OR t1.id = t2.id
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id;
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2.id = t3.id OR t3.id = t2.id ORDER BY t1.value, t2.value, t3.value;
 
 SELECT 'JOIN expression aliases';
 
 SELECT t1_id, t1.value, t2_id, t2.value, t3_id, t3.value
 FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON (t1.id AS t1_id) = (t2.id AS t2_id)
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id);
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2_id = (t3.id AS t3_id) ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 
 SELECT '--';
 
 SELECT t1.id AS t1_id, t1.value, t2.id AS t2_id, t2.value, t3.id AS t3_id, t3.value
 FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 ON t1_id = t2_id
-{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2_id = t3_id;
+{{ second_join_type }} JOIN test_table_join_3 AS t3 ON t2_id = t3_id ORDER BY t1_id, t1.value, t2_id, t2.value, t3_id, t3.value;
 
 {% endfor %}
 {% endfor %}
diff --git a/tests/queries/0_stateless/02373_analyzer_join_use_nulls.reference b/tests/queries/0_stateless/02373_analyzer_join_use_nulls.reference
index 3722c23e4a05..52010b718f4a 100644
--- a/tests/queries/0_stateless/02373_analyzer_join_use_nulls.reference
+++ b/tests/queries/0_stateless/02373_analyzer_join_use_nulls.reference
@@ -1,27 +1,27 @@
 -- { echoOn }
 
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String
 1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String
 SELECT '--';
 --
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 0	UInt64	Join_1_Value_0	String	0	Nullable(UInt64)	Join_2_Value_0	Nullable(String)
 1	UInt64	Join_1_Value_1	String	1	Nullable(UInt64)	Join_2_Value_1	Nullable(String)
 2	UInt64	Join_1_Value_2	String	\N	Nullable(UInt64)	\N	Nullable(String)
 SELECT '--';
 --
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 0	Nullable(UInt64)	Join_1_Value_0	Nullable(String)	0	UInt64	Join_2_Value_0	String
 1	Nullable(UInt64)	Join_1_Value_1	Nullable(String)	1	UInt64	Join_2_Value_1	String
 \N	Nullable(UInt64)	\N	Nullable(String)	3	UInt64	Join_2_Value_3	String
 SELECT '--';
 --
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 0	Nullable(UInt64)	Join_1_Value_0	Nullable(String)	0	Nullable(UInt64)	Join_2_Value_0	Nullable(String)
 1	Nullable(UInt64)	Join_1_Value_1	Nullable(String)	1	Nullable(UInt64)	Join_2_Value_1	Nullable(String)
 2	Nullable(UInt64)	Join_1_Value_2	Nullable(String)	\N	Nullable(UInt64)	\N	Nullable(String)
@@ -30,14 +30,14 @@ SELECT '--';
 --
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String
 SELECT '--';
 --
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	Nullable(UInt64)	Join_2_Value_0	Nullable(String)
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	Nullable(UInt64)	Join_2_Value_1	Nullable(String)
 2	UInt64	2	UInt64	Join_1_Value_2	String	\N	Nullable(UInt64)	\N	Nullable(String)
@@ -45,7 +45,7 @@ SELECT '--';
 --
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 0	UInt64	0	Nullable(UInt64)	Join_1_Value_0	Nullable(String)	0	UInt64	Join_2_Value_0	String
 1	UInt64	1	Nullable(UInt64)	Join_1_Value_1	Nullable(String)	1	UInt64	Join_2_Value_1	String
 3	UInt64	\N	Nullable(UInt64)	\N	Nullable(String)	3	UInt64	Join_2_Value_3	String
@@ -53,7 +53,7 @@ SELECT '--';
 --
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 0	Nullable(UInt64)	0	Nullable(UInt64)	Join_1_Value_0	Nullable(String)	0	Nullable(UInt64)	Join_2_Value_0	Nullable(String)
 1	Nullable(UInt64)	1	Nullable(UInt64)	Join_1_Value_1	Nullable(String)	1	Nullable(UInt64)	Join_2_Value_1	Nullable(String)
 2	Nullable(UInt64)	2	Nullable(UInt64)	Join_1_Value_2	Nullable(String)	\N	Nullable(UInt64)	\N	Nullable(String)
diff --git a/tests/queries/0_stateless/02373_analyzer_join_use_nulls.sql b/tests/queries/0_stateless/02373_analyzer_join_use_nulls.sql
index db7895084e8b..6b69dad6fcc7 100644
--- a/tests/queries/0_stateless/02373_analyzer_join_use_nulls.sql
+++ b/tests/queries/0_stateless/02373_analyzer_join_use_nulls.sql
@@ -26,46 +26,46 @@ INSERT INTO test_table_join_2 VALUES (3, 'Join_2_Value_3');
 -- { echoOn }
 
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 
 SELECT '--';
 
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 
 SELECT '--';
 
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 
 SELECT '--';
 
 SELECT t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value), t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 ON t1.id = t2.id ORDER BY t1.id;
 
 SELECT '--';
 
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 
 SELECT '--';
 
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 
 SELECT '--';
 
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 
 SELECT '--';
 
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1.id;
 
 -- { echoOff }
 
diff --git a/tests/queries/0_stateless/02374_analyzer_join_using.reference b/tests/queries/0_stateless/02374_analyzer_join_using.reference
index 622476942d0f..1d8d513af695 100644
--- a/tests/queries/0_stateless/02374_analyzer_join_using.reference
+++ b/tests/queries/0_stateless/02374_analyzer_join_using.reference
@@ -4,13 +4,13 @@ SELECT 'JOIN INNER';
 JOIN INNER
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) ORDER BY id;
 0	UInt16	0	UInt16	Join_1_Value_0	String	0	UInt16	Join_2_Value_0	String
 1	UInt16	1	UInt16	Join_1_Value_1	String	1	UInt16	Join_2_Value_1	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1_value NULLS LAST;
 Join_1_Value_0	String	Join_2_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String
 SELECT '--';
@@ -24,14 +24,14 @@ SELECT 'JOIN LEFT';
 JOIN LEFT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) ORDER BY id;
 0	UInt16	0	UInt16	Join_1_Value_0	String	0	UInt16	Join_2_Value_0	String
 1	UInt16	1	UInt16	Join_1_Value_1	String	1	UInt16	Join_2_Value_1	String
 2	UInt16	2	UInt16	Join_1_Value_2	String	0	UInt16		String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1_value NULLS LAST;
 Join_1_Value_0	String	Join_2_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String
 Join_1_Value_2	String		String
@@ -47,17 +47,17 @@ SELECT 'JOIN RIGHT';
 JOIN RIGHT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) ORDER BY id;
 0	UInt16	0	UInt16	Join_1_Value_0	String	0	UInt16	Join_2_Value_0	String
 1	UInt16	1	UInt16	Join_1_Value_1	String	1	UInt16	Join_2_Value_1	String
 3	UInt16	0	UInt16		String	3	UInt16	Join_2_Value_3	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1_value NULLS LAST;
+	String	Join_2_Value_3	String
 Join_1_Value_0	String	Join_2_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String
-	String	Join_2_Value_3	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id);
@@ -70,19 +70,19 @@ SELECT 'JOIN FULL';
 JOIN FULL
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) ORDER BY id;
 0	UInt16	0	UInt16	Join_1_Value_0	String	0	UInt16	Join_2_Value_0	String
+0	UInt16	0	UInt16		String	3	UInt16	Join_2_Value_3	String
 1	UInt16	1	UInt16	Join_1_Value_1	String	1	UInt16	Join_2_Value_1	String
 2	UInt16	2	UInt16	Join_1_Value_2	String	0	UInt16		String
-0	UInt16	0	UInt16		String	3	UInt16	Join_2_Value_3	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1_value NULLS LAST;
+	String	Join_2_Value_3	String
 Join_1_Value_0	String	Join_2_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String
 Join_1_Value_2	String		String
-	String	Join_2_Value_3	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id);
@@ -96,13 +96,13 @@ SELECT 'First JOIN INNER second JOIN INNER';
 First JOIN INNER second JOIN INNER
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 SELECT '--';
@@ -115,13 +115,13 @@ SELECT 'First JOIN INNER second JOIN LEFT';
 First JOIN INNER second JOIN LEFT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 SELECT '--';
@@ -134,17 +134,17 @@ SELECT 'First JOIN INNER second JOIN RIGHT';
 First JOIN INNER second JOIN RIGHT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 4	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
@@ -156,17 +156,17 @@ SELECT 'First JOIN INNER second JOIN FULL';
 First JOIN INNER second JOIN FULL
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
+0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
-0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
@@ -178,13 +178,13 @@ SELECT 'First JOIN LEFT second JOIN INNER';
 First JOIN LEFT second JOIN INNER
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 SELECT '--';
@@ -197,14 +197,14 @@ SELECT 'First JOIN LEFT second JOIN LEFT';
 First JOIN LEFT second JOIN LEFT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 2	UInt64	2	UInt64	Join_1_Value_2	String	0	UInt64		String	0	UInt64		String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 Join_1_Value_2	String		String		String
@@ -219,17 +219,17 @@ SELECT 'First JOIN LEFT second JOIN RIGHT';
 First JOIN LEFT second JOIN RIGHT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 4	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
@@ -241,19 +241,19 @@ SELECT 'First JOIN LEFT second JOIN FULL';
 First JOIN LEFT second JOIN FULL
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
+0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 2	UInt64	2	UInt64	Join_1_Value_2	String	0	UInt64		String	0	UInt64		String
-0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 Join_1_Value_2	String		String		String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 LEFT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
@@ -266,13 +266,13 @@ SELECT 'First JOIN RIGHT second JOIN INNER';
 First JOIN RIGHT second JOIN INNER
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 SELECT '--';
@@ -285,17 +285,17 @@ SELECT 'First JOIN RIGHT second JOIN LEFT';
 First JOIN RIGHT second JOIN LEFT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 3	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64		String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String	Join_2_Value_3	String		String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
-	String	Join_2_Value_3	String		String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
@@ -307,17 +307,17 @@ SELECT 'First JOIN RIGHT second JOIN RIGHT';
 First JOIN RIGHT second JOIN RIGHT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 4	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
@@ -329,19 +329,19 @@ SELECT 'First JOIN RIGHT second JOIN FULL';
 First JOIN RIGHT second JOIN FULL
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
+0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 3	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64		String
-0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
+	String	Join_2_Value_3	String		String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
-	String	Join_2_Value_3	String		String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 RIGHT JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
@@ -354,14 +354,14 @@ SELECT 'First JOIN FULL second JOIN INNER';
 First JOIN FULL second JOIN INNER
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
+0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
-0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) INNER JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 SELECT '--';
@@ -374,19 +374,19 @@ SELECT 'First JOIN FULL second JOIN LEFT';
 First JOIN FULL second JOIN LEFT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
+0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 2	UInt64	2	UInt64	Join_1_Value_2	String	0	UInt64		String	0	UInt64		String
-0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String	Join_2_Value_3	String		String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 Join_1_Value_2	String		String		String
-	String	Join_2_Value_3	String		String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) LEFT JOIN test_table_join_3 AS t3 USING(id);
@@ -399,18 +399,18 @@ SELECT 'First JOIN FULL second JOIN RIGHT';
 First JOIN FULL second JOIN RIGHT
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
+0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
-0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
 4	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) RIGHT JOIN test_table_join_3 AS t3 USING(id);
@@ -422,21 +422,21 @@ SELECT 'First JOIN FULL second JOIN FULL';
 First JOIN FULL second JOIN FULL
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
+0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
+0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
 0	UInt64	0	UInt64	Join_1_Value_0	String	0	UInt64	Join_2_Value_0	String	0	UInt64	Join_3_Value_0	String
 1	UInt64	1	UInt64	Join_1_Value_1	String	1	UInt64	Join_2_Value_1	String	1	UInt64	Join_3_Value_1	String
 2	UInt64	2	UInt64	Join_1_Value_2	String	0	UInt64		String	0	UInt64		String
-0	UInt64	0	UInt64		String	3	UInt64	Join_2_Value_3	String	0	UInt64	Join_3_Value_0	String
-0	UInt64	0	UInt64		String	0	UInt64		String	4	UInt64	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
+	String		String	Join_3_Value_4	String
+	String	Join_2_Value_3	String		String
 Join_1_Value_0	String	Join_2_Value_0	String	Join_3_Value_0	String
 Join_1_Value_1	String	Join_2_Value_1	String	Join_3_Value_1	String
 Join_1_Value_2	String		String		String
-	String	Join_2_Value_3	String		String
-	String		String	Join_3_Value_4	String
 SELECT '--';
 --
 SELECT 1 FROM test_table_join_1 AS t1 FULL JOIN test_table_join_2 AS t2 USING (id) FULL JOIN test_table_join_3 AS t3 USING(id);
diff --git a/tests/queries/0_stateless/02374_analyzer_join_using.sql.j2 b/tests/queries/0_stateless/02374_analyzer_join_using.sql.j2
index 26fb52716ff9..6b81c89ee7c9 100644
--- a/tests/queries/0_stateless/02374_analyzer_join_using.sql.j2
+++ b/tests/queries/0_stateless/02374_analyzer_join_using.sql.j2
@@ -41,12 +41,12 @@ SELECT 'JOIN {{ join_type }}';
 
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 USING (id) ORDER BY id;
 
 SELECT '--';
 
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value)
-FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 USING (id);
+FROM test_table_join_1 AS t1 {{ join_type }} JOIN test_table_join_2 AS t2 USING (id) ORDER BY t1_value NULLS LAST;
 
 SELECT '--';
 
@@ -64,12 +64,12 @@ SELECT 'First JOIN {{ first_join_type }} second JOIN {{ second_join_type }}';
 
 SELECT id AS using_id, toTypeName(using_id), t1.id AS t1_id, toTypeName(t1_id), t1.value AS t1_value, toTypeName(t1_value),
 t2.id AS t2_id, toTypeName(t2_id), t2.value AS t2_value, toTypeName(t2_value), t3.id AS t3_id, toTypeName(t3_id), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 USING (id) {{ second_join_type }} JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 USING (id) {{ second_join_type }} JOIN test_table_join_3 AS t3 USING(id) ORDER BY id, t1_value, t2_value;
 
 SELECT '--';
 
 SELECT t1.value AS t1_value, toTypeName(t1_value), t2.value AS t2_value, toTypeName(t2_value), t3.value AS t3_value, toTypeName(t3_value)
-FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 USING (id) {{ second_join_type }} JOIN test_table_join_3 AS t3 USING(id);
+FROM test_table_join_1 AS t1 {{ first_join_type }} JOIN test_table_join_2 AS t2 USING (id) {{ second_join_type }} JOIN test_table_join_3 AS t3 USING(id) ORDER BY t1_value, t2_value;
 
 SELECT '--';
 
diff --git a/tests/queries/0_stateless/02380_analyzer_join_sample.sql b/tests/queries/0_stateless/02380_analyzer_join_sample.sql
index e417f47d1735..5dc6c29b7901 100644
--- a/tests/queries/0_stateless/02380_analyzer_join_sample.sql
+++ b/tests/queries/0_stateless/02380_analyzer_join_sample.sql
@@ -23,7 +23,7 @@ SAMPLE BY id;
 INSERT INTO test_table_join_2 VALUES (0, 'Value'), (1, 'Value_1');
 
 SELECT t1.id AS t1_id, t2.id AS t2_id, t1._sample_factor AS t1_sample_factor, t2._sample_factor AS t2_sample_factor
-FROM test_table_join_1 AS t1 SAMPLE 1/2 INNER JOIN test_table_join_2 AS t2 SAMPLE 1/2 ON t1.id = t2.id;
+FROM test_table_join_1 AS t1 SAMPLE 1/2 INNER JOIN test_table_join_2 AS t2 SAMPLE 1/2 ON t1.id = t2.id ORDER BY t1.id;
 
 DROP TABLE test_table_join_1;
 DROP TABLE test_table_join_2;
diff --git a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql
index 3688a649d5ea..88fb2cdf9b13 100644
--- a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql
+++ b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql
@@ -1,5 +1,7 @@
 -- Tags: no-fasttest
 
+SET send_logs_level = 'fatal';
+
 drop table if exists rmt;
 drop table if exists rmt2;
 
diff --git a/tests/queries/0_stateless/02447_drop_database_replica.reference b/tests/queries/0_stateless/02447_drop_database_replica.reference
index 1d65fe66c6e8..f2b415695407 100644
--- a/tests/queries/0_stateless/02447_drop_database_replica.reference
+++ b/tests/queries/0_stateless/02447_drop_database_replica.reference
@@ -6,10 +6,16 @@ t
 2
 2
 2
-rdb_default	1	1
-rdb_default	1	2
 2
 2
 2
+2
+rdb_default	1	1	s1	r1	1
+2
+2
+rdb_default	1	1	s1	r1	1
+rdb_default	1	2	s1	r2	0
+2
+2
 t
-rdb_default_3	1	1
+rdb_default_4	1	1	s1	r1	1
diff --git a/tests/queries/0_stateless/02447_drop_database_replica.sh b/tests/queries/0_stateless/02447_drop_database_replica.sh
index 4bfd6243c2ed..47a6cf10bda3 100755
--- a/tests/queries/0_stateless/02447_drop_database_replica.sh
+++ b/tests/queries/0_stateless/02447_drop_database_replica.sh
@@ -13,35 +13,49 @@ $CLICKHOUSE_CLIENT -q "show tables from $db"
 
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from table t" 2>&1| grep -Fac "SYNTAX_ERROR"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from database $db" 2>&1| grep -Fac "There is a local database"
+$CLICKHOUSE_CLIENT -q "system drop database replica 'r1' from shard 's1' from database $db" 2>&1| grep -Fac "There is a local database"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb'" 2>&1| grep -Fac "There is a local database"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb/'" 2>&1| grep -Fac "There is a local database"
+$CLICKHOUSE_CLIENT -q "system drop database replica 'r1' from shard 's1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb/'" 2>&1| grep -Fac "There is a local database"
 
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from zkpath '/test/$CLICKHOUSE_DATABASE/'" 2>&1| grep -Fac "does not look like a path of Replicated database"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's2|r1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb'" 2>&1| grep -Fac "does not exist"
+$CLICKHOUSE_CLIENT -q "system drop database replica 's1' from shard 'r1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb'" 2>&1| grep -Fac "does not exist"
+$CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from shard 's1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb'" 2>&1| grep -Fac "does not exist"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's2/r1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb'" 2>&1| grep -Fac "Invalid replica name"
 
 db2="${db}_2"
+db3="${db}_3"
 $CLICKHOUSE_CLIENT --allow_experimental_database_replicated=1 -q "create database $db2 engine=Replicated('/test/$CLICKHOUSE_DATABASE/rdb', 's1', 'r2')"
+$CLICKHOUSE_CLIENT --allow_experimental_database_replicated=1 -q "create database $db3 engine=Replicated('/test/$CLICKHOUSE_DATABASE/rdb', 's2', 'r1')"
 $CLICKHOUSE_CLIENT -q "system sync database replica $db"
-$CLICKHOUSE_CLIENT -q "select cluster, shard_num, replica_num from system.clusters where cluster='$db' order by shard_num, replica_num"
+$CLICKHOUSE_CLIENT -q "select cluster, shard_num, replica_num, database_shard_name, database_replica_name, is_active from system.clusters where cluster='$db' and shard_num=1 and replica_num=1"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from database $db2" 2>&1| grep -Fac "is active, cannot drop it"
 
+$CLICKHOUSE_CLIENT -q "detach database $db3"
+$CLICKHOUSE_CLIENT -q "system drop database replica 'r1' from shard 's2' from database $db"
+$CLICKHOUSE_CLIENT -q "attach database $db3" 2>/dev/null
+$CLICKHOUSE_CLIENT --distributed_ddl_output_mode=none -q "create table $db3.t2 as system.query_log" 2>&1| grep -Fac "Database is in readonly mode"   # Suppress style check: current_database=$CLICKHOUSE_DATABASE
+
 $CLICKHOUSE_CLIENT -q "detach database $db2"
+$CLICKHOUSE_CLIENT -q "system sync database replica $db"
+$CLICKHOUSE_CLIENT -q "select cluster, shard_num, replica_num, database_shard_name, database_replica_name, is_active from system.clusters where cluster='$db' order by shard_num, replica_num"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r2' from database $db"
 $CLICKHOUSE_CLIENT -q "attach database $db2" 2>/dev/null
 $CLICKHOUSE_CLIENT --distributed_ddl_output_mode=none -q "create table $db2.t2 as system.query_log" 2>&1| grep -Fac "Database is in readonly mode"   # Suppress style check: current_database=$CLICKHOUSE_DATABASE
 
 $CLICKHOUSE_CLIENT -q "detach database $db"
-$CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb/'"
+$CLICKHOUSE_CLIENT -q "system drop database replica 'r1' from shard 's1' from zkpath '/test/$CLICKHOUSE_DATABASE/rdb/'"
 $CLICKHOUSE_CLIENT -q "attach database $db" 2>/dev/null
 $CLICKHOUSE_CLIENT --distributed_ddl_output_mode=none -q "create table $db.t2 as system.query_log" 2>&1| grep -Fac "Database is in readonly mode"   # Suppress style check: current_database=$CLICKHOUSE_DATABASE
 $CLICKHOUSE_CLIENT -q "show tables from $db"
 
-db3="${db}_3"
-$CLICKHOUSE_CLIENT --allow_experimental_database_replicated=1 -q "create database $db3 engine=Replicated('/test/$CLICKHOUSE_DATABASE/rdb', 's1', 'r1')"
-$CLICKHOUSE_CLIENT -q "system sync database replica $db3"
-$CLICKHOUSE_CLIENT -q "select cluster, shard_num, replica_num from system.clusters where cluster='$db3'"
+db4="${db}_4"
+$CLICKHOUSE_CLIENT --allow_experimental_database_replicated=1 -q "create database $db4 engine=Replicated('/test/$CLICKHOUSE_DATABASE/rdb', 's1', 'r1')"
+$CLICKHOUSE_CLIENT -q "system sync database replica $db4"
+$CLICKHOUSE_CLIENT -q "select cluster, shard_num, replica_num, database_shard_name, database_replica_name, is_active from system.clusters where cluster='$db4'"
 
 $CLICKHOUSE_CLIENT -q "drop database $db"
 $CLICKHOUSE_CLIENT -q "drop database $db2"
 $CLICKHOUSE_CLIENT -q "drop database $db3"
+$CLICKHOUSE_CLIENT -q "drop database $db4"
diff --git a/tests/queries/0_stateless/02479_nullable_primary_key_non_first_column.reference b/tests/queries/0_stateless/02479_nullable_primary_key_non_first_column.reference
new file mode 100644
index 000000000000..ed6ac232d9c5
--- /dev/null
+++ b/tests/queries/0_stateless/02479_nullable_primary_key_non_first_column.reference
@@ -0,0 +1,2 @@
+a	\N
+1	1	\N
diff --git a/tests/queries/0_stateless/02479_nullable_primary_key_non_first_column.sql b/tests/queries/0_stateless/02479_nullable_primary_key_non_first_column.sql
new file mode 100644
index 000000000000..2d56e315bd18
--- /dev/null
+++ b/tests/queries/0_stateless/02479_nullable_primary_key_non_first_column.sql
@@ -0,0 +1,11 @@
+drop table if exists test_table;
+create table test_table (A Nullable(String), B Nullable(String)) engine MergeTree order by (A,B) settings index_granularity = 1, allow_nullable_key=1;
+insert into test_table values ('a', 'b'), ('a', null), (null, 'b');
+select * from test_table where B is null;
+drop table test_table;
+
+DROP TABLE IF EXISTS dm_metric_small2;
+CREATE TABLE dm_metric_small2 (`x` Nullable(Int64), `y` Nullable(Int64), `z` Nullable(Int64)) ENGINE = MergeTree() ORDER BY (x, y, z) SETTINGS index_granularity = 1, allow_nullable_key = 1;
+INSERT INTO dm_metric_small2 VALUES (1,1,NULL) (1,1,1) (1,2,0) (1,2,1) (1,2,NULL) (1,2,NULL);
+SELECT * FROM dm_metric_small2 WHERE (x = 1) AND (y = 1) AND z IS NULL;
+DROP TABLE dm_metric_small2;
\ No newline at end of file
diff --git a/tests/queries/0_stateless/02479_nullable_primary_key_second_column.reference b/tests/queries/0_stateless/02479_nullable_primary_key_second_column.reference
deleted file mode 100644
index f0227e1a41ec..000000000000
--- a/tests/queries/0_stateless/02479_nullable_primary_key_second_column.reference
+++ /dev/null
@@ -1 +0,0 @@
-a	\N
diff --git a/tests/queries/0_stateless/02479_nullable_primary_key_second_column.sql b/tests/queries/0_stateless/02479_nullable_primary_key_second_column.sql
deleted file mode 100644
index ad0c09222c22..000000000000
--- a/tests/queries/0_stateless/02479_nullable_primary_key_second_column.sql
+++ /dev/null
@@ -1,9 +0,0 @@
-drop table if exists test_table;
-
-create table test_table (A Nullable(String), B Nullable(String)) engine MergeTree order by (A,B) settings index_granularity = 1, allow_nullable_key=1;
-
-insert into test_table values ('a', 'b'), ('a', null), (null, 'b');
-
-select * from test_table where B is null;
-
-drop table test_table;
diff --git a/tests/queries/0_stateless/02497_if_transform_strings_to_enum.reference b/tests/queries/0_stateless/02497_if_transform_strings_to_enum.reference
index f5284f38b86a..a1a653361ee2 100644
--- a/tests/queries/0_stateless/02497_if_transform_strings_to_enum.reference
+++ b/tests/queries/0_stateless/02497_if_transform_strings_to_enum.reference
@@ -405,16 +405,6 @@ QUERY id: 0
     TABLE id: 7, table_name: system.numbers
   LIMIT
     CONSTANT id: 17, constant_value: UInt64_10, constant_value_type: UInt64
-\N
-\N
-\N
-\N
-\N
-\N
-\N
-\N
-\N
-\N
 SELECT transform(number, [NULL], _CAST([\'google\', \'censor.net\', \'yahoo\'], \'Array(Enum8(\\\'censor.net\\\' = 1, \\\'google\\\' = 2, \\\'other\\\' = 3, \\\'yahoo\\\' = 4))\'), _CAST(\'other\', \'Enum8(\\\'censor.net\\\' = 1, \\\'google\\\' = 2, \\\'other\\\' = 3, \\\'yahoo\\\' = 4)\'))
 FROM
 (
@@ -424,56 +414,38 @@ FROM
 )
 QUERY id: 0
   PROJECTION COLUMNS
-    transform(number, [NULL], [\'google\', \'censor.net\', \'yahoo\'], \'other\') Nullable(Nothing)
+    transform(number, [NULL], [\'google\', \'censor.net\', \'yahoo\'], \'other\') String
   PROJECTION
     LIST id: 1, nodes: 1
-      FUNCTION id: 2, function_name: transform, function_type: ordinary, result_type: Nullable(Nothing)
+      FUNCTION id: 2, function_name: toString, function_type: ordinary, result_type: String
         ARGUMENTS
-          LIST id: 3, nodes: 4
-            COLUMN id: 4, column_name: number, result_type: Nullable(Nothing), source_id: 5
-            CONSTANT id: 6, constant_value: Array_[NULL], constant_value_type: Array(Nullable(Nothing))
-            CONSTANT id: 7, constant_value: Array_[\'google\', \'censor.net\', \'yahoo\'], constant_value_type: Array(String)
-            CONSTANT id: 8, constant_value: \'other\', constant_value_type: String
+          LIST id: 3, nodes: 1
+            FUNCTION id: 4, function_name: transform, function_type: ordinary, result_type: Enum8(\'censor.net\' = 1, \'google\' = 2, \'other\' = 3, \'yahoo\' = 4)
+              ARGUMENTS
+                LIST id: 5, nodes: 4
+                  COLUMN id: 6, column_name: number, result_type: Nullable(Nothing), source_id: 7
+                  CONSTANT id: 8, constant_value: Array_[NULL], constant_value_type: Array(Nullable(Nothing))
+                  FUNCTION id: 9, function_name: _CAST, function_type: ordinary, result_type: Array(Enum8(\'censor.net\' = 1, \'google\' = 2, \'other\' = 3, \'yahoo\' = 4))
+                    ARGUMENTS
+                      LIST id: 10, nodes: 2
+                        CONSTANT id: 11, constant_value: Array_[\'google\', \'censor.net\', \'yahoo\'], constant_value_type: Array(String)
+                        CONSTANT id: 12, constant_value: \'Array(Enum8(\\\'censor.net\\\' = 1, \\\'google\\\' = 2, \\\'other\\\' = 3, \\\'yahoo\\\' = 4))\', constant_value_type: String
+                  FUNCTION id: 13, function_name: _CAST, function_type: ordinary, result_type: Enum8(\'censor.net\' = 1, \'google\' = 2, \'other\' = 3, \'yahoo\' = 4)
+                    ARGUMENTS
+                      LIST id: 14, nodes: 2
+                        CONSTANT id: 15, constant_value: \'other\', constant_value_type: String
+                        CONSTANT id: 16, constant_value: \'Enum8(\\\'censor.net\\\' = 1, \\\'google\\\' = 2, \\\'other\\\' = 3, \\\'yahoo\\\' = 4)\', constant_value_type: String
   JOIN TREE
-    QUERY id: 5, is_subquery: 1
+    QUERY id: 7, is_subquery: 1
       PROJECTION COLUMNS
         number Nullable(Nothing)
       PROJECTION
-        LIST id: 9, nodes: 1
-          CONSTANT id: 10, constant_value: NULL, constant_value_type: Nullable(Nothing)
+        LIST id: 17, nodes: 1
+          CONSTANT id: 18, constant_value: NULL, constant_value_type: Nullable(Nothing)
       JOIN TREE
-        TABLE id: 11, table_name: system.numbers
+        TABLE id: 19, table_name: system.numbers
       LIMIT
-        CONSTANT id: 12, constant_value: UInt64_10, constant_value_type: UInt64
-\N
-\N
-\N
-\N
-\N
-\N
-\N
-\N
-\N
-\N
-SELECT transform(number, NULL, _CAST([\'google\', \'censor.net\', \'yahoo\'], \'Array(Enum8(\\\'censor.net\\\' = 1, \\\'google\\\' = 2, \\\'other\\\' = 3, \\\'yahoo\\\' = 4))\'), _CAST(\'other\', \'Enum8(\\\'censor.net\\\' = 1, \\\'google\\\' = 2, \\\'other\\\' = 3, \\\'yahoo\\\' = 4)\'))
-FROM system.numbers
-LIMIT 10
-QUERY id: 0
-  PROJECTION COLUMNS
-    transform(number, NULL, [\'google\', \'censor.net\', \'yahoo\'], \'other\') Nullable(Nothing)
-  PROJECTION
-    LIST id: 1, nodes: 1
-      FUNCTION id: 2, function_name: transform, function_type: ordinary, result_type: Nullable(Nothing)
-        ARGUMENTS
-          LIST id: 3, nodes: 4
-            COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-            CONSTANT id: 6, constant_value: NULL, constant_value_type: Nullable(Nothing)
-            CONSTANT id: 7, constant_value: Array_[\'google\', \'censor.net\', \'yahoo\'], constant_value_type: Array(String)
-            CONSTANT id: 8, constant_value: \'other\', constant_value_type: String
-  JOIN TREE
-    TABLE id: 5, table_name: system.numbers
-  LIMIT
-    CONSTANT id: 9, constant_value: UInt64_10, constant_value_type: UInt64
+        CONSTANT id: 20, constant_value: UInt64_10, constant_value_type: UInt64
 other
 other
 google
diff --git a/tests/queries/0_stateless/02497_if_transform_strings_to_enum.sql b/tests/queries/0_stateless/02497_if_transform_strings_to_enum.sql
index c23046c7b208..492d42cb6bc2 100644
--- a/tests/queries/0_stateless/02497_if_transform_strings_to_enum.sql
+++ b/tests/queries/0_stateless/02497_if_transform_strings_to_enum.sql
@@ -33,13 +33,13 @@ SELECT transform(number, [2, 4, 6], ['google', 'censor.net', 'yahoo'], 'other')
 EXPLAIN SYNTAX SELECT transform(number, [2, 4, 6], ['google', 'censor.net', 'yahoo'], 'other') as value, value FROM system.numbers LIMIT 10;
 EXPLAIN QUERY TREE run_passes = 1 SELECT transform(number, [2, 4, 6], ['google', 'censor.net', 'yahoo'], 'other') as value, value FROM system.numbers LIMIT 10;
 
-SELECT transform(number, [NULL], ['google', 'censor.net', 'yahoo'], 'other') FROM (SELECT NULL as number FROM system.numbers LIMIT 10);
+SELECT transform(number, [NULL], ['google', 'censor.net', 'yahoo'], 'other') FROM (SELECT NULL as number FROM system.numbers LIMIT 10); -- { serverError 36 }
 EXPLAIN SYNTAX SELECT transform(number, [NULL], ['google', 'censor.net', 'yahoo'], 'other') FROM (SELECT NULL as number FROM system.numbers LIMIT 10);
 EXPLAIN QUERY TREE run_passes = 1 SELECT transform(number, [NULL], ['google', 'censor.net', 'yahoo'], 'other') FROM (SELECT NULL as number FROM system.numbers LIMIT 10);
 
-SELECT transform(number, NULL, ['google', 'censor.net', 'yahoo'], 'other') FROM system.numbers LIMIT 10;
-EXPLAIN SYNTAX SELECT transform(number, NULL, ['google', 'censor.net', 'yahoo'], 'other') FROM system.numbers LIMIT 10;
-EXPLAIN QUERY TREE run_passes = 1 SELECT transform(number, NULL, ['google', 'censor.net', 'yahoo'], 'other') FROM system.numbers LIMIT 10;
+SELECT transform(number, NULL, ['google', 'censor.net', 'yahoo'], 'other') FROM system.numbers LIMIT 10; -- { serverError 43 }
+EXPLAIN SYNTAX SELECT transform(number, NULL, ['google', 'censor.net', 'yahoo'], 'other') FROM system.numbers LIMIT 10; -- { serverError 43 }
+EXPLAIN QUERY TREE run_passes = 1 SELECT transform(number, NULL, ['google', 'censor.net', 'yahoo'], 'other') FROM system.numbers LIMIT 10; -- { serverError 43 }
 
 SET optimize_if_transform_strings_to_enum = 0;
 
diff --git a/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.reference b/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.reference
index fd0b223f8e57..19da8828c30e 100644
--- a/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.reference
+++ b/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.reference
@@ -2,6 +2,10 @@
 1
 
 0
+1
+1
+
+1
 \N
 
 100000000000000000000
diff --git a/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.sql b/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.sql
index b6e60aa2e1f1..6b58d737a3ec 100644
--- a/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.sql
+++ b/tests/queries/0_stateless/02516_join_with_totals_and_subquery_bug.sql
@@ -1,3 +1,5 @@
+SET allow_experimental_analyzer = 1;
+
 SELECT *
 FROM
 (
@@ -12,7 +14,26 @@ INNER JOIN
     SELECT 1
     GROUP BY 1
         WITH TOTALS
-) AS t2 USING (a);
+) AS t2 USING (a)
+SETTINGS allow_experimental_analyzer=0;
+
+SELECT *
+FROM
+(
+    SELECT 1 AS a
+) AS t1
+INNER JOIN
+(
+    SELECT 1 AS a
+    GROUP BY 1
+        WITH TOTALS
+    UNION ALL
+    SELECT 1
+    GROUP BY 1
+        WITH TOTALS
+) AS t2 USING (a)
+SETTINGS allow_experimental_analyzer=1;
+
 
 SELECT a
 FROM
diff --git a/tests/queries/0_stateless/02542_case_no_else.reference b/tests/queries/0_stateless/02542_case_no_else.reference
new file mode 100644
index 000000000000..8f3fdf29168a
--- /dev/null
+++ b/tests/queries/0_stateless/02542_case_no_else.reference
@@ -0,0 +1,3 @@
+2
+1	Z
+1	Z
diff --git a/tests/queries/0_stateless/02542_case_no_else.sql b/tests/queries/0_stateless/02542_case_no_else.sql
new file mode 100644
index 000000000000..0c7975a750ee
--- /dev/null
+++ b/tests/queries/0_stateless/02542_case_no_else.sql
@@ -0,0 +1,14 @@
+SELECT CASE 1 WHEN 1 THEN 2 END;
+
+SELECT id,
+    CASE id
+         WHEN 1 THEN 'Z'
+    END x
+FROM  (SELECT 1 as id);
+
+SELECT id,
+       CASE id
+            WHEN 1 THEN 'Z'
+            ELSE 'X'
+     END x
+FROM  (SELECT 1 as id);
diff --git a/tests/queries/0_stateless/02542_transform_new.reference b/tests/queries/0_stateless/02542_transform_new.reference
new file mode 100644
index 000000000000..b6eaa692c416
--- /dev/null
+++ b/tests/queries/0_stateless/02542_transform_new.reference
@@ -0,0 +1,32 @@
+1
+1
+1
+1
+9
+9
+\N
+7
+1
+9
+7
+b
+b
+b
+b
+a
+a
+\N
+c
+sep1
+80000
+80000
+sep2
+80000
+80000
+sep3
+1
+sep4
+8000
+sep5
+8000
+sep6
diff --git a/tests/queries/0_stateless/02542_transform_new.sql b/tests/queries/0_stateless/02542_transform_new.sql
new file mode 100644
index 000000000000..43da0a507317
--- /dev/null
+++ b/tests/queries/0_stateless/02542_transform_new.sql
@@ -0,0 +1,35 @@
+select transform(2, [1,2], [9,1], materialize(null));
+select transform(2, [1,2], [9,1], materialize(7));
+select transform(2, [1,2], [9,1], null);
+select transform(2, [1,2], [9,1], 7);
+select transform(1, [1,2], [9,1], null);
+select transform(1, [1,2], [9,1], 7);
+select transform(5, [1,2], [9,1], null);
+select transform(5, [1,2], [9,1], 7);
+select transform(2, [1,2], [9,1]);
+select transform(1, [1,2], [9,1]);
+select transform(7, [1,2], [9,1]);
+
+select transform(2, [1,2], ['a','b'], materialize(null));
+select transform(2, [1,2], ['a','b'], materialize('c'));
+select transform(2, [1,2], ['a','b'], null);
+select transform(2, [1,2], ['a','b'], 'c');
+select transform(1, [1,2], ['a','b'], null);
+select transform(1, [1,2], ['a','b'], 'c');
+select transform(5, [1,2], ['a','b'], null);
+select transform(5, [1,2], ['a','b'], 'c');
+
+select 'sep1';
+SELECT transform(number, [2], [toDecimal32(1, 1)], materialize(80000)) as x FROM numbers(2);
+select 'sep2';
+SELECT transform(number, [2], [toDecimal32(1, 1)], 80000) as x FROM numbers(2);
+select 'sep3';
+SELECT transform(toDecimal32(2, 1), [toDecimal32(2, 1)], [1]);
+select 'sep4';
+SELECT transform(8000, [1], [toDecimal32(2, 1)]);
+select 'sep5';
+SELECT transform(toDecimal32(8000,0), [1], [toDecimal32(2, 1)]);
+select 'sep6';
+SELECT transform(-9223372036854775807, [-1], [toDecimal32(1024, 3)]) FROM system.numbers LIMIT 7; -- { serverError BAD_ARGUMENTS }
+SELECT [NULL, NULL, NULL, NULL], transform(number, [2147483648], [toDecimal32(1, 2)]) AS x FROM numbers(257) WHERE materialize(10); -- { serverError BAD_ARGUMENTS }
+SELECT transform(-2147483649, [1], [toDecimal32(1, 2)]) GROUP BY [1] WITH TOTALS; -- { serverError BAD_ARGUMENTS }
diff --git a/tests/queries/0_stateless/02542_transform_old.reference b/tests/queries/0_stateless/02542_transform_old.reference
new file mode 100644
index 000000000000..d03b17d40a32
--- /dev/null
+++ b/tests/queries/0_stateless/02542_transform_old.reference
@@ -0,0 +1,72 @@
+google
+other
+yahoo
+yandex
+#1
+20
+21
+22
+29
+#2
+0
+1
+3
+5
+7
+8
+9
+20
+21
+29
+#3
+20
+21
+22
+29
+#4
+google
+other
+yahoo
+yandex
+#5
+0
+1
+3
+5
+7
+8
+9
+google
+yahoo
+yandex
+----
+google
+other
+yahoo
+yandex
+#1
+20
+21
+22
+29
+#3
+20
+21
+22
+29
+#4
+google
+other
+yahoo
+yandex
+----
+2000
+2100
+2200
+2900
+#1
+2000
+2100
+2200
+2900
+----
diff --git a/tests/queries/0_stateless/02542_transform_old.sql b/tests/queries/0_stateless/02542_transform_old.sql
new file mode 100644
index 000000000000..01a960ec3674
--- /dev/null
+++ b/tests/queries/0_stateless/02542_transform_old.sql
@@ -0,0 +1,25 @@
+SELECT transform(number, [2, 4, 6], ['google', 'yandex', 'yahoo'], 'other') as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#1';
+SELECT transform(number, [2, 4, 6], [29, 20, 21], 22) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#2';
+SELECT transform(number, [2, 4, 6], [29, 20, 21]) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#3';
+SELECT transform(toString(number), ['2', '4', '6'], [29, 20, 21], 22) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#4';
+SELECT transform(toString(number), ['2', '4', '6'], ['google', 'yandex', 'yahoo'], 'other') as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#5';
+SELECT transform(toString(number), ['2', '4', '6'], ['google', 'yandex', 'yahoo']) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '----';
+SELECT transform(number, [2, 4, 6], ['google', 'yandex', 'yahoo'], materialize('other')) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#1';
+SELECT transform(number, [2, 4, 6], [29, 20, 21], materialize(22)) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#3';
+SELECT transform(toString(number), ['2', '4', '6'], [29, 20, 21], materialize(22)) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#4';
+SELECT transform(toString(number), ['2', '4', '6'], ['google', 'yandex', 'yahoo'], materialize('other')) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '----';
+SELECT transform(number, [2, 4, 6], [2900, 2000, 2100], 2200) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '#1';
+SELECT transform(number, [2, 4, 6], [2900, 2000, 2100], materialize(2200)) as x FROM numbers(10) GROUP BY x ORDER BY x;
+SELECT '----';
+SELECT transform(number, [1], [null]) FROM system.numbers LIMIT 1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
diff --git a/tests/queries/0_stateless/02596_build_set_and_remote.reference b/tests/queries/0_stateless/02596_build_set_and_remote.reference
new file mode 100644
index 000000000000..8d12196ae334
--- /dev/null
+++ b/tests/queries/0_stateless/02596_build_set_and_remote.reference
@@ -0,0 +1,19 @@
+-- {echoOn}
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM system.one;
+1
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one);
+1
+1
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one) GROUP BY NULL;
+1
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one) GROUP BY 1;
+1
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one) GROUP BY 'A';
+1
+SELECT 1 IN ( SELECT 1 ) FROM remote('127.0.0.{1,2}', system.one) GROUP BY dummy;
+1
+SELECT 1000.0001, toUInt64(arrayJoin([NULL, 257, 65536, NULL])), arrayExists(x -> (x IN (SELECT '2.55')), [-9223372036854775808]) FROM remote('127.0.0.{1,2}', system.one) GROUP BY NULL, NULL, NULL, NULL;
+1000.0001	\N	0
+1000.0001	257	0
+1000.0001	65536	0
+1000.0001	\N	0
diff --git a/tests/queries/0_stateless/02596_build_set_and_remote.sql b/tests/queries/0_stateless/02596_build_set_and_remote.sql
new file mode 100644
index 000000000000..7a904344c913
--- /dev/null
+++ b/tests/queries/0_stateless/02596_build_set_and_remote.sql
@@ -0,0 +1,14 @@
+-- {echoOn}
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM system.one;
+
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one);
+
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one) GROUP BY NULL;
+
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one) GROUP BY 1;
+
+SELECT arrayExists(x -> (x IN (SELECT '2')), [2]) FROM remote('127.0.0.{2,3}', system.one) GROUP BY 'A';
+
+SELECT 1 IN ( SELECT 1 ) FROM remote('127.0.0.{1,2}', system.one) GROUP BY dummy;
+
+SELECT 1000.0001, toUInt64(arrayJoin([NULL, 257, 65536, NULL])), arrayExists(x -> (x IN (SELECT '2.55')), [-9223372036854775808]) FROM remote('127.0.0.{1,2}', system.one) GROUP BY NULL, NULL, NULL, NULL;
diff --git a/tests/queries/0_stateless/02661_quantile_approx.reference b/tests/queries/0_stateless/02661_quantile_approx.reference
index f4e66adc8d91..8369363aa9b2 100644
--- a/tests/queries/0_stateless/02661_quantile_approx.reference
+++ b/tests/queries/0_stateless/02661_quantile_approx.reference
@@ -19,8 +19,10 @@ select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(numbe
 [99,199,249,313,776]
 select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000);
 [100,200,250,314,777]
-select medianGK()(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
-select quantileGK()(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
+select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
+select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+select quantileGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
+select quantileGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
 select medianGK(100)(number) from numbers(10);
 4
 select quantileGK(100)(number) from numbers(10);
@@ -31,7 +33,8 @@ select quantileGK(100, 0.5, 0.75)(number) from numbers(10); -- { serverError NUM
 select quantileGK('abc', 0.5)(number) from numbers(10); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 select quantileGK(1.23, 0.5)(number) from numbers(10); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 select quantileGK(-100, 0.5)(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
-select quantilesGK()(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
+select quantilesGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
+select quantilesGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
 select quantilesGK(100)(number) from numbers(10); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
 select quantilesGK(100, 0.5)(number) from numbers(10);
 [4]
diff --git a/tests/queries/0_stateless/02661_quantile_approx.sql b/tests/queries/0_stateless/02661_quantile_approx.sql
index 18c2e5de84b0..52c2979ad444 100644
--- a/tests/queries/0_stateless/02661_quantile_approx.sql
+++ b/tests/queries/0_stateless/02661_quantile_approx.sql
@@ -1,3 +1,5 @@
+set allow_experimental_analyzer = 1;
+
 -- { echoOn }
 with arrayJoin([0, 1, 2, 10]) as x select quantilesGK(100, 0.5, 0.4, 0.1)(x);
 with arrayJoin([0, 6, 7, 9, 10]) as x select quantileGK(100, 0.5)(x);
@@ -14,8 +16,12 @@ select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(numbe
 select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000);
 
 
-select medianGK()(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
-select quantileGK()(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
+select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
+select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+
+select quantileGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
+select quantileGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+
 select medianGK(100)(number) from numbers(10);
 select quantileGK(100)(number) from numbers(10);
 select quantileGK(100, 0.5)(number) from numbers(10);
@@ -24,7 +30,9 @@ select quantileGK('abc', 0.5)(number) from numbers(10); -- { serverError ILLEGAL
 select quantileGK(1.23, 0.5)(number) from numbers(10); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 select quantileGK(-100, 0.5)(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
 
-select quantilesGK()(number) from numbers(10); -- { serverError BAD_ARGUMENTS }
+select quantilesGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS }
+select quantilesGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+
 select quantilesGK(100)(number) from numbers(10); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
 select quantilesGK(100, 0.5)(number) from numbers(10);
 select quantilesGK('abc', 0.5, 0.75)(number) from numbers(10); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
diff --git a/tests/queries/0_stateless/02677_analyzer_bitmap_has_any.sql b/tests/queries/0_stateless/02677_analyzer_bitmap_has_any.sql
index 4af06634c665..f0f9845d91d4 100644
--- a/tests/queries/0_stateless/02677_analyzer_bitmap_has_any.sql
+++ b/tests/queries/0_stateless/02677_analyzer_bitmap_has_any.sql
@@ -18,7 +18,7 @@ FROM
         bitmapHasAny(bitmapBuild([toUInt64(1)]), (
             SELECT groupBitmapState(toUInt64(2))
         )) has2
-); -- { serverError 43 }
+) SETTINGS allow_experimental_analyzer = 0; -- { serverError 43 }
 
 SELECT '--------------';
 
diff --git a/tests/queries/0_stateless/02680_mysql_ast_logical_err.sql b/tests/queries/0_stateless/02680_mysql_ast_logical_err.sql
index 5b0530e05ae7..bde91df83ca9 100644
--- a/tests/queries/0_stateless/02680_mysql_ast_logical_err.sql
+++ b/tests/queries/0_stateless/02680_mysql_ast_logical_err.sql
@@ -1,2 +1,4 @@
+CREATE TABLE foo (key UInt32, a String, b Int64, c String) ENGINE = TinyLog;
+
 SELECT count() FROM mysql(mysql('127.0.0.1:9004', currentDatabase(), 'foo', 'default', ''), '127.0.0.1:9004', currentDatabase(), 'foo', '', ''); -- { serverError UNKNOWN_FUNCTION }
--- SELECT count() FROM mysql(mysql('127.0.0.1:9004', currentDatabase(), 'foo', 'default', '', SETTINGS connection_pool_size = 1), '127.0.0.1:9004', currentDatabase(), 'foo', '', ''); -- { serverError UNKNOWN_FUNCTION }
+SELECT count() FROM mysql(mysql('127.0.0.1:9004', currentDatabase(), 'foo', 'default', '', SETTINGS connection_pool_size = 1), '127.0.0.1:9004', currentDatabase(), 'foo', '', ''); -- { serverError UNKNOWN_FUNCTION, UNSUPPORTED_METHOD }
diff --git a/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.reference b/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.reference
index 346025b277b2..35c94347ac95 100644
--- a/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.reference
+++ b/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.reference
@@ -2,6 +2,8 @@
 []
 [[(2147483647,0),(10.0001,65535),(1,255),(1023,2147483646)]]	[[[(2147483647,0),(10.0001,65535),(1023,2147483646),(2147483647,0)]]]
 [[(2147483647,0),(10.0001,65535),(1,255),(1023,2147483646)]]	[]
+[[(2147483647,0),(10.0001,65535),(1,255),(1023,2147483646)]]	[[[(2147483647,0),(10.0001,65535),(1023,2147483646),(2147483647,0)]]]
+[[(2147483647,0),(10.0001,65535),(1,255),(1023,2147483646)]]	[[[(2147483647,0),(10.0001,65535),(1023,2147483646),(2147483647,0)]]]
 [[[(100.0001,1000.0001),(1000.0001,1.1920928955078125e-7),(20,-20),(20,20),(10,10),(-20,20),(100.0001,1000.0001)]]]
 [[[(100.0001,1000.0001),(1000.0001,1.1920928955078125e-7),(20,-20),(20,20),(10,10),(-20,20),(100.0001,1000.0001)]]]
 [(9223372036854775807,1.1754943508222875e-38)]	[[(1,1.0001)]]	\N	[]
diff --git a/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.sql b/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.sql
index 8b9b63f7996f..85307bec6e59 100644
--- a/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.sql
+++ b/tests/queries/0_stateless/02699_polygons_sym_difference_rollup.sql
@@ -1,5 +1,5 @@
-
 SELECT polygonsSymDifferenceCartesian([[[(1., 1.)]] AS x], [x]) GROUP BY x WITH ROLLUP;
-SELECT [[(2147483647, 0.), (10.0001, 65535), (1, 255), (1023, 2147483646)]], polygonsSymDifferenceCartesian([[[(2147483647, 0.), (10.0001, 65535), (1023, 2147483646)]]], [[[(1000.0001, 10.0001)]]]) GROUP BY [[(2147483647, 0.), (10.0001, 65535), (1023, 2147483646)]] WITH ROLLUP;
+SELECT [[(2147483647, 0.), (10.0001, 65535), (1, 255), (1023, 2147483646)]], polygonsSymDifferenceCartesian([[[(2147483647, 0.), (10.0001, 65535), (1023, 2147483646)]]], [[[(1000.0001, 10.0001)]]]) GROUP BY [[(2147483647, 0.), (10.0001, 65535), (1023, 2147483646)]] WITH ROLLUP SETTINGS allow_experimental_analyzer=0;
+SELECT [[(2147483647, 0.), (10.0001, 65535), (1, 255), (1023, 2147483646)]], polygonsSymDifferenceCartesian([[[(2147483647, 0.), (10.0001, 65535), (1023, 2147483646)]]], [[[(1000.0001, 10.0001)]]]) GROUP BY [[(2147483647, 0.), (10.0001, 65535), (1023, 2147483646)]] WITH ROLLUP SETTINGS allow_experimental_analyzer=1;
 SELECT polygonsSymDifferenceCartesian([[[(100.0001, 1000.0001), (-20., 20.), (10., 10.), (20., 20.), (20., -20.), (1000.0001, 1.1920928955078125e-7)]],[[(0.0001, 100000000000000000000.)]] AS x],[x]) GROUP BY x WITH ROLLUP;
 SELECT [(9223372036854775807, 1.1754943508222875e-38)], x, NULL, polygonsSymDifferenceCartesian([[[(1.1754943508222875e-38, 1.1920928955078125e-7), (0.5, 0.5)]], [[(1.1754943508222875e-38, 1.1920928955078125e-7), (1.1754943508222875e-38, 1.1920928955078125e-7)], [(0., 1.0001)]], [[(1., 1.0001)]] AS x], [[[(3.4028234663852886e38, 0.9999)]]]) GROUP BY GROUPING SETS ((x)) WITH TOTALS
diff --git a/tests/queries/0_stateless/02703_jit_external_aggregation.reference b/tests/queries/0_stateless/02703_jit_external_aggregation.reference
index cdeec60f4efa..9c558e357c41 100644
--- a/tests/queries/0_stateless/02703_jit_external_aggregation.reference
+++ b/tests/queries/0_stateless/02703_jit_external_aggregation.reference
@@ -1 +1 @@
-.....
+.
diff --git a/tests/queries/0_stateless/02703_jit_external_aggregation.sh b/tests/queries/0_stateless/02703_jit_external_aggregation.sh
index 2d1dda45de03..4bc17c106fba 100755
--- a/tests/queries/0_stateless/02703_jit_external_aggregation.sh
+++ b/tests/queries/0_stateless/02703_jit_external_aggregation.sh
@@ -5,11 +5,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
 
-# This query should return empty result in every of five runs:
-
-for _ in {1..5}
-do
-    $CLICKHOUSE_CLIENT --compile_aggregate_expressions 0 --query "
+# This query should return empty result
+$CLICKHOUSE_CLIENT --compile_aggregate_expressions 1 --min_count_to_compile_aggregate_expression=0 --query "
 SELECT
     COUNT() AS c,
     group_key,
@@ -30,6 +27,5 @@ ORDER BY group_key ASC
 LIMIT 10
 SETTINGS max_bytes_before_external_group_by = 200000
 " && echo -n '.'
-done
 
 echo
diff --git a/tests/queries/0_stateless/02707_complex_query_fails_analyzer.reference b/tests/queries/0_stateless/02707_complex_query_fails_analyzer.reference
deleted file mode 100644
index 192f8aa904a1..000000000000
--- a/tests/queries/0_stateless/02707_complex_query_fails_analyzer.reference
+++ /dev/null
@@ -1,10 +0,0 @@
-1	1	-59.952
-1	2	59.952
-1	3	-100
-2	1	-93.7611
-2	2	93.7611
-3	1	0
-3	2	0
----------
-0
-0
diff --git a/tests/queries/0_stateless/02707_complex_query_fails_analyzer.sql b/tests/queries/0_stateless/02707_complex_query_fails_analyzer.sql
deleted file mode 100644
index a9d83479d508..000000000000
--- a/tests/queries/0_stateless/02707_complex_query_fails_analyzer.sql
+++ /dev/null
@@ -1,117 +0,0 @@
-DROP TABLE IF EXISTS srv_account_parts;
-DROP TABLE IF EXISTS etl_batch;
-
-CREATE TABLE srv_account_parts(
-    shard_num UInt16,
-    account_ids Array(Int64)
-)ENGINE = ReplacingMergeTree
-ORDER BY shard_num
-as select * from values ((0,[]),(1,[1,2,3]),(2,[1,2,3]),(3,[1]));
-
-CREATE TABLE etl_batch(
-    batch_id UInt64,
-    batch_start DateTime,
-    batch_start_day Date DEFAULT toDate(batch_start),
-    batch_load DateTime,
-    total_num_records UInt32,
-    etl_server_id Int32,
-    account_id UInt64,
-    shard_num UInt16
-)ENGINE = ReplacingMergeTree
-PARTITION BY toYYYYMM(batch_start_day)
-ORDER BY (batch_id, etl_server_id, account_id);
-
-insert into etl_batch(batch_id, batch_start, batch_load, total_num_records, etl_server_id, account_id, shard_num)
-select number batch_id, 
-       toDateTime('2022-01-01') + INTERVAL 23 HOUR batch_start,
-       batch_start batch_load,
-       333 total_num_records,
-       1 etl_server_id,
-       number%3+1 account_id,
-       1 shard_num
-from numbers(1000);
-
-insert into etl_batch(batch_id, batch_start, batch_load, total_num_records, etl_server_id, account_id, shard_num)
-select number+2000 batch_id, 
-       toDateTime('2022-01-01') + INTERVAL 23 HOUR batch_start,
-       batch_start batch_load,
-       333 total_num_records,
-       1 etl_server_id,
-       number%3+1 account_id,
-       2 shard_num
-from numbers(1000);
-
-insert into etl_batch(batch_id, batch_start, batch_load, total_num_records, etl_server_id, account_id, shard_num)
-select number+4000 batch_id, 
-       toDateTime('2022-01-01') + INTERVAL 3 HOUR batch_start,
-       batch_start batch_load,
-       3333 total_num_records,
-       1 etl_server_id,
-       2 account_id,
-       2 shard_num
-from numbers(1000);
-
-insert into etl_batch(batch_id, batch_start, batch_load, total_num_records, etl_server_id, account_id, shard_num)
-select number+6000 batch_id, 
-       toDateTime('2022-01-01') + INTERVAL 23 HOUR  batch_start,
-       batch_start batch_load,
-       333 total_num_records,
-       1 etl_server_id,
-       1 account_id,
-       2 shard_num
-from numbers(1000);
-
-insert into etl_batch(batch_id, batch_start, batch_load, total_num_records, etl_server_id, account_id, shard_num)
-select number+8000 batch_id, 
-       toDateTime('2022-01-01') + INTERVAL 23 HOUR batch_start,
-       batch_start batch_load,
-       1000 total_num_records,
-       1 etl_server_id,
-       3 account_id,
-       3 shard_num
-from numbers(1000);
-
-CREATE OR REPLACE VIEW v_num_records_by_node_bias_acc as
-SELECT shard_num,
-       arrayJoin(account_ids) AS account_id,
-       records_24h,
-       records_12h,
-       IF (b = '',-100,xbias) AS bias,
-       IF (bias > 10,0,IF (bias > 0,1,IF (bias < -10,301,300))) AS sbias
-FROM srv_account_parts
-  LEFT JOIN (SELECT account_id,
-                    shard_num,
-                    records_24h,
-                    records_12h,
-                    xbias,
-                    'b' AS b
-             FROM (SELECT account_id,
-                          groupArray((shard_num,records_24h,records_12h)) AS ga,
-                          arraySum(ga.2) AS tot24,
-                          arraySum(ga.3) AS tot12,
-                          arrayMap(i ->(((((i.2)*LENGTH(ga))*100) / tot24) - 100),ga) AS bias24,
-                          arrayMap(i ->(((((i.3)*LENGTH(ga))*100) / tot12) - 100),ga) AS bias12,
-                          arrayMap((i,j,k) ->(i,IF (tot12 = 0,0,IF (ABS(j) > ABS(k),j,k))),ga,bias24,bias12) AS a_bias
-                   FROM (SELECT shard_num,
-                                toInt64(account_id) AS account_id,
-                                SUM(total_num_records) AS records_24h,
-                                sumIf(total_num_records,batch_load >(toDateTime('2022-01-02') -(3600*12))) AS records_12h
-                         FROM etl_batch FINAL PREWHERE (batch_start_day >= (toDate('2022-01-02') - 2)) AND (batch_load > (toDateTime('2022-01-02') - (3600*24)))
-                         where (shard_num, account_id) in (select shard_num, arrayJoin(account_ids) from srv_account_parts)
-                         GROUP BY shard_num,account_id)
-                   GROUP BY account_id) 
-                   ARRAY JOIN (a_bias.1).1 AS shard_num,a_bias.2 AS xbias, (a_bias.1).2 AS records_24h, (a_bias.1).3 AS records_12h
-            ) s USING (shard_num,account_id);
-
-select account_id, shard_num, round(bias,4) 
-from v_num_records_by_node_bias_acc
-order by  account_id, shard_num, bias;
-
-select '---------';
-
-SELECT a AS b FROM (SELECT 0 a) s LEFT JOIN (SELECT 0 b) t USING (b);
-
-SELECT arrayJoin(a) AS b FROM (SELECT [0] a) s LEFT JOIN (SELECT 0 b) t USING (b);
-
-DROP TABLE srv_account_parts;
-DROP TABLE etl_batch;
diff --git a/tests/queries/0_stateless/02713_create_user_substitutions.reference b/tests/queries/0_stateless/02713_create_user_substitutions.reference
new file mode 100644
index 000000000000..f9b5cc495b5b
--- /dev/null
+++ b/tests/queries/0_stateless/02713_create_user_substitutions.reference
@@ -0,0 +1,11 @@
+1
+2
+3
+4
+5
+6
+7
+8
+CREATE USER user9_02713 IDENTIFIED WITH ldap SERVER \'qwerty9\'
+CREATE USER user10_02713 IDENTIFIED WITH kerberos REALM \'qwerty10\'
+CREATE USER user11_02713 IDENTIFIED WITH ssl_certificate CN \'qwerty11\', \'qwerty12\'
diff --git a/tests/queries/0_stateless/02713_create_user_substitutions.sh b/tests/queries/0_stateless/02713_create_user_substitutions.sh
new file mode 100755
index 000000000000..42926335acbb
--- /dev/null
+++ b/tests/queries/0_stateless/02713_create_user_substitutions.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest, no-parallel
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS user1_02713, user2_02713, user3_02713, user4_02713, user5_02713, user6_02713, user7_02713";
+
+$CLICKHOUSE_CLIENT --param_password=qwerty1 -q "CREATE USER user1_02713 IDENTIFIED BY {password:String}";
+$CLICKHOUSE_CLIENT --param_password=qwerty2 -q "CREATE USER user2_02713 IDENTIFIED WITH PLAINTEXT_PASSWORD BY {password:String}";
+$CLICKHOUSE_CLIENT --param_password=qwerty3 -q "CREATE USER user3_02713 IDENTIFIED WITH SHA256_PASSWORD BY {password:String}";
+$CLICKHOUSE_CLIENT --param_password=qwerty4 -q "CREATE USER user4_02713 IDENTIFIED WITH DOUBLE_SHA1_PASSWORD BY {password:String}";
+$CLICKHOUSE_CLIENT --param_password=qwerty5 -q "CREATE USER user5_02713 IDENTIFIED WITH BCRYPT_PASSWORD BY {password:String}";
+
+# Generated online
+$CLICKHOUSE_CLIENT --param_hash=310cef2caff72c0224f38ca8e2141ca6012cd4da550c692573c25a917d9a75e6 \
+    -q "CREATE USER user6_02713 IDENTIFIED WITH SHA256_HASH BY {hash:String}";
+# Generated with ClickHouse
+$CLICKHOUSE_CLIENT --param_hash=5886A74C452575627522F3A80D8B9E239FD8955F \
+    -q "CREATE USER user7_02713 IDENTIFIED WITH DOUBLE_SHA1_HASH BY {hash:String}";
+# Generated online
+$CLICKHOUSE_CLIENT --param_hash=\$2a\$12\$wuohz0HFSBBNE8huN0Yx6.kmWrefiYVKeMp4gsuNoO1rOWwF2FXXC \
+    -q "CREATE USER user8_02713 IDENTIFIED WITH BCRYPT_HASH BY {hash:String}";
+
+$CLICKHOUSE_CLIENT --param_server=qwerty9 -q "CREATE USER user9_02713 IDENTIFIED WITH LDAP SERVER {server:String}";
+$CLICKHOUSE_CLIENT --param_realm=qwerty10 -q "CREATE USER user10_02713 IDENTIFIED WITH KERBEROS REALM {realm:String}";
+$CLICKHOUSE_CLIENT --param_cert1=qwerty11 --param_cert2=qwerty12 -q "CREATE USER user11_02713 IDENTIFIED WITH SSL_CERTIFICATE CN {cert1:String}, {cert2:String}";
+
+$CLICKHOUSE_CLIENT --user=user1_02713 --password=qwerty1 -q "SELECT 1";
+$CLICKHOUSE_CLIENT --user=user2_02713 --password=qwerty2 -q "SELECT 2";
+$CLICKHOUSE_CLIENT --user=user3_02713 --password=qwerty3 -q "SELECT 3";
+$CLICKHOUSE_CLIENT --user=user4_02713 --password=qwerty4 -q "SELECT 4";
+$CLICKHOUSE_CLIENT --user=user5_02713 --password=qwerty5 -q "SELECT 5";
+$CLICKHOUSE_CLIENT --user=user6_02713 --password=qwerty6 -q "SELECT 6";
+$CLICKHOUSE_CLIENT --user=user7_02713 --password=qwerty7 -q "SELECT 7";
+$CLICKHOUSE_CLIENT --user=user8_02713 --password=qwerty8 -q "SELECT 8";
+
+$CLICKHOUSE_CLIENT -q "SHOW CREATE USER user9_02713";
+$CLICKHOUSE_CLIENT -q "SHOW CREATE USER user10_02713";
+$CLICKHOUSE_CLIENT -q "SHOW CREATE USER user11_02713";
+
+$CLICKHOUSE_CLIENT -q "DROP USER user1_02713, user2_02713, user3_02713, user4_02713, user5_02713, user6_02713, user7_02713, user8_02713, user9_02713, user10_02713, user11_02713";
diff --git a/tests/queries/0_stateless/02713_sequence_match_serialization_fix.reference b/tests/queries/0_stateless/02713_sequence_match_serialization_fix.reference
new file mode 100644
index 000000000000..2a1c127e635a
--- /dev/null
+++ b/tests/queries/0_stateless/02713_sequence_match_serialization_fix.reference
@@ -0,0 +1,3 @@
+serialized state is not used	1
+serialized state is used	1
+via Distributed	1
diff --git a/tests/queries/0_stateless/02713_sequence_match_serialization_fix.sql b/tests/queries/0_stateless/02713_sequence_match_serialization_fix.sql
new file mode 100644
index 000000000000..3521cb8470fc
--- /dev/null
+++ b/tests/queries/0_stateless/02713_sequence_match_serialization_fix.sql
@@ -0,0 +1,36 @@
+DROP TABLE IF EXISTS 02713_seqt;
+DROP TABLE IF EXISTS 02713_seqt_distr;
+
+SELECT
+    'serialized state is not used', sequenceMatch('(?1)(?2)')(time, number_ = 1, number_ = 0) AS seq
+FROM
+(
+    SELECT
+        number AS time,
+        number % 2 AS number_
+    FROM numbers_mt(100)
+);
+
+
+CREATE TABLE 02713_seqt
+ENGINE = MergeTree
+ORDER BY n AS
+SELECT
+    sequenceMatchState('(?1)(?2)')(time, number_ = 1, number_ = 0) AS seq,
+    1 AS n
+FROM
+(
+    SELECT
+        number AS time,
+        number % 2 AS number_
+    FROM numbers_mt(100)
+);
+
+
+SELECT 'serialized state is used', sequenceMatchMerge('(?1)(?2)')(seq) AS seq
+FROM 02713_seqt;
+
+
+CREATE TABLE 02713_seqt_distr ( seq AggregateFunction(sequenceMatch('(?1)(?2)'), UInt64, UInt8, UInt8) , n UInt8) ENGINE = Distributed(test_shard_localhost, currentDatabase(), '02713_seqt');
+
+SELECT 'via Distributed', sequenceMatchMerge('(?1)(?2)')(seq) AS seq FROM 02713_seqt_distr;
diff --git a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference
new file mode 100644
index 000000000000..360b484bf28d
--- /dev/null
+++ b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference
@@ -0,0 +1,4 @@
+Size: 6000001
+Size: 6000001
+Size: 6000001
+Size: 2971517
diff --git a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh
new file mode 100755
index 000000000000..69e2f7349144
--- /dev/null
+++ b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest, long
+# Tag no-fasttest: requires S3
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+in="$CUR_DIR/$CLICKHOUSE_TEST_UNIQUE_NAME.in"
+out="$CUR_DIR/$CLICKHOUSE_TEST_UNIQUE_NAME.out"
+log="$CUR_DIR/$CLICKHOUSE_TEST_UNIQUE_NAME.log"
+
+set -e
+trap 'rm -f "${out:?}" "${in:?}" "${log:?}"' EXIT
+
+# Generate a file of 20MiB in size, with our part size it will have 4 parts
+# NOTE: 1 byte is for new line, so 1023 not 1024
+$CLICKHOUSE_LOCAL -q "SELECT randomPrintableASCII(1023) FROM numbers(20*1024) FORMAT LineAsString" > "$in"
+
+$CLICKHOUSE_CLIENT --send_logs_level=trace --server_logs_file="$log" -q "INSERT INTO FUNCTION s3(s3_conn, filename='$CLICKHOUSE_TEST_UNIQUE_NAME', format='LineAsString', structure='line String') FORMAT LineAsString" --s3_strict_upload_part_size=6000001 < "$in"
+grep -F '<Fatal>' "$log" || :
+grep -o 'WriteBufferFromS3: Writing part.*Size: .*' "$log" | grep -o 'Size: .*'
+$CLICKHOUSE_CLIENT -q "SELECT * FROM s3(s3_conn, filename='$CLICKHOUSE_TEST_UNIQUE_NAME', format='LineAsString', structure='line String') FORMAT LineAsString" > "$out"
+
+diff -q "$in" "$out"
diff --git a/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.reference b/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.reference
new file mode 100644
index 000000000000..6f9b4b4fc6a7
--- /dev/null
+++ b/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.reference
@@ -0,0 +1,7 @@
+-- { echoOn }
+SYSTEM DROP COMPILED EXPRESSION CACHE;
+SELECT minIf(num1, num1 < 5) FROM dummy GROUP BY num2;
+0
+SYSTEM DROP COMPILED EXPRESSION CACHE;
+SELECT minIf(num1, num1 >= 5) FROM dummy GROUP BY num2;
+5
diff --git a/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.sql b/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.sql
new file mode 100644
index 000000000000..04e0fc5e0ba5
--- /dev/null
+++ b/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.sql
@@ -0,0 +1,17 @@
+-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64
+
+drop table if exists dummy;
+CREATE TABLE dummy ( num1 Int32, num2 Enum8('foo' = 0, 'bar' = 1, 'tar' = 2) )
+ENGINE = MergeTree ORDER BY num1 as select 5, 'bar';
+
+set compile_aggregate_expressions=1;
+set min_count_to_compile_aggregate_expression=0;
+
+-- { echoOn }
+SYSTEM DROP COMPILED EXPRESSION CACHE;
+SELECT minIf(num1, num1 < 5) FROM dummy GROUP BY num2;
+SYSTEM DROP COMPILED EXPRESSION CACHE;
+SELECT minIf(num1, num1 >= 5) FROM dummy GROUP BY num2;
+-- { echoOff }
+
+drop table dummy;
diff --git a/tests/queries/0_stateless/02724_decompress_filename_exception.reference b/tests/queries/0_stateless/02724_decompress_filename_exception.reference
new file mode 100644
index 000000000000..f9c5aacff7be
--- /dev/null
+++ b/tests/queries/0_stateless/02724_decompress_filename_exception.reference
@@ -0,0 +1,8 @@
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
diff --git a/tests/queries/0_stateless/02724_decompress_filename_exception.sh b/tests/queries/0_stateless/02724_decompress_filename_exception.sh
new file mode 100755
index 000000000000..bbc2b8d066ba
--- /dev/null
+++ b/tests/queries/0_stateless/02724_decompress_filename_exception.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest, no-parallel
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
+FILENAME="${USER_FILES_PATH}/corrupted_file.tsv.xx"
+
+echo 'corrupted file' > $FILENAME;
+
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'gzip')" 2>&1    | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'deflate')" 2>&1 | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'br')" 2>&1      | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'xz')" 2>&1      | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'zstd')" 2>&1    | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'lz4')" 2>&1     | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'bz2')" 2>&1     | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+$CLICKHOUSE_CLIENT --query "SELECT * FROM file('${FILENAME}', 'TSV', 'c UInt32', 'snappy')" 2>&1  | grep -q "While reading from: $FILENAME" && echo 'Ok' || echo 'Fail';
+
+rm $FILENAME;
diff --git a/tests/queries/0_stateless/02724_function_in_left_table_clause_asof_join.reference b/tests/queries/0_stateless/02724_function_in_left_table_clause_asof_join.reference
new file mode 100644
index 000000000000..d00491fd7e5b
--- /dev/null
+++ b/tests/queries/0_stateless/02724_function_in_left_table_clause_asof_join.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/02724_function_in_left_table_clause_asof_join.sql b/tests/queries/0_stateless/02724_function_in_left_table_clause_asof_join.sql
new file mode 100644
index 000000000000..13dfb5debe7a
--- /dev/null
+++ b/tests/queries/0_stateless/02724_function_in_left_table_clause_asof_join.sql
@@ -0,0 +1,8 @@
+select count(*)
+from (
+  select 1 as id, [1, 2, 3] as arr
+) as sessions
+ASOF LEFT JOIN (
+  select 1 as session_id, 4 as id
+) as visitors
+ON visitors.session_id <= sessions.id AND arrayFirst(a -> a, arrayMap((a) -> a, sessions.arr)) = visitors.id
diff --git a/tests/queries/0_stateless/02724_jit_logical_functions.reference b/tests/queries/0_stateless/02724_jit_logical_functions.reference
new file mode 100644
index 000000000000..673ffe02613d
--- /dev/null
+++ b/tests/queries/0_stateless/02724_jit_logical_functions.reference
@@ -0,0 +1,18 @@
+Logical functions not null
+0	0	0	0	0
+0	1	0	1	1
+1	0	0	1	1
+1	1	1	1	0
+Logical functions nullable
+0	0	0	0	0
+0	1	0	1	1
+1	0	0	1	1
+1	1	1	1	0
+0	\N	0	\N	\N
+1	\N	\N	1	\N
+0	0	0
+1	1	0
+0	0	0
+1	1	0
+\N	\N	\N
+\N	\N	\N
diff --git a/tests/queries/0_stateless/02724_jit_logical_functions.sql b/tests/queries/0_stateless/02724_jit_logical_functions.sql
new file mode 100644
index 000000000000..fe6646337d04
--- /dev/null
+++ b/tests/queries/0_stateless/02724_jit_logical_functions.sql
@@ -0,0 +1,21 @@
+SET compile_expressions = 1;
+SET min_count_to_compile_expression = 0;
+
+DROP TABLE IF EXISTS test_table;
+CREATE TABLE test_table (a UInt8, b UInt8) ENGINE = TinyLog;
+INSERT INTO test_table VALUES (0, 0), (0, 1), (1, 0), (1, 1);
+
+SELECT 'Logical functions not null';
+SELECT a, b, and(a, b), or(a, b), xor(a, b) FROM test_table;
+
+DROP TABLE test_table;
+
+DROP TABLE IF EXISTS test_table_nullable;
+CREATE TABLE test_table_nullable (a UInt8, b Nullable(UInt8)) ENGINE = TinyLog;
+INSERT INTO test_table_nullable VALUES (0, 0), (0, 1), (1, 0), (1, 1), (0, NULL), (1, NULL);
+
+SELECT 'Logical functions nullable';
+SELECT a, b, and(a, b), or(a, b), xor(a, b) FROM test_table_nullable;
+SELECT and(b, b), or(b, b), xor(b, b) FROM test_table_nullable;
+
+DROP TABLE test_table_nullable;
diff --git a/tests/queries/0_stateless/02724_mutliple_storage_join.reference b/tests/queries/0_stateless/02724_mutliple_storage_join.reference
new file mode 100644
index 000000000000..f7eb44d66e0b
--- /dev/null
+++ b/tests/queries/0_stateless/02724_mutliple_storage_join.reference
@@ -0,0 +1,6 @@
+0
+0
+0
+0
+0
+0
diff --git a/tests/queries/0_stateless/02724_mutliple_storage_join.sql b/tests/queries/0_stateless/02724_mutliple_storage_join.sql
new file mode 100644
index 000000000000..286e867704df
--- /dev/null
+++ b/tests/queries/0_stateless/02724_mutliple_storage_join.sql
@@ -0,0 +1,21 @@
+CREATE TABLE user(id UInt32, name String) ENGINE = Join(ANY, LEFT, id);
+INSERT INTO user VALUES (1,'U1')(2,'U2')(3,'U3');
+
+CREATE TABLE product(id UInt32, name String, cate String) ENGINE = Join(ANY, LEFT, id);
+INSERT INTO product VALUES (1,'P1','C1')(2,'P2','C1')(3,'P3','C2');
+
+CREATE TABLE order(id UInt32, pId UInt32, uId UInt32) ENGINE = TinyLog;
+INSERT INTO order VALUES (1,1,1)(2,1,2)(3,2,3);
+
+SELECT ignore(*) FROM (
+    SELECT
+        uId,
+        user.id as `uuu`
+    FROM order
+    LEFT ANY JOIN user
+    ON uId = `uuu`
+);
+
+SELECT ignore(*) FROM order
+LEFT ANY JOIN user ON uId = user.id
+LEFT ANY JOIN product ON pId = product.id;
diff --git a/tests/queries/0_stateless/02725_alias_columns_should_not_allow_compression_codec.reference b/tests/queries/0_stateless/02725_alias_columns_should_not_allow_compression_codec.reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/queries/0_stateless/02725_alias_columns_should_not_allow_compression_codec.sql b/tests/queries/0_stateless/02725_alias_columns_should_not_allow_compression_codec.sql
new file mode 100644
index 000000000000..083a3aefdaff
--- /dev/null
+++ b/tests/queries/0_stateless/02725_alias_columns_should_not_allow_compression_codec.sql
@@ -0,0 +1,7 @@
+drop table if exists alias_column_should_not_allow_compression;
+create table if not exists alias_column_should_not_allow_compression ( user_id UUID, user_id_hashed ALIAS (cityHash64(user_id))) engine=MergeTree() order by tuple();
+create table if not exists alias_column_should_not_allow_compression_fail ( user_id UUID, user_id_hashed ALIAS (cityHash64(user_id)) codec(LZ4HC(1))) engine=MergeTree() order by tuple(); -- { serverError BAD_ARGUMENTS }
+alter table alias_column_should_not_allow_compression modify column user_id codec(LZ4HC(1));
+alter table alias_column_should_not_allow_compression modify column user_id_hashed codec(LZ4HC(1)); -- { serverError BAD_ARGUMENTS }
+alter table alias_column_should_not_allow_compression add column user_id_hashed_1 UInt64 ALIAS (cityHash64(user_id)) codec(LZ4HC(1)); -- { serverError BAD_ARGUMENTS }
+drop table if exists alias_column_should_not_allow_compression;
diff --git a/tests/queries/0_stateless/02725_alias_with_restricted_keywords.reference b/tests/queries/0_stateless/02725_alias_with_restricted_keywords.reference
new file mode 100644
index 000000000000..9874d6464ab7
--- /dev/null
+++ b/tests/queries/0_stateless/02725_alias_with_restricted_keywords.reference
@@ -0,0 +1 @@
+1	2
diff --git a/tests/queries/0_stateless/02725_alias_with_restricted_keywords.sql b/tests/queries/0_stateless/02725_alias_with_restricted_keywords.sql
new file mode 100644
index 000000000000..6df0e8560610
--- /dev/null
+++ b/tests/queries/0_stateless/02725_alias_with_restricted_keywords.sql
@@ -0,0 +1 @@
+SELECT 1 `array`, 2 "union";
diff --git a/tests/queries/0_stateless/02725_async_insert_table_setting.reference b/tests/queries/0_stateless/02725_async_insert_table_setting.reference
new file mode 100644
index 000000000000..5f5235c569f7
--- /dev/null
+++ b/tests/queries/0_stateless/02725_async_insert_table_setting.reference
@@ -0,0 +1,4 @@
+2
+2
+default.t_mt_async_insert	1
+default.t_mt_sync_insert	0
diff --git a/tests/queries/0_stateless/02725_async_insert_table_setting.sh b/tests/queries/0_stateless/02725_async_insert_table_setting.sh
new file mode 100755
index 000000000000..13911e8d6778
--- /dev/null
+++ b/tests/queries/0_stateless/02725_async_insert_table_setting.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+${CLICKHOUSE_CLIENT} -n --query "
+DROP TABLE IF EXISTS t_mt_async_insert;
+DROP TABLE IF EXISTS t_mt_sync_insert;
+
+CREATE TABLE t_mt_async_insert (id UInt64, s String)
+ENGINE = MergeTree ORDER BY id SETTINGS async_insert = 1;
+
+CREATE TABLE t_mt_sync_insert (id UInt64, s String)
+ENGINE = MergeTree ORDER BY id SETTINGS async_insert = 0;"
+
+url="${CLICKHOUSE_URL}&async_insert=0&wait_for_async_insert=1"
+
+${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO t_mt_async_insert VALUES (1, 'aa'), (2, 'bb')"
+${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO t_mt_sync_insert VALUES (1, 'aa'), (2, 'bb')"
+
+${CLICKHOUSE_CLIENT} -n --query "
+SELECT count() FROM t_mt_async_insert;
+SELECT count() FROM t_mt_sync_insert;
+
+SYSTEM FLUSH LOGS;
+SELECT tables[1], ProfileEvents['AsyncInsertQuery'] FROM system.query_log
+WHERE
+    type = 'QueryFinish' AND
+    current_database = currentDatabase() AND
+    query ILIKE 'INSERT INTO t_mt_%sync_insert%'
+ORDER BY tables[1];
+
+DROP TABLE IF EXISTS t_mt_async_insert;
+DROP TABLE IF EXISTS t_mt_sync_insert;"
diff --git a/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.reference b/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.sql b/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.sql
new file mode 100644
index 000000000000..e1db4ba2fa63
--- /dev/null
+++ b/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.sql
@@ -0,0 +1,10 @@
+DROP TABLE IF EXISTS keeper_fault_inject_sequential_cleanup;
+
+CREATE TABLE keeper_fault_inject_sequential_cleanup (d Int8) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_02725/tables/keeper_fault_inject_sequential_cleanup', '1') ORDER BY d;
+
+INSERT INTO keeper_fault_inject_sequential_cleanup VALUES (1);
+INSERT INTO keeper_fault_inject_sequential_cleanup SETTINGS insert_deduplicate = 0 VALUES (1);
+INSERT INTO keeper_fault_inject_sequential_cleanup SETTINGS insert_deduplicate = 0, insert_keeper_fault_injection_probability = 0.4, insert_keeper_fault_injection_seed = 5619964844601345291 VALUES (1);
+
+-- with database ordinary it produced a warning
+DROP TABLE keeper_fault_inject_sequential_cleanup;
diff --git a/tests/queries/0_stateless/02725_memory-for-merges.reference b/tests/queries/0_stateless/02725_memory-for-merges.reference
new file mode 100644
index 000000000000..d00491fd7e5b
--- /dev/null
+++ b/tests/queries/0_stateless/02725_memory-for-merges.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/02725_memory-for-merges.sql b/tests/queries/0_stateless/02725_memory-for-merges.sql
new file mode 100644
index 000000000000..b6ae7af7f1ae
--- /dev/null
+++ b/tests/queries/0_stateless/02725_memory-for-merges.sql
@@ -0,0 +1,27 @@
+-- Tags: no-s3-storage
+-- We allocate a lot of memory for buffers when reading or writing to S3
+
+DROP TABLE IF EXISTS 02725_memory_for_merges SYNC;
+
+CREATE TABLE 02725_memory_for_merges
+(   n UInt64,
+    s String
+)
+ENGINE = MergeTree
+ORDER BY n
+SETTINGS merge_max_block_size_bytes=1024, index_granularity_bytes=1024;
+
+INSERT INTO 02725_memory_for_merges SELECT number, randomPrintableASCII(1000000) FROM numbers(100);
+INSERT INTO 02725_memory_for_merges SELECT number, randomPrintableASCII(1000000) FROM numbers(100);
+INSERT INTO 02725_memory_for_merges SELECT number, randomPrintableASCII(1000000) FROM numbers(100);
+INSERT INTO 02725_memory_for_merges SELECT number, randomPrintableASCII(1000000) FROM numbers(100);
+INSERT INTO 02725_memory_for_merges SELECT number, randomPrintableASCII(1000000) FROM numbers(100);
+
+OPTIMIZE TABLE 02725_memory_for_merges FINAL;
+
+SYSTEM FLUSH LOGS;
+
+WITH (SELECT uuid FROM system.tables WHERE table='02725_memory_for_merges' and database=currentDatabase()) as uuid
+SELECT sum(peak_memory_usage) < 1024 * 1024 * 200 from system.part_log where table_uuid=uuid and event_type='MergeParts';
+
+DROP TABLE IF EXISTS 02725_memory_for_merges SYNC;
diff --git a/tests/queries/0_stateless/02725_start_stop_fetches.reference b/tests/queries/0_stateless/02725_start_stop_fetches.reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/queries/0_stateless/02725_start_stop_fetches.sh b/tests/queries/0_stateless/02725_start_stop_fetches.sh
new file mode 100755
index 000000000000..0ca687ae951a
--- /dev/null
+++ b/tests/queries/0_stateless/02725_start_stop_fetches.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+# Tags: race, zookeeper, no-parallel, no-upgrade-check, no-replicated-database
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+set -e
+
+NUM_REPLICAS=5
+
+for i in $(seq 1 $NUM_REPLICAS); do
+    $CLICKHOUSE_CLIENT -n -q "
+        DROP TABLE IF EXISTS r$i SYNC;
+        CREATE TABLE r$i (x UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/r', 'r$i') ORDER BY x SETTINGS replicated_deduplication_window = 1, allow_remote_fs_zero_copy_replication = 1;
+    "
+done
+
+function thread {
+    while true; do
+        REPLICA=$(($RANDOM % 5 + 1))
+        $CLICKHOUSE_CLIENT --query "INSERT INTO r$REPLICA SELECT rand()"
+    done
+}
+
+function nemesis_thread1 {
+    while true; do
+        REPLICA=$(($RANDOM % 5 + 1))
+        $CLICKHOUSE_CLIENT --query "SYSTEM STOP REPLICATED SENDS r$REPLICA"
+        sleep 0.5
+        $CLICKHOUSE_CLIENT --query "SYSTEM START REPLICATED SENDS r$REPLICA"
+    done
+}
+
+function nemesis_thread2 {
+    while true; do
+        REPLICA=$(($RANDOM % 5 + 1))
+        $CLICKHOUSE_CLIENT --query "SYSTEM STOP FETCHES r$REPLICA"
+        sleep 0.5
+        $CLICKHOUSE_CLIENT --query "SYSTEM START FETCHES r$REPLICA"
+    done
+}
+
+
+
+export -f thread
+export -f nemesis_thread1
+export -f nemesis_thread2
+
+TIMEOUT=20
+
+timeout $TIMEOUT bash -c thread 2>/dev/null &
+timeout $TIMEOUT bash -c thread 2>/dev/null &
+timeout $TIMEOUT bash -c thread 2>/dev/null &
+timeout $TIMEOUT bash -c nemesis_thread1 2>/dev/null &
+timeout $TIMEOUT bash -c nemesis_thread1 2>/dev/null &
+timeout $TIMEOUT bash -c nemesis_thread1 2>/dev/null &
+timeout $TIMEOUT bash -c nemesis_thread2 2>/dev/null &
+timeout $TIMEOUT bash -c nemesis_thread2 2>/dev/null &
+timeout $TIMEOUT bash -c nemesis_thread2 2>/dev/null &
+
+wait
+
+
+for i in $(seq 1 $NUM_REPLICAS); do
+    $CLICKHOUSE_CLIENT -q "SYSTEM START FETCHES r$REPLICA"
+    $CLICKHOUSE_CLIENT -q "SYSTEM START REPLICATED SENDS r$REPLICA"
+done
+
+for i in $(seq 1 $NUM_REPLICAS); do
+    $CLICKHOUSE_CLIENT --max_execution_time 60 -q "SYSTEM SYNC REPLICA r$i PULL"
+done
+
+for i in $(seq 1 $NUM_REPLICAS); do
+    $CLICKHOUSE_CLIENT -q "DROP TABLE r$i" 2>/dev/null &
+done
+
+wait
diff --git a/tests/queries/0_stateless/02730_dictionary_hashed_load_factor_element_count.reference b/tests/queries/0_stateless/02730_dictionary_hashed_load_factor_element_count.reference
new file mode 100644
index 000000000000..09d337562b53
--- /dev/null
+++ b/tests/queries/0_stateless/02730_dictionary_hashed_load_factor_element_count.reference
@@ -0,0 +1,2 @@
+dict_sharded	1	1000000	0.4768
+dict_sharded_multi	5	1000000	0.4768
diff --git a/tests/queries/0_stateless/02730_dictionary_hashed_load_factor_element_count.sql b/tests/queries/0_stateless/02730_dictionary_hashed_load_factor_element_count.sql
new file mode 100644
index 000000000000..1e42f56889d7
--- /dev/null
+++ b/tests/queries/0_stateless/02730_dictionary_hashed_load_factor_element_count.sql
@@ -0,0 +1,17 @@
+DROP DICTIONARY IF EXISTS dict_sharded;
+DROP DICTIONARY IF EXISTS dict_sharded_multi;
+DROP TABLE IF EXISTS dict_data;
+
+CREATE TABLE dict_data (key UInt64, v0 UInt16, v1 UInt16, v2 UInt16, v3 UInt16, v4 UInt16) engine=Memory() AS SELECT number, number%65535, number%65535, number%6553, number%655355, number%65535 FROM numbers(1e6);
+
+CREATE DICTIONARY dict_sharded (key UInt64, v0 UInt16) PRIMARY KEY key SOURCE(CLICKHOUSE(TABLE 'dict_data')) LIFETIME(MIN 0 MAX 0) LAYOUT(HASHED(SHARDS 32));
+SYSTEM RELOAD DICTIONARY dict_sharded;
+SELECT name, length(attribute.names), element_count, round(load_factor, 4) FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict_sharded';
+DROP DICTIONARY dict_sharded;
+
+CREATE DICTIONARY dict_sharded_multi (key UInt64, v0 UInt16, v1 UInt16, v2 UInt16, v3 UInt16, v4 UInt16) PRIMARY KEY key SOURCE(CLICKHOUSE(TABLE 'dict_data')) LIFETIME(MIN 0 MAX 0) LAYOUT(HASHED(SHARDS 32));
+SYSTEM RELOAD DICTIONARY dict_sharded_multi;
+SELECT name, length(attribute.names), element_count, round(load_factor, 4) FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict_sharded_multi';
+DROP DICTIONARY dict_sharded_multi;
+
+DROP TABLE dict_data;
diff --git a/utils/check-style/check-style b/utils/check-style/check-style
index 7dbd7d7a8164..afaf2ee6d48d 100755
--- a/utils/check-style/check-style
+++ b/utils/check-style/check-style
@@ -13,7 +13,7 @@
 #  and then to run formatter only for the specified files.
 
 ROOT_PATH=$(git rev-parse --show-toplevel)
-EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/'
+EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/|utils/keeper-bench/example.yaml'
 
 # From [1]:
 #     But since array_to_string_internal() in array.c still loops over array
diff --git a/utils/keeper-bench/CMakeLists.txt b/utils/keeper-bench/CMakeLists.txt
index 2596be4adddc..87fa64b17619 100644
--- a/utils/keeper-bench/CMakeLists.txt
+++ b/utils/keeper-bench/CMakeLists.txt
@@ -1,2 +1,7 @@
+if (NOT TARGET ch_contrib::rapidjson)
+    message (${RECONFIGURE_MESSAGE_LEVEL} "Not building keeper-bench due to rapidjson is disabled")
+    return()
+endif()
+
 clickhouse_add_executable(keeper-bench Generator.cpp Runner.cpp Stats.cpp main.cpp)
-target_link_libraries(keeper-bench PRIVATE clickhouse_common_zookeeper_no_log)
+target_link_libraries(keeper-bench PRIVATE clickhouse_common_config_no_zookeeper_log ch_contrib::rapidjson)
diff --git a/utils/keeper-bench/Generator.cpp b/utils/keeper-bench/Generator.cpp
index b6d8223862c1..2212f7158aef 100644
--- a/utils/keeper-bench/Generator.cpp
+++ b/utils/keeper-bench/Generator.cpp
@@ -1,16 +1,18 @@
 #include "Generator.h"
+#include "Common/Exception.h"
+#include "Common/ZooKeeper/ZooKeeperCommon.h"
+#include <Common/Config/ConfigProcessor.h>
 #include <random>
 #include <filesystem>
+#include <Poco/Util/AbstractConfiguration.h>
 
 using namespace Coordination;
 using namespace zkutil;
 
-namespace DB
-{
-namespace ErrorCodes
+namespace DB::ErrorCodes
 {
     extern const int LOGICAL_ERROR;
-}
+    extern const int BAD_ARGUMENTS;
 }
 
 namespace
@@ -38,16 +40,6 @@ std::string generateRandomString(size_t length)
 }
 }
 
-std::string generateRandomPath(const std::string & prefix, size_t length)
-{
-    return std::filesystem::path(prefix) / generateRandomString(length);
-}
-
-std::string generateRandomData(size_t size)
-{
-    return generateRandomString(size);
-}
-
 void removeRecursive(Coordination::ZooKeeper & zookeeper, const std::string & path)
 {
     namespace fs = std::filesystem;
@@ -96,245 +88,679 @@ void removeRecursive(Coordination::ZooKeeper & zookeeper, const std::string & pa
     remove_future.get();
 }
 
-
-void CreateRequestGenerator::startup(Coordination::ZooKeeper & zookeeper)
+NumberGetter
+NumberGetter::fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config, std::optional<uint64_t> default_value)
 {
-    removeRecursive(zookeeper, path_prefix);
+    NumberGetter number_getter;
 
-    auto promise = std::make_shared<std::promise<void>>();
-    auto future = promise->get_future();
-    auto create_callback = [promise] (const CreateResponse & response)
+    if (!config.has(key) && default_value.has_value())
     {
-        if (response.error != Coordination::Error::ZOK)
-            promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
-        else
-            promise->set_value();
-    };
-    zookeeper.create(path_prefix, "", false, false, default_acls, create_callback);
-    future.get();
+        number_getter.value = *default_value;
+    }
+    else if (config.has(key + ".min_value") && config.has(key + ".max_value"))
+    {
+        NumberRange range{.min_value = config.getUInt64(key + ".min_value"), .max_value = config.getUInt64(key + ".max_value")};
+        if (range.max_value <= range.min_value)
+            throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Range is invalid for key {}: [{}, {}]", key, range.min_value, range.max_value);
+        number_getter.value = range;
+    }
+    else
+    {
+        number_getter.value = config.getUInt64(key);
+    }
+
+    return number_getter;
 }
 
-ZooKeeperRequestPtr CreateRequestGenerator::generate()
+std::string NumberGetter::description() const
 {
-    auto request = std::make_shared<ZooKeeperCreateRequest>();
-    request->acls = default_acls;
-    size_t plength = 5;
-    if (path_length)
-        plength = *path_length;
-    auto path_candidate = generateRandomPath(path_prefix, plength);
-
-    while (paths_created.contains(path_candidate))
-        path_candidate = generateRandomPath(path_prefix, plength);
-
-    paths_created.insert(path_candidate);
-
-    request->path = path_candidate;
-    if (data_size)
-        request->data = generateRandomData(*data_size);
+    if (const auto * number = std::get_if<uint64_t>(&value))
+        return std::to_string(*number);
 
-    return request;
+    const auto & range = std::get<NumberRange>(value);
+    return fmt::format("random value from range [{}, {}]", range.min_value, range.max_value);
 }
 
-
-void SetRequestGenerator::startup(Coordination::ZooKeeper & zookeeper)
+uint64_t NumberGetter::getNumber() const
 {
-    removeRecursive(zookeeper, path_prefix);
+    if (const auto * number = std::get_if<uint64_t>(&value))
+        return *number;
 
-    auto promise = std::make_shared<std::promise<void>>();
-    auto future = promise->get_future();
-    auto create_callback = [promise] (const CreateResponse & response)
-    {
-        if (response.error != Coordination::Error::ZOK)
-            promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
-        else
-            promise->set_value();
-    };
-    zookeeper.create(path_prefix, "", false, false, default_acls, create_callback);
-    future.get();
+    const auto & range = std::get<NumberRange>(value);
+    static pcg64 rng(randomSeed());
+    return std::uniform_int_distribution<uint64_t>(range.min_value, range.max_value)(rng);
 }
 
-ZooKeeperRequestPtr SetRequestGenerator::generate()
+StringGetter StringGetter::fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config)
 {
-    auto request = std::make_shared<ZooKeeperSetRequest>();
-    request->path = path_prefix;
-    request->data = generateRandomData(data_size);
-
-    return request;
+    StringGetter string_getter;
+    if (config.has(key + ".random_string"))
+        string_getter.value
+            = NumberGetter::fromConfig(key + ".random_string.size", config);
+    else
+        string_getter.value = config.getString(key);
+
+    return string_getter;
 }
 
-void MixedRequestGenerator::startup(Coordination::ZooKeeper & zookeeper)
+void StringGetter::setString(std::string name)
 {
-    for (auto & generator : generators)
-        generator->startup(zookeeper);
+    value = std::move(name);
 }
 
-ZooKeeperRequestPtr MixedRequestGenerator::generate()
+std::string StringGetter::getString() const
 {
-    pcg64 rng(randomSeed());
-    std::uniform_int_distribution<size_t> distribution(0, generators.size() - 1);
+    if (const auto * string = std::get_if<std::string>(&value))
+        return *string;
 
-    return generators[distribution(rng)]->generate();
+    const auto number_getter = std::get<NumberGetter>(value);
+    return generateRandomString(number_getter.getNumber());
 }
 
-void GetRequestGenerator::startup(Coordination::ZooKeeper & zookeeper)
+std::string StringGetter::description() const
 {
-    auto promise = std::make_shared<std::promise<void>>();
-    auto future = promise->get_future();
-    auto create_callback = [promise] (const CreateResponse & response)
-    {
-        if (response.error != Coordination::Error::ZOK)
-            promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
-        else
-            promise->set_value();
-    };
-    zookeeper.create(path_prefix, "", false, false, default_acls, create_callback);
-    future.get();
-    size_t total_nodes = 1;
-    if (num_nodes)
-        total_nodes = *num_nodes;
+    if (const auto * string = std::get_if<std::string>(&value))
+        return *string;
 
-    for (size_t i = 0; i < total_nodes; ++i)
-    {
-        auto path = generateRandomPath(path_prefix, 5);
-        while (std::find(paths_to_get.begin(), paths_to_get.end(), path) != paths_to_get.end())
-            path = generateRandomPath(path_prefix, 5);
-
-        auto create_promise = std::make_shared<std::promise<void>>();
-        auto create_future = create_promise->get_future();
-        auto callback = [create_promise] (const CreateResponse & response)
-        {
-            if (response.error != Coordination::Error::ZOK)
-                create_promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
-            else
-                create_promise->set_value();
-        };
-        std::string data;
-        if (nodes_data_size)
-            data = generateRandomString(*nodes_data_size);
-
-        zookeeper.create(path, data, false, false, default_acls, callback);
-        create_future.get();
-        paths_to_get.push_back(path);
-    }
+    const auto number_getter = std::get<NumberGetter>(value);
+    return fmt::format("random string with size of {}", number_getter.description());
 }
 
-Coordination::ZooKeeperRequestPtr GetRequestGenerator::generate()
+bool StringGetter::isRandom() const
 {
-    auto request = std::make_shared<ZooKeeperGetRequest>();
-
-    size_t path_index = distribution(rng);
-    request->path = paths_to_get[path_index];
-    return request;
+    return std::holds_alternative<NumberGetter>(value);
 }
 
-void ListRequestGenerator::startup(Coordination::ZooKeeper & zookeeper)
+PathGetter PathGetter::fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config)
 {
-    auto promise = std::make_shared<std::promise<void>>();
-    auto future = promise->get_future();
-    auto create_callback = [promise] (const CreateResponse & response)
+    static constexpr std::string_view path_key_string = "path";
+
+    PathGetter path_getter;
+    Poco::Util::AbstractConfiguration::Keys path_keys;
+    config.keys(key, path_keys);
+
+    for (const auto & path_key : path_keys)
     {
-        if (response.error != Coordination::Error::ZOK)
-            promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
+        if (!path_key.starts_with(path_key_string))
+            continue;
+
+        const auto current_path_key_string = key + "." + path_key;
+        const auto children_of_key = current_path_key_string + ".children_of";
+        if (config.has(children_of_key))
+        {
+            auto parent_node = config.getString(children_of_key);
+            if (parent_node.empty() || parent_node[0] != '/')
+                throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Invalid path for request generator: '{}'", parent_node);
+            path_getter.parent_paths.push_back(std::move(parent_node));
+        }
         else
-            promise->set_value();
-    };
-    zookeeper.create(path_prefix, "", false, false, default_acls, create_callback);
-    future.get();
+        {
+            auto path = config.getString(key + "." + path_key);
 
-    size_t total_nodes = 1;
-    if (num_nodes)
-        total_nodes = *num_nodes;
+            if (path.empty() || path[0] != '/')
+                throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Invalid path for request generator: '{}'", path);
 
-    size_t path_length = 5;
-    if (paths_length)
-        path_length = *paths_length;
+            path_getter.paths.push_back(std::move(path));
+        }
+    }
 
-    for (size_t i = 0; i < total_nodes; ++i)
-    {
-        auto path = generateRandomPath(path_prefix, path_length);
+    path_getter.path_picker = std::uniform_int_distribution<size_t>(0, path_getter.paths.size() - 1);
+    return path_getter;
+}
 
-        auto create_promise = std::make_shared<std::promise<void>>();
-        auto create_future = create_promise->get_future();
-        auto callback = [create_promise] (const CreateResponse & response)
+void PathGetter::initialize(Coordination::ZooKeeper & zookeeper)
+{
+    for (const auto & parent_path : parent_paths)
+    {
+        auto list_promise = std::make_shared<std::promise<ListResponse>>();
+        auto list_future = list_promise->get_future();
+        auto callback = [list_promise] (const ListResponse & response)
         {
             if (response.error != Coordination::Error::ZOK)
-                create_promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
+                list_promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
             else
-                create_promise->set_value();
+                list_promise->set_value(response);
         };
-        zookeeper.create(path, "", false, false, default_acls, callback);
-        create_future.get();
+        zookeeper.list(parent_path, ListRequestType::ALL, std::move(callback), {});
+        auto list_response = list_future.get();
+
+        for (const auto & child : list_response.names)
+            paths.push_back(std::filesystem::path(parent_path) / child);
     }
+
+    path_picker = std::uniform_int_distribution<size_t>(0, paths.size() - 1);
+    initialized = true;
 }
 
-Coordination::ZooKeeperRequestPtr ListRequestGenerator::generate()
+std::string PathGetter::getPath() const
 {
-    auto request = std::make_shared<ZooKeeperListRequest>();
-    request->path = path_prefix;
-    return request;
+    if (!initialized)
+        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "PathGetter is not initialized");
+
+    if (paths.size() == 1)
+        return paths[0];
+
+    static pcg64 rng(randomSeed());
+    return paths[path_picker(rng)];
 }
 
-std::unique_ptr<IGenerator> getGenerator(const std::string & name)
+std::string PathGetter::description() const
 {
-    if (name == "create_no_data")
+    std::string description;
+    for (const auto & path : parent_paths)
     {
-        return std::make_unique<CreateRequestGenerator>();
+        if (!description.empty())
+            description += ", ";
+        description += fmt::format("children of {}", path);
     }
-    else if (name == "create_small_data")
+
+    for (const auto & path : paths)
     {
-        return std::make_unique<CreateRequestGenerator>("/create_generator", 5, 32);
+        if (!description.empty())
+            description += ", ";
+        description += path;
     }
-    else if (name == "create_medium_data")
+
+    return description;
+}
+
+RequestGetter::RequestGetter(std::vector<RequestGeneratorPtr> request_generators_)
+    : request_generators(std::move(request_generators_))
+{}
+
+RequestGetter RequestGetter::fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config, bool for_multi)
+{
+    RequestGetter request_getter;
+
+    Poco::Util::AbstractConfiguration::Keys generator_keys;
+    config.keys(key, generator_keys);
+
+    bool use_weights = false;
+    size_t weight_sum = 0;
+    auto & generators = request_getter.request_generators;
+    for (const auto & generator_key : generator_keys)
     {
-        return std::make_unique<CreateRequestGenerator>("/create_generator", 5, 1024);
+        RequestGeneratorPtr request_generator;
+
+        if (generator_key.starts_with("create"))
+            request_generator = std::make_unique<CreateRequestGenerator>();
+        else if (generator_key.starts_with("set"))
+            request_generator = std::make_unique<SetRequestGenerator>();
+        else if (generator_key.starts_with("get"))
+            request_generator = std::make_unique<GetRequestGenerator>();
+        else if (generator_key.starts_with("list"))
+            request_generator = std::make_unique<ListRequestGenerator>();
+        else if (generator_key.starts_with("multi"))
+        {
+            if (for_multi)
+                throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Nested multi requests are not allowed");
+            request_generator = std::make_unique<MultiRequestGenerator>();
+        }
+        else
+        {
+            if (for_multi)
+                continue;
+
+            throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unknown generator {}", key + "." + generator_key);
+        }
+
+        request_generator->getFromConfig(key + "." + generator_key, config);
+
+        auto weight = request_generator->getWeight();
+        use_weights |= weight != 1;
+        weight_sum += weight;
+
+        generators.push_back(std::move(request_generator));
     }
-    else if (name == "create_big_data")
+
+    if (generators.empty())
+        throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "No request generators found in config for key '{}'", key);
+
+
+    size_t max_value = use_weights ? weight_sum - 1 : generators.size() - 1;
+    request_getter.request_generator_picker = std::uniform_int_distribution<size_t>(0, max_value);
+
+    /// construct weight vector
+    if (use_weights)
     {
-        return std::make_unique<CreateRequestGenerator>("/create_generator", 5, 512 * 1024);
+        auto & weights = request_getter.weights;
+        weights.reserve(generators.size());
+        weights.push_back(generators[0]->getWeight() - 1);
+
+        for (size_t i = 1; i < generators.size(); ++i)
+            weights.push_back(weights.back() + generators[i]->getWeight());
     }
-    else if (name == "get_no_data")
+
+    return request_getter;
+}
+
+RequestGeneratorPtr RequestGetter::getRequestGenerator() const
+{
+    static pcg64 rng(randomSeed());
+
+    auto random_number = request_generator_picker(rng);
+
+    if (weights.empty())
+        return request_generators[random_number];
+
+    for (size_t i = 0; i < request_generators.size(); ++i)
     {
-        return std::make_unique<GetRequestGenerator>("/get_generator", 10, 0);
+        if (random_number <= weights[i])
+            return request_generators[i];
     }
-    else if (name == "get_small_data")
+
+    throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Invalid number generated: {}", random_number);
+}
+
+std::string RequestGetter::description() const
+{
+    std::string guard(30, '-');
+    std::string description = guard;
+
+    for (const auto & request_generator : request_generators)
+        description += fmt::format("\n{}\n", request_generator->description());
+    return description + guard;
+}
+
+void RequestGetter::startup(Coordination::ZooKeeper & zookeeper)
+{
+    for (const auto & request_generator : request_generators)
+        request_generator->startup(zookeeper);
+}
+
+const std::vector<RequestGeneratorPtr> & RequestGetter::requestGenerators() const
+{
+    return request_generators;
+}
+
+void RequestGenerator::getFromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config)
+{
+    if (config.has(key + ".weight"))
+        weight = config.getUInt64(key + ".weight");
+    getFromConfigImpl(key, config);
+}
+
+std::string RequestGenerator::description()
+{
+    std::string weight_string = weight == 1 ? "" : fmt::format("\n- weight: {}", weight);
+    return fmt::format("{}{}", descriptionImpl(), weight_string);
+}
+
+Coordination::ZooKeeperRequestPtr RequestGenerator::generate(const Coordination::ACLs & acls)
+{
+    return generateImpl(acls);
+}
+
+void RequestGenerator::startup(Coordination::ZooKeeper & zookeeper)
+{
+    startupImpl(zookeeper);
+}
+
+size_t RequestGenerator::getWeight() const
+{
+    return weight;
+}
+
+CreateRequestGenerator::CreateRequestGenerator()
+    : rng(randomSeed())
+    , remove_picker(0, 1.0)
+{}
+
+void CreateRequestGenerator::getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config)
+{
+    parent_path = PathGetter::fromConfig(key, config);
+
+    name = StringGetter(NumberGetter::fromConfig(key + ".name_length", config, 5));
+
+    if (config.has(key + ".data"))
+        data = StringGetter::fromConfig(key + ".data", config);
+
+    if (config.has(key + ".remove_factor"))
+        remove_factor = config.getDouble(key + ".remove_factor");
+}
+
+std::string CreateRequestGenerator::descriptionImpl()
+{
+    std::string data_string
+        = data.has_value() ? fmt::format("data for created nodes: {}", data->description()) : "no data for created nodes";
+    std::string remove_factor_string
+        = remove_factor.has_value() ? fmt::format("- remove factor: {}", *remove_factor) : "- without removes";
+    return fmt::format(
+        "Create Request Generator\n"
+        "- parent path(s) for created nodes: {}\n"
+        "- name for created nodes: {}\n"
+        "- {}\n"
+        "{}",
+        parent_path.description(),
+        name.description(),
+        data_string,
+        remove_factor_string);
+}
+
+void CreateRequestGenerator::startupImpl(Coordination::ZooKeeper & zookeeper)
+{
+    parent_path.initialize(zookeeper);
+}
+
+Coordination::ZooKeeperRequestPtr CreateRequestGenerator::generateImpl(const Coordination::ACLs & acls)
+{
+    if (remove_factor.has_value() && !paths_created.empty() && remove_picker(rng) < *remove_factor)
     {
-        return std::make_unique<GetRequestGenerator>("/get_generator", 10, 32);
+        auto request = std::make_shared<ZooKeeperRemoveRequest>();
+        auto it = paths_created.begin();
+        request->path = *it;
+        paths_created.erase(it);
+        return request;
     }
-    else if (name == "get_medium_data")
+
+    auto request = std::make_shared<ZooKeeperCreateRequest>();
+    request->acls = acls;
+
+    std::string path_candidate = std::filesystem::path(parent_path.getPath()) / name.getString();
+
+    while (paths_created.contains(path_candidate))
+        path_candidate = std::filesystem::path(parent_path.getPath()) / name.getString();
+
+    paths_created.insert(path_candidate);
+
+    request->path = std::move(path_candidate);
+
+    if (data)
+        request->data = data->getString();
+
+    return request;
+}
+
+void SetRequestGenerator::getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config)
+{
+    path = PathGetter::fromConfig(key, config);
+
+    data = StringGetter::fromConfig(key + ".data", config);
+}
+
+std::string SetRequestGenerator::descriptionImpl()
+{
+    return fmt::format(
+        "Set Request Generator\n"
+        "- path(s) to set: {}\n"
+        "- data to set: {}",
+        path.description(),
+        data.description());
+}
+
+Coordination::ZooKeeperRequestPtr SetRequestGenerator::generateImpl(const Coordination::ACLs & /*acls*/)
+{
+    auto request = std::make_shared<ZooKeeperSetRequest>();
+    request->path = path.getPath();
+    request->data = data.getString();
+    return request;
+}
+
+void SetRequestGenerator::startupImpl(Coordination::ZooKeeper & zookeeper)
+{
+    path.initialize(zookeeper);
+}
+
+void GetRequestGenerator::getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config)
+{
+    path = PathGetter::fromConfig(key, config);
+}
+
+std::string GetRequestGenerator::descriptionImpl()
+{
+    return fmt::format(
+        "Get Request Generator\n"
+        "- path(s) to get: {}",
+        path.description());
+}
+
+Coordination::ZooKeeperRequestPtr GetRequestGenerator::generateImpl(const Coordination::ACLs & /*acls*/)
+{
+    auto request = std::make_shared<ZooKeeperGetRequest>();
+    request->path = path.getPath();
+    return request;
+}
+
+void GetRequestGenerator::startupImpl(Coordination::ZooKeeper & zookeeper)
+{
+    path.initialize(zookeeper);
+}
+
+void ListRequestGenerator::getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config)
+{
+    path = PathGetter::fromConfig(key, config);
+}
+
+std::string ListRequestGenerator::descriptionImpl()
+{
+    return fmt::format(
+        "List Request Generator\n"
+        "- path(s) to get: {}",
+        path.description());
+}
+
+Coordination::ZooKeeperRequestPtr ListRequestGenerator::generateImpl(const Coordination::ACLs & /*acls*/)
+{
+    auto request = std::make_shared<ZooKeeperFilteredListRequest>();
+    request->path = path.getPath();
+    return request;
+}
+
+void ListRequestGenerator::startupImpl(Coordination::ZooKeeper & zookeeper)
+{
+    path.initialize(zookeeper);
+}
+
+void MultiRequestGenerator::getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config)
+{
+    if (config.has(key + ".size"))
+        size = NumberGetter::fromConfig(key + ".size", config);
+
+    request_getter = RequestGetter::fromConfig(key, config, /*for_multi*/ true);
+};
+
+std::string MultiRequestGenerator::descriptionImpl()
+{
+    std::string size_string = size.has_value() ? fmt::format("- number of requests: {}\n", size->description()) : "";
+    return fmt::format(
+        "Multi Request Generator\n"
+        "{}"
+        "- requests:\n{}",
+        size_string,
+        request_getter.description());
+}
+
+Coordination::ZooKeeperRequestPtr MultiRequestGenerator::generateImpl(const Coordination::ACLs & acls)
+{
+    Coordination::Requests ops;
+
+    if (size)
     {
-        return std::make_unique<GetRequestGenerator>("/get_generator", 10, 1024);
+        auto request_count = size->getNumber();
+
+        for (size_t i = 0; i < request_count; ++i)
+            ops.push_back(request_getter.getRequestGenerator()->generate(acls));
     }
-    else if (name == "get_big_data")
+    else
     {
-        return std::make_unique<GetRequestGenerator>("/get_generator", 10, 512 * 1024);
+        for (const auto & request_generator : request_getter.requestGenerators())
+            ops.push_back(request_generator->generate(acls));
     }
-    else if (name == "list_no_nodes")
+
+    return std::make_shared<ZooKeeperMultiRequest>(ops, acls);
+}
+
+void MultiRequestGenerator::startupImpl(Coordination::ZooKeeper & zookeeper)
+{
+    request_getter.startup(zookeeper);
+}
+
+Generator::Generator(const Poco::Util::AbstractConfiguration & config)
+{
+    Coordination::ACL acl;
+    acl.permissions = Coordination::ACL::All;
+    acl.scheme = "world";
+    acl.id = "anyone";
+    default_acls.emplace_back(std::move(acl));
+
+    static const std::string generator_key = "generator";
+
+    std::cerr << "---- Parsing setup ---- " << std::endl;
+    static const std::string setup_key = generator_key + ".setup";
+    Poco::Util::AbstractConfiguration::Keys keys;
+    config.keys(setup_key, keys);
+    for (const auto & key : keys)
     {
-        return std::make_unique<ListRequestGenerator>("/list_generator", 0, 1);
+        if (key.starts_with("node"))
+        {
+            auto node_key = setup_key + "." + key;
+            auto parsed_root_node = parseNode(node_key, config);
+            const auto node = root_nodes.emplace_back(parsed_root_node);
+
+            if (config.has(node_key + ".repeat"))
+            {
+                if (!node->name.isRandom())
+                    throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Repeating node creation for key {}, but name is not randomly generated", node_key);
+
+                auto repeat_count = config.getUInt64(node_key + ".repeat");
+                node->repeat_count = repeat_count;
+                for (size_t i = 1; i < repeat_count; ++i)
+                    root_nodes.emplace_back(node->clone());
+            }
+
+            std::cerr << "Tree to create:" << std::endl;
+
+            node->dumpTree();
+            std::cerr << std::endl;
+        }
     }
-    else if (name == "list_few_nodes")
+    std::cerr << "---- Done parsing data setup ----\n" << std::endl;
+
+    std::cerr << "---- Collecting request generators ----" << std::endl;
+    static const std::string requests_key = generator_key + ".requests";
+    request_getter = RequestGetter::fromConfig(requests_key, config);
+    std::cerr << request_getter.description() << std::endl;
+    std::cerr << "---- Done collecting request generators ----\n" << std::endl;
+}
+
+std::shared_ptr<Generator::Node> Generator::parseNode(const std::string & key, const Poco::Util::AbstractConfiguration & config)
+{
+    auto node = std::make_shared<Generator::Node>();
+    node->name = StringGetter::fromConfig(key + ".name", config);
+
+    if (config.has(key + ".data"))
+        node->data = StringGetter::fromConfig(key + ".data", config);
+
+    Poco::Util::AbstractConfiguration::Keys node_keys;
+    config.keys(key, node_keys);
+
+    for (const auto & node_key : node_keys)
     {
-        return std::make_unique<ListRequestGenerator>("/list_generator", 10, 5);
+        if (!node_key.starts_with("node"))
+            continue;
+
+        const auto node_key_string = key + "." + node_key;
+        auto child_node = parseNode(node_key_string, config);
+        node->children.push_back(child_node);
+
+        if (config.has(node_key_string + ".repeat"))
+        {
+            if (!child_node->name.isRandom())
+                throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Repeating node creation for key {}, but name is not randomly generated", node_key_string);
+
+            auto repeat_count = config.getUInt64(node_key_string + ".repeat");
+            child_node->repeat_count = repeat_count;
+            for (size_t i = 1; i < repeat_count; ++i)
+                node->children.push_back(child_node);
+        }
     }
-    else if (name == "list_medium_nodes")
+
+    return node;
+}
+
+void Generator::Node::dumpTree(int level) const
+{
+    std::string data_string
+        = data.has_value() ? fmt::format("{}", data->description()) : "no data";
+
+    std::string repeat_count_string = repeat_count != 0 ? fmt::format(", repeated {} times", repeat_count) : "";
+
+    std::cerr << fmt::format("{}name: {}, data: {}{}", std::string(level, '\t'), name.description(), data_string, repeat_count_string) << std::endl;
+
+    for (auto it = children.begin(); it != children.end();)
     {
-        return std::make_unique<ListRequestGenerator>("/list_generator", 1000, 5);
+        const auto & child = *it;
+        child->dumpTree(level + 1);
+        std::advance(it, child->repeat_count != 0 ? child->repeat_count : 1);
     }
-    else if (name == "list_a_lot_nodes")
+}
+
+std::shared_ptr<Generator::Node> Generator::Node::clone() const
+{
+    auto new_node = std::make_shared<Node>();
+    new_node->name = name;
+    new_node->data = data;
+    new_node->repeat_count = repeat_count;
+
+    // don't do deep copy of children because we will do clone only for root nodes
+    new_node->children = children;
+
+    return new_node;
+}
+
+void Generator::Node::createNode(Coordination::ZooKeeper & zookeeper, const std::string & parent_path, const Coordination::ACLs & acls) const
+{
+    auto path = std::filesystem::path(parent_path) / name.getString();
+    auto promise = std::make_shared<std::promise<void>>();
+    auto future = promise->get_future();
+    auto create_callback = [promise] (const CreateResponse & response)
     {
-        return std::make_unique<ListRequestGenerator>("/list_generator", 100000, 5);
-    }
-    else if (name == "set_small_data")
+        if (response.error != Coordination::Error::ZOK)
+            promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
+        else
+            promise->set_value();
+    };
+    zookeeper.create(path, data ? data->getString() : "", false, false, acls, create_callback);
+    future.get();
+
+    for (const auto & child : children)
+        child->createNode(zookeeper, path, acls);
+}
+
+void Generator::startup(Coordination::ZooKeeper & zookeeper)
+{
+    std::cerr << "---- Creating test data ----" << std::endl;
+    for (const auto & node : root_nodes)
     {
-        return std::make_unique<SetRequestGenerator>("/set_generator", 5);
+        auto node_name = node->name.getString();
+        node->name.setString(node_name);
+
+        std::string root_path = std::filesystem::path("/") / node_name;
+        std::cerr << "Cleaning up " << root_path << std::endl;
+        removeRecursive(zookeeper, root_path);
+
+        node->createNode(zookeeper, "/", default_acls);
     }
-    else if (name == "mixed_small_data")
+    std::cerr << "---- Created test data ----\n" << std::endl;
+
+    std::cerr << "---- Initializing generators ----" << std::endl;
+
+    request_getter.startup(zookeeper);
+}
+
+Coordination::ZooKeeperRequestPtr Generator::generate()
+{
+    return request_getter.getRequestGenerator()->generate(default_acls);
+}
+
+void Generator::cleanup(Coordination::ZooKeeper & zookeeper)
+{
+    std::cerr << "---- Cleaning up test data ----" << std::endl;
+    for (const auto & node : root_nodes)
     {
-        std::vector<std::unique_ptr<IGenerator>> generators;
-        generators.push_back(std::make_unique<SetRequestGenerator>("/set_generator", 5));
-        generators.push_back(std::make_unique<GetRequestGenerator>("/get_generator", 10, 32));
-        return std::make_unique<MixedRequestGenerator>(std::move(generators));
+        auto node_name = node->name.getString();
+        std::string root_path = std::filesystem::path("/") / node_name;
+        std::cerr << "Cleaning up " << root_path << std::endl;
+        removeRecursive(zookeeper, root_path);
     }
-
-    throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown generator {}", name);
 }
diff --git a/utils/keeper-bench/Generator.h b/utils/keeper-bench/Generator.h
index e2c546e4bce0..5b4c05b2d8b4 100644
--- a/utils/keeper-bench/Generator.h
+++ b/utils/keeper-bench/Generator.h
@@ -6,135 +6,194 @@
 #include <functional>
 #include <optional>
 #include <pcg-random/pcg_random.hpp>
+#include <Poco/Util/AbstractConfiguration.h>
 #include <Common/randomSeed.h>
 
+struct NumberGetter
+{
+    static NumberGetter fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config, std::optional<uint64_t> default_value = std::nullopt);
+    uint64_t getNumber() const;
+    std::string description() const;
+private:
+    struct NumberRange
+    {
+        uint64_t min_value;
+        uint64_t max_value;
+    };
+
+    std::variant<uint64_t, NumberRange> value;
+};
+
+struct StringGetter
+{
+    explicit StringGetter(NumberGetter number_getter)
+        : value(std::move(number_getter))
+    {}
 
-std::string generateRandomPath(const std::string & prefix, size_t length = 5);
+    StringGetter() = default;
 
-std::string generateRandomData(size_t size);
+    static StringGetter fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config);
+    void setString(std::string name);
+    std::string getString() const;
+    std::string description() const;
+    bool isRandom() const;
+private:
+    std::variant<std::string, NumberGetter> value;
+};
 
-class IGenerator
+struct PathGetter
 {
-public:
-    IGenerator()
-    {
-        Coordination::ACL acl;
-        acl.permissions = Coordination::ACL::All;
-        acl.scheme = "world";
-        acl.id = "anyone";
-        default_acls.emplace_back(std::move(acl));
-    }
-    virtual void startup(Coordination::ZooKeeper & /*zookeeper*/) {}
-    virtual Coordination::ZooKeeperRequestPtr generate() = 0;
+    static PathGetter fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config);
 
-    virtual ~IGenerator() = default;
+    std::string getPath() const;
+    std::string description() const;
 
-    Coordination::ACLs default_acls;
+    void initialize(Coordination::ZooKeeper & zookeeper);
+private:
+    std::vector<std::string> parent_paths;
 
+    bool initialized = false;
+
+    std::vector<std::string> paths;
+    mutable std::uniform_int_distribution<size_t> path_picker;
 };
 
-class CreateRequestGenerator final : public IGenerator
+struct RequestGenerator
 {
-public:
-    explicit CreateRequestGenerator(
-        std::string path_prefix_ = "/create_generator",
-        std::optional<uint64_t> path_length_ = std::nullopt,
-        std::optional<uint64_t> data_size_ = std::nullopt)
-        : path_prefix(path_prefix_)
-        , path_length(path_length_)
-        , data_size(data_size_)
-    {}
+    virtual ~RequestGenerator() = default;
+
+    void getFromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config);
 
-    void startup(Coordination::ZooKeeper & zookeeper) override;
-    Coordination::ZooKeeperRequestPtr generate() override;
+    Coordination::ZooKeeperRequestPtr generate(const Coordination::ACLs & acls);
 
+    std::string description();
+
+    void startup(Coordination::ZooKeeper & zookeeper);
+
+    size_t getWeight() const;
 private:
-    std::string path_prefix;
-    std::optional<uint64_t> path_length;
-    std::optional<uint64_t> data_size;
-    std::unordered_set<std::string> paths_created;
+    virtual void getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config) = 0;
+    virtual std::string descriptionImpl() = 0;
+    virtual Coordination::ZooKeeperRequestPtr generateImpl(const Coordination::ACLs & acls) = 0;
+    virtual void startupImpl(Coordination::ZooKeeper &) {}
+
+    size_t weight = 1;
 };
 
+using RequestGeneratorPtr = std::shared_ptr<RequestGenerator>;
 
-class GetRequestGenerator final : public IGenerator
+struct CreateRequestGenerator final : public RequestGenerator
 {
-public:
-    explicit GetRequestGenerator(
-        std::string path_prefix_ = "/get_generator",
-        std::optional<uint64_t> num_nodes_ = std::nullopt,
-        std::optional<uint64_t> nodes_data_size_ = std::nullopt)
-        : path_prefix(path_prefix_)
-        , num_nodes(num_nodes_)
-        , nodes_data_size(nodes_data_size_)
-        , rng(randomSeed())
-        , distribution(0, num_nodes ? *num_nodes - 1 : 0)
-    {}
+    CreateRequestGenerator();
+private:
+    void getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config) override;
+    std::string descriptionImpl() override;
+    Coordination::ZooKeeperRequestPtr generateImpl(const Coordination::ACLs & acls) override;
+    void startupImpl(Coordination::ZooKeeper & zookeeper) override;
 
-    void startup(Coordination::ZooKeeper & zookeeper) override;
-    Coordination::ZooKeeperRequestPtr generate() override;
+    PathGetter parent_path;
+    StringGetter name;
+    std::optional<StringGetter> data;
 
+    std::optional<double> remove_factor;
+    pcg64 rng;
+    std::uniform_real_distribution<double> remove_picker;
+
+    std::unordered_set<std::string> paths_created;
+};
+
+struct SetRequestGenerator final : public RequestGenerator
+{
 private:
-    std::string path_prefix;
-    std::optional<uint64_t> num_nodes;
-    std::optional<uint64_t> nodes_data_size;
-    std::vector<std::string> paths_to_get;
+    void getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config) override;
+    std::string descriptionImpl() override;
+    Coordination::ZooKeeperRequestPtr generateImpl(const Coordination::ACLs & acls) override;
+    void startupImpl(Coordination::ZooKeeper & zookeeper) override;
 
-    pcg64 rng;
-    std::uniform_int_distribution<size_t> distribution;
+    PathGetter path;
+    StringGetter data;
 };
 
-class ListRequestGenerator final : public IGenerator
+struct GetRequestGenerator final : public RequestGenerator
 {
-public:
-    explicit ListRequestGenerator(
-        std::string path_prefix_ = "/list_generator",
-        std::optional<uint64_t> num_nodes_ = std::nullopt,
-        std::optional<uint64_t> paths_length_ = std::nullopt)
-        : path_prefix(path_prefix_)
-        , num_nodes(num_nodes_)
-        , paths_length(paths_length_)
-    {}
+private:
+    void getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config) override;
+    std::string descriptionImpl() override;
+    Coordination::ZooKeeperRequestPtr generateImpl(const Coordination::ACLs & acls) override;
+    void startupImpl(Coordination::ZooKeeper & zookeeper) override;
 
-    void startup(Coordination::ZooKeeper & zookeeper) override;
-    Coordination::ZooKeeperRequestPtr generate() override;
+    PathGetter path;
+};
 
+struct ListRequestGenerator final : public RequestGenerator
+{
 private:
-    std::string path_prefix;
-    std::optional<uint64_t> num_nodes;
-    std::optional<uint64_t> paths_length;
+    void getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config) override;
+    std::string descriptionImpl() override;
+    Coordination::ZooKeeperRequestPtr generateImpl(const Coordination::ACLs & acls) override;
+    void startupImpl(Coordination::ZooKeeper & zookeeper) override;
+
+    PathGetter path;
 };
 
-class SetRequestGenerator final : public IGenerator
+struct RequestGetter
 {
-public:
-    explicit SetRequestGenerator(
-        std::string path_prefix_ = "/set_generator",
-        uint64_t data_size_ = 5)
-        : path_prefix(path_prefix_)
-        , data_size(data_size_)
-    {}
+    explicit RequestGetter(std::vector<RequestGeneratorPtr> request_generators_);
+
+    RequestGetter() = default;
 
-    void startup(Coordination::ZooKeeper & zookeeper) override;
-    Coordination::ZooKeeperRequestPtr generate() override;
+    static RequestGetter fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config, bool for_multi = false);
 
+    RequestGeneratorPtr getRequestGenerator() const;
+    std::string description() const;
+    void startup(Coordination::ZooKeeper & zookeeper);
+    const std::vector<RequestGeneratorPtr> & requestGenerators() const;
 private:
-    std::string path_prefix;
-    uint64_t data_size;
+    std::vector<RequestGeneratorPtr> request_generators;
+    std::vector<size_t> weights;
+    mutable std::uniform_int_distribution<size_t> request_generator_picker;
 };
 
-class MixedRequestGenerator final : public IGenerator
+struct MultiRequestGenerator final : public RequestGenerator
 {
-public:
-    explicit MixedRequestGenerator(std::vector<std::unique_ptr<IGenerator>> generators_)
-        : generators(std::move(generators_))
-    {}
+private:
+    void getFromConfigImpl(const std::string & key, const Poco::Util::AbstractConfiguration & config) override;
+    std::string descriptionImpl() override;
+    Coordination::ZooKeeperRequestPtr generateImpl(const Coordination::ACLs & acls) override;
+    void startupImpl(Coordination::ZooKeeper & zookeeper) override;
 
-    void startup(Coordination::ZooKeeper & zookeeper) override;
-    Coordination::ZooKeeperRequestPtr generate() override;
+    std::optional<NumberGetter> size;
+    RequestGetter request_getter;
+};
 
+class Generator
+{
+public:
+    explicit Generator(const Poco::Util::AbstractConfiguration & config);
+
+    void startup(Coordination::ZooKeeper & zookeeper);
+    Coordination::ZooKeeperRequestPtr generate();
+    void cleanup(Coordination::ZooKeeper & zookeeper);
 private:
-    std::vector<std::unique_ptr<IGenerator>> generators;
-};
+    struct Node
+    {
+        StringGetter name;
+        std::optional<StringGetter> data;
+        std::vector<std::shared_ptr<Node>> children;
+        size_t repeat_count = 0;
+
+        std::shared_ptr<Node> clone() const;
+
+        void createNode(Coordination::ZooKeeper & zookeeper, const std::string & parent_path, const Coordination::ACLs & acls) const;
+        void dumpTree(int level = 0) const;
+    };
 
+    static std::shared_ptr<Node> parseNode(const std::string & key, const Poco::Util::AbstractConfiguration & config);
+
+    std::uniform_int_distribution<size_t> request_picker;
+    std::vector<std::shared_ptr<Node>> root_nodes;
+    RequestGetter request_getter;
+    Coordination::ACLs default_acls;
+};
 
-std::unique_ptr<IGenerator> getGenerator(const std::string & name);
+std::optional<Generator> getGenerator(const std::string & name);
diff --git a/utils/keeper-bench/README.md b/utils/keeper-bench/README.md
new file mode 100644
index 000000000000..8b498228799d
--- /dev/null
+++ b/utils/keeper-bench/README.md
@@ -0,0 +1,317 @@
+# Keeper Bench
+
+Keeper Bench is a tool for benchmarking Keeper or any ZooKeeper compatible systems.
+
+To run it call following command from the build folder:
+
+```
+./utils/keeper-bench --config benchmark_config_file.yaml
+```
+
+## Configuration file
+
+Keeper Bench runs need to be configured inside a yaml or XML file.
+An example of a configuration file can be found in `./utils/keeper-bench/example.yaml`
+
+### Table of contents
+- [Special Types](#special-types)
+- [General settings](#general-settings)
+- [Connections](#connections)
+- [Generator](#generator)
+- [Output](#output)
+
+<a name="special-types"></a>
+## Special types
+
+### IntegerGetter
+
+Can be defined with constant integer or as a random value from a range.
+
+```yaml
+key: integer
+key:
+    min_value: integer
+    max_value: integer
+```
+
+Example for a constant value:
+
+```yaml
+some_key: 2
+```
+
+Example for random value from [10, 20]:
+
+```yaml
+some_key:
+    min_value: 10
+    max_value: 20
+```
+
+### StringGetter
+
+Can be defined with constant string or as a random string of some size.
+
+```yaml
+key: string
+key:
+    random_string:
+        size: IntegerGetter
+```
+
+Example for a constant value:
+```yaml
+some_key: "string"
+```
+
+Example for a random string with a random size from [10, 20]:
+```yaml
+some_key:
+    random_string:
+        size:
+            min_value: 10
+            max_value: 20
+```
+
+
+### PathGetter
+
+If a section contains one or more `path` keys, all `path` keys are collected into a list. \
+Additionally, paths can be defined with key `children_of` which will add all children of some path to the list.
+
+```yaml
+path: string
+path:
+    children_of: string
+```
+
+Example for defining list of paths (`/path1`, `/path2` and children of `/path3`):
+
+```yaml
+main:
+    path:
+        - "/path1"
+        - "/path2"
+    path:
+        children_of: "/path3"
+```
+
+<a name="general-settings"></a>
+## General settings
+
+```yaml
+# number of parallel queries (default: 1)
+concurrency: integer
+
+# amount of queries to be executed, set 0 to disable limit (default: 0)
+iterations: integer
+
+# delay between intermediate reports in seconds, set 0 to disable reports (default: 1.0)
+report_delay: double
+
+# stop launch of queries after specified time limit, set 0 to disable limit (default: 0)
+timelimit: double
+
+# continue testing even if a query fails (default: false)
+continue_on_errors: boolean
+```
+
+<a name="connections"></a>
+## Connections
+
+Connection definitions that will be used throughout tests defined under `connections` key.
+
+Following configurations can be defined under `connections` key or for each specific connection. \
+If it's defined under `connections` key, it will be used by default unless a specific connection overrides it.
+
+```yaml
+secure: boolean
+operation_timeout_ms: integer
+session_timeout_ms: integer
+connection_timeout_ms: integer
+```
+
+Specific configuration can be defined with a string or with a detailed description.
+
+```yaml
+host: string
+connection:
+    host: string
+
+    # number of sessions to create for host
+    sessions: integer
+    # any connection configuration defined above
+```
+
+Example definition of 3 connections in total, 1 to `localhost:9181` and 2 to `localhost:9182` both will use secure connections:
+
+```yaml
+connections:
+    secure: true
+
+    host: "localhost:9181"
+    connection:
+        host: "localhost:9182"
+        sessions: 2
+```
+
+<a name="generator"></a>
+## Generator
+
+Main part of the benchmark is the generator itself which creates necessary nodes and defines how the requests will be generated. \
+It is defined under `generator` key.
+
+### Setup
+
+Setup defines nodes that are needed for test, defined under `setup` key.
+
+Each node is defined with a `node` key in the following format:
+
+```yaml
+node: StringGetter
+
+node:
+    name: StringGetter
+    data: StringGetter
+    repeat: integer
+    node: Node
+```
+
+If only string is defined, a node with that name will be created. \
+Otherwise more detailed definition could be included to set data or the children of the node. \
+If `repeat` key is set, the node definition will be used multiple times. For a `repeat` key to be valid, the name of the node needs to be a random string.
+
+Example for a setup:
+
+```yaml
+generator:
+    setup:
+        node: "node1"
+            node:
+                name:
+                    random_string:
+                        size: 20
+                data: "somedata"
+                repeat: 4
+        node:
+            name:
+                random_string:
+                    size: 10
+            repeat: 2
+```
+
+We will create node `/node1` with no data and 4 children of random name of size 20 and data set to `somedata`. \
+We will also create 2 nodes with no data and random name of size 10 under `/` node.
+
+### Requests
+
+While benchmark is running, we are generating requests.
+
+Request generator is defined under `requests` key. \
+For each request `weight` (default: 1) can be defined which defines preference for a certain request.
+
+#### `create`
+
+```yaml
+create:
+    # parent path for created nodes
+    path: string
+
+    # length of the name for the create node (default: 5)
+    name_length: IntegerGetter
+
+    # data for create nodes (default: "")
+    data: StringGetter
+
+    # value in range [0.0, 1.0> denoting how often a remove request should be generated compared to create request (default: 0)
+    remove_factor: double
+```
+
+#### `set`
+
+```yaml
+set:
+    # paths on which we randomly set data
+    path: PathGetter
+
+    # data to set
+    data: StringGetter
+```
+
+#### `get`
+
+```yaml
+get:
+    # paths for which we randomly get data
+    path: PathGetter
+```
+
+#### `list`
+
+```yaml
+list:
+    # paths for which we randomly do list request
+    path: PathGetter
+```
+
+#### `multi`
+
+```yaml
+multi:
+    # any request definition defined above can be added
+
+    # optional size for the multi request
+    size: IntegerGetter
+```
+
+Multi request definition can contain any other request generator definitions described above. \
+If `size` key is defined, we will randomly pick `size` amount of requests from defined request generators. \
+All request generators can have a higher pick probability by using `weight` key. \
+If `size` is not defined, multi request with same request generators will always be generated. \
+Both write and read multi requests are supported.
+
+#### Example
+
+```yaml
+generator:
+    requests:
+        create:
+            path: "/test_create"
+            name_length:
+                min_value: 10
+                max_value: 20
+        multi:
+            weight: 20
+            size: 10
+            get:
+                path:
+                    children_of: "/test_get1"
+            get:
+                weight: 2
+                path:
+                    children_of: "/test_get2"
+```
+
+We defined a request geneator that will generate either a `create` or a `multi` request. \
+Each `create` request will create a node under `/test_create` with a randomly generated name with size from range `[10, 20]`. \
+`multi` request will be generated 20 times more than `create` request. \
+`multi` request will contain 10 requests and approximately twice as much get requests to children of "/test_get2".
+
+<a name="output"></a>
+## Output
+
+```yaml
+output:
+    # if defined, JSON output of results will be stored at the defined path
+    file: string
+    # or
+    file:
+        # if defined, JSON output of results will be stored at the defined path
+        path: string
+
+        # if set to true, timestamp will be appended to the output file name (default: false)
+        with_timestamp: boolean
+
+    # if set to true, output will be printed to stdout also (default: false)
+    stdout: boolean
+```
diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp
index c858b476483f..f86d2b44dd7a 100644
--- a/utils/keeper-bench/Runner.cpp
+++ b/utils/keeper-bench/Runner.cpp
@@ -1,15 +1,160 @@
 #include "Runner.h"
-
-namespace DB
+#include <Poco/Util/AbstractConfiguration.h>
+
+#include "Common/ZooKeeper/ZooKeeperCommon.h"
+#include "Common/ZooKeeper/ZooKeeperConstants.h"
+#include <Common/EventNotifier.h>
+#include <Common/Config/ConfigProcessor.h>
+#include "IO/ReadBufferFromString.h"
+#include <IO/WriteBufferFromFile.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/copyData.h>
+
+namespace CurrentMetrics
 {
+    extern const Metric LocalThread;
+    extern const Metric LocalThreadActive;
+}
 
-namespace ErrorCodes
+namespace DB::ErrorCodes
 {
     extern const int CANNOT_BLOCK_SIGNAL;
+    extern const int BAD_ARGUMENTS;
 }
 
+Runner::Runner(
+        std::optional<size_t> concurrency_,
+        const std::string & config_path,
+        const Strings & hosts_strings_,
+        std::optional<double> max_time_,
+        std::optional<double> delay_,
+        std::optional<bool> continue_on_error_,
+        std::optional<size_t> max_iterations_)
+        : info(std::make_shared<Stats>())
+{
+
+    DB::ConfigProcessor config_processor(config_path, true, false);
+    auto config = config_processor.loadConfig().configuration;
+
+    generator.emplace(*config);
+
+    if (!hosts_strings_.empty())
+    {
+        for (const auto & host : hosts_strings_)
+            connection_infos.push_back({.host = host});
+    }
+    else
+    {
+        if (!config)
+            throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "No config file or hosts defined");
+
+        parseHostsFromConfig(*config);
+    }
+
+    std::cerr << "---- Run options ---- " << std::endl;
+    static constexpr uint64_t DEFAULT_CONCURRENCY = 1;
+    if (concurrency_)
+        concurrency = *concurrency_;
+    else
+        concurrency = config->getUInt64("concurrency", DEFAULT_CONCURRENCY);
+    std::cerr << "Concurrency: " << concurrency << std::endl;
+
+    static constexpr uint64_t DEFAULT_ITERATIONS = 0;
+    if (max_iterations_)
+        max_iterations = *max_iterations_;
+    else
+        max_iterations = config->getUInt64("iterations", DEFAULT_ITERATIONS);
+    std::cerr << "Iterations: " << max_iterations << std::endl;
+
+    static constexpr double DEFAULT_DELAY = 1.0;
+    if (delay_)
+        delay = *delay_;
+    else
+        delay = config->getDouble("report_delay", DEFAULT_DELAY);
+    std::cerr << "Report delay: " << delay << std::endl;
+
+    static constexpr double DEFAULT_TIME_LIMIT = 0.0;
+    if (max_time_)
+        max_time = *max_time_;
+    else
+        max_time = config->getDouble("timelimit", DEFAULT_TIME_LIMIT);
+    std::cerr << "Time limit: " << max_time << std::endl;
+
+    if (continue_on_error_)
+        continue_on_error = *continue_on_error_;
+    else
+        continue_on_error = config->getBool("continue_on_error", false);
+    std::cerr << "Continue on error: " << continue_on_error << std::endl;
+
+    static const std::string output_key = "output";
+    print_to_stdout = config->getBool(output_key + ".stdout", false);
+    std::cerr << "Printing output to stdout: " << print_to_stdout << std::endl;
+
+    static const std::string output_file_key = output_key + ".file";
+    if (config->has(output_file_key))
+    {
+        if (config->has(output_file_key + ".path"))
+        {
+            file_output = config->getString(output_file_key + ".path");
+            output_file_with_timestamp = config->getBool(output_file_key + ".with_timestamp");
+        }
+        else
+            file_output = config->getString(output_file_key);
+
+        std::cerr << "Result file path: " << file_output->string() << std::endl;
+    }
+
+    std::cerr << "---- Run options ----\n" << std::endl;
+
+    pool.emplace(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, concurrency);
+    queue.emplace(concurrency);
 }
 
+void Runner::parseHostsFromConfig(const Poco::Util::AbstractConfiguration & config)
+{
+    ConnectionInfo default_connection_info;
+
+    const auto fill_connection_details = [&](const std::string & key, auto & connection_info)
+    {
+        if (config.has(key + ".secure"))
+            connection_info.secure = config.getBool(key + ".secure");
+
+        if (config.has(key + ".session_timeout_ms"))
+            connection_info.session_timeout_ms = config.getInt(key + ".session_timeout_ms");
+
+        if (config.has(key + ".operation_timeout_ms"))
+            connection_info.operation_timeout_ms = config.getInt(key + ".operation_timeout_ms");
+
+        if (config.has(key + ".connection_timeout_ms"))
+            connection_info.connection_timeout_ms = config.getInt(key + ".connection_timeout_ms");
+    };
+
+    fill_connection_details("connections", default_connection_info);
+
+    Poco::Util::AbstractConfiguration::Keys connections_keys;
+    config.keys("connections", connections_keys);
+
+    for (const auto & key : connections_keys)
+    {
+        std::string connection_key = "connections." + key;
+        auto connection_info = default_connection_info;
+        if (key.starts_with("host"))
+        {
+            connection_info.host = config.getString(connection_key);
+            connection_infos.push_back(std::move(connection_info));
+        }
+        else if (key.starts_with("connection") && key != "connection_timeout_ms")
+        {
+            connection_info.host = config.getString(connection_key + ".host");
+            if (config.has(connection_key + ".sessions"))
+                connection_info.sessions = config.getUInt64(connection_key + ".sessions");
+
+            fill_connection_details(connection_key, connection_info);
+
+            connection_infos.push_back(std::move(connection_info));
+        }
+    }
+}
 
 void Runner::thread(std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookeepers)
 {
@@ -33,7 +178,7 @@ void Runner::thread(std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookee
 
         while (!extracted)
         {
-            extracted = queue.tryPop(request, 100);
+            extracted = queue->tryPop(request, 100);
 
             if (shutdown
                 || (max_iterations && requests_executed >= max_iterations))
@@ -47,9 +192,35 @@ void Runner::thread(std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookee
 
         auto promise = std::make_shared<std::promise<size_t>>();
         auto future = promise->get_future();
-        Coordination::ResponseCallback callback = [promise](const Coordination::Response & response)
+        Coordination::ResponseCallback callback = [&request, promise](const Coordination::Response & response)
         {
-            if (response.error != Coordination::Error::ZOK)
+            bool set_exception = true;
+
+            if (response.error == Coordination::Error::ZOK)
+            {
+                set_exception = false;
+            }
+            else if (response.error == Coordination::Error::ZNONODE)
+            {
+                /// remove can fail with ZNONODE because of different order of execution
+                /// of generated create and remove requests
+                /// this is okay for concurrent runs
+                if (dynamic_cast<const Coordination::ZooKeeperRemoveResponse *>(&response))
+                    set_exception = false;
+                else if (const auto * multi_response = dynamic_cast<const Coordination::ZooKeeperMultiResponse *>(&response))
+                {
+                    const auto & responses = multi_response->responses;
+                    size_t i = 0;
+                    while (responses[i]->error != Coordination::Error::ZNONODE)
+                        ++i;
+
+                    const auto & multi_request = dynamic_cast<const Coordination::ZooKeeperMultiRequest &>(*request);
+                    if (dynamic_cast<const Coordination::ZooKeeperRemoveRequest *>(&*multi_request.requests[i]))
+                        set_exception = false;
+                }
+            }
+
+            if (set_exception)
                 promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error)));
             else
                 promise->set_value(response.bytesSize());
@@ -62,14 +233,14 @@ void Runner::thread(std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookee
         try
         {
             auto response_size = future.get();
-            double seconds = watch.elapsedSeconds();
+            auto microseconds = watch.elapsedMicroseconds();
 
             std::lock_guard lock(mutex);
 
             if (request->isReadRequest())
-                info->addRead(seconds, 1, request->bytesSize() + response_size);
+                info->addRead(microseconds, 1, request->bytesSize() + response_size);
             else
-                info->addWrite(seconds, 1, request->bytesSize() + response_size);
+                info->addWrite(microseconds, 1, request->bytesSize() + response_size);
         }
         catch (...)
         {
@@ -95,7 +266,7 @@ void Runner::thread(std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookee
                 {
                     try
                     {
-                        zookeepers = getConnections();
+                        zookeepers = refreshConnections();
                         break;
                     }
                     catch (...)
@@ -110,13 +281,13 @@ void Runner::thread(std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookee
     }
 }
 
-bool Runner::tryPushRequestInteractively(const Coordination::ZooKeeperRequestPtr & request, DB::InterruptListener & interrupt_listener)
+bool Runner::tryPushRequestInteractively(Coordination::ZooKeeperRequestPtr && request, DB::InterruptListener & interrupt_listener)
 {
     bool inserted = false;
 
     while (!inserted)
     {
-        inserted = queue.tryPush(request, 100);
+        inserted = queue->tryPush(std::move(request), 100);
 
         if (shutdown)
         {
@@ -126,13 +297,13 @@ bool Runner::tryPushRequestInteractively(const Coordination::ZooKeeperRequestPtr
 
         if (max_time > 0 && total_watch.elapsedSeconds() >= max_time)
         {
-            std::cout << "Stopping launch of queries. Requested time limit is exhausted.\n";
+            std::cerr << "Stopping launch of queries. Requested time limit is exhausted.\n";
             return false;
         }
 
         if (interrupt_listener.check())
         {
-            std::cout << "Stopping launch of queries. SIGINT received." << std::endl;
+            std::cerr << "Stopping launch of queries. SIGINT received." << std::endl;
             return false;
         }
 
@@ -141,7 +312,7 @@ bool Runner::tryPushRequestInteractively(const Coordination::ZooKeeperRequestPtr
             printNumberOfRequestsExecuted(requests_executed);
 
             std::lock_guard lock(mutex);
-            report(info, concurrency);
+            info->report(concurrency);
             delay_watch.restart();
         }
     }
@@ -152,23 +323,26 @@ bool Runner::tryPushRequestInteractively(const Coordination::ZooKeeperRequestPtr
 
 void Runner::runBenchmark()
 {
-    auto aux_connections = getConnections();
+    createConnections();
 
     std::cerr << "Preparing to run\n";
-    generator->startup(*aux_connections[0]);
+    generator->startup(*connections[0]);
     std::cerr << "Prepared\n";
+
+    auto start_timestamp_ms = Poco::Timestamp().epochMicroseconds() / 1000;
+
     try
     {
-        auto connections = getConnections();
         for (size_t i = 0; i < concurrency; ++i)
         {
-            pool.scheduleOrThrowOnError([this, connections]() mutable { thread(connections); });
+            auto thread_connections = connections;
+            pool->scheduleOrThrowOnError([this, connections = std::move(thread_connections)]() mutable { thread(connections); });
         }
     }
     catch (...)
     {
         shutdown = true;
-        pool.wait();
+        pool->wait();
         throw;
     }
 
@@ -185,31 +359,102 @@ void Runner::runBenchmark()
         }
     }
 
-    pool.wait();
+    pool->wait();
     total_watch.stop();
 
     printNumberOfRequestsExecuted(requests_executed);
 
     std::lock_guard lock(mutex);
-    report(info, concurrency);
+    info->report(concurrency);
+
+    DB::WriteBufferFromOwnString out;
+    info->writeJSON(out, concurrency, start_timestamp_ms);
+    auto output_string = std::move(out.str());
+
+    if (print_to_stdout)
+        std::cout << output_string << std::endl;
+
+    if (file_output)
+    {
+        auto path = *file_output;
+
+        if (output_file_with_timestamp)
+        {
+            auto filename = file_output->filename();
+            filename = fmt::format("{}_{}{}", filename.stem().generic_string(), start_timestamp_ms, filename.extension().generic_string());
+            path = file_output->parent_path() / filename;
+        }
+
+        std::cerr << "Storing output to " << path << std::endl;
+
+        DB::WriteBufferFromFile file_output_buffer(path);
+        DB::ReadBufferFromString read_buffer(output_string);
+        DB::copyData(read_buffer, file_output_buffer);
+    }
 }
 
 
-std::vector<std::shared_ptr<Coordination::ZooKeeper>> Runner::getConnections()
+void Runner::createConnections()
 {
-    std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookeepers;
-    for (const auto & host_string : hosts_strings)
+    DB::EventNotifier::init();
+    std::cerr << "---- Creating connections ---- " << std::endl;
+    for (size_t connection_info_idx = 0; connection_info_idx < connection_infos.size(); ++connection_info_idx)
     {
-        Coordination::ZooKeeper::Node node{Poco::Net::SocketAddress{host_string}, false};
-        std::vector<Coordination::ZooKeeper::Node> nodes;
-        nodes.push_back(node);
-        zkutil::ZooKeeperArgs args;
-        args.session_timeout_ms = 30000;
-        args.connection_timeout_ms = 1000;
-        args.operation_timeout_ms = 10000;
-        zookeepers.emplace_back(std::make_shared<Coordination::ZooKeeper>(nodes, args, nullptr));
+        const auto & connection_info = connection_infos[connection_info_idx];
+        std::cerr << fmt::format("Creating {} session(s) for:\n"
+                                 "- host: {}\n"
+                                 "- secure: {}\n"
+                                 "- session timeout: {}ms\n"
+                                 "- operation timeout: {}ms\n"
+                                 "- connection timeout: {}ms",
+                                 connection_info.sessions,
+                                 connection_info.host,
+                                 connection_info.secure,
+                                 connection_info.session_timeout_ms,
+                                 connection_info.operation_timeout_ms,
+                                 connection_info.connection_timeout_ms) << std::endl;
+
+        for (size_t session = 0; session < connection_info.sessions; ++session)
+        {
+            connections.emplace_back(getConnection(connection_info));
+            connections_to_info_map[connections.size() - 1] = connection_info_idx;
+        }
     }
+    std::cerr << "---- Done creating connections ----\n" << std::endl;
+}
 
+std::shared_ptr<Coordination::ZooKeeper> Runner::getConnection(const ConnectionInfo & connection_info)
+{
+    Coordination::ZooKeeper::Node node{Poco::Net::SocketAddress{connection_info.host}, connection_info.secure};
+    std::vector<Coordination::ZooKeeper::Node> nodes;
+    nodes.push_back(node);
+    zkutil::ZooKeeperArgs args;
+    args.session_timeout_ms = connection_info.session_timeout_ms;
+    args.connection_timeout_ms = connection_info.operation_timeout_ms;
+    args.operation_timeout_ms = connection_info.connection_timeout_ms;
+    return std::make_shared<Coordination::ZooKeeper>(nodes, args, nullptr);
+}
 
-    return zookeepers;
+std::vector<std::shared_ptr<Coordination::ZooKeeper>> Runner::refreshConnections()
+{
+    std::lock_guard lock(connection_mutex);
+    for (size_t connection_idx = 0; connection_idx < connections.size(); ++connection_idx)
+    {
+        auto & connection = connections[connection_idx];
+        if (connection->isExpired())
+        {
+            const auto & connection_info = connection_infos[connections_to_info_map[connection_idx]];
+            connection = getConnection(connection_info);
+        }
+    }
+    return connections;
 }
+
+Runner::~Runner()
+{
+    queue->clearAndFinish();
+    shutdown = true;
+    pool->wait();
+    generator->cleanup(*connections[0]);
+}
+
diff --git a/utils/keeper-bench/Runner.h b/utils/keeper-bench/Runner.h
index a00b7b43effb..f899f1d538d3 100644
--- a/utils/keeper-bench/Runner.h
+++ b/utils/keeper-bench/Runner.h
@@ -1,50 +1,35 @@
 #pragma once
+#include "Common/ZooKeeper/ZooKeeperConstants.h"
 #include <Common/ZooKeeper/ZooKeeperImpl.h>
 #include "Generator.h"
 #include <Common/ZooKeeper/IKeeper.h>
+#include <Common/Config/ConfigProcessor.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
 #include <Common/Stopwatch.h>
 #include <Common/ThreadPool.h>
-#include <pcg-random/pcg_random.hpp>
-#include <Common/randomSeed.h>
 #include <Common/InterruptListener.h>
 #include <Common/CurrentMetrics.h>
 
 #include <Core/Types.h>
+#include <Poco/Util/AbstractConfiguration.h>
 #include "Stats.h"
 
+#include <filesystem>
+
 using Ports = std::vector<UInt16>;
 using Strings = std::vector<std::string>;
 
-namespace CurrentMetrics
-{
-    extern const Metric LocalThread;
-    extern const Metric LocalThreadActive;
-}
-
 class Runner
 {
 public:
     Runner(
-        size_t concurrency_,
-        const std::string & generator_name,
+        std::optional<size_t> concurrency_,
+        const std::string & config_path,
         const Strings & hosts_strings_,
-        double max_time_,
-        double delay_,
-        bool continue_on_error_,
-        size_t max_iterations_)
-        : concurrency(concurrency_)
-        , pool(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, concurrency)
-        , hosts_strings(hosts_strings_)
-        , generator(getGenerator(generator_name))
-        , max_time(max_time_)
-        , delay(delay_)
-        , continue_on_error(continue_on_error_)
-        , max_iterations(max_iterations_)
-        , info(std::make_shared<Stats>())
-        , queue(concurrency)
-    {
-    }
+        std::optional<double> max_time_,
+        std::optional<double> delay_,
+        std::optional<bool> continue_on_error_,
+        std::optional<size_t> max_iterations_);
 
     void thread(std::vector<std::shared_ptr<Coordination::ZooKeeper>> zookeepers);
 
@@ -53,18 +38,19 @@ class Runner
         std::cerr << "Requests executed: " << num << ".\n";
     }
 
-    bool tryPushRequestInteractively(const Coordination::ZooKeeperRequestPtr & request, DB::InterruptListener & interrupt_listener);
+    bool tryPushRequestInteractively(Coordination::ZooKeeperRequestPtr && request, DB::InterruptListener & interrupt_listener);
 
     void runBenchmark();
 
-
+    ~Runner();
 private:
+    void parseHostsFromConfig(const Poco::Util::AbstractConfiguration & config);
 
     size_t concurrency = 1;
 
-    ThreadPool pool;
-    Strings hosts_strings;
-    std::unique_ptr<IGenerator> generator;
+    std::optional<ThreadPool> pool;
+
+    std::optional<Generator> generator;
     double max_time = 0;
     double delay = 1;
     bool continue_on_error = false;
@@ -73,6 +59,9 @@ class Runner
     std::atomic<bool> shutdown = false;
 
     std::shared_ptr<Stats> info;
+    bool print_to_stdout;
+    std::optional<std::filesystem::path> file_output;
+    bool output_file_with_timestamp;
 
     Stopwatch total_watch;
     Stopwatch delay_watch;
@@ -80,7 +69,26 @@ class Runner
     std::mutex mutex;
 
     using Queue = ConcurrentBoundedQueue<Coordination::ZooKeeperRequestPtr>;
-    Queue queue;
+    std::optional<Queue> queue;
+
+    struct ConnectionInfo
+    {
+        std::string host;
+
+        bool secure = false;
+        int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
+        int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
+        int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
+
+        size_t sessions = 1;
+    };
+
+    std::mutex connection_mutex;
+    std::vector<ConnectionInfo> connection_infos;
+    std::vector<std::shared_ptr<Coordination::ZooKeeper>> connections;
+    std::unordered_map<size_t, size_t> connections_to_info_map;
 
-    std::vector<std::shared_ptr<Coordination::ZooKeeper>> getConnections();
+    void createConnections();
+    std::shared_ptr<Coordination::ZooKeeper> getConnection(const ConnectionInfo & connection_info);
+    std::vector<std::shared_ptr<Coordination::ZooKeeper>> refreshConnections();
 };
diff --git a/utils/keeper-bench/Stats.cpp b/utils/keeper-bench/Stats.cpp
index 1f8b02ed09d6..f5e5f84ba147 100644
--- a/utils/keeper-bench/Stats.cpp
+++ b/utils/keeper-bench/Stats.cpp
@@ -1,67 +1,177 @@
 #include "Stats.h"
 #include <iostream>
 
-void report(std::shared_ptr<Stats> & info, size_t concurrency)
+#include <rapidjson/document.h>
+#include <rapidjson/rapidjson.h>
+#include <rapidjson/writer.h>
+#include <rapidjson/stringbuffer.h>
+
+void Stats::StatsCollector::add(uint64_t microseconds, size_t requests_inc, size_t bytes_inc)
+{
+    work_time += microseconds;
+    requests += requests_inc;
+    requests_bytes += bytes_inc;
+    sampler.insert(microseconds);
+}
+
+void Stats::addRead(uint64_t microseconds, size_t requests_inc, size_t bytes_inc)
+{
+    read_collector.add(microseconds, requests_inc, bytes_inc);
+}
+
+void Stats::addWrite(uint64_t microseconds, size_t requests_inc, size_t bytes_inc)
+{
+    write_collector.add(microseconds, requests_inc, bytes_inc);
+}
+
+void Stats::StatsCollector::clear()
+{
+    requests = 0;
+    work_time = 0;
+    requests_bytes = 0;
+    sampler.clear();
+}
+
+void Stats::clear()
+{
+    read_collector.clear();
+    write_collector.clear();
+}
+
+std::pair<double, double> Stats::StatsCollector::getThroughput(size_t concurrency)
+{
+    assert(requests != 0);
+    double seconds = work_time / 1'000'000.0 / concurrency;
+
+    return {requests / seconds, requests_bytes / seconds};
+}
+
+double Stats::StatsCollector::getPercentile(double percent)
+{
+    return sampler.quantileNearest(percent / 100.0) / 1000.0;
+}
+
+void Stats::report(size_t concurrency)
 {
     std::cerr << "\n";
 
+    const auto & read_requests = read_collector.requests;
+    const auto & write_requests = write_collector.requests;
+
     /// Avoid zeros, nans or exceptions
-    if (0 == info->read_requests && 0 == info->write_requests)
+    if (0 == read_requests && 0 == write_requests)
         return;
 
-    double read_seconds = info->read_work_time / concurrency;
-    double write_seconds = info->write_work_time / concurrency;
+    auto [read_rps, read_bps] = read_collector.getThroughput(concurrency);
+    auto [write_rps, write_bps] = write_collector.getThroughput(concurrency);
 
-    std::cerr << "read requests " << info->read_requests << ", write requests " << info->write_requests << ", ";
-    if (info->errors)
-    {
-        std::cerr << "errors " << info->errors << ", ";
-    }
-    if (0 != info->read_requests)
+    std::cerr << "read requests " << read_requests << ", write requests " << write_requests << ", ";
+    if (errors)
+        std::cerr << "errors " << errors << ", ";
+
+    if (0 != read_requests)
     {
         std::cerr
-            << "Read RPS: " << (info->read_requests / read_seconds) << ", "
-            << "Read MiB/s: " << (info->requests_read_bytes / read_seconds / 1048576);
-        if (0 != info->write_requests)
+            << "Read RPS: " << read_rps << ", "
+            << "Read MiB/s: " << read_bps / 1048576;
+
+        if (0 != write_requests)
             std::cerr << ", ";
     }
-    if (0 != info->write_requests)
+
+    if (0 != write_requests)
     {
         std::cerr
-            << "Write RPS: " << (info->write_requests / write_seconds) << ", "
-            << "Write MiB/s: " << (info->requests_write_bytes / write_seconds / 1048576) << ". "
+            << "Write RPS: " << write_rps << ", "
+            << "Write MiB/s: " << write_bps / 1048576 << ". "
             << "\n";
     }
     std::cerr << "\n";
 
-    auto print_percentile = [&](double percent, Stats::Sampler & sampler)
+    auto print_percentile = [&](double percent, Stats::StatsCollector & collector)
     {
         std::cerr << percent << "%\t\t";
-        std::cerr << sampler.quantileNearest(percent / 100.0) << " sec.\t";
+        std::cerr << collector.getPercentile(percent) << " msec.\t";
         std::cerr << "\n";
     };
 
-    if (0 != info->read_requests)
+    const auto print_all_percentiles = [&](auto & collector)
     {
-        std::cerr << "Read sampler:\n";
         for (int percent = 0; percent <= 90; percent += 10)
-            print_percentile(percent, info->read_sampler);
+            print_percentile(percent, collector);
 
-        print_percentile(95, info->read_sampler);
-        print_percentile(99, info->read_sampler);
-        print_percentile(99.9, info->read_sampler);
-        print_percentile(99.99, info->read_sampler);
+        print_percentile(95, collector);
+        print_percentile(99, collector);
+        print_percentile(99.9, collector);
+        print_percentile(99.99, collector);
+    };
+
+    if (0 != read_requests)
+    {
+        std::cerr << "Read sampler:\n";
+        print_all_percentiles(read_collector);
     }
 
-    if (0 != info->write_requests)
+    if (0 != write_requests)
     {
         std::cerr << "Write sampler:\n";
+        print_all_percentiles(write_collector);
+    }
+}
+
+void Stats::writeJSON(DB::WriteBuffer & out, size_t concurrency, int64_t start_timestamp)
+{
+    using namespace rapidjson;
+    Document results;
+    auto & allocator = results.GetAllocator();
+    results.SetObject();
+
+    results.AddMember("timestamp", Value(start_timestamp), allocator);
+
+    const auto get_results = [&](auto & collector)
+    {
+        Value specific_results(kObjectType);
+
+        specific_results.AddMember("total_requests", Value(collector.requests), allocator);
+
+        auto [rps, bps] = collector.getThroughput(concurrency);
+        specific_results.AddMember("requests_per_second", Value(rps), allocator);
+        specific_results.AddMember("bytes_per_second", Value(bps), allocator);
+
+        Value percentiles(kArrayType);
+
+        const auto add_percentile = [&](double percent)
+        {
+            Value percentile(kObjectType);
+            Value percent_key(fmt::format("{:.2f}", percent).c_str(), allocator);
+            percentile.AddMember(percent_key, Value(collector.getPercentile(percent)), allocator);
+            percentiles.PushBack(percentile, allocator);
+        };
+
         for (int percent = 0; percent <= 90; percent += 10)
-            print_percentile(percent, info->write_sampler);
+            add_percentile(percent);
 
-        print_percentile(95, info->write_sampler);
-        print_percentile(99, info->write_sampler);
-        print_percentile(99.9, info->write_sampler);
-        print_percentile(99.99, info->write_sampler);
-    }
+        add_percentile(95);
+        add_percentile(99);
+        add_percentile(99.9);
+        add_percentile(99.99);
+
+        specific_results.AddMember("percentiles", percentiles, allocator);
+
+        return specific_results;
+    };
+
+    if (read_collector.requests != 0)
+        results.AddMember("read_results", get_results(read_collector), results.GetAllocator());
+
+    if (write_collector.requests != 0)
+        results.AddMember("write_results", get_results(write_collector), results.GetAllocator());
+
+    StringBuffer strbuf;
+    strbuf.Clear();
+    Writer<StringBuffer> writer(strbuf);
+    results.Accept(writer);
+
+    const char * output_string = strbuf.GetString();
+    out.write(output_string, strlen(output_string));
 }
diff --git a/utils/keeper-bench/Stats.h b/utils/keeper-bench/Stats.h
index 1b9a31bb7349..bc50588e8377 100644
--- a/utils/keeper-bench/Stats.h
+++ b/utils/keeper-bench/Stats.h
@@ -5,48 +5,38 @@
 
 #include <AggregateFunctions/ReservoirSampler.h>
 
+#include <base/JSON.h>
+
 struct Stats
 {
-    std::atomic<size_t> read_requests{0};
-    std::atomic<size_t> write_requests{0};
     size_t errors = 0;
-    size_t requests_write_bytes = 0;
-    size_t requests_read_bytes = 0;
-    double read_work_time = 0;
-    double write_work_time = 0;
 
     using Sampler = ReservoirSampler<double>;
-    Sampler read_sampler {1 << 16};
-    Sampler write_sampler {1 << 16};
-
-    void addRead(double seconds, size_t requests_inc, size_t bytes_inc)
+    struct StatsCollector
     {
-        read_work_time += seconds;
-        read_requests += requests_inc;
-        requests_read_bytes += bytes_inc;
-        read_sampler.insert(seconds);
-    }
+        std::atomic<size_t> requests{0};
+        uint64_t requests_bytes = 0;
+        uint64_t work_time = 0;
+        Sampler sampler;
 
-    void addWrite(double seconds, size_t requests_inc, size_t bytes_inc)
-    {
-        write_work_time += seconds;
-        write_requests += requests_inc;
-        requests_write_bytes += bytes_inc;
-        write_sampler.insert(seconds);
-    }
+        /// requests/second, bytes/second
+        std::pair<double, double> getThroughput(size_t concurrency);
+        double getPercentile(double percent);
 
-    void clear()
-    {
-        read_requests = 0;
-        write_requests = 0;
-        read_work_time = 0;
-        write_work_time = 0;
-        requests_read_bytes = 0;
-        requests_write_bytes = 0;
-        read_sampler.clear();
-        write_sampler.clear();
-    }
+        void add(uint64_t microseconds, size_t requests_inc, size_t bytes_inc);
+        void clear();
+    };
+
+    StatsCollector read_collector;
+    StatsCollector write_collector;
+
+    void addRead(uint64_t microseconds, size_t requests_inc, size_t bytes_inc);
+    void addWrite(uint64_t microseconds, size_t requests_inc, size_t bytes_inc);
+
+    void clear();
+
+    void report(size_t concurrency);
+    void writeJSON(DB::WriteBuffer & out, size_t concurrency, int64_t start_timestamp);
 };
 
 
-void report(std::shared_ptr<Stats> & info, size_t concurrency);
diff --git a/utils/keeper-bench/example.yaml b/utils/keeper-bench/example.yaml
new file mode 100644
index 000000000000..e800e9234827
--- /dev/null
+++ b/utils/keeper-bench/example.yaml
@@ -0,0 +1,117 @@
+concurrency: 20
+iterations: 10000
+delay: 4
+timelimit: 300
+continue_on_errors: true
+
+connections:
+  operation_timeout_ms: 3000
+  connection_timeout_ms: 40000
+
+  connection:
+    secure: false
+    operation_timeout_ms: 2000
+    session_timeout_ms: 2000
+    connection_timeout_ms: 50000
+    host: "localhost:9181"
+    sessions: 1
+
+  host: "localhost:9181"
+
+generator:
+  setup:
+    node:
+      name: "test3"
+    node:
+      name: "test_create"
+    node:
+      name: "test4"
+    node:
+      name: "test"
+      data: "somedata"
+      node:
+        repeat: 4
+        name:
+          random_string:
+            size: 15
+        data:
+          random_string:
+            size:
+              min_value: 10
+              max_value: 20
+      node:
+        repeat: 2
+        node:
+          repeat: 2
+          name:
+            random_string:
+              size: 12
+        name:
+          random_string:
+            size: 15
+        data:
+          random_string:
+            size:
+              min_value: 10
+              max_value: 20
+    node:
+      name: "test2"
+      data: "somedata"
+  requests:
+    create:
+      path: "/test_create"
+      name_length: 10
+      remove_factor: 0.5
+    multi:
+      size: 20
+      create:
+        path: "/test"
+        data:
+          random_string:
+            size:
+              min_value: 10
+              max_value: 20
+        remove_factor: 0.8
+      set:
+        weight: 2
+        path: 
+          - "/test3"
+          - "/test4"
+        path:
+          children_of: "/test"
+        data:
+          random_string:
+            size: 10
+    get:
+      path: 
+        - "/test3"
+        - "/test4"
+      path:
+        children_of: "/test"
+
+    multi:
+      weight: 10
+      get:
+        path: 
+          - "/test3"
+          - "/test4"
+        path:
+          children_of: "/test"
+      list:
+        path: 
+          - "/test3"
+        path:
+          children_of: "/test"
+
+    list:
+      path: 
+        - "/test3"
+        - "/test4"
+      path:
+        children_of: "/test"
+
+output:
+  file: 
+    path: "output.json"
+    with_timestamp: true
+  stdout: true
diff --git a/utils/keeper-bench/main.cpp b/utils/keeper-bench/main.cpp
index 39af28e7580a..0753d66850f5 100644
--- a/utils/keeper-bench/main.cpp
+++ b/utils/keeper-bench/main.cpp
@@ -3,10 +3,24 @@
 #include "Runner.h"
 #include "Stats.h"
 #include "Generator.h"
+#include "Common/Exception.h"
 #include <Common/TerminalSize.h>
 #include <Core/Types.h>
+#include <boost/program_options/variables_map.hpp>
 
-using namespace std;
+namespace
+{
+
+template <typename T>
+std::optional<T> valueToOptional(const boost::program_options::variable_value & value)
+{
+    if (value.empty())
+        return std::nullopt;
+
+    return value.as<T>();
+}
+
+}
 
 int main(int argc, char *argv[])
 {
@@ -19,15 +33,14 @@ int main(int argc, char *argv[])
 
         boost::program_options::options_description desc = createOptionsDescription("Allowed options", getTerminalWidth());
         desc.add_options()
-            ("help",                                                            "produce help message")
-            ("generator",     value<std::string>()->default_value("set_small_data"),             "query to execute")
-            ("concurrency,c", value<unsigned>()->default_value(1),              "number of parallel queries")
-            ("delay,d",       value<double>()->default_value(1),                "delay between intermediate reports in seconds (set 0 to disable reports)")
-            ("iterations,i",  value<size_t>()->default_value(0),                "amount of queries to be executed")
-            ("timelimit,t",   value<double>()->default_value(0.),               "stop launch of queries after specified time limit")
-            ("hosts,h",       value<Strings>()->multitoken(),                   "")
+            ("help",                                                                         "produce help message")
+            ("config",         value<std::string>()->default_value(""),                      "yaml/xml file containing configuration")
+            ("concurrency,c",  value<unsigned>(),                                            "number of parallel queries")
+            ("report-delay,d", value<double>(),                                              "delay between intermediate reports in seconds (set 0 to disable reports)")
+            ("iterations,i",   value<size_t>(),                                              "amount of queries to be executed")
+            ("time-limit,t",   value<double>(),                                              "stop launch of queries after specified time limit")
+            ("hosts,h",        value<Strings>()->multitoken()->default_value(Strings{}, ""), "")
             ("continue_on_errors", "continue testing even if a query fails")
-            ("reconnect", "establish new connection for every query")
         ;
 
         boost::program_options::variables_map options;
@@ -41,15 +54,22 @@ int main(int argc, char *argv[])
             return 1;
         }
 
-        Runner runner(options["concurrency"].as<unsigned>(),
-            options["generator"].as<std::string>(),
-            options["hosts"].as<Strings>(),
-            options["timelimit"].as<double>(),
-            options["delay"].as<double>(),
-            options.count("continue_on_errors"),
-            options["iterations"].as<size_t>());
+        Runner runner(valueToOptional<unsigned>(options["concurrency"]),
+                      options["config"].as<std::string>(),
+                      options["hosts"].as<Strings>(),
+                      valueToOptional<double>(options["time-limit"]),
+                      valueToOptional<double>(options["report-delay"]),
+                      options.count("continue_on_errors") ? std::optional<bool>(true) : std::nullopt,
+                      valueToOptional<size_t>(options["iterations"]));
 
-        runner.runBenchmark();
+        try
+        {
+            runner.runBenchmark();
+        }
+        catch (const DB::Exception & e)
+        {
+            std::cout << "Got exception while trying to run benchmark: " << e.message() << std::endl;
+        }
 
         return 0;
     }
diff --git a/utils/list-licenses/list-licenses.sh b/utils/list-licenses/list-licenses.sh
index db3eb5e59e8f..dd23e6321c8e 100755
--- a/utils/list-licenses/list-licenses.sh
+++ b/utils/list-licenses/list-licenses.sh
@@ -40,14 +40,21 @@ ls -1 -d ${LIBS_PATH}/*/ | ${GREP_CMD} -F -v -- '-cmake' | LC_ALL=C sort | while
          ${GREP_CMD} -q -i -F 'Altered source versions must be plainly marked as such' "$LIB_LICENSE" &&
          ${GREP_CMD} -q -i -F 'This notice may not be removed or altered' "$LIB_LICENSE" &&
          echo "zLib") ||
+        (${GREP_CMD} -q -i -F 'This program, "bzip2", the associated library "libbzip2"' "$LIB_LICENSE" &&
+         echo "bzip2") ||
         (${GREP_CMD} -q -i -F 'Permission is hereby granted, free of charge, to any person' "$LIB_LICENSE" &&
-         ${GREP_CMD} -q -i -F 'The above copyright notice and this permission notice shall be included' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'The above copyright notice and this permission notice shall be' "$LIB_LICENSE" &&
          ${GREP_CMD} -q -i -F 'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND' "$LIB_LICENSE" &&
          echo "MIT") ||
+        (${GREP_CMD} -q -F 'PostgreSQL' "$LIB_LICENSE" &&
+         echo "PostgreSQL") ||
         (${GREP_CMD} -q -i -F 'Permission to use, copy, modify, and distribute this software for any purpose' "$LIB_LICENSE" &&
          ${GREP_CMD} -q -i -F 'the name of a copyright holder shall not' "$LIB_LICENSE" &&
          ${GREP_CMD} -q -i -F 'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND' "$LIB_LICENSE" &&
          echo "MIT/curl") ||
+        (${GREP_CMD} -q -i -F 'OpenLDAP Public License' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'Version 2.8' "$LIB_LICENSE" &&
+         echo "OpenLDAP Version 2.8") ||
         (${GREP_CMD} -q -i -F 'Redistributions of source code must retain the above copyright' "$LIB_LICENSE" &&
          ${GREP_CMD} -q -i -F 'Redistributions in binary form must reproduce' "$LIB_LICENSE" &&
          ${GREP_CMD} -q -i -F 'Neither the name' "$LIB_LICENSE" &&
@@ -55,6 +62,14 @@ ls -1 -d ${LIBS_PATH}/*/ | ${GREP_CMD} -F -v -- '-cmake' | LC_ALL=C sort | while
         (${GREP_CMD} -q -i -F 'Redistributions of source code must retain the above copyright' "$LIB_LICENSE" &&
          ${GREP_CMD} -q -i -F 'Redistributions in binary form must reproduce' "$LIB_LICENSE" &&
          echo "BSD 2-clause") ||
+        (${GREP_CMD} -q -i -F 'Permission to use, copy, modify, and distribute this software' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'documentation for any purpose and without fee is hereby granted' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'the above copyright notice appear in all copies and that both that copyright' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'notice and this permission notice appear in supporting documentation' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'not be used in advertising or publicity pertaining' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'distribution of the software without specific, written prior permission' "$LIB_LICENSE" &&
+         ${GREP_CMD} -q -i -F 'makes no representations about the suitability of this software' "$LIB_LICENSE" &&
+         echo "HPND") ||
         echo "Unknown")
 
         RELATIVE_PATH=$(echo "$LIB_LICENSE" | sed -r -e 's!^.+/contrib/!/contrib/!')
diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 5be458488a82..653a0cd53883 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,3 +1,4 @@
+v23.4.2.11-stable	2023-05-02
 v23.4.1.1943-stable	2023-04-27
 v23.3.2.37-lts	2023-04-22
 v23.3.1.2823-lts	2023-03-31
diff --git a/utils/security-generator/generate_security.py b/utils/security-generator/generate_security.py
index d25612e8bc6e..83180ccce1cd 100755
--- a/utils/security-generator/generate_security.py
+++ b/utils/security-generator/generate_security.py
@@ -48,17 +48,20 @@
 """
 
 
-def generate_supported_versions():
+def generate_supported_versions() -> str:
     with open(VERSIONS_FILE, "r", encoding="utf-8") as fd:
         versions = [line.split(maxsplit=1)[0][1:] for line in fd.readlines()]
 
     # The versions in VERSIONS_FILE are ordered ascending, so the first one is
     # the greatest one. We may have supported versions in the previous year
-    unsupported_year = int(versions[0].split(".", maxsplit=1)[0]) - 2
-    # 3 supported versions
-    supported = []  # type: List[str]
-    # 2 LTS versions, one of them could be in supported
+    greatest_year = int(versions[0].split(".", maxsplit=1)[0])
+    unsupported_year = greatest_year - 2
+    # 3 regular versions
+    regular = []  # type: List[str]
+    max_regular = 3
+    # 2 LTS versions, one of them could be in regular
     lts = []  # type: List[str]
+    max_lts = 2
     # The rest are unsupported
     unsupported = []  # type: List[str]
     table = [
@@ -69,18 +72,21 @@ def generate_supported_versions():
         year = int(version.split(".")[0])
         month = int(version.split(".")[1])
         version = f"{year}.{month}"
-        if version in supported or version in lts:
+        to_append = ""
+        if version in regular or version in lts:
             continue
-        if len(supported) < 3:
-            supported.append(version)
-            if len(lts) < 2 and month in [3, 8]:
-                # The version can be LTS as well
-                lts.append(version)
-            table.append(f"| {version} | ✔️ |")
-            continue
-        if len(lts) < 2 and month in [3, 8]:
+        if len(regular) < max_regular:
+            regular.append(version)
+            to_append = f"| {version} | ✔️ |"
+        if len(lts) < max_lts and month in [3, 8]:
             lts.append(version)
-            table.append(f"| {version} | ✔️ |")
+            to_append = f"| {version} | ✔️ |"
+        if to_append:
+            if len(regular) == max_regular and len(lts) == max_lts:
+                # if we reached the max number of supported versions, the rest
+                # are unsopported, so year.* will be used
+                unsupported_year = min(greatest_year - 1, year)
+            table.append(to_append)
             continue
         if year <= unsupported_year:
             # The whole year is unsopported
@@ -92,7 +98,7 @@ def generate_supported_versions():
     return "\n".join(table) + "\n"
 
 
-def main():
+def main() -> None:
     print(HEADER)
     print(generate_supported_versions())
     print(FOOTER)
diff --git a/utils/tests-visualizer/index.html b/utils/tests-visualizer/index.html
index 11b2d6504e44..b2db5dbed338 100644
--- a/utils/tests-visualizer/index.html
+++ b/utils/tests-visualizer/index.html
@@ -20,9 +20,7 @@
             width: 130px;
             display: block;
             margin: 30px auto;
-            -webkit-animation: spin 2s ease-in-out infinite;
-            -moz-animation: spin 2s ease-in-out infinite;
-            animation: spin 2s ease-in-out infinite;
+            animation: spin 10s ease-in-out infinite;
         }
 
         h1 {
@@ -45,16 +43,9 @@
             cursor: pointer;
         }
 
-        @-moz-keyframes spin {
-            100% { -moz-transform: rotate(360deg); }
-        }
-
-        @-webkit-keyframes spin {
-            100% { -webkit-transform: rotate(360deg); }
-        }
-
         @keyframes spin {
-            100% { transform:rotate(360deg); }
+            50% { transform:scale(150%); }
+            100% { transform:scale(100%); }
         }
     </style>
 </head>
@@ -67,33 +58,26 @@ <h1>Loading (~10 seconds, ~20 MB)</h1>
 </div>
 <canvas id="canvas"></canvas>
 <script type="text/javascript">
-///////////////////////
-//  GLOBAL VARIABLES
-///////////////////////
-let start_date = '2020-06-13';
+let start_date = '2021-12-01';
 const canvasNode = document.getElementById('canvas');
 const infoNode = document.getElementById('info');
 const loadingNode = document.getElementById('loading');
 const failMessageNode = document.getElementById('fail-message');
 
-///////////////////////
-//      QUERIES
-///////////////////////
 let render_data_query = `
     WITH '${start_date}'::Date AS start_date
     SELECT groupArray([d, n, fail]) FROM
     (
         SELECT n, check_start_time::Date - start_date AS d, max(test_status LIKE 'F%' OR test_status LIKE 'E%') AS fail
-        FROM "default".checks
+        FROM checks
 
         INNER JOIN
         (
-            SELECT test_name, toUInt16(rowNumberInAllBlocks()) AS n FROM
+            SELECT test_name, toUInt16(row_number() OVER (ORDER BY test_name)) AS n FROM
             (
                 SELECT DISTINCT test_name
-                FROM "default".checks
+                FROM checks
                 WHERE match(test_name, '^\\d+_') AND check_name ILIKE '%stateless%' AND check_start_time > now() - INTERVAL 1 DAY
-                ORDER BY test_name
             )
         ) AS nums
 
@@ -109,25 +93,20 @@ <h1>Loading (~10 seconds, ~20 MB)</h1>
     FORMAT TSV`;
 
 let test_names_query = `
-    SELECT test_name, toUInt16(rowNumberInAllBlocks()) AS n FROM
+    SELECT test_name, toUInt16(row_number() OVER (ORDER BY test_name)) AS n FROM
     (
         SELECT DISTINCT test_name
-        FROM "default".checks
+        FROM checks
         WHERE match(test_name, '^\\d+_') AND check_name ILIKE '%stateless%' AND check_start_time > now() - INTERVAL 1 DAY
-        ORDER BY test_name
     ) FORMAT JSONCompact`;
 
-///////////////////////
-//       MAIN
-///////////////////////
-
 (async () => {
     try {
         const [render_data, test_names_data]  = await Promise.all([
             loadDataByQuery(render_data_query),
             loadDataByQuery(test_names_query),
         ]);
-        // we good
+
         renderResponse(render_data);
         saveTestNames(test_names_data);
     } catch (e) {
@@ -138,10 +117,6 @@ <h1>Loading (~10 seconds, ~20 MB)</h1>
     }
 })()
 
-///////////////////////
-//  SPECIAL FUNCTIONS
-///////////////////////
-
 async function loadDataByQuery(query) {
     const response = await fetch(
         "https://play.clickhouse.com?user=play&add_http_cors_header=1",