diff --git a/ci/defs/job_configs.py b/ci/defs/job_configs.py index ad718adeb758..c82f929857e7 100644 --- a/ci/defs/job_configs.py +++ b/ci/defs/job_configs.py @@ -149,7 +149,7 @@ "./ci/jobs/scripts/docker_in_docker.sh", ], ), - run_in_docker=f"altinityinfra/integration-tests-runner+root+--memory={LIMITED_MEM}+--privileged+--dns-search='.'+--security-opt seccomp=unconfined+--cap-add=SYS_PTRACE+{docker_sock_mount}+--volume=clickhouse_integration_tests_volume:/var/lib/docker+--cgroupns=host+--env=CLICKHOUSE_TEST_STAT_URL=$CLICKHOUSE_TEST_STAT_URL+--env=CLICKHOUSE_TEST_STAT_LOGIN=$CLICKHOUSE_TEST_STAT_LOGIN+--env=CLICKHOUSE_TEST_STAT_PASSWORD=$CLICKHOUSE_TEST_STAT_PASSWORD", + run_in_docker=f"altinityinfra/integration-tests-runner+root+--memory={LIMITED_MEM}+--privileged+--dns-search='.'+--security-opt seccomp=unconfined+--cap-add=SYS_PTRACE+{docker_sock_mount}+--volume=clickhouse_integration_tests_volume:/var/lib/docker+--cgroupns=host+--ulimit nofile=262144:262144+--env=CLICKHOUSE_TEST_STAT_URL=$CLICKHOUSE_TEST_STAT_URL+--env=CLICKHOUSE_TEST_STAT_LOGIN=$CLICKHOUSE_TEST_STAT_LOGIN+--env=CLICKHOUSE_TEST_STAT_PASSWORD=$CLICKHOUSE_TEST_STAT_PASSWORD", ) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index 73ff6a883a06..7f8c80571db2 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: VERSION_REVISION has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54519) +SET(VERSION_REVISION 54520) SET(VERSION_MAJOR 26) SET(VERSION_MINOR 3) -SET(VERSION_PATCH 12) -SET(VERSION_GITHASH fa3aa24e79104d29f8bfde078fc586ec6ac3a565) +SET(VERSION_PATCH 13) +SET(VERSION_GITHASH d23c7536b980c34b39c850b08ef23c509f06aaaa) SET(VERSION_DESCRIBE v26.3.12.20001.altinityantalya) SET(VERSION_STRING 26.3.12.20001.altinityantalya) # end of autochange diff --git a/contrib/delta-kernel-rs b/contrib/delta-kernel-rs index c7dd984421fd..609a93b33560 160000 --- a/contrib/delta-kernel-rs +++ b/contrib/delta-kernel-rs @@ -1 +1 @@ -Subproject commit c7dd984421fdcce76becf16dcfab85c908398c59 +Subproject commit 609a93b33560cd64d2d062e000ecd609fe5d0edd diff --git a/contrib/mongo-c-driver b/contrib/mongo-c-driver index 529d7f0af3b8..a26c8e251fec 160000 --- a/contrib/mongo-c-driver +++ b/contrib/mongo-c-driver @@ -1 +1 @@ -Subproject commit 529d7f0af3b86d7da6ce90562cf9bcd10e8a3a0a +Subproject commit a26c8e251fec147258e89b7e85e9adfa259fc42f diff --git a/contrib/mongo-c-driver-cmake/CMakeLists.txt b/contrib/mongo-c-driver-cmake/CMakeLists.txt index 0b608517607c..fa856376c93f 100644 --- a/contrib/mongo-c-driver-cmake/CMakeLists.txt +++ b/contrib/mongo-c-driver-cmake/CMakeLists.txt @@ -5,13 +5,13 @@ if(NOT USE_MONGODB) endif() set(libbson_VERSION_MAJOR 2) -set(libbson_VERSION_MINOR 2) -set(libbson_VERSION_PATCH 2) -set(libbson_VERSION 2.2.2) +set(libbson_VERSION_MINOR 3) +set(libbson_VERSION_PATCH 0) +set(libbson_VERSION 2.3.0) set(libmongoc_VERSION_MAJOR 2) -set(libmongoc_VERSION_MINOR 2) -set(libmongoc_VERSION_PATCH 2) -set(libmongoc_VERSION 2.2.2) +set(libmongoc_VERSION_MINOR 3) +set(libmongoc_VERSION_PATCH 0) +set(libmongoc_VERSION 2.3.0) set(LIBBSON_SOURCES_ROOT "${ClickHouse_SOURCE_DIR}/contrib/mongo-c-driver/src") set(LIBBSON_SOURCE_DIR "${LIBBSON_SOURCES_ROOT}/libbson/src") diff --git a/contrib/replxx b/contrib/replxx index 0b6f23e14141..3a55a1bbee3c 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 0b6f23e14141f30b446be13c18a0aeedc8121780 +Subproject commit 3a55a1bbee3c4ccdbad38c0f1da4ce722e337807 diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 933bd4d32aca..23083b9c53c1 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -730,7 +730,7 @@ SYSTEM STOP VIEWS Enable periodic refreshing for the given view or all refreshable views. No immediate refresh is triggered. -If the view is in a Replicated or Shared database, `START VIEW` undoes the effect of `STOP VIEW`, and `START REPLICATED VIEW` undoes the effect of `STOP REPLICATED VIEW`. +If the view is in a Replicated or Shared database, `START VIEW` undoes the effect of `STOP VIEW`, and `START REPLICATED VIEW` undoes the effect of `STOP REPLICATED VIEW`. `START VIEW` also undoes the effect of `PAUSE VIEW`. ```sql SYSTEM START VIEW [db.]name @@ -739,7 +739,26 @@ SYSTEM START VIEW [db.]name SYSTEM START VIEWS ``` -### SYSTEM CANCEL VIEW {#cancel-view} +### SYSTEM PAUSE VIEW, PAUSE VIEWS {#pause-view-pause-views} + +Disable periodic refreshing of the given view or all refreshable views. +Unlike `SYSTEM STOP VIEW`, `SYSTEM PAUSE VIEW` does not interrupt a refresh that is already in progress: the running refresh is allowed to finish, and only subsequent refreshes are prevented. + +Undo with `SYSTEM START VIEW` or `SYSTEM START VIEWS`. + +:::note +The paused state does not persist across server restarts. After a restart, views will resume their configured refresh schedules. +In Replicated or Shared databases, `SYSTEM PAUSE VIEW` only affects the current replica. +::: + +```sql +SYSTEM PAUSE VIEW [db.]name +``` +```sql +SYSTEM PAUSE VIEWS +``` + +### SYSTEM REFRESH VIEW {#refresh-view} If there's a refresh in progress for the given view on the current replica, interrupt and cancel it. Otherwise do nothing. diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index d61b7d5b86df..c78567a5fef1 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -358,7 +358,7 @@ enum class AccessType : uint8_t M(SYSTEM_SWARM, "SYSTEM STOP SWARM MODE, SYSTEM START SWARM MODE, STOP SWARM MODE, START SWARM MODE", GLOBAL, SYSTEM) \ M(SYSTEM_PULLING_REPLICATION_LOG, "SYSTEM STOP PULLING REPLICATION LOG, SYSTEM START PULLING REPLICATION LOG", TABLE, SYSTEM) \ M(SYSTEM_CLEANUP, "SYSTEM STOP CLEANUP, SYSTEM START CLEANUP", TABLE, SYSTEM) \ - M(SYSTEM_VIEWS, "SYSTEM REFRESH VIEW, SYSTEM START VIEWS, SYSTEM STOP VIEWS, SYSTEM START VIEW, SYSTEM STOP VIEW, SYSTEM CANCEL VIEW, REFRESH VIEW, START VIEWS, STOP VIEWS, START VIEW, STOP VIEW, CANCEL VIEW", VIEW, SYSTEM) \ + M(SYSTEM_VIEWS, "SYSTEM REFRESH VIEW, SYSTEM START VIEWS, SYSTEM STOP VIEWS, SYSTEM START VIEW, SYSTEM STOP VIEW, SYSTEM PAUSE VIEWS, SYSTEM PAUSE VIEW, SYSTEM CANCEL VIEW, REFRESH VIEW, START VIEWS, STOP VIEWS, START VIEW, STOP VIEW, PAUSE VIEWS, PAUSE VIEW, CANCEL VIEW", VIEW, SYSTEM) \ M(SYSTEM_DISTRIBUTED_SENDS, "SYSTEM STOP DISTRIBUTED SENDS, SYSTEM START DISTRIBUTED SENDS, STOP DISTRIBUTED SENDS, START DISTRIBUTED SENDS", TABLE, SYSTEM_SENDS) \ M(SYSTEM_REPLICATED_SENDS, "SYSTEM STOP REPLICATED SENDS, SYSTEM START REPLICATED SENDS, STOP REPLICATED SENDS, START REPLICATED SENDS", TABLE, SYSTEM_SENDS) \ M(SYSTEM_SENDS, "SYSTEM STOP SENDS, SYSTEM START SENDS, STOP SENDS, START SENDS", GROUP, SYSTEM) \ diff --git a/src/AggregateFunctions/AggregateFunctionSingleValueOrNull.cpp b/src/AggregateFunctions/AggregateFunctionSingleValueOrNull.cpp index 5be288f64bb1..d8230528eb55 100644 --- a/src/AggregateFunctions/AggregateFunctionSingleValueOrNull.cpp +++ b/src/AggregateFunctions/AggregateFunctionSingleValueOrNull.cpp @@ -92,7 +92,7 @@ struct AggregateFunctionSingleValueOrNullData { ColumnNullable & col = typeid_cast(to); col.getNullMapColumn().insertDefault(); - data().insertResultInto(col.getNestedColumn(), result_type); + data().insertResultInto(col.getNestedColumn(), removeNullable(result_type)); } } }; @@ -169,7 +169,7 @@ class AggregateFunctionSingleValueOrNull final void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional /* version */, Arena * arena) const override { - data(place).read(buf, *serialization, result_type, arena); + data(place).read(buf, *serialization, value_type, arena); } bool allocatesMemoryInArena() const override { return singleValueTypeAllocatesMemoryInArena(value_type->getTypeId()); } diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionOrFill.h b/src/AggregateFunctions/Combinators/AggregateFunctionOrFill.h index 485998bacc97..9efb6242e97d 100644 --- a/src/AggregateFunctions/Combinators/AggregateFunctionOrFill.h +++ b/src/AggregateFunctions/Combinators/AggregateFunctionOrFill.h @@ -310,6 +310,42 @@ class AggregateFunctionOrFill final : public IAggregateFunctionHelperinsertResultInto(place, to, arena); } } + else if (nested_function->isState()) + { + /// Mirror the flag-set branch for State-nested combinators: routing + /// flag-unset rows through `to.insertDefault()` would call + /// `ColumnAggregateFunction::ensureOwnership()` on the inner column and + /// reset its `src`, leaving subsequent flag-set rows pushing externally- + /// owned state pointers without `src` protection (double-destroy under + /// `MemorySanitizer`, issue #105462). The state at `place` is already + /// default-initialized by `create()` above, so it is safe to forward. + if constexpr (UseNull) + { + if (!result_is_nullable || inner_nullable) + { + if constexpr (merge) + nested_function->insertMergeResultInto(place, to, arena); + else + nested_function->insertResultInto(place, to, arena); + } + else + { + ColumnNullable & col = typeid_cast(to); + col.getNullMapColumn().getData().push_back(static_cast(1)); + if constexpr (merge) + nested_function->insertMergeResultInto(place, col.getNestedColumn(), arena); + else + nested_function->insertResultInto(place, col.getNestedColumn(), arena); + } + } + else + { + if constexpr (merge) + nested_function->insertMergeResultInto(place, to, arena); + else + nested_function->insertResultInto(place, to, arena); + } + } else to.insertDefault(); } diff --git a/src/AggregateFunctions/UniquesHashSet.h b/src/AggregateFunctions/UniquesHashSet.h index bbbd5af70367..142ee94cd970 100644 --- a/src/AggregateFunctions/UniquesHashSet.h +++ b/src/AggregateFunctions/UniquesHashSet.h @@ -2,6 +2,7 @@ #include +#include #include #include @@ -465,6 +466,9 @@ class UniquesHashSet : private HashTableAllocatorWithStackMemory<(1ULL << UNIQUE if (m_size > UNIQUES_HASH_MAX_SIZE) throw Poco::Exception("Cannot write UniquesHashSet: too large size_degree."); + /// A null `buf` here would indicate upstream state corruption (e.g. a double-destroyed state). + chassert(buf); + DB::writeBinaryLittleEndian(skip_degree, wb); DB::writeVarUInt(m_size, wb); diff --git a/src/Analyzer/FunctionNode.cpp b/src/Analyzer/FunctionNode.cpp index 52ef493021fd..b80993faf405 100644 --- a/src/Analyzer/FunctionNode.cpp +++ b/src/Analyzer/FunctionNode.cpp @@ -286,7 +286,8 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const /// tuple, and adding a type may significantly increase query size. /// It should be safe because set type for `column IN tuple` is deduced from `column` type. if (isNameOfInFunction(function_name) && argument_nodes.size() > 1 && argument_nodes[1]->getNodeType() == QueryTreeNodeType::CONSTANT - && !static_cast(argument_nodes[1].get())->hasSourceExpression()) + && !static_cast(argument_nodes[1].get())->hasSourceExpression() + && !isArray(argument_nodes[1]->getResultType())) { auto expression_list_ast = make_intrusive(); diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index f9e0c368325d..4f6b74022732 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -408,7 +408,7 @@ void BackupEntriesCollector::gatherDatabasesMetadata() case ASTBackupQuery::ElementType::ALL: { - for (const auto & [database_name, database] : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = true})) + for (const auto & [database_name, database] : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = true})) { if (!element.except_databases.contains(database_name)) { diff --git a/src/Client/BuzzHouse/Generator/SessionSettings.cpp b/src/Client/BuzzHouse/Generator/SessionSettings.cpp index dc576d371904..107566db278e 100644 --- a/src/Client/BuzzHouse/Generator/SessionSettings.cpp +++ b/src/Client/BuzzHouse/Generator/SessionSettings.cpp @@ -1206,7 +1206,7 @@ static std::unordered_map serverSettings2 = { {"shared_merge_tree_sync_parts_on_partition_operations", trueOrFalseSettingNoOracle}, {"short_circuit_function_evaluation_for_nulls", trueOrFalseSetting}, {"short_circuit_function_evaluation_for_nulls_threshold", probRangeSetting}, - {"show_data_lake_catalogs_in_system_tables", trueOrFalseSettingNoOracle}, + {"show_remote_databases_in_system_tables", trueOrFalseSettingNoOracle}, {"show_create_query_identifier_quoting_rule", CHSetting( [](RandomGenerator & rg, FuzzConfig &) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index b2cb3861694d..dfa4234f2042 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1378,6 +1378,15 @@ void ClientBase::processOrdinaryQuery(String query, ASTPtr parsed_query) /// Also checks if query execution should be cancelled. void ClientBase::receiveResult(ASTPtr parsed_query, Int32 signals_before_stop, bool partial_result_on_first_cancel) { + /// The connection may already be torn down — e.g. `sendQuery` failed to + /// write the query to a server that died mid-transfer and called + /// `disconnect()`, after which `processOrdinaryQuery` calls us in a + /// best-effort attempt to drain remaining data. With the receive side + /// gone there is nothing to poll for; bail out so we don't call + /// `connection->poll()` (and friends) on a disconnected connection. + if (!connection->isConnected()) + return; + // TODO: get the poll_interval from commandline. const auto receive_timeout = connection_parameters.timeouts.receive_timeout; constexpr size_t default_poll_interval = 1000000; /// in microseconds diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index aa92e71dbf17..cdd00017861a 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -1335,12 +1335,15 @@ std::optional Connection::getResolvedAddress() const bool Connection::poll(size_t timeout_microseconds) { + ensureConnected(); return in->poll(timeout_microseconds); } bool Connection::hasReadPendingData() const { + if (!in) + return false; return last_input_packet_type.has_value() || in->hasBufferedData(); } @@ -1369,20 +1372,26 @@ UInt64 Connection::receivePacketType() if (last_input_packet_type) return *last_input_packet_type; - UInt64 type; + ensureConnected(); + + UInt64 type = 0; readVarUInt(type, *in); return last_input_packet_type.emplace(type); } +void Connection::ensureConnected() const +{ + /// We are trying to send something to already disconnected connection, + /// this means that we continue using Connection after exception. + if (!in) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Connection to {} is terminated", getDescription()); +} Packet Connection::receivePacket() { try { - /// We are trying to send something to already disconnected connection, - /// this means that we continue using Connection after exception. - if (!in) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Connection to {} is terminated", getDescription()); + ensureConnected(); Packet res; diff --git a/src/Client/Connection.h b/src/Client/Connection.h index 4627176459d2..4116374dbfb1 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -344,6 +344,8 @@ class Connection : public IServerConnection void initBlockLogsInput(); void initBlockProfileEventsInput(); + void ensureConnected() const; + [[noreturn]] void throwUnexpectedPacket(UInt64 packet_type, const char * expected, TimeoutSetter * timeout_setter = nullptr); }; diff --git a/src/Client/LineReader.cpp b/src/Client/LineReader.cpp index f3fad1ae81b5..0d711ff32a52 100644 --- a/src/Client/LineReader.cpp +++ b/src/Client/LineReader.cpp @@ -8,8 +8,6 @@ #include #include #include -#include -#include #pragma clang diagnostic ignored "-Wreserved-identifier" @@ -59,11 +57,8 @@ namespace DB /// Allows delaying the start of query execution until the entirety of query is inserted. bool LineReader::hasInputData() const { - timeval timeout = {0, 0}; - fd_set fds{}; - FD_ZERO(&fds); - FD_SET(in_fd, &fds); - return select(1, &fds, nullptr, nullptr, &timeout) == 1; + pollfd pfd{.fd = in_fd, .events = POLLIN, .revents = 0}; + return poll(&pfd, 1, 0) == 1 && (pfd.revents & POLLIN); } replxx::Replxx::completions_t LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length, const char * word_break_characters) diff --git a/src/Common/FailPoint.cpp b/src/Common/FailPoint.cpp index 80c79b12d197..d8ee99f58cb3 100644 --- a/src/Common/FailPoint.cpp +++ b/src/Common/FailPoint.cpp @@ -62,6 +62,7 @@ static struct InitFiu ONCE(rmt_lightweight_update_sleep_after_block_allocation) \ ONCE(rmt_merge_task_sleep_in_prepare) \ ONCE(s3_read_buffer_throw_expired_token) \ + ONCE(s3_send_request_throw_expired_token) \ ONCE(distributed_cache_fail_request_in_the_middle_of_request) \ ONCE(object_storage_queue_fail_commit_once) \ ONCE(distributed_cache_fail_continue_request) \ @@ -168,7 +169,9 @@ static struct InitFiu PAUSEABLE_ONCE(drop_database_before_exclusive_ddl_lock) \ REGULAR(storage_merge_tree_background_schedule_merge_fail) \ REGULAR(patch_parts_reverse_column_order) \ - REGULAR(wide_part_writer_fail_in_add_streams) + REGULAR(wide_part_writer_fail_in_add_streams) \ + REGULAR(compact_part_writer_fail_in_add_streams) \ + REGULAR(query_metric_log_delay_collect) namespace FailPoints { diff --git a/src/Common/QueryFuzzer.cpp b/src/Common/QueryFuzzer.cpp index 9abe575dc23e..fffe0b773cf0 100644 --- a/src/Common/QueryFuzzer.cpp +++ b/src/Common/QueryFuzzer.cpp @@ -3649,6 +3649,7 @@ void QueryFuzzer::fuzz(ASTPtr & ast) Type::START_REPLICATED_VIEW, Type::STOP_VIEW, Type::STOP_REPLICATED_VIEW, + Type::PAUSE_VIEW, Type::CANCEL_VIEW, Type::WAIT_VIEW, }; diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index fc231ca292e2..ba022db0f83a 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -6993,9 +6993,9 @@ Query Iceberg table using the snapshot that was current at a specific timestamp. DECLARE(Int64, iceberg_snapshot_id, 0, R"( Query Iceberg table using the specific snapshot id. )", 0) \ - DECLARE(Bool, show_data_lake_catalogs_in_system_tables, false, R"( -Enables showing data lake catalogs in system tables. -)", 0) \ + DECLARE_WITH_ALIAS(Bool, show_remote_databases_in_system_tables, false, R"( +Enables showing remote databases (data lake catalogs, MySQL, PostgreSQL) in system tables. +)", 0, show_data_lake_catalogs_in_system_tables) \ DECLARE(Bool, delta_lake_enable_expression_visitor_logging, false, R"( Enables Test level logs of DeltaLake expression visitor. These logs can be too verbose even for test logging. )", 0) \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 428f34345f22..50cc8c572561 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -81,6 +81,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"max_skip_unavailable_shards_num", 0, 0, "New setting to limit the number of shards that can be silently skipped when skip_unavailable_shards is enabled."}, {"max_skip_unavailable_shards_ratio", 0, 0, "New setting to limit the ratio of shards that can be silently skipped when skip_unavailable_shards is enabled."}, {"allow_experimental_database_s3_tables", false, false, "New setting to enable experimental database S3 tables (AWS Iceberg REST catalog)."}, + {"show_remote_databases_in_system_tables", false, false, "Renamed from `show_data_lake_catalogs_in_system_tables` and broadened to also hide `MySQL` and `PostgreSQL` databases from `system.tables`, `system.columns` and `system.completions` by default, since enumerating their tables requires expensive remote calls. Users who relied on the previous behavior must set this setting to `true`. The old name is kept as an alias."}, }); addSettingsChanges(settings_changes_history, "26.2", { @@ -104,7 +105,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"use_page_cache_for_local_disks", false, false, "New setting to use userspace page cache for local disks"}, {"use_page_cache_for_object_storage", false, false, "New setting to use userspace page cache for object storage table functions"}, {"use_statistics_cache", false, true, "Enable statistics cache"}, - {"apply_row_policy_after_final", false, true, "Enabling apply_row_policy_after_final by default, as if was in 25.8 before #87303"}, + {"apply_row_policy_after_final", true, true, "Enabling apply_row_policy_after_final by default, as if was in 25.8 before #87303"}, {"ignore_format_null_for_explain", false, true, "FORMAT Null is now ignored for EXPLAIN queries by default"}, {"input_format_connection_handling", false, false, "New setting to allow parsing and processing remaining data in the buffer if the connection closes unexpectedly"}, {"input_format_max_block_wait_ms", 0, 0, "New setting to limit maximum wait time in milliseconds before a block is emitted by input format"}, @@ -193,7 +194,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"aggregate_function_input_format", "state", "state", "New setting to control AggregateFunction input format during INSERT operations. Setting Value set to state by default"}, {"delta_lake_snapshot_start_version", -1, -1, "New setting."}, {"delta_lake_snapshot_end_version", -1, -1, "New setting."}, - {"apply_row_policy_after_final", false, false, "New setting to control if row policies and PREWHERE are applied after FINAL processing for *MergeTree tables"}, + {"apply_row_policy_after_final", true, true, "New setting to control if row policies and PREWHERE are applied after FINAL processing for *MergeTree tables"}, {"apply_prewhere_after_final", false, false, "New setting. When enabled, PREWHERE conditions are applied after FINAL processing."}, {"compatibility_s3_presigned_url_query_in_path", false, false, "New setting."}, {"serialize_string_in_memory_with_zero_byte", true, true, "New setting"}, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index c14e8f494e0d..7a511293a948 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -670,7 +670,12 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( } if (col.getVariantByLocalDiscriminator(i).size() < variant_limits[i]) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of variant {} is expected to be not less than {} according to discriminators, but it is {}", variant_names[i], variant_limits[i], col.getVariantByLocalDiscriminator(i).size()); + throw Exception( + settings.native_format ? ErrorCodes::INCORRECT_DATA : ErrorCodes::LOGICAL_ERROR, + "Size of variant {} is expected to be not less than {} according to discriminators, but it is {}", + variant_names[i], + variant_limits[i], + col.getVariantByLocalDiscriminator(i).size()); variant_offsets.push_back(col.getVariantByLocalDiscriminator(i).size() - variant_limits[i]); } diff --git a/src/Databases/DataLake/DatabaseDataLake.h b/src/Databases/DataLake/DatabaseDataLake.h index 5154b8225d18..b6e918af4ba0 100644 --- a/src/Databases/DataLake/DatabaseDataLake.h +++ b/src/Databases/DataLake/DatabaseDataLake.h @@ -28,7 +28,7 @@ class DatabaseDataLake final : public IDatabase, WithContext UUID getUUID() const override { return db_uuid; } bool shouldBeEmptyOnDetach() const override { return false; } - bool isDatalakeCatalog() const override { return true; } + bool isRemoteDatabase() const override { return true; } bool empty() const override; diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 047184a1ede3..0d5e907d91a8 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -437,6 +437,27 @@ void GlueCatalog::setCredentials(TableMetadata & metadata) const } } +ICatalog::CredentialsRefreshCallback GlueCatalog::getCredentialsConfigurationCallback(const DB::StorageID & storage_id) +{ + /// The AWS SDK credentials provider chain (instance profile, STS assume-role, + /// web-identity, etc.) refreshes its cached credentials internally before + /// expiry. The bug we are fixing is that `setCredentials` captures the result + /// of `GetAWSCredentials` once at table-load time and embeds the access key, + /// secret, and session token as static literals in the storage args, so the + /// S3 client is pinned to a snapshot that goes stale on long reads. This + /// callback re-asks the same provider for current credentials each time + /// `ReadBufferFromS3` reports an `ExpiredToken`, letting the read recover. + return [this, storage_id]() -> std::shared_ptr + { + LOG_DEBUG(log, "Refreshing AWS credentials for {} after expired token", storage_id.getNameForLogs()); + auto credentials = credentials_provider->GetAWSCredentials(); + return std::make_shared( + credentials.GetAWSAccessKeyId(), + credentials.GetAWSSecretKey(), + credentials.GetSessionToken()); + }; +} + bool GlueCatalog::empty() const { auto all_databases = getDatabases(""); diff --git a/src/Databases/DataLake/GlueCatalog.h b/src/Databases/DataLake/GlueCatalog.h index 6208bb284195..21674c25edc7 100644 --- a/src/Databases/DataLake/GlueCatalog.h +++ b/src/Databases/DataLake/GlueCatalog.h @@ -71,6 +71,20 @@ class GlueCatalog final : public ICatalog, private DB::WithContext bool updateMetadata(const String & namespace_name, const String & table_name, const String & new_metadata_path, Poco::JSON::Object::Ptr new_snapshot) const override; void dropTable(const String & namespace_name, const String & table_name) const override; + /// Returns a callback that re-vends fresh AWS credentials from the configured + /// credentials provider chain. Invoked by `ReadBufferFromS3` when an S3 call + /// fails with `ExpiredToken`, so that a long-running read can recover without + /// the user having to restart the query. + ICatalog::CredentialsRefreshCallback getCredentialsConfigurationCallback(const DB::StorageID & storage_id) override; + + /// Resolves the precise Iceberg timestamp type for `column_name` by searching the current schema + /// in the Iceberg `metadata_object`. Falls back to `"timestamp_ns"` when `glue_column_type` is + /// `"timestamp_nano"`, or `"timestamp"` otherwise, when the column is not found in the metadata. + static String resolveTimestampTypeFromMetadata( + const Poco::JSON::Object::Ptr & metadata_object, + const String & column_name, + const String & glue_column_type); + private: void createNamespaceIfNotExists(const String & namespace_name) const; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 513f6190d3ba..9112adf427d9 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -543,7 +543,7 @@ static bool getRMVCoordinationInfo( String data; if (!zookeeper->tryGet(*coordination_path, data, &stats)) return false; - coordination_znode.parse(data); + coordination_znode.parse(data, /*running_znode_exists=*/ false, log); return true; } catch (...) diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index f148f3bcfc92..9e55415223b9 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -180,14 +180,20 @@ class IDatabase : public std::enable_shared_from_this /// Get name of database engine. virtual String getEngineName() const = 0; - /// External database (i.e. PostgreSQL/Datalake/...) does not support any of ClickHouse internal tables: + /// Database engines that do not own ClickHouse table metadata cannot contain arbitrary ClickHouse table engines: /// - *MergeTree /// - Distributed /// - RocksDB /// - ... virtual bool isExternal() const { return true; } - virtual bool isDatalakeCatalog() const { return false; } + /// True for databases whose contents live on a remote service that we don't + /// want to enumerate implicitly in system tables (data lake catalogs, MySQL, PostgreSQL, ...). + /// Such databases are hidden from system.tables / system.columns / system.completions + /// unless `show_remote_databases_in_system_tables` is enabled. + /// This is distinct from `isExternal()` (which classifies whether the engine supports + /// ClickHouse internal table types). + virtual bool isRemoteDatabase() const { return false; } /// Load a set of existing tables. /// You can call only once, right after the object is created. diff --git a/src/Databases/MySQL/DatabaseMySQL.h b/src/Databases/MySQL/DatabaseMySQL.h index bf4ebe447bbe..5ac2e10474d4 100644 --- a/src/Databases/MySQL/DatabaseMySQL.h +++ b/src/Databases/MySQL/DatabaseMySQL.h @@ -51,6 +51,8 @@ class DatabaseMySQL final : public DatabaseWithAltersOnDiskBase, WithContext String getEngineName() const override { return "MySQL"; } UUID getUUID() const override { return db_uuid; } + bool isRemoteDatabase() const override { return true; } + bool shouldBeEmptyOnDetach() const override { return false; } bool empty() const override; diff --git a/src/Databases/PostgreSQL/DatabasePostgreSQL.h b/src/Databases/PostgreSQL/DatabasePostgreSQL.h index 184aa284bbb9..59d49e37940a 100644 --- a/src/Databases/PostgreSQL/DatabasePostgreSQL.h +++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.h @@ -38,6 +38,8 @@ class DatabasePostgreSQL final : public DatabaseWithAltersOnDiskBase, WithContex String getEngineName() const override { return "PostgreSQL"; } UUID getUUID() const override { return db_uuid; } + bool isRemoteDatabase() const override { return true; } + String getMetadataPath() const override { return metadata_path; } bool shouldBeEmptyOnDetach() const override { return false; } diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp index d51be0ea9b32..437dc9639d5b 100644 --- a/src/Dictionaries/MongoDBDictionarySource.cpp +++ b/src/Dictionaries/MongoDBDictionarySource.cpp @@ -100,6 +100,7 @@ void registerDictionarySourceMongoDB(DictionarySourceFactory & factory) } configuration->checkHosts(context); + configuration->checkCollection(); return std::make_unique(dict_struct, std::move(configuration), std::make_shared(sample_block)); }; diff --git a/src/Disks/DiskObjectStorage/ObjectStorages/S3/diskSettings.cpp b/src/Disks/DiskObjectStorage/ObjectStorages/S3/diskSettings.cpp index e770e496a1f9..75bb749683d7 100644 --- a/src/Disks/DiskObjectStorage/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/DiskObjectStorage/ObjectStorages/S3/diskSettings.cpp @@ -221,7 +221,6 @@ getClient(const S3::URI & url, const S3Settings & settings, ContextPtr context, access_key_id = s3_updated_credentials->getAccessKeyId(); secret_access_key = s3_updated_credentials->getSecretAccessKey(); session_token = s3_updated_credentials->getSessionToken(); - LOG_DEBUG(getLogger("getClient"), "Got new access tokens {} {} {}", access_key_id, secret_access_key, session_token); } } diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index edea32189296..2b8a017207aa 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -223,7 +223,19 @@ FunctionCast::WrapperType FunctionCast::createWrapper(const DataTypePtr & from_t { /// In case when converting to Nullable type, we apply different parsing rule, /// that will not throw an exception but return NULL in case of malformed input. - FunctionPtr function = FunctionConvertFromString::createFromSettings(settings); + FunctionPtr function; + switch (settings.cast_string_to_date_time_mode) + { + case FormatSettings::DateTimeInputFormat::Basic: + function = FunctionConvertFromString::createFromSettings(settings); + break; + case FormatSettings::DateTimeInputFormat::BestEffort: + function = FunctionConvertFromString::createFromSettings(settings); + break; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + function = FunctionConvertFromString::createFromSettings(settings); + break; + } return createFunctionAdaptor(function, from_type); } else if (!can_apply_accurate_cast) diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h index 8b767263c8f5..540987259a61 100644 --- a/src/Functions/FunctionsExternalDictionaries.h +++ b/src/Functions/FunctionsExternalDictionaries.h @@ -618,6 +618,47 @@ class FunctionDictGetNoType final : public IFunction private: + /// When maskedExecute evaluates a short-circuit argument only on the rows where the + /// dictionary key was not found (mask == 1), it expands the result back to full size by + /// filling mask == 0 rows with NULLs via ColumnNullable::expand. Those rows are about to + /// be overwritten by the dictionary result, but castColumnAccurate rejects the column if + /// it contains any NULLs while casting to a non-Nullable type. + /// This helper clears the spurious null-map bits at mask == 0 positions so the cast only + /// fails when the user-provided default genuinely evaluates to NULL on a row that needs it. + static void clearMaskedNullsBeforeCast( + IColumn & column, + const IColumn::Filter & mask, + const DataTypePtr & result_type) + { + if (result_type->isNullable()) + return; + + if (auto * nullable = typeid_cast(&column)) + { + auto & null_map = nullable->getNullMapData(); + chassert(null_map.size() == mask.size()); + for (size_t i = 0; i < null_map.size(); ++i) + { + if (!mask[i]) + null_map[i] = 0; + } + /// Recurse in case nested is e.g. Tuple(Nullable(...)). + clearMaskedNullsBeforeCast(nullable->getNestedColumn(), mask, result_type); + } + else if (auto * tuple_col = typeid_cast(&column)) + { + if (const auto * tuple_type = typeid_cast(result_type.get())) + { + const size_t n = std::min(tuple_col->tupleSize(), tuple_type->getElements().size()); + for (size_t col_idx = 0; col_idx < n; ++col_idx) + { + clearMaskedNullsBeforeCast( + tuple_col->getColumn(col_idx), mask, tuple_type->getElements()[col_idx]); + } + } + } + } + std::pair getDefaultsShortCircuit( IColumn::Filter && default_mask, const DataTypePtr & result_type, @@ -626,8 +667,11 @@ class FunctionDictGetNoType final : public IFunction ColumnWithTypeAndName column_before_cast = last_argument; maskedExecute(column_before_cast, default_mask); + auto mutable_col = IColumn::mutate(column_before_cast.column->convertToFullColumnIfConst()); + clearMaskedNullsBeforeCast(*mutable_col, default_mask, result_type); + ColumnWithTypeAndName column_to_cast = { - column_before_cast.column->convertToFullColumnIfConst(), + std::move(mutable_col), column_before_cast.type, column_before_cast.name}; diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index d3b907a177b6..f5ad61b63d44 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -48,6 +48,7 @@ namespace S3RequestSetting namespace FailPoints { extern const char s3_read_buffer_throw_expired_token[]; + extern const char s3_send_request_throw_expired_token[]; } namespace ErrorCodes @@ -539,6 +540,19 @@ Aws::S3::Model::GetObjectResult ReadBufferFromS3::sendRequest(size_t attempt, si if (client_ptr->isClientForDisk()) ProfileEvents::increment(ProfileEvents::DiskS3GetObject); + /// Simulate a real `ExpiredToken` error returned from S3, used by integration tests for + /// the credentials refresh callback path in `processException`. Unlike + /// `s3_read_buffer_throw_expired_token` (which is gated by `if (impl)` in `nextImpl` and + /// therefore only fires on multi-fill streaming reads), this failpoint fires inside + /// `sendRequest` itself, so it covers both `nextImpl`-driven reads and the + /// `readBigAt` range-read path used by Parquet column-chunk reads. + fiu_do_on(FailPoints::s3_send_request_throw_expired_token, + { + throw S3Exception( + Aws::S3::S3Errors::UNKNOWN, + "Unable to parse ExceptionName: ExpiredToken Message: The provided token has expired."); + }); + ProfileEventTimeIncrement watch(ProfileEvents::ReadBufferFromS3InitMicroseconds); // We do not know in advance how many bytes we are going to consume, to avoid blocking estimated it from below diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 8fa0d90cda93..bb98124f5dba 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -277,6 +277,12 @@ void readStringUntilEquals(String & s, ReadBuffer & buf) readStringUntilCharsInto<'='>(s, buf); } +void readStringUntilColon(String & s, ReadBuffer & buf) +{ + s.clear(); + readStringUntilCharsInto<':'>(s, buf); +} + template void readNullTerminated>(PODArray & s, ReadBuffer & buf); template void readNullTerminated(String & s, ReadBuffer & buf); diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 7fdf9baf2a18..7f3ea930ff63 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -138,7 +138,11 @@ inline void readFloatBinary(T & x, ReadBuffer & buf) readPODBinary(x, buf); } -inline void readStringBinary(std::string & s, ReadBuffer & buf, size_t max_string_size = DEFAULT_MAX_STRING_SIZE) +/// Templated on the allocator so this works with both `std::string` and +/// `StringWithMemoryTracking` (and any other `std::basic_string` with +/// a custom allocator). +template +inline void readStringBinary(std::basic_string, Allocator> & s, ReadBuffer & buf, size_t max_string_size = DEFAULT_MAX_STRING_SIZE) { size_t size = 0; readVarUInt(size, buf); @@ -402,6 +406,7 @@ void skipStringUntilWhitespace(ReadBuffer & buf); void readStringUntilAmpersand(String & s, ReadBuffer & buf); void readStringUntilEquals(String & s, ReadBuffer & buf); +void readStringUntilColon(String & s, ReadBuffer & buf); /** Read string in CSV format. diff --git a/src/Interpreters/ActionLocksManager.cpp b/src/Interpreters/ActionLocksManager.cpp index a77ba1fc9bcf..8667a5686440 100644 --- a/src/Interpreters/ActionLocksManager.cpp +++ b/src/Interpreters/ActionLocksManager.cpp @@ -22,6 +22,7 @@ namespace ActionLocks extern const StorageActionBlockType ViewRefresh = 10; extern const StorageActionBlockType VirtualPartsUpdate = 11; extern const StorageActionBlockType ReduceBlockingParts = 12; + extern const StorageActionBlockType ViewRefreshPause = 13; } diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 7ba46ced8d2d..1d36d0762880 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1750,8 +1750,17 @@ void ActionsDAG::removeTrivialWrappers() child = child->children[0]; for (auto *& output : outputs) - while (is_trivial_wrapper(output)) - output = output->children[0]; + { + if (is_trivial_wrapper(output)) + { + auto original_name = output->result_name; + while (is_trivial_wrapper(output)) + output = output->children[0]; + + if (output->result_name != original_name) + output = &addAlias(*output, original_name); + } + } removeUnusedActions(); } diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index 40ac3edacda9..9a85ebf9ac1e 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -352,7 +352,7 @@ class ActionsDAG const Node & materializeNode(const Node & node, bool materialize_sparse = true); /// Remove materialize() and identity() wrapper functions from the DAG. - /// These are transparent wrappers that don't change values. Removing them projection matching for queries through views. + /// These are transparent wrappers that don't change values. Removing them helps projection matching for queries through views. void removeTrivialWrappers(); enum class MatchColumnsMode : uint8_t diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 8f331cec548b..010d5aab49ca 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -87,7 +87,7 @@ namespace Setting { extern const SettingsBool fsync_metadata; extern const SettingsBool allow_experimental_analyzer; - extern const SettingsBool show_data_lake_catalogs_in_system_tables; + extern const SettingsBool show_remote_databases_in_system_tables; } namespace MergeTreeSetting @@ -114,7 +114,7 @@ class DatabaseNameHints : public IHints<> const bool need_to_check_access_for_databases = !access->isGranted(AccessType::SHOW_DATABASES); Names result; - auto databases_list = database_catalog.getDatabases(GetDatabasesOptions{.with_datalake_catalogs = true}); + auto databases_list = database_catalog.getDatabases(GetDatabasesOptions{.with_remote_databases = true}); for (const auto & database_name : databases_list | boost::adaptors::map_keys) { if (need_to_check_access_for_databases && !access->isGranted(AccessType::SHOW_DATABASES, database_name)) @@ -342,7 +342,7 @@ void DatabaseCatalog::shutdownImpl(std::function shutdown_system_logs) }) == uuid_map.end()); databases.clear(); - databases_without_datalake_catalogs.clear(); + databases_without_remote.clear(); referential_dependencies.clear(); loading_dependencies.clear(); @@ -580,16 +580,16 @@ void DatabaseCatalog::assertDatabaseExists(const String & database_name) const } } -bool DatabaseCatalog::hasDatalakeCatalogs() const +bool DatabaseCatalog::hasRemoteDatabases() const { std::lock_guard lock{databases_mutex}; - return databases.size() != databases_without_datalake_catalogs.size(); + return databases.size() != databases_without_remote.size(); } -bool DatabaseCatalog::isDatalakeCatalog(const String & database_name) const +bool DatabaseCatalog::isRemoteDatabase(const String & database_name) const { std::lock_guard lock{databases_mutex}; - return databases.contains(database_name) && !databases_without_datalake_catalogs.contains(database_name); + return databases.contains(database_name) && !databases_without_remote.contains(database_name); } void DatabaseCatalog::assertDatabaseDoesntExist(const String & database_name) const @@ -610,8 +610,8 @@ void DatabaseCatalog::attachDatabase(const String & database_name, const Databas std::lock_guard lock{databases_mutex}; assertDatabaseDoesntExistUnlocked(database_name); databases.emplace(database_name, database); - if (!database->isDatalakeCatalog()) - databases_without_datalake_catalogs.emplace(database_name, database); + if (!database->isRemoteDatabase()) + databases_without_remote.emplace(database_name, database); NOEXCEPT_SCOPE({ UUID db_uuid = database->getUUID(); @@ -639,8 +639,8 @@ DatabasePtr DatabaseCatalog::detachDatabase(ContextPtr local_context, const Stri removeUUIDMapping(db_uuid); databases.erase(database_name); } - if (auto it = databases_without_datalake_catalogs.find(database_name); it != databases_without_datalake_catalogs.end()) - databases_without_datalake_catalogs.erase(it); + if (auto it = databases_without_remote.find(database_name); it != databases_without_remote.end()) + databases_without_remote.erase(it); } if (!db) { @@ -711,11 +711,11 @@ void DatabaseCatalog::updateDatabaseName(const String & old_name, const String & databases.erase(it); databases.emplace(new_name, db); - auto no_catalogs_it = databases_without_datalake_catalogs.find(old_name); - if (no_catalogs_it != databases_without_datalake_catalogs.end()) + auto local_it = databases_without_remote.find(old_name); + if (local_it != databases_without_remote.end()) { - databases_without_datalake_catalogs.erase(no_catalogs_it); - databases_without_datalake_catalogs.emplace(new_name, db); + databases_without_remote.erase(local_it); + databases_without_remote.emplace(new_name, db); } for (const auto & table_name : tables_in_database) @@ -839,10 +839,10 @@ bool DatabaseCatalog::isDatabaseExist(std::string_view database_name) const Databases DatabaseCatalog::getDatabases(GetDatabasesOptions options) const { std::lock_guard lock{databases_mutex}; - if (options.with_datalake_catalogs) + if (options.with_remote_databases) return databases; - return databases_without_datalake_catalogs; + return databases_without_remote; } bool DatabaseCatalog::isTableExist(const DB::StorageID & table_id, ContextPtr context_) const @@ -1156,7 +1156,7 @@ void DatabaseCatalog::loadMarkedAsDroppedTables() std::map> dropped_metadata; String path = fs::path("metadata_dropped") / ""; - auto db_map = getDatabases(GetDatabasesOptions{.with_datalake_catalogs = true}); + auto db_map = getDatabases(GetDatabasesOptions{.with_remote_databases = true}); std::set metadata_disk_list; for (const auto & [_, db] : db_map) { @@ -2049,7 +2049,7 @@ void DatabaseCatalog::reloadDisksTask() disks.swap(disks_to_reload); } - for (auto & database : getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (auto & database : getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { // WARNING: In case of `async_load_databases = true` getTablesIterator() call wait for all table in the database to be loaded. // WARNING: It means that no database will be able to update configuration until all databases are fully loaded. @@ -2215,8 +2215,8 @@ std::pair TableNameHints::getExtendedHintForTable(const String & { /// load all available databases from the DatabaseCatalog instance auto & database_catalog = DatabaseCatalog::instance(); - /// NOTE Skip datalake catalogs to avoid unnecessary access to remote catalogs (can be expensive) - auto all_databases = database_catalog.getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + /// NOTE Skip remote databases (data lake catalogs, MySQL, PostgreSQL) to avoid unnecessary access to remote services (can be expensive) + auto all_databases = database_catalog.getDatabases(GetDatabasesOptions{.with_remote_databases = false}); for (const auto & [db_name, db] : all_databases) { @@ -2239,8 +2239,9 @@ Names TableNameHints::getAllRegisteredNames() const { if (!database) return {}; - /// DataLakeCatalog::getAllTableNames lists all tables from remote catalog - expensive. Skip when user opted out. - if (database->isDatalakeCatalog() && context && !context->getSettingsRef()[Setting::show_data_lake_catalogs_in_system_tables]) + /// Remote databases (data lake catalogs, MySQL, PostgreSQL) typically list tables via a remote + /// service, which is expensive. Skip when user opted out of seeing them in system tables. + if (database->isRemoteDatabase() && context && !context->getSettingsRef()[Setting::show_remote_databases_in_system_tables]) return {}; return database->getAllTableNames(context); } diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index b5dfb94d6689..7640b2aa29a6 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -88,7 +88,11 @@ class BackgroundSchedulePoolTaskHolder; struct GetDatabasesOptions { - bool with_datalake_catalogs{false}; + /// Include remote databases (data lake catalogs, MySQL, PostgreSQL). + /// These are excluded by default because listing their tables can be expensive + /// (network calls to remote services). Controlled by the + /// `show_remote_databases_in_system_tables` setting in system.tables/columns/completions. + bool with_remote_databases{false}; }; /// For some reason Context is required to get Storage from Database object. @@ -153,14 +157,13 @@ class DatabaseCatalog : boost::noncopyable, WithMutableContext DatabasePtr getDatabase(const UUID & uuid) const; DatabasePtr tryGetDatabase(const UUID & uuid) const; bool isDatabaseExist(std::string_view database_name) const; - /// Datalake catalogs are implement at IDatabase level in ClickHouse. - /// In general case Datalake catalog is a some remote service which contains iceberg/delta tables. - /// Sometimes this service charges money for requests. With this flag we explicitly protect ourself - /// to not accidentally query external non-free service for some trivial things like - /// autocompletion hints or system.tables / system.columns queries. We have a setting which allow to show - /// these databases everywhere, but user must explicitly specify it. - /// Note: system.databases always passes with_datalake_catalogs = true because listing a database name - /// is purely local metadata and never requires calls to an external catalog service. + /// Remote databases (data lake catalogs, MySQL, PostgreSQL) are implemented at IDatabase level in ClickHouse. + /// Listing their tables typically requires calls to a remote service (sometimes paid). + /// GetDatabasesOptions::with_remote_databases explicitly protects us from accidentally querying the remote service for trivial + /// things like autocompletion hints or system.tables / system.columns queries. + /// The `show_remote_databases_in_system_tables` setting allows the user to opt in. + /// Note: system.databases always passes with_remote_databases = true because listing a database + /// name is purely local metadata and never requires calls to a remote service. Databases getDatabases(GetDatabasesOptions options) const; /// Same as getDatabase(const String & database_name), but if database_name is empty, current database of local_context is used @@ -268,8 +271,8 @@ class DatabaseCatalog : boost::noncopyable, WithMutableContext bool canPerformReplicatedDDLQueries() const; void updateMetadataFile(const String & database_name, const ASTPtr & create_query); - bool hasDatalakeCatalogs() const; - bool isDatalakeCatalog(const String & database_name) const; + bool hasRemoteDatabases() const; + bool isRemoteDatabase(const String & database_name) const; private: // The global instance of database catalog. unique_ptr is to allow @@ -317,7 +320,7 @@ class DatabaseCatalog : boost::noncopyable, WithMutableContext mutable std::mutex databases_mutex; Databases databases TSA_GUARDED_BY(databases_mutex); - Databases databases_without_datalake_catalogs TSA_GUARDED_BY(databases_mutex); + Databases databases_without_remote TSA_GUARDED_BY(databases_mutex); UUIDToStorageMap uuid_map; /// Referential dependencies between tables: table "A" depends on table "B" diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index f5fc47c6c3bb..a3481a53223b 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -216,7 +216,7 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) command_ast->formatForErrorMessage(), rewritten_command_ast->formatForErrorMessage()); } } - + /// When session_timezone is set, string literals compared to DateTime columns /// must be wrapped with explicit timezone to avoid misinterpretation in the /// background mutation thread which lacks the session context. diff --git a/src/Interpreters/InterpreterCheckQuery.cpp b/src/Interpreters/InterpreterCheckQuery.cpp index 16d44311d8c2..b689d6b6cdcf 100644 --- a/src/Interpreters/InterpreterCheckQuery.cpp +++ b/src/Interpreters/InterpreterCheckQuery.cpp @@ -395,7 +395,7 @@ InterpreterCheckQuery::InterpreterCheckQuery(const ASTPtr & query_ptr_, ContextP static Strings getAllDatabases(const ContextPtr & context) { Strings res; - const auto & databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + const auto & databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); res.reserve(databases.size()); for (const auto & [database_name, _] : databases) { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 296a5b69d03d..cb82a691f35d 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -207,7 +207,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) auto db_num_limit = getContext()->getGlobalContext()->getServerSettings()[ServerSetting::max_database_num_to_throw].value; if (db_num_limit > 0 && !internal) { - size_t db_count = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = true}).size(); + size_t db_count = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = true}).size(); std::initializer_list system_databases = { DatabaseCatalog::TEMPORARY_DATABASE, diff --git a/src/Interpreters/InterpreterShowColumnsQuery.cpp b/src/Interpreters/InterpreterShowColumnsQuery.cpp index c7b7c58d9fa0..1477ae0b3b33 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.cpp +++ b/src/Interpreters/InterpreterShowColumnsQuery.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -171,9 +172,13 @@ WHERE BlockIO InterpreterShowColumnsQuery::execute() { + const auto & query = query_ptr->as(); + String database = getContext()->resolveDatabase(query.database); auto query_context = Context::createCopy(getContext()); query_context->makeQueryContext(); query_context->setCurrentQueryId(""); + if (DatabaseCatalog::instance().isRemoteDatabase(database)) + query_context->setSetting("show_remote_databases_in_system_tables", true); return executeQuery(getRewrittenQuery(), query_context, QueryFlags{ .internal = true }).second; } diff --git a/src/Interpreters/InterpreterShowIndexesQuery.cpp b/src/Interpreters/InterpreterShowIndexesQuery.cpp index 6556e4cdff07..c09e6ed21f25 100644 --- a/src/Interpreters/InterpreterShowIndexesQuery.cpp +++ b/src/Interpreters/InterpreterShowIndexesQuery.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -122,9 +123,13 @@ ORDER BY index_type, expression, seq_in_index;)", database, table, where_express BlockIO InterpreterShowIndexesQuery::execute() { + const auto & query = query_ptr->as(); + String database = getContext()->resolveDatabase(query.database); auto query_context = Context::createCopy(getContext()); query_context->makeQueryContext(); query_context->setCurrentQueryId(""); + if (DatabaseCatalog::instance().isRemoteDatabase(database)) + query_context->setSetting("show_remote_databases_in_system_tables", true); return executeQuery(getRewrittenQuery(), query_context, QueryFlags{ .internal = true }).second; } diff --git a/src/Interpreters/InterpreterShowTablesQuery.cpp b/src/Interpreters/InterpreterShowTablesQuery.cpp index fe819ee4a6dc..780dc39c4a8e 100644 --- a/src/Interpreters/InterpreterShowTablesQuery.cpp +++ b/src/Interpreters/InterpreterShowTablesQuery.cpp @@ -235,11 +235,11 @@ BlockIO InterpreterShowTablesQuery::execute() auto query_context = Context::createCopy(getContext()); query_context->makeQueryContext(); query_context->setCurrentQueryId(""); - if (DatabaseCatalog::instance().isDatalakeCatalog(database)) + if (DatabaseCatalog::instance().isRemoteDatabase(database)) { - /// HACK: force the setting so that system.tables includes tables from the requested data lake catalog. - /// system.databases already shows all catalogs unconditionally, so no override is needed for SHOW DATABASES. - query_context->setSetting("show_data_lake_catalogs_in_system_tables", true); + /// HACK: force the setting so that system.tables includes tables from the requested remote database. + /// system.databases already shows all databases unconditionally, so no override is needed for SHOW DATABASES. + query_context->setSetting("show_remote_databases_in_system_tables", true); } return executeQuery(rewritten_query, std::move(query_context), QueryFlags{ .internal = true }).second; } diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 5ce0c2a9ef97..f961c05862da 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -162,6 +162,7 @@ namespace ActionLocks extern const StorageActionBlockType PullReplicationLog; extern const StorageActionBlockType Cleanup; extern const StorageActionBlockType ViewRefresh; + extern const StorageActionBlockType ViewRefreshPause; } namespace @@ -220,6 +221,8 @@ AccessType getRequiredAccessType(StorageActionBlockType action_type) return AccessType::SYSTEM_CLEANUP; if (action_type == ActionLocks::ViewRefresh) return AccessType::SYSTEM_VIEWS; + if (action_type == ActionLocks::ViewRefreshPause) + return AccessType::SYSTEM_VIEWS; throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown action type: {}", std::to_string(action_type)); } @@ -258,7 +261,7 @@ void InterpreterSystemQuery::startStopAction(StorageActionBlockType action_type, } else { - for (auto & elem : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (auto & elem : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { startStopActionInDatabase(action_type, start, elem.first, elem.second, getContext(), log); } @@ -843,12 +846,20 @@ BlockIO InterpreterSystemQuery::execute() break; case Type::START_VIEW: case Type::START_VIEWS: + /// `SYSTEM START VIEW` must undo both `SYSTEM STOP VIEW` and `SYSTEM PAUSE VIEW`. + /// Each call drops the corresponding lock (if any) and invokes `refresher->start()`; + /// `start` is idempotent so calling it twice is safe. startStopAction(ActionLocks::ViewRefresh, true); + startStopAction(ActionLocks::ViewRefreshPause, true); break; case Type::STOP_VIEW: case Type::STOP_VIEWS: startStopAction(ActionLocks::ViewRefresh, false); break; + case Type::PAUSE_VIEW: + case Type::PAUSE_VIEWS: + startStopAction(ActionLocks::ViewRefreshPause, false); + break; case Type::START_REPLICATED_VIEW: for (const auto & task : getRefreshTasks()) task->startReplicated(); @@ -1350,7 +1361,7 @@ void InterpreterSystemQuery::restartReplicas(ContextMutablePtr system_context) bool access_is_granted_globally = access->isGranted(AccessType::SYSTEM_RESTART_REPLICA); bool show_tables_is_granted_globally = access->isGranted(AccessType::SHOW_TABLES); - for (auto & elem : catalog.getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (auto & elem : catalog.getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { if (elem.second->isExternal()) continue; @@ -1412,7 +1423,7 @@ void InterpreterSystemQuery::dropReplica(ASTSystemQuery & query) } else if (query.is_drop_whole_replica) { - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); auto access = getContext()->getAccess(); bool access_is_granted_globally = access->isGranted(AccessType::SYSTEM_DROP_REPLICA); @@ -1454,7 +1465,7 @@ void InterpreterSystemQuery::dropReplica(ASTSystemQuery & query) String remote_replica_path = fs::path(query.replica_zk_path) / "replicas" / query.replica; /// This check is actually redundant, but it may prevent from some user mistakes - for (auto & elem : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (auto & elem : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { DatabasePtr & database = elem.second; for (auto iterator = database->getTablesIterator(getContext()); iterator->isValid(); iterator->next()) @@ -1781,7 +1792,7 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query) } else if (query.is_drop_whole_replica) { - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); auto access = getContext()->getAccess(); bool access_is_granted_globally = access->isGranted(AccessType::SYSTEM_DROP_REPLICA); @@ -1814,7 +1825,7 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query) getContext()->checkAccess(AccessType::SYSTEM_DROP_REPLICA); /// This check is actually redundant, but it may prevent from some user mistakes - for (auto & elem : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (auto & elem : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) if (auto * replicated = dynamic_cast(elem.second.get())) check_not_local_replica(replicated, full_replica_name, query_replica_zk_path); @@ -1964,7 +1975,7 @@ void InterpreterSystemQuery::loadOrUnloadPrimaryKeysImpl(bool load) getContext()->checkAccess(load ? AccessType::SYSTEM_LOAD_PRIMARY_KEY : AccessType::SYSTEM_UNLOAD_PRIMARY_KEY); LOG_TRACE(log, "{} primary keys for all tables", load ? "Loading" : "Unloading"); - for (auto & database : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (auto & database : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { for (auto it = database.second->getTablesIterator(getContext()); it->isValid(); it->next()) { @@ -2368,6 +2379,8 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() case Type::STOP_VIEW: case Type::STOP_VIEWS: case Type::STOP_REPLICATED_VIEW: + case Type::PAUSE_VIEW: + case Type::PAUSE_VIEWS: case Type::CANCEL_VIEW: case Type::TEST_VIEW: { diff --git a/src/Interpreters/MutationsDateTimeLiteralVisitor.cpp b/src/Interpreters/MutationsDateTimeLiteralVisitor.cpp index 2573d86b23ae..ba30921bcdf4 100644 --- a/src/Interpreters/MutationsDateTimeLiteralVisitor.cpp +++ b/src/Interpreters/MutationsDateTimeLiteralVisitor.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -163,6 +164,23 @@ class RewriteDateTimeLiteralsMatcher { if (auto * function = ast->as()) visit(*function, data); + else if (auto * assignment = ast->as()) + visit(*assignment, data); + } + + /// Wrap string literal in UPDATE SET when the target column is DateTime without explicit timezone + static void visit(ASTAssignment & assignment, Data & data) + { + auto dt = getDateTimeColumnType(assignment.column_name, data.columns); + if (!dt) + return; + + auto & expr = assignment.children.at(0); + if (const auto * lit = expr->as(); lit && lit->value.getType() == Field::Types::String) + { + expr = wrapWithTimezone(expr, dt, data.session_timezone); + data.modified = true; + } } static void visit(ASTFunction & function, Data & data) diff --git a/src/Interpreters/ServerAsynchronousMetrics.cpp b/src/Interpreters/ServerAsynchronousMetrics.cpp index d92ad49dd9e1..21b49a6c91a1 100644 --- a/src/Interpreters/ServerAsynchronousMetrics.cpp +++ b/src/Interpreters/ServerAsynchronousMetrics.cpp @@ -293,7 +293,7 @@ void ServerAsynchronousMetrics::updateImpl(TimePoint update_time, TimePoint curr } { - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); size_t max_queue_size = 0; size_t max_inserts_in_queue = 0; @@ -495,7 +495,7 @@ void ServerAsynchronousMetrics::updateMutationAndDetachedPartsStats() DetachedPartsStats current_values{}; MutationStats current_mutation_stats{}; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { if (db.second->isExternal()) continue; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index d387532e04de..dd7bd76a5e9c 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -2106,7 +2106,7 @@ static void executeASTFuzzerQueries(const ASTPtr & ast, const ContextMutablePtr std::pair executeQuery( - const String & query, + std::string_view query, ContextMutablePtr context, QueryFlags flags, QueryProcessingStage::Enum stage) diff --git a/src/Interpreters/executeQuery.h b/src/Interpreters/executeQuery.h index 5e3545d854f0..ceab8f3f1073 100644 --- a/src/Interpreters/executeQuery.h +++ b/src/Interpreters/executeQuery.h @@ -84,7 +84,7 @@ void executeQuery( /// Correctly formatting the results (according to INTO OUTFILE and FORMAT sections) /// must be done separately. std::pair executeQuery( - const String & query, /// Query text without INSERT data. The latter must be written to BlockIO::out. + std::string_view query, /// Query text without INSERT data. The latter must be written to BlockIO::out. ContextMutablePtr context, /// DB, tables, data types, storage engines, functions, aggregate functions... QueryFlags flags = {}, QueryProcessingStage::Enum stage = QueryProcessingStage::Complete /// To which stage the query must be executed. diff --git a/src/Interpreters/loadMetadata.cpp b/src/Interpreters/loadMetadata.cpp index 5bdb8e2cee0a..301dbb3930e0 100644 --- a/src/Interpreters/loadMetadata.cpp +++ b/src/Interpreters/loadMetadata.cpp @@ -603,7 +603,7 @@ void convertDatabasesEnginesIfNeed(const LoadTaskPtrs & load_metadata, ContextMu // Wait for all table to be loaded and started waitLoad(TablesLoaderForegroundPoolId, load_metadata); - for (const auto & [name, _] : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & [name, _] : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) if (name != DatabaseCatalog::SYSTEM_DATABASE) maybeConvertOrdinaryDatabaseToAtomic(context, name); diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index 4c68316c3707..0a2a7e78042b 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -463,6 +463,7 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti case Type::START_REPLICATED_VIEW: case Type::STOP_VIEW: case Type::STOP_REPLICATED_VIEW: + case Type::PAUSE_VIEW: case Type::CANCEL_VIEW: case Type::WAIT_VIEW: { @@ -622,6 +623,7 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti case Type::STOP_THREAD_FUZZER: case Type::START_VIEWS: case Type::STOP_VIEWS: + case Type::PAUSE_VIEWS: case Type::CLEAR_PAGE_CACHE: case Type::STOP_REPLICATED_DDL_QUERIES: case Type::START_REPLICATED_DDL_QUERIES: diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 70749eecc3c5..f61d921b3655 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -128,6 +128,8 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster STOP_VIEW, STOP_VIEWS, STOP_REPLICATED_VIEW, + PAUSE_VIEW, + PAUSE_VIEWS, CANCEL_VIEW, TEST_VIEW, LOAD_PRIMARY_KEY, diff --git a/src/Parsers/ParserSystemQuery.cpp b/src/Parsers/ParserSystemQuery.cpp index 29ff87ba1afa..430b90c577b6 100644 --- a/src/Parsers/ParserSystemQuery.cpp +++ b/src/Parsers/ParserSystemQuery.cpp @@ -566,6 +566,7 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & case Type::START_REPLICATED_VIEW: case Type::STOP_VIEW: case Type::STOP_REPLICATED_VIEW: + case Type::PAUSE_VIEW: case Type::CANCEL_VIEW: if (!parseDatabaseAndTableAsAST(pos, expected, res->database, res->table)) return false; @@ -573,6 +574,7 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & case Type::START_VIEWS: case Type::STOP_VIEWS: + case Type::PAUSE_VIEWS: case Type::FREE_MEMORY: break; diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index 0c0d77713726..0e2c7419bd6a 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -249,6 +249,14 @@ bool QueryDAG::build(QueryPlan::Node & node) outputs.insert(outputs.begin(), filter_node); } + /// Remove materialize() and identity() wrappers from the combined DAG. + /// This must happen after all expressions are merged and filter nodes are added + /// back to outputs, because: + /// 1. Removing wrappers before merging would change output column names (e.g., + /// "materialize(name)" → "name"), causing subsequent merges to fail when they + /// try to match input names to the modified output names. + /// 2. removeTrivialWrappers calls removeUnusedActions, which can invalidate raw + /// pointers (like filter_nodes) to nodes that are no longer reachable from outputs. if (dag) dag->removeTrivialWrappers(); diff --git a/src/Server/ReplicasStatusHandler.cpp b/src/Server/ReplicasStatusHandler.cpp index dd90198229be..6d6b497f2aa4 100644 --- a/src/Server/ReplicasStatusHandler.cpp +++ b/src/Server/ReplicasStatusHandler.cpp @@ -55,7 +55,7 @@ void ReplicasStatusHandler::handleRequest(HTTPServerRequest & request, HTTPServe bool ok = true; WriteBufferFromOwnString message; - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); /// Iterate through all the replicated tables. for (const auto & db : databases) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 4e5951687084..c5dfc5e8620c 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -2237,7 +2237,7 @@ void TCPHandler::processQuery(std::shared_ptr & state) if (part_uuids_to_ignore.has_value()) state->part_uuids_to_ignore = std::move(part_uuids_to_ignore); - readStringBinary(state->query_id, *in); + readStringBinary(state->query_id, *in, MAX_HELLO_STRING_SIZE); /// In interserver mode, /// initial_user can be empty in case of Distributed INSERT via Buffer/Kafka, @@ -2327,7 +2327,12 @@ void TCPHandler::processQuery(std::shared_ptr & state) throw exception; /// NOLINT } - std::string data(salt); + /// `StringWithMemoryTracking` so the duplicate of the (potentially + /// large) query body that the hash needs goes through the throwing + /// memory-tracker path. With a plain `std::string` the append below + /// would allocate through `allocNoThrow` and could push the server + /// past `max_server_memory_usage` without throwing. + StringWithMemoryTracking data(salt); // For backward compatibility if (nonce.has_value()) data += std::to_string(nonce.value()); diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 1e78790b4f5a..7540baf0a073 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -83,8 +84,13 @@ struct QueryState std::unique_ptr block_out; Block block_for_insert; - /// Query text. - String query; + /// Query text. Uses `StringWithMemoryTracking` so that the resize on + /// receive goes through the throwing memory-tracker path. A client + /// sending an oversized query body trips `MEMORY_LIMIT_EXCEEDED` + /// cleanly, rather than driving the server's RSS past + /// `max_server_memory_usage` via `allocNoThrow` and getting + /// cgroup-OOM-killed. + StringWithMemoryTracking query; std::shared_ptr plan_and_sets; /// Parsed query ASTPtr parsed_query; diff --git a/src/Storages/MaterializedView/RefreshTask.cpp b/src/Storages/MaterializedView/RefreshTask.cpp index 0e5f3a9006a9..ed6202b13416 100644 --- a/src/Storages/MaterializedView/RefreshTask.cpp +++ b/src/Storages/MaterializedView/RefreshTask.cpp @@ -80,11 +80,12 @@ namespace ErrorCodes RefreshTask::RefreshTask( StorageMaterializedView * view_, ContextPtr context, const DB::ASTRefreshStrategy & strategy, bool attach, bool coordinated, bool empty, bool is_restore_from_backup) - : log(getLogger("RefreshTask")) - , view(view_) + : view(view_) , refresh_schedule(strategy) , refresh_append(strategy.append) { + createLogger(view->getStorageID()); + auto component_guard = Coordination::setCurrentComponent("RefreshTask::RefreshTask"); if (strategy.settings != nullptr) refresh_settings.applyChanges(strategy.settings->changes); @@ -111,9 +112,13 @@ RefreshTask::RefreshTask( /// currently both DatabaseReplicated and DatabaseShared seem to require this behavior. if (!replica_path_existed) { - if (!attach && !is_restore_from_backup && - !zookeeper->isFeatureEnabled(KeeperFeatureFlag::MULTI_READ)) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Keeper server doesn't support multi-reads."); + if (!attach && !is_restore_from_backup) + { + /// (It would be possible to avoid using these features, if needed.) + if (!zookeeper->isFeatureEnabled(KeeperFeatureFlag::MULTI_READ) || + !zookeeper->isFeatureEnabled(KeeperFeatureFlag::CREATE_IF_NOT_EXISTS)) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Keeper server doesn't have all feature flags required by refreshable MV: MULTI_READ, CREATE_IF_NOT_EXISTS"); + } zookeeper->createAncestors(coordination.path); Coordination::Requests ops; @@ -149,6 +154,18 @@ RefreshTask::RefreshTask( } } +void RefreshTask::createLogger(const StorageID & storage_id) +{ + std::lock_guard lock(logger_mutex); + current_logger = ::getLogger(fmt::format("RefreshTask({})", storage_id.getFullTableName())); +} + +LoggerPtr RefreshTask::getLogger() +{ + std::lock_guard lock(logger_mutex); + return current_logger; +} + OwnedRefreshTask RefreshTask::create( StorageMaterializedView * view, ContextMutablePtr context, @@ -160,10 +177,12 @@ OwnedRefreshTask RefreshTask::create( { auto task = std::make_shared(view, context, strategy, attach, coordinated, empty, is_restore_from_backup); - task->refresh_task = context->getSchedulePool().createTask(view->getStorageID(), "RefreshTask", - [self = task.get()] { self->refreshTask(); }); + task->scheduling_task = context->getSchedulePool().createTask(view->getStorageID(), "RefreshSched", + [self = task.get()] { self->doScheduling(/*is_shutdown=*/ false); }); + task->execution_task = context->getSchedulePool().createTask(view->getStorageID(), "RefreshExec", + [self = task.get()] { self->executeRefresh(); }); - task->refresh_task_watch_callback = std::make_shared([w = task->coordination.watches, task_waker = task->refresh_task->getWatchCallback()](const Coordination::WatchResponse & response) + task->watch_callback = std::make_shared([w = task->coordination.watches, task_waker = task->scheduling_task->getWatchCallback()](const Coordination::WatchResponse & response) { w->root_watch_active.store(false); w->children_watch_active.store(false); @@ -215,11 +234,28 @@ void RefreshTask::shutdown() } /// If we're in DatabaseReplicated, interrupt replicated CREATE/EXCHANGE/DROP queries in refresh task. - /// Without this we can deadlock waiting for refresh_task because this shutdown happens from the same DDL thread for which CREATE/EXCHANGE/DROP wait. + /// Without this we can deadlock waiting for execution_task because this shutdown happens from the same DDL thread for which CREATE/EXCHANGE/DROP wait. execution.cancel_ddl_queries.request_stop(); - /// Wait for the task to return and prevent it from being scheduled in future. - refresh_task->deactivate(); + /// Wait for the tasks to return and prevent them from being scheduled in future. + scheduling_task->deactivate(); + execution_task->deactivate(); + + /// Best-effort final update of information in zookeeper, to reflect that this replica is not + /// running a refresh anymore. + try + { + doScheduling(/*is_shutdown=*/ true); + } + catch (...) + { + /// Avoid throwing from shutdown(). + /// If we failed to write to zookeeper, other replicas won't start refresh until our + /// zookeeper session expires (+ grace period). This is not a problem if this + /// shutdown() is caused by server shutdown or by table DROP, but may be bad if it's a DETACH + /// (and we'll hold on to the session indefinitely). + tryLogCurrentException(getLogger(), "Keeper error during RMV shutdown"); + } /// Remove from RefreshSet on DROP, without waiting for the IStorage to be destroyed. /// This matters because a table may get dropped and immediately created again with the same name, @@ -236,35 +272,45 @@ void RefreshTask::shutdown() refresh_cv.notify_all(); } -void RefreshTask::drop(ContextPtr context) +void RefreshTask::drop(ContextPtr context, bool is_shared_db) { - if (coordination.coordinated) - { - auto component_guard = Coordination::setCurrentComponent("RefreshTask::drop"); - auto zookeeper = context->getZooKeeper(); + if (!coordination.coordinated) + return; - zookeeper->tryRemove(coordination.path + "/replicas/" + coordination.replica_name); + auto component_guard = Coordination::setCurrentComponent("RefreshTask::drop"); + auto zookeeper = context->getZooKeeper(); - /// Redundant, refreshTask() is supposed to clean up after itself, but let's be paranoid. - removeRunningZnodeIfMine(zookeeper); + zookeeper->tryRemove(coordination.path + "/replicas/" + coordination.replica_name); - /// If no replicas left, remove the coordination znode. - Coordination::Requests ops; - ops.emplace_back(zkutil::makeRemoveRequest(coordination.path + "/replicas", -1)); - String paused_path = coordination.path + "/paused"; - if (zookeeper->exists(paused_path)) - ops.emplace_back(zkutil::makeRemoveRequest(paused_path, -1)); - ops.emplace_back(zkutil::makeRemoveRequest(coordination.path, -1)); - Coordination::Responses responses; - auto code = zookeeper->tryMulti(ops, responses); - if (responses[0]->error != Coordination::Error::ZNOTEMPTY && responses[0]->error != Coordination::Error::ZNONODE) - zkutil::KeeperMultiException::check(code, ops, responses); + /// If no replicas left, remove the coordination znode. + /// In Shared DB, let SharedDatabaseCatalog handle it. + if (is_shared_db) + return; + + /// If no replicas left, remove the coordination znode. + Coordination::Requests ops; + ops.emplace_back(zkutil::makeRemoveRequest(coordination.path + "/replicas", -1)); + String paused_path = coordination.path + "/paused"; + if (zookeeper->exists(paused_path)) + ops.emplace_back(zkutil::makeRemoveRequest(paused_path, -1)); + String running_path = coordination.path + "/running"; + if (zookeeper->exists(running_path)) + { + /// shutdown() was supposed to delete it. + LOG_ERROR(getLogger(), "Unexpected 'running' znode when dropping refreshable materialized view."); + ops.emplace_back(zkutil::makeRemoveRequest(running_path, -1)); } + ops.emplace_back(zkutil::makeRemoveRequest(coordination.path, -1)); + Coordination::Responses responses; + auto code = zookeeper->tryMulti(ops, responses); + if (responses[0]->error != Coordination::Error::ZNOTEMPTY && responses[0]->error != Coordination::Error::ZNONODE) + zkutil::KeeperMultiException::check(code, ops, responses); } void RefreshTask::rename(StorageID new_id, StorageID new_inner_table_id) { std::lock_guard guard(mutex); + createLogger(new_id); if (set_handle) set_handle.rename(new_id, refresh_append ? std::nullopt : std::make_optional(new_inner_table_id)); } @@ -319,7 +365,7 @@ void RefreshTask::alterRefreshParams(const DB::ASTRefreshStrategy & new_strategy RefreshTask::Info RefreshTask::getInfo() const { std::lock_guard guard(mutex); - return Info {.view_id = set_handle.getID(), .state = state, .next_refresh_time = next_refresh_time, .znode = coordination.root_znode, .replica_name = coordination.replica_name, .refresh_running = coordination.running_znode_exists, .progress = execution.progress.getValues(), .unexpected_error = scheduling.unexpected_error}; + return Info {.view_id = set_handle.getID(), .state = state, .next_refresh_time = next_refresh_time, .znode = coordination.root_znode, .replica_name = coordination.replica_name, .refresh_running = coordination.root_znode.refresh_running, .progress = execution.progress.getValues(), .unexpected_error = scheduling.unexpected_error}; } void RefreshTask::start() @@ -334,9 +380,23 @@ void RefreshTask::start() void RefreshTask::stop() { std::lock_guard guard(mutex); + bool was_already_stopped = std::exchange(scheduling.stop_requested, true); + /// Always interrupt the in-flight refresh. This matters in the PAUSE-then-STOP sequence: + /// `SYSTEM PAUSE VIEW` leaves the running refresh alone but sets `stop_requested`, and a + /// subsequent `SYSTEM STOP VIEW` must still cancel it. `interruptExecution` is idempotent + /// (guarded by `execution.interrupt_execution`) so repeated calls are safe. + interruptExecution(); + if (!was_already_stopped) + scheduleRefresh(guard); +} + +void RefreshTask::pause() +{ + std::lock_guard guard(mutex); + /// Do NOT interrupt the currently running refresh. Only prevent future refreshes. + /// If `stop_requested` was already set (e.g. by `SYSTEM STOP VIEW`), this is a no-op. if (std::exchange(scheduling.stop_requested, true)) return; - interruptExecution(); scheduleRefresh(guard); } @@ -401,7 +461,7 @@ void RefreshTask::wait() { if (!view) throw Exception(ErrorCodes::TABLE_IS_DROPPED, "The table was dropped or detached"); - if (!coordination.running_znode_exists && !coordination.root_znode.last_attempt_succeeded && coordination.root_znode.last_attempt_time.time_since_epoch().count() != 0) + if (!coordination.root_znode.refresh_running && !coordination.root_znode.last_attempt_succeeded && coordination.root_znode.last_attempt_time.time_since_epoch().count() != 0) throw Exception(ErrorCodes::REFRESH_FAILED, "Refresh failed{}: {}", coordination.coordinated ? " (on replica " + coordination.root_znode.last_attempt_replica + ")" : "", coordination.root_znode.last_attempt_error.empty() ? "Replica went away" : coordination.root_znode.last_attempt_error); @@ -474,176 +534,256 @@ void RefreshTask::notify() void RefreshTask::setFakeTime(std::optional t) { std::unique_lock lock(mutex); - scheduling.fake_clock.store(t.value_or(INT64_MIN), std::memory_order_relaxed); + Int64 val = t.value_or(INT64_MIN); + LOG_INFO(getLogger(), "Set fake time: {}", val); + scheduling.fake_clock.store(val, std::memory_order_relaxed); /// Reschedule task with shorter delay if currently scheduled. - refresh_task->scheduleAfter(100, /*overwrite*/ true, /*only_if_scheduled*/ true); + scheduling_task->scheduleAfter(100, /*overwrite*/ true, /*only_if_scheduled*/ true); } -void RefreshTask::refreshTask() +void RefreshTask::doScheduling(bool is_shutdown) { - auto component_guard = Coordination::setCurrentComponent("RefreshTask::refreshTask"); + auto component_guard = Coordination::setCurrentComponent("RefreshTask::doScheduling"); std::unique_lock lock(mutex); - auto schedule_keeper_retry = [&] { - chassert(lock.owns_lock()); - chassert(state == RefreshState::Scheduling); - coordination.watches->should_reread_znodes.store(true); - refresh_task->scheduleAfter(5000); - }; + /// The way this function generally works is: + /// * Look at state in zookeeper and in memory and at current time. + /// * If some change is needed (e.g. write to zookeeper or start a refresh), make that change, + /// do scheduling_task->schedule() (to inspect the new state after the change), and return. + /// (We avoid making multiple changes in one iteration because that just adds more + /// opportunities for bugs. This function should be able to pick up from ~any state anyway.) + /// * If no change is needed, we setState, optionally scheduling_task->scheduleAfter, and return. + /// * On error, we scheduling_task->schedule/scheduleAfter and return. try { - bool refreshed_just_now = false; - /// Whoever breaks out of this loop should assign state. - while (true) + setState(RefreshState::Scheduling, lock); + + std::shared_ptr zookeeper; + if (coordination.coordinated) + zookeeper = view->getContext()->getZooKeeper(); + readZnodesIfNeeded(zookeeper, lock); + chassert(lock.owns_lock()); + + /// Sync 3 pieces of information about currently running refresh: + /// * coordination.root_znode.refresh_running + /// * coordination.running_znode_exists + /// * execution.state + /// (Why are there as many as 3? We need an ephemeral znode to notice server crashes, a + /// non-ephemeral znode to tolerate brief zookeeper connection loss, and in-memory state to + /// communicate with the thread that executes refresh.) + + auto running_znode_missing_since = coordination.running_znode_missing_since; + coordination.running_znode_missing_since.reset(); // reassigned below if still missing + + if (coordination.root_znode.refresh_running && coordination.root_znode.last_attempt_replica == coordination.replica_name) { - setState(RefreshState::Scheduling, lock); - execution.interrupt_execution.store(false); + /// Our replica is allegedly running a refresh. - updateDependenciesIfNeeded(lock); + if (is_shutdown) + { + chassert(execution.state != ExecutionState::State::Running); + if (execution.state == ExecutionState::State::Requested) + { + /// execution_task was deactivated by shutdown before refresh started. + execution.znode.last_attempt_error = "shutdown"; + execution.znode.refresh_running = false; + execution.state = ExecutionState::State::Finished; + } + } - std::shared_ptr zookeeper; - if (coordination.coordinated) - zookeeper = view->getContext()->getZooKeeper(); - readZnodesIfNeeded(zookeeper, lock); - chassert(lock.owns_lock()); - /// Check if another replica is already running a refresh. - if (coordination.running_znode_exists) + switch (execution.state) { - if (coordination.root_znode.last_attempt_replica == coordination.replica_name) + case ExecutionState::State::None: { - LOG_ERROR(log, "Znode {} indicates that this replica is running a refresh, but it isn't. Likely a bug.", coordination.path + "/running"); -#ifdef DEBUG_OR_SANITIZER_BUILD - abortOnFailedAssertion("Unexpected refresh lock in keeper"); -#else - coordination.running_znode_exists = false; - if (coordination.coordinated) - removeRunningZnodeIfMine(zookeeper); - schedule_keeper_retry(); + LOG_WARNING(getLogger(), "RMV znode says this replica is running refresh, but it isn't. Maybe we crashed and restarted recently, and the state is left over from the crashed run?"); + CoordinationZnode znode = coordination.root_znode; + znode.refresh_running = false; + updateCoordinationState(znode, /*running=*/ false, zookeeper, lock); + scheduling_task->schedule(); break; -#endif } - else + case ExecutionState::State::Finished: { - setState(RefreshState::RunningOnAnotherReplica, lock); + /// Report refresh completion (successful or not) to the znode. + if (execution.znode.version == coordination.root_znode.version) + { + if (!updateCoordinationState(execution.znode, /*running=*/ false, zookeeper, lock)) + return; + chassert(!coordination.root_znode.refresh_running); + + if (coordination.root_znode.last_attempt_succeeded) + { + lock.unlock(); + view->getContext()->getRefreshSet().notifyDependents(view->getStorageID()); + lock.lock(); + } + } + else + { + LOG_ERROR(getLogger(), "RMV znode was updated (version {} -> {}) while refresh was running (without changing refresh_running and last_attempt_replica). This should only be possible if another server has the same replica name as me.", execution.znode.version, coordination.root_znode.version); + CoordinationZnode znode = coordination.root_znode; + znode.refresh_running = false; + updateCoordinationState(znode, /*running=*/ false, zookeeper, lock); + } + + chassert(lock.owns_lock()); + execution.state = ExecutionState::State::None; + /// Go to Scheduled state after each refresh, even if for a moment before + /// starting the next refresh. This gives `wait()` a chance to complete. + setState(RefreshState::Scheduled, lock); + scheduling_task->schedule(); break; } - } - - chassert(lock.owns_lock()); + case ExecutionState::State::Requested: + case ExecutionState::State::Running: + { + if (!coordination.running_znode_exists) + { + LOG_WARNING(getLogger(), "Re-creating ephemeral znode '{}', presumably lost on zookeeper reconnect.", coordination.path + "/running"); + updateCoordinationState(coordination.root_znode, /*running=*/ true, zookeeper, lock, /*only_running_znode=*/ true); + } - if (scheduling.stop_requested || coordination.paused_znode_exists || view->getContext()->getRefreshSet().refreshesStopped() || coordination.read_only) - { - /// Exit the task and wait for the user to start or resume, which will schedule the task again. - setState(RefreshState::Disabled, lock); - break; + setState(RefreshState::Running, lock); + break; + } } - /// Check if it's time to refresh. - auto start_time = currentTime(); - auto start_time_seconds = std::chrono::floor(start_time); - Stopwatch stopwatch; - auto [when, timeslot, start_znode] = determineNextRefreshTime(start_time_seconds); - next_refresh_time = when; - bool out_of_schedule = scheduling.out_of_schedule_refresh_requested; - if (out_of_schedule) - { - chassert(start_znode.attempt_number > 0); - start_znode.attempt_number -= 1; - } - else if (start_time < when) - { - size_t delay_ms = std::chrono::duration_cast(when - start_time).count(); - /// If we're in a test that fakes the clock, poll every 100ms. - if (scheduling.fake_clock.load(std::memory_order_relaxed) != INT64_MIN) - delay_ms = 100; - refresh_task->scheduleAfter(delay_ms); - setState(RefreshState::Scheduled, lock); - break; - } - else if (timeslot >= scheduling.dependencies_satisfied_until) + return; + } + else + { + if (execution.state != ExecutionState::State::None) { - setState(RefreshState::WaitingForDependencies, lock); - break; + LOG_ERROR(getLogger(), "RMV refresh is running locally, but keeper says there's no running refresh on this replica. This should only be possible after keeper was unavailable for a while (over 1-2 minutes)."); + if (execution.state == ExecutionState::State::Finished) + execution.state = ExecutionState::State::None; + else + interruptExecution(); } - if (refreshed_just_now) + if (coordination.root_znode.refresh_running) { - /// If doing two refreshes in a row, go through Scheduled state first, - /// to give wait() a chance to complete. - setState(RefreshState::Scheduled, lock); - refresh_task->schedule(); - break; - } + /// Another replica is allegedly running a refresh. - /// Write to keeper. - if (!updateCoordinationState(start_znode, true, zookeeper, lock)) + if (!coordination.running_znode_exists) + { + /// If ephemeral znode unexpectedly disappears, wait for this long to give the + /// owner of the znode a chance to re-create it. + /// Currently hard-coded as 1.25x the keeper session timeout, but it doesn't + /// necessarily need to be longer than session timeout, since the grace period + /// starts after the session already expired. + UInt64 grace_period_ms = zookeeper->getSessionTimeoutMS(); + grace_period_ms += grace_period_ms / 4; + + std::chrono::system_clock::time_point now = currentTime(); + if (!running_znode_missing_since.has_value()) + { + LOG_INFO(getLogger(), "RMV coordination znode says refresh is running on replica '{}', but there's no corresponding ephemeral znode. Waiting for {} ms before assuming that the replica crashed.", coordination.root_znode.last_attempt_replica, grace_period_ms); + running_znode_missing_since = now; + } + coordination.running_znode_missing_since = running_znode_missing_since; + + std::chrono::system_clock::time_point deadline = *coordination.running_znode_missing_since + std::chrono::milliseconds(grace_period_ms); + if (now >= deadline) + { + LOG_WARNING(getLogger(), "Replica '{}' appears to have crashed while executing a refresh. Clearing the lock in zookeeper to allow another replica to start a new refresh.", coordination.root_znode.last_attempt_replica); + + CoordinationZnode znode = coordination.root_znode; + chassert(znode.refresh_running); + znode.refresh_running = false; + updateCoordinationState(znode, /*running=*/ false, zookeeper, lock); + scheduling_task->schedule(); + return; + } + else + { + scheduling_task->scheduleAfter(std::chrono::duration_cast(deadline - now).count()); + } + } + + setState(RefreshState::RunningOnAnotherReplica, lock); + return; + } + else if (coordination.running_znode_exists) { - schedule_keeper_retry(); + LOG_ERROR(getLogger(), "RMV coordination znode says no refresh is running, but the ephemeral 'running' znode exists. This should be impossible."); + chassert(false); + updateCoordinationState(coordination.root_znode, /*running=*/ false, zookeeper, lock); + scheduling_task->schedule(); return; } - chassert(lock.owns_lock()); - - /// Perform a refresh. + } - setState(RefreshState::Running, lock); - scheduling.out_of_schedule_refresh_requested = false; - bool append = refresh_append; - int32_t root_znode_version = coordination.coordinated ? coordination.root_znode.version : -1; - CurrentMetrics::Increment metric_inc(CurrentMetrics::RefreshingViews); + if (is_shutdown) + return; // we just needed to propagate information into zookeeper - String log_comment = fmt::format("refresh of {}", view->getStorageID().getFullTableName()); - if (start_znode.attempt_number > 1) - log_comment += fmt::format(" (attempt {}/{})", start_znode.attempt_number, refresh_settings[RefreshSetting::refresh_retries] + 1); + /// Decide when to do the next refresh. - lock.unlock(); + updateDependenciesIfNeeded(lock); + chassert(lock.owns_lock()); - String error_message; - auto new_table_uuid = executeRefreshUnlocked(append, root_znode_version, start_time, stopwatch, log_comment, error_message); - bool refreshed = new_table_uuid.has_value(); + if (scheduling.stop_requested || coordination.paused_znode_exists || view->getContext()->getRefreshSet().refreshesStopped() || coordination.read_only) + { + setState(RefreshState::Disabled, lock); + return; + } - lock.lock(); + auto start_time = currentTime(); + auto start_time_seconds = std::chrono::floor(start_time); + auto [when, timeslot, start_znode] = determineNextRefreshTime(start_time_seconds); + next_refresh_time = when; + bool out_of_schedule = scheduling.out_of_schedule_refresh_requested; + if (out_of_schedule) + { + chassert(start_znode.attempt_number > 0); + start_znode.attempt_number -= 1; + } + else if (start_time < when) + { + size_t delay_ms = std::chrono::duration_cast(when - start_time).count(); + /// If we're in a test that fakes the clock, poll every 100ms. + if (scheduling.fake_clock.load(std::memory_order_relaxed) != INT64_MIN) + delay_ms = 100; + scheduling_task->scheduleAfter(delay_ms); + setState(RefreshState::Scheduled, lock); + return; + } + else if (timeslot >= scheduling.dependencies_satisfied_until) + { + setState(RefreshState::WaitingForDependencies, lock); + return; + } - setState(RefreshState::Scheduling, lock); + /// The time to start next refresh is now! - auto end_time_seconds = std::chrono::floor(currentTime()); - auto znode = coordination.root_znode; - znode.last_attempt_time = end_time_seconds; - znode.last_attempt_error = error_message; - if (refreshed) - { - znode.last_attempt_succeeded = true; - znode.last_completed_timeslot = refresh_schedule.timeslotForCompletedRefresh(znode.last_completed_timeslot, start_time_seconds, end_time_seconds, out_of_schedule); - znode.last_success_time = start_time_seconds; - znode.last_success_duration = std::chrono::milliseconds(stopwatch.elapsedMilliseconds()); - znode.last_success_table_uuid = *new_table_uuid; - znode.previous_attempt_error = ""; - znode.attempt_number = 0; - znode.randomize(); - } + /// Write to keeper. + if (!updateCoordinationState(start_znode, /*running=*/ true, zookeeper, lock)) + return; + chassert(lock.owns_lock()); - bool ok = updateCoordinationState(znode, false, zookeeper, lock); - chassert(lock.owns_lock()); - if (!ok) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Refresh coordination znode was changed while refresh was in progress."); + scheduling.out_of_schedule_refresh_requested = false; - if (refreshed) - { - lock.unlock(); - view->getContext()->getRefreshSet().notifyDependents(view->getStorageID()); - lock.lock(); - } + chassert(execution.state == ExecutionState::State::None); + execution.interrupt_execution.store(false); + execution.znode = coordination.root_znode; + execution.start_time = start_time; + execution.out_of_schedule = out_of_schedule; + execution.state = ExecutionState::State::Requested; - refreshed_just_now = true; - } + execution_task->schedule(); + setState(RefreshState::Running, lock); } catch (Coordination::Exception &) { - tryLogCurrentException(log, "Keeper error"); + tryLogCurrentException(getLogger(), "Keeper error"); if (!lock.owns_lock()) lock.lock(); - schedule_keeper_retry(); + + chassert(state == RefreshState::Scheduling); + coordination.watches->should_reread_znodes.store(true); + scheduling_task->scheduleAfter(5000); } catch (...) { @@ -652,30 +792,77 @@ void RefreshTask::refreshTask() scheduling.stop_requested = true; scheduling.unexpected_error = getCurrentExceptionMessage(true); coordination.watches->should_reread_znodes.store(true); - coordination.running_znode_exists = false; + interruptExecution(); setState(RefreshState::Scheduling, lock); - refresh_task->schedule(); - lock.unlock(); + scheduling_task->schedule(); - tryLogCurrentException(log, - "Exception in refresh scheduling. The view will be stopped."); + tryLogCurrentException(getLogger(), + "Unexpected exception in refresh scheduling. The view will be stopped."); #ifdef DEBUG_OR_SANITIZER_BUILD - /// There's at least one legitimate case where this may happen: if the user (DEFINER) was dropped. - /// But it's unexpected in tests. - /// Note that Coordination::Exception is caught separately above, so transient keeper errors - /// don't go here and are just retried. abortOnFailedAssertion("Unexpected exception in refresh scheduling"); -#else - if (coordination.coordinated) - removeRunningZnodeIfMine(view->getContext()->getZooKeeper()); #endif } } -std::optional RefreshTask::executeRefreshUnlocked(bool append, int32_t root_znode_version, std::chrono::system_clock::time_point start_time, const Stopwatch & stopwatch, const String & log_comment, String & out_error_message) +void RefreshTask::executeRefresh() +{ + std::unique_lock lock(mutex); + + chassert(execution.state == ExecutionState::State::Requested); + execution.state = ExecutionState::State::Running; + + Stopwatch stopwatch; + int32_t root_znode_version = execution.znode.version; + String error_message; + std::optional new_table_uuid; + + String log_comment = fmt::format("refresh of {}", view->getStorageID().getFullTableName()); + if (execution.znode.attempt_number > 1) + log_comment += fmt::format(" (attempt {}/{})", execution.znode.attempt_number, refresh_settings[RefreshSetting::refresh_retries] + 1); + + lock.unlock(); + try + { + CurrentMetrics::Increment metric_inc(CurrentMetrics::RefreshingViews); + new_table_uuid = executeRefreshUnlocked(root_znode_version, log_comment, error_message); + } + catch (...) + { + tryLogCurrentException(getLogger(), "Unexpected exception during refresh"); + chassert(false); + error_message = getCurrentExceptionMessage(true); + } + lock.lock(); + + auto start_time_seconds = std::chrono::floor(execution.start_time); + auto end_time_seconds = std::chrono::floor(currentTime()); + CoordinationZnode znode = execution.znode; + znode.last_attempt_time = end_time_seconds; + znode.last_attempt_error = error_message; + znode.refresh_running = false; + if (new_table_uuid.has_value()) + { + znode.last_attempt_succeeded = true; + znode.last_completed_timeslot = refresh_schedule.timeslotForCompletedRefresh(znode.last_completed_timeslot, start_time_seconds, end_time_seconds, execution.out_of_schedule); + znode.last_success_time = start_time_seconds; + znode.last_success_duration = std::chrono::milliseconds(stopwatch.elapsedMilliseconds()); + znode.last_success_table_uuid = *new_table_uuid; + znode.previous_attempt_error = ""; + znode.attempt_number = 0; + znode.randomize(); + } + execution.znode = znode; + + chassert(execution.state == ExecutionState::State::Running); + execution.state = ExecutionState::State::Finished; + + scheduling_task->schedule(); +} + +std::optional RefreshTask::executeRefreshUnlocked(int32_t root_znode_version, const String & log_comment, String & out_error_message) { StorageID view_storage_id = view->getStorageID(); - LOG_DEBUG(log, "Refreshing view {}", view_storage_id.getFullTableName()); + LOG_DEBUG(getLogger(), "Refreshing view"); execution.progress.reset(); static constexpr bool internal = true; @@ -690,12 +877,13 @@ std::optional RefreshTask::executeRefreshUnlocked(bool append, int32_t roo String query_for_logging; UInt64 normalized_query_hash = 0; std::shared_ptr query_span = std::make_shared("query"); + Stopwatch stopwatch; try { refresh_context = view->createRefreshContext(log_comment); - if (!append) + if (!refresh_append) { refresh_context->setParentTable(view_storage_id.uuid); refresh_context->setDDLQueryCancellation(execution.cancel_ddl_queries.get_token()); @@ -708,7 +896,7 @@ std::optional RefreshTask::executeRefreshUnlocked(bool append, int32_t roo query_for_logging = "(create target table)"; normalized_query_hash = normalizedQueryHash(query_for_logging, false); QueryScope query_scope; - std::tie(refresh_query, query_scope) = view->prepareRefresh(append, refresh_context, table_to_drop); + std::tie(refresh_query, query_scope) = view->prepareRefresh(refresh_append, refresh_context, table_to_drop); new_table_id = refresh_query->table_id; /// Add the query to system.processes and allow it to be killed with KILL QUERY. @@ -740,7 +928,7 @@ std::optional RefreshTask::executeRefreshUnlocked(bool append, int32_t roo /// We log the refresh as one INSERT SELECT query, but the timespan and exceptions also /// cover the surrounding CREATE, EXCHANGE, and DROP queries. query_log_elem = logQueryStart( - start_time, refresh_context, query_for_logging, normalized_query_hash, refresh_query, pipeline, + currentTime(), refresh_context, query_for_logging, normalized_query_hash, refresh_query, pipeline, &interpreter, /*internal*/ internal, view_storage_id.database_name, view_storage_id.table_name, /*async_insert*/ false); @@ -783,7 +971,7 @@ std::optional RefreshTask::executeRefreshUnlocked(bool append, int32_t roo } /// Exchange tables. - if (!append) + if (!refresh_append) { query_for_logging = "(exchange tables)"; normalized_query_hash = normalizedQueryHash(query_for_logging, false); @@ -928,6 +1116,7 @@ RefreshTask::determineNextRefreshTime(std::chrono::sys_seconds now) znode.last_attempt_replica = coordination.replica_name; znode.last_attempt_error = ""; znode.last_attempt_succeeded = false; + znode.refresh_running = true; return {when, timeslot, znode}; } @@ -936,7 +1125,7 @@ void RefreshTask::scheduleRefresh(std::lock_guard &) { if (state != RefreshState::Running) state = RefreshState::Scheduling; - refresh_task->schedule(); + scheduling_task->schedule(); } void RefreshTask::setState(RefreshState s, std::unique_lock & lock) @@ -965,12 +1154,12 @@ void RefreshTask::readZnodesIfNeeded(std::shared_ptr zookeepe if (!coordination.watches->root_watch_active.load()) { coordination.watches->root_watch_active.store(true); - zookeeper->existsWatch(coordination.path, nullptr, refresh_task_watch_callback); + zookeeper->existsWatch(coordination.path, nullptr, watch_callback); } if (!coordination.watches->children_watch_active.load()) { coordination.watches->children_watch_active.store(true); - zookeeper->getChildrenWatch(coordination.path, nullptr, refresh_task_watch_callback); + zookeeper->getChildrenWatch(coordination.path, nullptr, watch_callback); } Strings paths {coordination.path, coordination.path + "/running", coordination.path + "/paused"}; @@ -984,9 +1173,11 @@ void RefreshTask::readZnodesIfNeeded(std::shared_ptr zookeepe if (responses[i].error != Coordination::Error::ZOK && responses[i].error != Coordination::Error::ZNONODE) throw Coordination::Exception::fromPath(responses[i].error, paths[i]); - coordination.root_znode.parse(responses[0].data); + bool running_znode_exists = responses[1].error == Coordination::Error::ZOK; + + coordination.root_znode.parse(responses[0].data, running_znode_exists, getLogger()); coordination.root_znode.version = responses[0].stat.version; - coordination.running_znode_exists = responses[1].error == Coordination::Error::ZOK; + coordination.running_znode_exists = running_znode_exists; coordination.paused_znode_exists = responses[2].error == Coordination::Error::ZOK; if (coordination.root_znode.last_completed_timeslot != prev_last_completed_timeslot) @@ -997,22 +1188,27 @@ void RefreshTask::readZnodesIfNeeded(std::shared_ptr zookeepe } } -bool RefreshTask::updateCoordinationState(CoordinationZnode root, bool running, std::shared_ptr zookeeper, std::unique_lock & lock) +bool RefreshTask::updateCoordinationState(CoordinationZnode root, bool running, std::shared_ptr zookeeper, std::unique_lock & lock, bool only_running_znode) { chassert(lock.owns_lock()); int32_t version = -1; if (coordination.coordinated) { Coordination::Requests ops; - ops.emplace_back(zkutil::makeSetRequest(coordination.path, root.toString(), root.version)); + if (only_running_znode) + ops.emplace_back(zkutil::makeCheckRequest(coordination.path, root.version)); + else + ops.emplace_back(zkutil::makeSetRequest(coordination.path, root.toString(), root.version)); if (running) { ops.emplace_back( - zkutil::makeCreateRequest(coordination.path + "/running", coordination.replica_name, zkutil::CreateMode::Ephemeral)); + zkutil::makeCreateRequest(coordination.path + "/running", coordination.replica_name, zkutil::CreateMode::Ephemeral, /*ignore_if_exists=*/ true)); } else { - ops.emplace_back(zkutil::makeRemoveRequest(coordination.path + "/running", -1)); + /// (Avoid `try_remove = true` because it requires a keeper feature flag TRY_REMOVE that we're otherwise not using.) + if (coordination.running_znode_exists) + ops.emplace_back(zkutil::makeRemoveRequest(coordination.path + "/running", -1)); } Coordination::Responses responses; @@ -1022,11 +1218,25 @@ bool RefreshTask::updateCoordinationState(CoordinationZnode root, bool running, lock.lock(); if (running && responses[0]->error == Coordination::Error::ZBADVERSION) + { /// Lost the race, this is normal, don't log a stack trace. + /// Trigger a re-read of znodes just in case, though it shouldn't be necessary because of watches. + /// (Can we get into a situation where such re-reads keep returning stale data, and + /// write attempts keep failing with version mismatch, and we keep needlessly + /// busy-waiting and DOSing keeper? + /// Based on how keeper server works, this shouldn't happen: keeper provides + /// read-after-write consistency within a session even for failed writes. The next read + /// should always see the newer znode version that caused the conflict. Idk whether + /// this is the case in vanilla zookeeper as well.) + coordination.watches->should_reread_znodes.store(true); + scheduling_task->schedule(); return false; + } zkutil::KeeperMultiException::check(code, ops, responses); - version = dynamic_cast(*responses[0]).stat.version; - + if (only_running_znode) + version = root.version; + else + version = dynamic_cast(*responses[0]).stat.version; } coordination.root_znode = root; coordination.root_znode.version = version; @@ -1034,17 +1244,6 @@ bool RefreshTask::updateCoordinationState(CoordinationZnode root, bool running, return true; } -void RefreshTask::removeRunningZnodeIfMine(std::shared_ptr zookeeper) -{ - Coordination::Stat stat; - String data; - if (zookeeper->tryGet(coordination.path + "/running", data, &stat) && data == coordination.replica_name) - { - LOG_WARNING(log, "Removing unexpectedly lingering znode {}", coordination.path + "/running"); - zookeeper->tryRemove(coordination.path + "/running", stat.version); - } -} - void RefreshTask::interruptExecution() { chassert(!mutex.try_lock()); @@ -1054,7 +1253,7 @@ void RefreshTask::interruptExecution() if (execution.executor) { execution.executor->cancel(); - LOG_DEBUG(log, "Cancelling refresh in {}", set_handle.getID().getFullNameNotQuoted()); + LOG_DEBUG(getLogger(), "Cancelling refresh in {}", set_handle.getID().getFullNameNotQuoted()); } } @@ -1133,7 +1332,7 @@ std::tuple RefreshTask::getAndLockTargetTable(const /// X may still not have all data. ReplicatedMergeTree shutdown stops /// data part exchange, so there's no hope of getting all data out of X. /// In this case we retry table lookup in hopes of seeing the new table Y. - LOG_DEBUG(log, "Retrying after exception when syncing replica: {}", e.message()); + LOG_DEBUG(getLogger(), "Retrying after exception when syncing replica: {}", e.message()); exception = std::current_exception(); prev_table_dropped_locally = false; continue; @@ -1174,10 +1373,20 @@ void RefreshTask::CoordinationZnode::randomize() String RefreshTask::CoordinationZnode::toString() const { + /// "format version" should be incremented when making incompatible change, to make older + /// servers refuse to parse it. We should probably never do that. + /// + /// For backwards compatible changes, just add new fields at the end (!), and old servers will + /// ignore them. + /// + /// Removing a field is complicated enough that maybe we should never do it. Procedure would be: + /// 1. Make the field optional in `parse` but keep writing it here. + /// 2. Update all servers and make sure they update all RMV znodes (e.g. do a refresh). + /// 3. Stop writing the field here but keep recognizing (and ignoring) it in `parse`. + /// 4. Update all servers etc. + /// 5. Remove the field from `parse`. + WriteBufferFromOwnString out; - /// "format version" should be incremented when making incompatible change, to make older servers - /// refuse to parse it. For backwards compatible changes, just add new fields at the end and old - /// servers will ignore them. out << "format version: 1\n" << "last_completed_timeslot: " << Int64(last_completed_timeslot.time_since_epoch().count()) << "\n" << "last_success_time: " << Int64(last_success_time.time_since_epoch().count()) << "\n" @@ -1189,33 +1398,90 @@ String RefreshTask::CoordinationZnode::toString() const << "last_attempt_succeeded: " << last_attempt_succeeded << "\n" << "previous_attempt_error: " << escape << previous_attempt_error << "\n" << "attempt_number: " << attempt_number << "\n" - << "randomness: " << randomness << "\n"; + << "randomness: " << randomness << "\n" + << "refresh_running: " << refresh_running << "\n"; return out.str(); } -void RefreshTask::CoordinationZnode::parse(const String & data) +void RefreshTask::CoordinationZnode::parse(const String & data, bool running_znode_exists, const LoggerPtr & log_) { ReadBufferFromString in(data); - Int64 last_completed_timeslot_int; - Int64 last_success_time_int; - Int64 last_success_duration_int; - Int64 last_attempt_time_int; - in >> "format version: 1\n" - >> "last_completed_timeslot: " >> last_completed_timeslot_int >> "\n" - >> "last_success_time: " >> last_success_time_int >> "\n" - >> "last_success_duration_ms: " >> last_success_duration_int >> "\n" - >> "last_success_table_uuid: " >> last_success_table_uuid >> "\n" - >> "last_attempt_time: " >> last_attempt_time_int >> "\n" - >> "last_attempt_replica: " >> escape >> last_attempt_replica >> "\n" - >> "last_attempt_error: " >> escape >> last_attempt_error >> "\n" - >> "last_attempt_succeeded: " >> last_attempt_succeeded >> "\n" - >> "previous_attempt_error: " >> escape >> previous_attempt_error >> "\n" - >> "attempt_number: " >> attempt_number >> "\n" - >> "randomness: " >> randomness >> "\n"; - last_completed_timeslot = std::chrono::sys_seconds(std::chrono::seconds(last_completed_timeslot_int)); - last_success_time = std::chrono::sys_seconds(std::chrono::seconds(last_success_time_int)); - last_success_duration = std::chrono::milliseconds(last_success_duration_int); - last_attempt_time = std::chrono::sys_seconds(std::chrono::seconds(last_attempt_time_int)); + + String next_field_name; + auto advance_to_next_field = [&] + { + next_field_name.clear(); + if (in.eof()) + return; + assertChar('\n', in); + if (in.eof()) + return; + readStringUntilColon(next_field_name, in); + assertString(": ", in); + }; + auto try_read_field = [&](const char * name, auto & out) -> bool + { + using T = std::remove_reference_t; + + if (next_field_name != name) + return false; + + if constexpr (std::is_same_v) + { + in >> escape >> out; + } + else if constexpr (std::is_same_v) + { + Int64 v; + in >> v; + out = std::chrono::sys_seconds(std::chrono::seconds(v)); + } + else if constexpr (std::is_same_v) + { + Int64 v; + in >> v; + out = std::chrono::milliseconds(v); + } + else + { + in >> out; + } + + advance_to_next_field(); + return true; + }; + + auto required_field = [&](const char * name, auto & out) + { + if (!try_read_field(name, out)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "RMV coordination znode fields are missing or reordered: not found field '{}'", name); + }; + auto optional_field = [&](const char * name, auto & out, auto default_value) + { + if (!try_read_field(name, out)) + out = default_value; + }; + + in >> "format version: 1"; + advance_to_next_field(); + + required_field("last_completed_timeslot", last_completed_timeslot); + required_field("last_success_time", last_success_time); + required_field("last_success_duration_ms", last_success_duration); + required_field("last_success_table_uuid", last_success_table_uuid); + required_field("last_attempt_time", last_attempt_time); + required_field("last_attempt_replica", last_attempt_replica); + required_field("last_attempt_error", last_attempt_error); + required_field("last_attempt_succeeded", last_attempt_succeeded); + required_field("previous_attempt_error", previous_attempt_error); + required_field("attempt_number", attempt_number); + required_field("randomness", randomness); + optional_field("refresh_running", refresh_running, running_znode_exists); + + if (!next_field_name.empty()) + { + LOG_INFO(log_, "Unrecognized field '{}' in RMV coordination znode. Maybe the znode was written by a newer version of the server that added this field, or maybe parsing is broken.", next_field_name); + } } } diff --git a/src/Storages/MaterializedView/RefreshTask.h b/src/Storages/MaterializedView/RefreshTask.h index d0144931138b..73b8a6ad01a1 100644 --- a/src/Storages/MaterializedView/RefreshTask.h +++ b/src/Storages/MaterializedView/RefreshTask.h @@ -61,7 +61,7 @@ class RefreshTask : public std::enable_shared_from_this /// Ok to call even if startup() wasn't called or failed. void shutdown(); /// Call when dropping the table, after shutdown(). Removes coordination znodes if needed. - void drop(ContextPtr context); + void drop(ContextPtr context, bool is_shared_db); /// Call when renaming the materialized view. void rename(StorageID new_id, StorageID new_inner_table_id); /// Call when changing refresh params (ALTER MODIFY REFRESH). @@ -78,6 +78,9 @@ class RefreshTask : public std::enable_shared_from_this /// no refreshes will run. void start(); void stop(); + /// Like `stop`, but does not interrupt the currently running refresh; only prevents future + /// refreshes from starting. Resumed by `start`. + void pause(); void startReplicated(); void stopReplicated(const String & reason); @@ -149,13 +152,18 @@ class RefreshTask : public std::enable_shared_from_this /// on average, on the replica that generated the shortest delay. We could use nonuniform distribution to complensate, but this is easier.) Int64 randomness = 0; + /// Whether any replica is executing a refresh right now. + /// May be inaccurate if the replica that's executing refresh lost zookeeper connection for + /// a long time (long enough for its ephemeral znode to expire + grace period). + bool refresh_running = false; + /// Znode version. Not serialized. int32_t version = -1; void randomize(); // assigns `randomness` String toString() const; - void parse(const String & data); + void parse(const String & data, bool running_znode_exists, const LoggerPtr & log_); }; /// Just for observability. @@ -196,6 +204,11 @@ class RefreshTask : public std::enable_shared_from_this bool paused_znode_exists = false; std::shared_ptr watches = std::make_shared(); + /// Time when we first saw that `root_znode.refresh_running && !running_znode_exists`. + /// If far enough in the past, the replica that started the refresh has probably crashed, + /// and we should update the znode to say refresh_running = false. + std::optional running_znode_missing_since {}; + /// Whether we use Keeper to coordinate refresh across replicas. If false, we don't write to Keeper, /// but we still use the same in-memory structs (CoordinationZnode etc), as if it's coordinated (with one replica). bool coordinated = false; @@ -204,19 +217,38 @@ class RefreshTask : public std::enable_shared_from_this String replica_name; }; + /// Information about the currently running refresh. struct ExecutionState { + enum class State + { + None, + /// doScheduling() decided to run a refresh, executeRefresh() didn't start yet. + Requested, + /// executeRefresh() is in progress. + Running, + /// executeRefresh() completed, doScheduling() didn't propagate the result to zookeeper yet. + Finished, + }; + /// Protects interrupt_execution and executor. /// Can be locked while holding `mutex`. std::mutex executor_mutex; /// If there's a refresh in progress, it can be aborted by setting this flag and cancel()ling /// this executor. Refresh task will then reconsider what to do, re-checking `stop_requested`, - /// `cancel_requested`, etc. + /// `out_of_schedule_refresh_requested`, etc. std::atomic_bool interrupt_execution {false}; PipelineExecutor * executor = nullptr; /// Interrupts internal CREATE/EXCHANGE/DROP queries that refresh does. Only used during shutdown. StopSource cancel_ddl_queries; Progress progress; + + State state = State::None; + /// Contains information about the completed refresh, and znode version number at the start + /// of refresh. + CoordinationZnode znode; + std::chrono::system_clock::time_point start_time; + bool out_of_schedule = false; }; struct SchedulingState @@ -237,7 +269,9 @@ class RefreshTask : public std::enable_shared_from_this std::atomic fake_clock {INT64_MIN}; }; - LoggerPtr log = nullptr; + std::mutex logger_mutex; + LoggerPtr current_logger = nullptr; + StorageMaterializedView * view; /// Protects all fields below. @@ -248,13 +282,16 @@ class RefreshTask : public std::enable_shared_from_this RefreshSchedule refresh_schedule; RefreshSettings refresh_settings; std::vector initial_dependencies; - bool refresh_append; + const bool refresh_append; RefreshSet::Handle set_handle; - /// Calls refreshTask() from background thread. - BackgroundSchedulePoolTaskHolder refresh_task; - Coordination::WatchCallbackPtr refresh_task_watch_callback; + /// Calls doScheduling() from background thread. + BackgroundSchedulePoolTaskHolder scheduling_task; + /// Calls executeRefresh() from background thread. + BackgroundSchedulePoolTaskHolder execution_task; + + Coordination::WatchCallbackPtr watch_callback; CoordinationState coordination; ExecutionState execution; @@ -270,17 +307,24 @@ class RefreshTask : public std::enable_shared_from_this std::mutex replica_sync_mutex; UUID last_synced_inner_uuid = UUIDHelpers::Nil; - /// The main loop of the refresh task. It examines the state, sees what needs to be - /// done and does it. If there's nothing to do at the moment, returns; it's then scheduled again, - /// when needed, by public methods or by timer. + /// We have two "threads": one for managing scheduling and zookeeper state, one for executing + /// refresh. Two are needed because we may need to update zookeeper state during refresh. /// - /// Public methods just provide inputs for the refreshTask()'s decisions - /// (e.g. stop_requested, cancel_requested), they don't do anything significant themselves. - void refreshTask(); + /// doScheduling(), started through scheduling_task, is the scheduling / zookeeper management. + /// It runs whenever anything changes (e.g. znodes change, or refresh completes, or retry timer fires). + /// It looks at the state of everything and decides what needs to be done. + /// Public methods just provide inputs for the doScheduling()'s decisions + /// (e.g. stop_requested, out_of_schedule_refresh_requested), they don't do anything significant themselves. + /// If is_shutdown, both background tasks were stopped, and we only need to write to zookeeper + /// to reflect that this replica is not running a refresh anymore. + /// + /// executeRefresh(), started through execution_task, is where actual refresh SQL query runs. + void doScheduling(bool is_shutdown); + void executeRefresh(); /// Perform an actual refresh: create new table, run INSERT SELECT, exchange tables, drop old table. - /// Mutex must be unlocked. Called only from refresh_task. Doesn't throw. - std::optional executeRefreshUnlocked(bool append, int32_t root_znode_version, std::chrono::system_clock::time_point start_time, const Stopwatch & stopwatch, const String & log_comment, String & out_error_message); + /// Mutex must be unlocked. + std::optional executeRefreshUnlocked(int32_t root_znode_version, const String & log_comment, String & out_error_message); /// Assigns dependencies_satisfied_until. void updateDependenciesIfNeeded(std::unique_lock & lock); @@ -289,13 +333,21 @@ class RefreshTask : public std::enable_shared_from_this determineNextRefreshTime(std::chrono::sys_seconds now); void readZnodesIfNeeded(std::shared_ptr zookeeper, std::unique_lock & lock); - bool updateCoordinationState(CoordinationZnode root, bool running, std::shared_ptr zookeeper, std::unique_lock & lock); - void removeRunningZnodeIfMine(std::shared_ptr zookeeper); + /// Update the root znode and create/remove-if-exists the 'running' znode, + /// atomically, conditionally on the root znode version number. + /// If `only_running_znode`, the root znode is not updated, but its version is still checked. + /// If version number doesn't match, schedules a doScheduling() call + /// with should_reread_znodes = true, and returns false. + /// If coordination is disabled, just update in-memory struct without writing to zookeeper. + bool updateCoordinationState(CoordinationZnode root, bool running, std::shared_ptr zookeeper, std::unique_lock & lock, bool only_running_znode = false); void setState(RefreshState s, std::unique_lock & lock); void scheduleRefresh(std::lock_guard & lock); void interruptExecution(); std::chrono::system_clock::time_point currentTime() const; + + void createLogger(const StorageID & storage_id); + LoggerPtr getLogger(); }; using RefreshTaskPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index 668017b6f671..785ed4125984 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -86,6 +86,12 @@ class IMergeTreeReader : private boost::noncopyable virtual bool canSkipMark(size_t, size_t) { return false; } + /// Returns true if this reader can skip whole marks via `canSkipMark` for at least some inputs. + /// Independent of any particular mark index. Used by callers that need to know upfront whether + /// the reader chain may filter marks before the PREWHERE step runs — for example, to decide + /// whether `read_mark_ranges` with `row_count == 0` can be attributed to the PREWHERE predicate. + virtual bool canSkipAnyMark() const { return false; } + virtual void updateAllMarkRanges(const MarkRanges & ranges) { all_mark_ranges = ranges; } protected: diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 20280ab948b2..3a27828ae8c9 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -850,6 +850,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const { global_ctx->merging_skip_indexes.clear(); global_ctx->skip_indexes_by_column.clear(); + global_ctx->text_indexes_to_merge.clear(); } bool use_adaptive_granularity = global_ctx->new_data_part->index_granularity_info.mark_type.adaptive; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index fc4bdb843eb1..00c6baea63c1 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -11,6 +12,12 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int FAULT_INJECTED; +} + +namespace FailPoints +{ + extern const char compact_part_writer_fail_in_add_streams[]; } MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( @@ -83,11 +90,18 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & name_and compression_codec = CompressionCodecFactory::instance().get(effective_codec_desc, nullptr, default_codec, true); UInt64 codec_id = compression_codec->getHash(); - auto & stream = streams_by_codec[codec_id]; - if (!stream) - stream = std::make_shared(plain_hashing, compression_codec); + /// Exception safety: if `make_shared` throws, the map is not modified, avoiding null entries in `cancel`. + auto it = streams_by_codec.find(codec_id); + if (it == streams_by_codec.end()) + { + fiu_do_on(FailPoints::compact_part_writer_fail_in_add_streams, + { + throw Exception(ErrorCodes::FAULT_INJECTED, "Injected failure in Compact part writer addStreams"); + }); + it = streams_by_codec.emplace(codec_id, std::make_shared(plain_hashing, compression_codec)).first; + } - compressed_streams.emplace(stream_name, stream); + compressed_streams.emplace(stream_name, it->second); }; ISerialization::EnumerateStreamsSettings enumerate_settings; diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index bbb63c14397d..a01e94f21a3e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -558,8 +559,26 @@ const ActionsDAG::Node & MergeTreeIndexConditionSet::traverseDAG(const ActionsDA atom_node_ptr->type == ActionsDAG::ActionType::FUNCTION || (atom_node_ptr->type == ActionsDAG::ActionType::COLUMN && !WhichDataType(atom_node_ptr->result_type).isSet())) { - auto bit_wrapper_function = FunctionFactory::instance().get("__bitWrapperFunc", context); - result_node = &result_dag.addFunction(bit_wrapper_function, {atom_node_ptr}, {}); + /// `__bitWrapperFunc` is defined only for integer arguments. If the atom result type + /// is not integer (e.g. `Float`, `BFloat16`), wrapping it would throw the internal + /// "It's a bug!" exception from `__bitWrapperFunc` at execution time. Fall back to + /// `UNKNOWN_FIELD` so that the index does not prune granules and the query goes + /// through the regular filter path. + if (WhichDataType(removeNullable(atom_node_ptr->result_type)).isInteger()) + { + auto bit_wrapper_function = FunctionFactory::instance().get("__bitWrapperFunc", context); + result_node = &result_dag.addFunction(bit_wrapper_function, {atom_node_ptr}, {}); + } + else + { + ColumnWithTypeAndName unknown_field_column_with_type; + + unknown_field_column_with_type.name = calculateConstantActionNodeName(UNKNOWN_FIELD); + unknown_field_column_with_type.type = std::make_shared(); + unknown_field_column_with_type.column = unknown_field_column_with_type.type->createColumnConst(1, UNKNOWN_FIELD); + + result_node = &result_dag.addColumn(unknown_field_column_with_type); + } } } else diff --git a/src/Storages/MergeTree/MergeTreeReadTask.cpp b/src/Storages/MergeTree/MergeTreeReadTask.cpp index 9f04c36ae787..0eb5d5b33b51 100644 --- a/src/Storages/MergeTree/MergeTreeReadTask.cpp +++ b/src/Storages/MergeTree/MergeTreeReadTask.cpp @@ -442,4 +442,11 @@ void MergeTreeReadTask::addPrewhereUnmatchedMarks(const MarkRanges & mark_ranges prewhere_unmatched_marks.insert(prewhere_unmatched_marks.end(), mark_ranges_.begin(), mark_ranges_.end()); } +bool MergeTreeReadTask::readersChainCanSkipMarksBeforePrewhere() const +{ + /// Only `prepared_index` (a `MergeTreeReaderIndex`) sits ahead of the PREWHERE readers in the + /// reader chain and is able to skip whole marks via `canSkipMark`. + return readers.prepared_index && readers.prepared_index->canSkipAnyMark(); +} + } diff --git a/src/Storages/MergeTree/MergeTreeReadTask.h b/src/Storages/MergeTree/MergeTreeReadTask.h index 673f07daa52b..3afd1eac4ab7 100644 --- a/src/Storages/MergeTree/MergeTreeReadTask.h +++ b/src/Storages/MergeTree/MergeTreeReadTask.h @@ -196,6 +196,13 @@ struct MergeTreeReadTask : private boost::noncopyable void addPrewhereUnmatchedMarks(const MarkRanges & mark_ranges_); const MarkRanges & getPrewhereUnmatchedMarks() { return prewhere_unmatched_marks; } + /// Returns true if a reader earlier in the chain than PREWHERE can skip whole marks based on + /// secondary indexes (skip-index or projection-index). When true, marks that appear in + /// `read_mark_ranges` with `row_count == 0` may have been filtered before PREWHERE evaluated + /// them, so they must not be attributed to the PREWHERE predicate in the QueryConditionCache. + /// See Issue #104781. + bool readersChainCanSkipMarksBeforePrewhere() const; + Readers releaseReaders() { return std::move(readers); } size_t getNumMarksToRead() const { return mark_ranges.getNumberOfMarks(); } diff --git a/src/Storages/MergeTree/MergeTreeReaderIndex.cpp b/src/Storages/MergeTree/MergeTreeReaderIndex.cpp index 54b9c1b42aeb..dff64a5f0bd3 100644 --- a/src/Storages/MergeTree/MergeTreeReaderIndex.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderIndex.cpp @@ -10,6 +10,16 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +bool MergeTreeReaderIndex::canSkipAnyMark() const +{ + /// `canSkipMark` only ever returns true when a skip-index or projection-index read result is present. + /// `lazy_materializing_rows` alone affects row-level filtering inside a granule and never returns + /// true from `canSkipMark`, so a reader configured only for lazy materialization does not skip + /// whole marks. + return index_read_result + && (index_read_result->skip_index_read_result || index_read_result->projection_index_read_result); +} + MergeTreeReaderIndex::MergeTreeReaderIndex(const IMergeTreeReader * main_reader_, MergeTreeIndexReadResultPtr index_read_result_, const PaddedPODArray * lazy_materializing_rows_) : IMergeTreeReader( main_reader_->data_part_info_for_read, diff --git a/src/Storages/MergeTree/MergeTreeReaderIndex.h b/src/Storages/MergeTree/MergeTreeReaderIndex.h index 68f37585c80e..51b72e667de8 100644 --- a/src/Storages/MergeTree/MergeTreeReaderIndex.h +++ b/src/Storages/MergeTree/MergeTreeReaderIndex.h @@ -34,6 +34,8 @@ class MergeTreeReaderIndex : public IMergeTreeReader bool canSkipMark(size_t mark, size_t current_task_last_mark) override; + bool canSkipAnyMark() const override; + size_t getResultColumnCount() const override { return 1; } bool producesFilterOnly() const override { return true; } diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index c552a227fca9..52e719b9d461 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -263,7 +263,13 @@ MergeTreeSelectProcessor::readCurrentTask(MergeTreeReadTask & current_task, IMer .chunk = std::move(chunk), .num_read_rows = res.num_read_rows, .num_read_bytes = res.num_read_bytes, .is_finished = false}; } - if (reader_settings.use_query_condition_cache && prewhere_info) + /// Suppress PREWHERE attribution when a reader earlier in the chain (skip-index or projection-index) + /// can skip whole marks before PREWHERE evaluates them. In that case `res.read_mark_ranges` may + /// include marks that were filtered by the index, and recording them under the PREWHERE predicate's + /// hash would poison the QueryConditionCache: later queries that share the same PREWHERE predicate + /// hash would skip those marks even though the predicate alone matches their rows. See Issue #104781. + if (reader_settings.use_query_condition_cache && prewhere_info + && !current_task.readersChainCanSkipMarksBeforePrewhere()) current_task.addPrewhereUnmatchedMarks(res.read_mark_ranges); return {Chunk(), res.num_read_rows, res.num_read_bytes, false}; @@ -277,8 +283,12 @@ ChunkAndProgress MergeTreeSelectProcessor::read() { if (!task || algorithm->needNewTask(*task)) { - /// Update the query condition cache for filters in PREWHERE stage - if (reader_settings.use_query_condition_cache && task && prewhere_info) + /// Update the query condition cache for filters in PREWHERE stage. + /// Skip the write when a reader earlier in the chain (skip-index or projection-index) + /// could have filtered marks before PREWHERE saw them, to avoid attributing those + /// marks to the PREWHERE predicate hash. See Issue #104781. + if (reader_settings.use_query_condition_cache && task && prewhere_info + && !task->readersChainCanSkipMarksBeforePrewhere()) { for (const auto * output : prewhere_info->prewhere_actions.getOutputs()) { diff --git a/src/Storages/ObjectStorage/Azure/Configuration.cpp b/src/Storages/ObjectStorage/Azure/Configuration.cpp index 23aa8307bf7d..22969d6dd305 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.cpp +++ b/src/Storages/ObjectStorage/Azure/Configuration.cpp @@ -345,7 +345,7 @@ void AzureStorageParsedArguments::initializeForOneLake(ASTs & args, ContextPtr c fillBlobsFromURLCommon(connection_url, ".com", ".dfs.fabric.microsoft.com"); - connection_params.endpoint.additional_params = "resource=REDACTED&directory=REDACTED&recursive=REDACTED"; + connection_params.endpoint.container_already_exists = true; auto request_settings = AzureBlobStorage::getRequestSettings(context->getSettingsRef()); connection_params.client_options = AzureBlobStorage::getClientOptions(context, context->getSettingsRef(), *request_settings, /*for_disk=*/ false); diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp index 08ec182bd9f5..6d68436fedc9 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp @@ -5,11 +5,13 @@ #include #include #include +#include #include namespace DB::ErrorCodes { extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; } namespace DB::S3AuthSetting @@ -20,6 +22,47 @@ namespace DB::S3AuthSetting namespace DeltaLake { +namespace +{ + +/// Forwards to `ffi::set_builder_option`, translating a kernel error into a `DB::Exception`. +/// The Rust FFI decodes the `(ptr, len)` slices as `&str`. Validate the value up front so a +/// user supplied credential, region, endpoint or SAS token that is not valid UTF-8 (e.g. raw +/// `MD5(...)` bytes) is rejected with a clear `BAD_ARGUMENTS` naming the option, instead of the +/// opaque `DELTA_KERNEL_ERROR` the FFI would otherwise return. The FFI also propagates the error +/// (rather than aborting), so `unwrapResult` still guards keys and any future decode failure. +void setBuilderOption(ffi::EngineBuilder * builder, const std::string & name, const std::string & value) +{ + if (!DB::UTF8::isValidUTF8(reinterpret_cast(value.data()), value.size())) + throw DB::Exception( + DB::ErrorCodes::BAD_ARGUMENTS, + "Option '{}' for the DeltaLake engine contains invalid UTF-8 bytes; " + "the delta-kernel-rs FFI requires valid UTF-8 input.", + name); + + KernelUtils::unwrapResult( + ffi::set_builder_option(builder, KernelUtils::toDeltaString(name), KernelUtils::toDeltaString(value)), + "set_builder_option"); +} + +/// RAII guard that frees an `EngineBuilder` unless released. +/// `ffi::builder_build` consumes the builder on success; if configuring the builder throws before +/// that (invalid option, unsupported auth, malformed connection string), the builder must be freed +/// via `ffi::free_engine_builder` to avoid leaking it. +class BuilderGuard +{ +public: + explicit BuilderGuard(ffi::EngineBuilder * builder_) : builder(builder_) {} + ~BuilderGuard() { if (builder) ffi::free_engine_builder(builder); } + BuilderGuard(const BuilderGuard &) = delete; + BuilderGuard & operator=(const BuilderGuard &) = delete; + ffi::EngineBuilder * release() { auto * b = builder; builder = nullptr; return b; } +private: + ffi::EngineBuilder * builder = nullptr; +}; + +} + /// A helper class to manage S3-compatible storage types. class S3KernelHelper final : public IKernelHelper { @@ -55,10 +98,11 @@ class S3KernelHelper final : public IKernelHelper KernelUtils::toDeltaString(table_location), &KernelUtils::allocateError), "get_engine_builder"); + BuilderGuard guard(builder); auto set_option = [&](const std::string & name, const std::string & value) { - ffi::set_builder_option(builder, KernelUtils::toDeltaString(name), KernelUtils::toDeltaString(value)); + setBuilderOption(builder, name, value); }; const auto & credentials = client->getCredentials(); @@ -104,7 +148,7 @@ class S3KernelHelper final : public IKernelHelper url.endpoint, url.uri_str, region, url.bucket, no_sign, !access_key_id.empty(), !secret_access_key.empty(), !token.empty()); - return builder; + return guard.release(); } private: diff --git a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp index b156a28d00a6..5734f56a5e38 100644 --- a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp @@ -128,6 +128,8 @@ void WriteBufferFromHDFS::sync() void WriteBufferFromHDFS::finalizeImpl() { + WriteBufferFromFileBase::finalizeImpl(); + if (blob_log) { blob_log->addEvent( diff --git a/src/Storages/RocksDB/StorageSystemRocksDB.cpp b/src/Storages/RocksDB/StorageSystemRocksDB.cpp index 7c2e9ed2ea20..25c2e4f0df14 100644 --- a/src/Storages/RocksDB/StorageSystemRocksDB.cpp +++ b/src/Storages/RocksDB/StorageSystemRocksDB.cpp @@ -54,7 +54,7 @@ void StorageSystemRocksDB::fillData(MutableColumns & res_columns, ContextPtr con using RocksDBStoragePtr = std::shared_ptr; std::map> tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { if (db.second->isExternal()) continue; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index f5c2e4ba0bc0..38809d5cbb4f 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -30,17 +30,18 @@ #include #include -#include -#include -#include #include #include -#include -#include -#include #include +#include #include +#include #include +#include +#include +#include +#include +#include #include @@ -77,6 +78,14 @@ namespace ErrorCodes namespace ActionLocks { extern const StorageActionBlockType ViewRefresh; + extern const StorageActionBlockType ViewRefreshPause; +} + +String StorageMaterializedView::generateInnerTableName(const StorageID & view_id) +{ + if (view_id.hasUUID()) + return ".inner_id." + toString(view_id.uuid); + return ".inner." + view_id.getTableName(); } /// Remove columns from target_header that does not exist in src_header @@ -172,13 +181,13 @@ StorageMaterializedView::StorageMaterializedView( if (point_to_itself_by_uuid || point_to_itself_by_name) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Materialized view {} cannot point to itself", table_id_.getFullTableName()); + auto db_engine = DatabaseCatalog::instance().getDatabase(table_id_.database_name)->getEngineName(); + bool is_replicated_db = db_engine == "Replicated"; + if (query.refresh_strategy) { fixed_uuid = query.refresh_strategy->append; - auto db = DatabaseCatalog::instance().getDatabase(getStorageID().database_name); - bool is_replicated_db = db->getEngineName() == "Replicated"; - /// Decide whether to enable coordination. if (is_replicated_db) { @@ -460,6 +469,14 @@ void StorageMaterializedView::drop() if (getInMemoryMetadataPtr()->sql_security_type == SQLSecurityType::DEFINER) ViewDefinerDependencies::instance().removeViewDependencies(table_id); + bool is_shared_catalog = false; + + if (refresher) + refresher->drop(getContext(), is_shared_catalog); + + if (is_shared_catalog) + return; + /// Sync flag and the setting make sense for Atomic databases only. /// However, with Atomic databases, IStorage::drop() can be called only from a background task in DatabaseCatalog. /// Running synchronous DROP from that task leads to deadlock. @@ -470,9 +487,6 @@ void StorageMaterializedView::drop() /// but DROP acquires DDLGuard for the name of MV. And we cannot acquire second DDLGuard for the inner name in DROP, /// because it may lead to lock-order-inversion (DDLGuards must be acquired in lexicographical order). dropInnerTableIfAny(/* sync */ false, getContext()); - - if (refresher) - refresher->drop(getContext()); } void StorageMaterializedView::dropInnerTableIfAny(bool sync, ContextPtr local_context) @@ -930,6 +944,10 @@ ActionLock StorageMaterializedView::getActionLock(StorageActionBlockType type) { if (type == ActionLocks::ViewRefresh && refresher) refresher->stop(); + /// `SYSTEM PAUSE VIEW` prevents future refreshes but does not interrupt the currently running + /// refresh. `SYSTEM START VIEW` undoes it by clearing `stop_requested` via `onActionLockRemove`. + else if (type == ActionLocks::ViewRefreshPause && refresher) + refresher->pause(); if (has_inner_table) { if (auto target_table = tryGetTargetTable()) @@ -947,7 +965,7 @@ bool StorageMaterializedView::isRemote() const void StorageMaterializedView::onActionLockRemove(StorageActionBlockType action_type) { - if (action_type == ActionLocks::ViewRefresh && refresher) + if ((action_type == ActionLocks::ViewRefresh || action_type == ActionLocks::ViewRefreshPause) && refresher) refresher->start(); } @@ -970,13 +988,6 @@ void StorageMaterializedView::updateTargetTableId(std::optional database target_table_id.table_name = *std::move(table_name); } -String StorageMaterializedView::generateInnerTableName(const StorageID & view_id) -{ - if (view_id.hasUUID()) - return ".inner_id." + toString(view_id.uuid); - return ".inner." + view_id.getTableName(); -} - std::optional StorageMaterializedView::supportedPrewhereColumns() const { auto table = tryGetTargetTable(); diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 329882a13a1e..c2e2e47e71e6 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1502,7 +1502,7 @@ StorageMerge::DatabaseTablesIterators StorageMerge::DatabaseNameOrRegexp::getDat else { /// database_name argument is a regexp - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = true}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = true}); for (const auto & db : databases) { diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index 514be58ea48b..6a6fbeca720e 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -49,6 +49,7 @@ namespace DB namespace ErrorCodes { + extern const int BAD_ARGUMENTS; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int NOT_IMPLEMENTED; } @@ -68,6 +69,15 @@ void MongoDBConfiguration::checkHosts(const ContextPtr & context) const context->getRemoteHostFilter().checkHostAndPort(host.name, toString(host.port)); } +void MongoDBConfiguration::checkCollection() const +{ + /// The C driver builds the namespace as "." and asserts that the collection part is non-empty. + /// It treats the name as a NUL-terminated C string, so any embedded NUL truncates it and can produce an + /// effectively empty collection name, which aborts the process inside the driver. + if (collection.empty() || collection.find('\0') != String::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "MongoDB collection name must be non-empty and must not contain NUL characters"); +} + StorageMongoDB::StorageMongoDB( const StorageID & table_id_, MongoDBConfiguration configuration_, @@ -149,6 +159,7 @@ MongoDBConfiguration StorageMongoDB::getConfigurationFromCollection(MutableNamed boost::split(configuration.oid_fields, named_collection->get("oid_columns"), boost::is_any_of(",")); configuration.checkHosts(context); + configuration.checkCollection(); return configuration; } @@ -229,6 +240,7 @@ static MongoDBConfiguration getConfigurationImpl(const StorageID * table_id, AST "MongoDB('host:port', 'database', 'collection', 'user', 'password'[, options[, oid_columns]]) or MongoDB('uri', 'collection'[, oid columns])."); configuration.checkHosts(context); + configuration.checkCollection(); return configuration; } diff --git a/src/Storages/StorageMongoDB.h b/src/Storages/StorageMongoDB.h index 3107539c597f..17c6bb9b06bf 100644 --- a/src/Storages/StorageMongoDB.h +++ b/src/Storages/StorageMongoDB.h @@ -47,6 +47,7 @@ struct MongoDBConfiguration std::unordered_set oid_fields = {"_id"}; void checkHosts(const ContextPtr & context) const; + void checkCollection() const; bool isOidColumn(const std::string & name) const { diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index 0df9f53ff796..7397341159ad 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -59,7 +59,7 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, ContextPtr co for (const auto & name_and_cluster : context->getClusters()) writeCluster(res_columns, columns_mask, name_and_cluster, /* replicas_info_getter= */ {}); - const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); for (const auto & name_and_database : databases) { if (const auto * replicated = typeid_cast(name_and_database.second.get())) diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp index 7276d31b9cfa..ea40c6735956 100644 --- a/src/Storages/System/StorageSystemColumns.cpp +++ b/src/Storages/System/StorageSystemColumns.cpp @@ -27,7 +27,7 @@ namespace DB namespace Setting { extern const SettingsSeconds lock_acquire_timeout; - extern const SettingsBool show_data_lake_catalogs_in_system_tables; + extern const SettingsBool show_remote_databases_in_system_tables; } StorageSystemColumns::StorageSystemColumns(const StorageID & table_id_) @@ -440,7 +440,7 @@ void ReadFromSystemColumns::initializePipeline(QueryPipelineBuilder & pipeline, const auto & context = getContext(); const auto & settings = context->getSettingsRef(); - const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = settings[Setting::show_data_lake_catalogs_in_system_tables]}); + const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = settings[Setting::show_remote_databases_in_system_tables]}); for (const auto & [database_name, database] : databases) { if (database_name == DatabaseCatalog::TEMPORARY_DATABASE) diff --git a/src/Storages/System/StorageSystemCompletions.cpp b/src/Storages/System/StorageSystemCompletions.cpp index d4740d7be21d..ef82b8da86bf 100644 --- a/src/Storages/System/StorageSystemCompletions.cpp +++ b/src/Storages/System/StorageSystemCompletions.cpp @@ -33,7 +33,7 @@ namespace Setting { extern const SettingsUInt64 readonly; extern const SettingsSeconds lock_acquire_timeout; - extern const SettingsBool show_data_lake_catalogs_in_system_tables; + extern const SettingsBool show_remote_databases_in_system_tables; } static constexpr const char * DATABASE_CONTEXT = "database"; @@ -109,7 +109,7 @@ void fillDataWithDatabasesTablesColumns(MutableColumns & res_columns, const Cont { const auto & access = context->getAccess(); const auto & settings = context->getSettingsRef(); - const auto & databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = settings[Setting::show_data_lake_catalogs_in_system_tables]}); + const auto & databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = settings[Setting::show_remote_databases_in_system_tables]}); for (const auto & [database_name, database_ptr] : databases) { if (!access->isGranted(AccessType::SHOW_DATABASES) || !access->isGranted(AccessType::SHOW_DATABASES, database_name)) @@ -303,8 +303,7 @@ void fillDataWithPolicies(MutableColumns & res_columns, const ContextPtr & conte void fillDataWithDictionaries(MutableColumns & res_columns, const ContextPtr & context) { const auto & access = context->getAccess(); - if (!access->isGranted(AccessType::SHOW_DICTIONARIES)) - return; + const bool need_to_check_access_for_dictionaries = !access->isGranted(AccessType::SHOW_DICTIONARIES); const auto & external_dictionaries = context->getExternalDictionariesLoader(); for (const auto & load_result : external_dictionaries.getLoadResults()) @@ -320,7 +319,7 @@ void fillDataWithDictionaries(MutableColumns & res_columns, const ContextPtr & c dict_id.table_name = load_result.name; String db_or_tag = dict_id.database_name.empty() ? IDictionary::NO_DATABASE_TAG : dict_id.database_name; - if (!access->isGranted(AccessType::SHOW_DICTIONARIES, db_or_tag, dict_id.table_name)) + if (need_to_check_access_for_dictionaries && !access->isGranted(AccessType::SHOW_DICTIONARIES, db_or_tag, dict_id.table_name)) continue; res_columns[0]->insert(dict_id.table_name); res_columns[1]->insert(DICTIONARY_CONTEXT); diff --git a/src/Storages/System/StorageSystemDataSkippingIndices.cpp b/src/Storages/System/StorageSystemDataSkippingIndices.cpp index 16de220d712f..6266080fe5cc 100644 --- a/src/Storages/System/StorageSystemDataSkippingIndices.cpp +++ b/src/Storages/System/StorageSystemDataSkippingIndices.cpp @@ -274,7 +274,7 @@ void ReadFromSystemDataSkippingIndices::initializePipeline(QueryPipelineBuilder { MutableColumnPtr column = ColumnString::create(); - const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); for (const auto & [database_name, database] : databases) { if (database_name == DatabaseCatalog::TEMPORARY_DATABASE) diff --git a/src/Storages/System/StorageSystemDatabaseReplicas.cpp b/src/Storages/System/StorageSystemDatabaseReplicas.cpp index 4d2fef70a0a1..3c9afa1047b4 100644 --- a/src/Storages/System/StorageSystemDatabaseReplicas.cpp +++ b/src/Storages/System/StorageSystemDatabaseReplicas.cpp @@ -291,7 +291,7 @@ void StorageSystemDatabaseReplicas::read( const bool need_to_check_access_for_databases = !access->isGranted(AccessType::SHOW_DATABASES); std::map replicated_databases; - for (const auto & [db_name, db_data] : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & [db_name, db_data] : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { if (!dynamic_cast(db_data.get())) continue; diff --git a/src/Storages/System/StorageSystemDatabases.cpp b/src/Storages/System/StorageSystemDatabases.cpp index fee7227aed44..b5d69747110c 100644 --- a/src/Storages/System/StorageSystemDatabases.cpp +++ b/src/Storages/System/StorageSystemDatabases.cpp @@ -22,7 +22,6 @@ namespace ErrorCodes extern const int UNKNOWN_DATABASE; } - ColumnsDescription StorageSystemDatabases::getColumnsDescription() { auto description = ColumnsDescription @@ -119,10 +118,10 @@ void StorageSystemDatabases::fillData(MutableColumns & res_columns, ContextPtr c { const auto access = context->getAccess(); const bool need_to_check_access_for_databases = !access->isGranted(AccessType::SHOW_DATABASES); - /// Data lake catalog databases are always shown in system.databases regardless of show_data_lake_catalogs_in_system_tables. - /// Listing a database name is purely local metadata and never requires expensive calls to an external catalog service. - /// The setting only guards operations like system.tables / system.columns that enumerate a catalog's contents. - const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = true}); + /// Remote databases are always shown in system.databases regardless of show_remote_databases_in_system_tables. + /// Listing a database name is purely local metadata and never requires expensive calls to an external service. + /// The setting only guards operations like system.tables / system.columns that enumerate a database's contents. + const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = true}); ColumnPtr filtered_databases_column = getFilteredDatabases(databases, predicate, context); for (size_t i = 0; i < filtered_databases_column->size(); ++i) diff --git a/src/Storages/System/StorageSystemDictionaries.cpp b/src/Storages/System/StorageSystemDictionaries.cpp index 1926093f56d3..49a2856be5db 100644 --- a/src/Storages/System/StorageSystemDictionaries.cpp +++ b/src/Storages/System/StorageSystemDictionaries.cpp @@ -104,13 +104,10 @@ ColumnsDescription StorageSystemDictionaries::getColumnsDescription() void StorageSystemDictionaries::fillData(MutableColumns & res_columns, ContextPtr context, const ActionsDAG::Node *, std::vector) const { const auto access = context->getAccess(); - const bool check_access_for_dictionaries = access->isGranted(AccessType::SHOW_DICTIONARIES); + const bool need_to_check_access_for_dictionaries = !access->isGranted(AccessType::SHOW_DICTIONARIES); const auto & external_dictionaries = context->getExternalDictionariesLoader(); - if (!check_access_for_dictionaries) - return; - for (const auto & load_result : external_dictionaries.getLoadResults()) { const auto dict_ptr = std::dynamic_pointer_cast(load_result.object); @@ -121,7 +118,7 @@ void StorageSystemDictionaries::fillData(MutableColumns & res_columns, ContextPt StorageID dict_id = getDictionaryID(load_result, dict_ptr); String db_or_tag = dict_id.database_name.empty() ? IDictionary::NO_DATABASE_TAG : dict_id.database_name; - if (!access->isGranted(AccessType::SHOW_DICTIONARIES, db_or_tag, dict_id.table_name)) + if (need_to_check_access_for_dictionaries && !access->isGranted(AccessType::SHOW_DICTIONARIES, db_or_tag, dict_id.table_name)) continue; size_t i = 0; diff --git a/src/Storages/System/StorageSystemDistributionQueue.cpp b/src/Storages/System/StorageSystemDistributionQueue.cpp index a047606fc687..c08a2734a083 100644 --- a/src/Storages/System/StorageSystemDistributionQueue.cpp +++ b/src/Storages/System/StorageSystemDistributionQueue.cpp @@ -121,7 +121,7 @@ void StorageSystemDistributionQueue::fillData(MutableColumns & res_columns, Cont const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); std::map> tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { /// Check if database can contain distributed tables if (db.second->isExternal()) diff --git a/src/Storages/System/StorageSystemGraphite.cpp b/src/Storages/System/StorageSystemGraphite.cpp index 1f5e4d46e7b0..f652bb9b09c8 100644 --- a/src/Storages/System/StorageSystemGraphite.cpp +++ b/src/Storages/System/StorageSystemGraphite.cpp @@ -35,7 +35,7 @@ ColumnsDescription StorageSystemGraphite::getColumnsDescription() */ static StorageSystemGraphite::Configs getConfigs(ContextPtr context) { - const Databases databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + const Databases databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); StorageSystemGraphite::Configs graphite_configs; for (const auto & db : databases) diff --git a/src/Storages/System/StorageSystemHybridWatermarks.cpp b/src/Storages/System/StorageSystemHybridWatermarks.cpp index 38099e4be5f2..ee38713e48f3 100644 --- a/src/Storages/System/StorageSystemHybridWatermarks.cpp +++ b/src/Storages/System/StorageSystemHybridWatermarks.cpp @@ -61,7 +61,7 @@ void StorageSystemHybridWatermarks::fillData( /// Enumerate only Hybrid tables (StorageDistributed with getName() == "Hybrid"). std::map> tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { if (db.second->isExternal()) continue; diff --git a/src/Storages/System/StorageSystemIcebergHistory.cpp b/src/Storages/System/StorageSystemIcebergHistory.cpp index f7ca27e65955..7067bb66745a 100644 --- a/src/Storages/System/StorageSystemIcebergHistory.cpp +++ b/src/Storages/System/StorageSystemIcebergHistory.cpp @@ -93,7 +93,7 @@ void StorageSystemIcebergHistory::fillData([[maybe_unused]] MutableColumns & res if (show_tables_granted) { - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = true}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = true}); for (const auto & db: databases) { /// with last flag we are filtering out all non iceberg table diff --git a/src/Storages/System/StorageSystemKafkaConsumers.cpp b/src/Storages/System/StorageSystemKafkaConsumers.cpp index e81677451b41..ca17621942ec 100644 --- a/src/Storages/System/StorageSystemKafkaConsumers.cpp +++ b/src/Storages/System/StorageSystemKafkaConsumers.cpp @@ -266,7 +266,7 @@ void StorageSystemKafkaConsumers::fillData(MutableColumns & res_columns, Context return; add_rows(it, kafka_table); }; - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); for (const auto & db : databases) { for (auto it = db.second->getTablesIterator(context); it->isValid(); it->next()) diff --git a/src/Storages/System/StorageSystemMutations.cpp b/src/Storages/System/StorageSystemMutations.cpp index 3691951f34ab..9fe9c9ed7aef 100644 --- a/src/Storages/System/StorageSystemMutations.cpp +++ b/src/Storages/System/StorageSystemMutations.cpp @@ -71,7 +71,7 @@ void StorageSystemMutations::fillData(MutableColumns & res_columns, ContextPtr c /// Collect a set of *MergeTree tables. std::map> merge_tree_tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { /// Check if database can contain MergeTree tables if (db.second->isExternal()) diff --git a/src/Storages/System/StorageSystemObjectStorageQueueSettings.cpp b/src/Storages/System/StorageSystemObjectStorageQueueSettings.cpp index d9b08ae474cb..8acebb7c0a03 100644 --- a/src/Storages/System/StorageSystemObjectStorageQueueSettings.cpp +++ b/src/Storages/System/StorageSystemObjectStorageQueueSettings.cpp @@ -58,7 +58,7 @@ void StorageSystemObjectStorageQueueSettings::fillData( const bool show_tables_granted = access->isGranted(AccessType::SHOW_TABLES); if (show_tables_granted) { - auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); for (const auto & db : databases) { for (auto iterator = db.second->getTablesIterator(context); iterator->isValid(); iterator->next()) diff --git a/src/Storages/System/StorageSystemPartMovesBetweenShards.cpp b/src/Storages/System/StorageSystemPartMovesBetweenShards.cpp index a7e043ff24eb..876e5539bb9e 100644 --- a/src/Storages/System/StorageSystemPartMovesBetweenShards.cpp +++ b/src/Storages/System/StorageSystemPartMovesBetweenShards.cpp @@ -59,7 +59,7 @@ void StorageSystemPartMovesBetweenShards::fillData(MutableColumns & res_columns, const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); std::map> replicated_tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { /// Check if database can contain replicated tables if (db.second->isExternal()) diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index 4d59daa5e26a..894a3857d3bf 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -118,7 +118,7 @@ StoragesInfoStream::StoragesInfoStream(std::optional filter_by_datab const bool check_access_for_tables = !access->isGranted(AccessType::SHOW_TABLES); { - Databases databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + Databases databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); /// Add column 'database'. MutableColumnPtr database_column_mut = ColumnString::create(); diff --git a/src/Storages/System/StorageSystemProjections.cpp b/src/Storages/System/StorageSystemProjections.cpp index 961c3c53619e..3e52a223b9f1 100644 --- a/src/Storages/System/StorageSystemProjections.cpp +++ b/src/Storages/System/StorageSystemProjections.cpp @@ -270,7 +270,7 @@ void ReadFromSystemProjections::initializePipeline(QueryPipelineBuilder & pipeli { MutableColumnPtr column = ColumnString::create(); - const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false}); + const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false}); for (const auto & [database_name, database] : databases) { if (database_name == DatabaseCatalog::TEMPORARY_DATABASE) diff --git a/src/Storages/System/StorageSystemReplicas.cpp b/src/Storages/System/StorageSystemReplicas.cpp index 6061f305f88f..df9987f7ce2b 100644 --- a/src/Storages/System/StorageSystemReplicas.cpp +++ b/src/Storages/System/StorageSystemReplicas.cpp @@ -174,7 +174,7 @@ void StorageSystemReplicas::read( /// We collect a set of replicated tables. std::map> replicated_tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { /// Check if database can contain replicated tables if (db.second->isExternal()) diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp index 9e8faab689d6..434cfdc9b7b5 100644 --- a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp @@ -56,7 +56,7 @@ void StorageSystemReplicatedPartitionExports::fillData(MutableColumns & res_colu const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); std::map> replicated_merge_tree_tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { /// skip data lakes if (db.second->isExternal()) diff --git a/src/Storages/System/StorageSystemReplicationQueue.cpp b/src/Storages/System/StorageSystemReplicationQueue.cpp index 150b181eeae6..7d9692b53e3a 100644 --- a/src/Storages/System/StorageSystemReplicationQueue.cpp +++ b/src/Storages/System/StorageSystemReplicationQueue.cpp @@ -78,7 +78,7 @@ void StorageSystemReplicationQueue::fillData(MutableColumns & res_columns, Conte const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); std::map> replicated_tables; - for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = false})) + for (const auto & db : DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = false})) { /// Check if database can contain replicated tables if (db.second->isExternal()) diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index a8e32acddeb5..2bf1cb48e071 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -43,7 +43,7 @@ namespace Setting extern const SettingsSeconds lock_acquire_timeout; extern const SettingsUInt64 select_sequential_consistency; extern const SettingsBool show_table_uuid_in_table_create_query_if_not_nil; - extern const SettingsBool show_data_lake_catalogs_in_system_tables; + extern const SettingsBool show_remote_databases_in_system_tables; } namespace detail @@ -53,7 +53,7 @@ ColumnPtr getFilteredDatabases(const ActionsDAG::Node * predicate, ContextPtr co MutableColumnPtr column = ColumnString::create(); const auto & settings = context->getSettingsRef(); - const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_datalake_catalogs = settings[Setting::show_data_lake_catalogs_in_system_tables]}); + const auto databases = DatabaseCatalog::instance().getDatabases(GetDatabasesOptions{.with_remote_databases = settings[Setting::show_remote_databases_in_system_tables]}); for (const auto & database_name : databases | boost::adaptors::map_keys) { if (database_name == DatabaseCatalog::TEMPORARY_DATABASE) diff --git a/tests/integration/test_database_glue/test.py b/tests/integration/test_database_glue/test.py index 4738a3ac6977..111c331ce2dd 100644 --- a/tests/integration/test_database_glue/test.py +++ b/tests/integration/test_database_glue/test.py @@ -194,7 +194,7 @@ def generate_arrow_data(num_rows=5): return table def create_clickhouse_glue_database( - started_cluster, node, name, additional_settings={}, with_credentials=True + started_cluster, node, name, additional_settings={}, query_settings={}, with_credentials=True ): settings = { "catalog_type": "glue", @@ -262,6 +262,7 @@ def started_cluster(): user_configs=[], stay_alive=True, with_glue_catalog=True, + with_zookeeper=True ) sts = cluster.add_instance( @@ -508,6 +509,35 @@ def test_empty_table(started_cluster): assert len(node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`")) == 0 +def test_cloud_mode(started_cluster): + node = started_cluster.instances["node1"] + + test_ref = f"test_cloud_mode_{uuid.uuid4()}" + table_name = f"{test_ref}_table" + root_namespace = f"{test_ref}_namespace" + + namespace = f"{root_namespace}_A" + catalog = load_catalog_impl(started_cluster) + catalog.create_namespace(namespace) + + table = create_table(catalog, namespace, table_name) + + + num_rows = 10 + df = generate_arrow_data(num_rows) + table.append(df) + + create_clickhouse_glue_database( + started_cluster, + node, + CATALOG_NAME, query_settings={"cloud_mode": 1} + ) + + assert int( + node.query(f"SELECT count() FROM {CATALOG_NAME}.`{namespace}.{table_name}`") + ) == num_rows + + def test_timestamps(started_cluster): node = started_cluster.instances["node1"] @@ -859,6 +889,7 @@ def test_sts_smoke(started_cluster): "aws_role_arn": "arn::role", "aws_role_session_name": "wrongsession", }, + query_settings={}, with_credentials=False, ) @@ -884,6 +915,7 @@ def test_sts_smoke(started_cluster): "aws_role_arn": "arn::role", "aws_role_session_name": "miniorole", }, + query_settings={}, with_credentials=False, ) @@ -894,3 +926,117 @@ def test_sts_smoke(started_cluster): # Cleanup node.query(f"DROP DATABASE IF EXISTS {db_name_fail} SYNC") node.query(f"DROP DATABASE IF EXISTS {db_name_success} SYNC") + + +def test_sts_credential_refresh_on_expired_token(started_cluster): + """When an S3 read fails mid-query with an `ExpiredToken`-style error, + `GlueCatalog::getCredentialsConfigurationCallback` should fire and the read + should recover with freshly-vended STS credentials. + + Without the override (the previous behavior), the catalog inherits the default + `std::nullopt` from `ICatalog::getCredentialsConfigurationCallback`, + `ReadBufferFromS3::processException` skips the refresh branch, and the read + surfaces the `ExpiredToken` to the user. + + The `s3_send_request_throw_expired_token` failpoint fires inside + `ReadBufferFromS3::sendRequest`, so it covers both `nextImpl`-driven + streaming reads and the `readBigAt` range-read path that Parquet column-chunk + reads go through. It is `ONCE`, so only the first S3 GET fails — the + subsequent retry triggered by the credentials refresh callback succeeds. + """ + node = started_cluster.instances["node1"] + + test_ref = f"refresh_token_{uuid.uuid4().hex}" + table_name = f"{test_ref}_table" + root_namespace = f"{test_ref}_namespace" + + catalog = load_catalog_impl(started_cluster) + catalog.create_namespace(root_namespace) + + schema = Schema( + NestedField(field_id=1, name="id", field_type=StringType(), required=False), + NestedField(field_id=2, name="value", field_type=DoubleType(), required=False), + ) + + rows = [ + {"id": "row1", "value": 10.0}, + {"id": "row2", "value": 20.0}, + {"id": "row3", "value": 30.0}, + ] + expected_sum = sum(r["value"] for r in rows) + create_table( + catalog, root_namespace, table_name, schema, PartitionSpec(), DEFAULT_SORT_ORDER, dir=table_name + ).append(pa.Table.from_pylist(rows)) + + db_name = f"db_{test_ref.replace('-', '_')}" + create_clickhouse_glue_database( + started_cluster, + node, + db_name, + additional_settings={ + "aws_role_arn": "arn::role", + "aws_role_session_name": "miniorole", + }, + with_credentials=False, + ) + + # Sanity check: query works without the failpoint. + pre_failpoint = node.query( + f"SELECT sum(value) FROM {db_name}.`{root_namespace}.{table_name}`" + ).strip() + assert pre_failpoint == str(int(expected_sum)), ( + f"Baseline query returned {pre_failpoint}, expected {int(expected_sum)}" + ) + + qid_select = uuid.uuid4().hex + node.query("SYSTEM ENABLE FAILPOINT s3_send_request_throw_expired_token") + try: + result = node.query( + f"SELECT sum(value) FROM {db_name}.`{root_namespace}.{table_name}` " + "SETTINGS max_threads = 1", + query_id=qid_select, + ) + finally: + node.query("SYSTEM DISABLE FAILPOINT s3_send_request_throw_expired_token") + + assert result.strip() == str(int(expected_sum)), ( + f"Expected sum {int(expected_sum)} after refresh-driven retry, got: {result}" + ) + + node.query("SYSTEM FLUSH LOGS system.text_log") + + # Diagnostic: confirm the failpoint actually tripped. If this assertion fails, + # the test environment did not reach the credential-refresh code path at all + # (e.g. column chunks completed in a single `nextImpl` call); the refresh-log + # assertion below would be vacuous. + failpoint_fired = int( + node.query( + "SELECT count() FROM system.text_log " + "WHERE message LIKE '%Caught exception while reading S3 object%' " + "AND message LIKE '%ExpiredToken%' " + "AND event_time >= now() - INTERVAL 5 MINUTE" + ).strip() + ) + assert failpoint_fired >= 1, ( + "Failpoint `s3_send_request_throw_expired_token` did not fire — " + "`processException` was never invoked, so the credentials-refresh " + "code path was not exercised." + ) + + # The `LOG_DEBUG` in `GlueCatalog::getCredentialsConfigurationCallback` must + # have fired — its absence would mean the override is not in place and the + # retry was attributable to some other path. + refresh_log_count = int( + node.query( + "SELECT count() FROM system.text_log " + "WHERE message LIKE '%Refreshing AWS credentials%after expired token%' " + "AND event_time >= now() - INTERVAL 5 MINUTE" + ).strip() + ) + assert refresh_log_count >= 1, ( + "The failpoint fired but `GlueCatalog::getCredentialsConfigurationCallback` " + "was not invoked — the override is missing or not wired into " + "`StorageS3Configuration::createObjectStorage`" + ) + + node.query(f"DROP DATABASE IF EXISTS {db_name} SYNC") diff --git a/tests/integration/test_mysql57_database_engine/test.py b/tests/integration/test_mysql57_database_engine/test.py index 47aef042698e..6ba431318e5e 100644 --- a/tests/integration/test_mysql57_database_engine/test.py +++ b/tests/integration/test_mysql57_database_engine/test.py @@ -112,7 +112,14 @@ def test_mysql_ddl_for_mysql_database(started_cluster): "ALTER TABLE `test_database`.`test_table` ADD COLUMN `add_column` int(11)" ) assert "add_column" in clickhouse_node.query( - "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'" + "SHOW COLUMNS FROM test_database.test_table" + ) + assert "PRIMARY" in clickhouse_node.query( + "SHOW INDEX FROM test_database.test_table" + ) + assert "add_column" in clickhouse_node.query( + "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) time.sleep( @@ -122,7 +129,8 @@ def test_mysql_ddl_for_mysql_database(started_cluster): "ALTER TABLE `test_database`.`test_table` DROP COLUMN `add_column`" ) assert "add_column" not in clickhouse_node.query( - "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'" + "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) mysql_node.query("DROP TABLE `test_database`.`test_table`;") @@ -335,7 +343,8 @@ def test_column_comments_for_mysql_database_engine(started_cluster): "ALTER TABLE `test_database`.`test_table` ADD COLUMN `add_column` int(11) COMMENT 'add_column comment'" ) assert "add_column comment" in clickhouse_node.query( - "SELECT comment FROM system.columns WHERE table = 'test_table' AND database = 'test_database'" + "SELECT comment FROM system.columns WHERE table = 'test_table' AND database = 'test_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) clickhouse_node.query("DROP DATABASE test_database") diff --git a/tests/integration/test_mysql_database_engine/test.py b/tests/integration/test_mysql_database_engine/test.py index 7d09ca2da9eb..80781a362bd4 100644 --- a/tests/integration/test_mysql_database_engine/test.py +++ b/tests/integration/test_mysql_database_engine/test.py @@ -107,7 +107,14 @@ def test_mysql_ddl_for_mysql_database(started_cluster): "ALTER TABLE `test_database`.`test_table` ADD COLUMN `add_column` int(11)" ) assert "add_column" in clickhouse_node.query( - "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'" + "SHOW COLUMNS FROM test_database.test_table" + ) + assert "PRIMARY" in clickhouse_node.query( + "SHOW INDEX FROM test_database.test_table" + ) + assert "add_column" in clickhouse_node.query( + "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) time.sleep( @@ -117,7 +124,8 @@ def test_mysql_ddl_for_mysql_database(started_cluster): "ALTER TABLE `test_database`.`test_table` DROP COLUMN `add_column`" ) assert "add_column" not in clickhouse_node.query( - "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'" + "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'test_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) mysql_node.query("DROP TABLE `test_database`.`test_table`;") @@ -318,7 +326,8 @@ def test_column_comments_for_mysql_database_engine(started_cluster): "ALTER TABLE `test_database`.`test_table` ADD COLUMN `add_column` int(11) COMMENT 'add_column comment'" ) assert "add_column comment" in clickhouse_node.query( - "SELECT comment FROM system.columns WHERE table = 'test_table' AND database = 'test_database'" + "SELECT comment FROM system.columns WHERE table = 'test_table' AND database = 'test_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) clickhouse_node.query("DROP DATABASE test_database") diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py index 94de7fbebb4a..cb8ce0802b25 100644 --- a/tests/integration/test_mysql_protocol/test.py +++ b/tests/integration/test_mysql_protocol/test.py @@ -959,7 +959,7 @@ def test_mysql_dotnet_client(started_cluster): [ "bash", "-c", - f"dotnet run -- --host {node.hostname} --port {server_port} --username default --password 123", + f"cd /testapp && dotnet run -- --host {node.hostname} --port {server_port} --username default --password 123", ], ) # there is some thrash at the beggining of output, so it's better to use `in` instead of `==`` diff --git a/tests/integration/test_native_incorrect_data_deserialization/__init__.py b/tests/integration/test_native_incorrect_data_deserialization/__init__.py new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/tests/integration/test_native_incorrect_data_deserialization/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/integration/test_native_incorrect_data_deserialization/test.py b/tests/integration/test_native_incorrect_data_deserialization/test.py new file mode 100644 index 000000000000..f917d441e499 --- /dev/null +++ b/tests/integration/test_native_incorrect_data_deserialization/test.py @@ -0,0 +1,110 @@ +# pylint: disable=redefined-outer-name + +import logging + +import pytest + +from helpers.cluster import ClickHouseCluster + + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance("node") + + +def error_code(text): + for code in ( + "INCORRECT_DATA", + "LOGICAL_ERROR", + "CANNOT_READ_ALL_DATA", + "CANNOT_PARSE_INPUT_ASSERTION_FAILED", + ): + if code in text: + return code + return "NO_ERROR_CODE" + + +def assert_error_code(error, expected, expected_message): + code = error_code(error) + logging.debug("Error code: %s", code) + logging.debug("Error output:\n%s", error) + assert code == expected + assert expected_message in error + + +def write_var_uint(value): + result = bytearray() + while value >= 0x80: + result.append((value & 0x7F) | 0x80) + value >>= 7 + result.append(value) + return bytes(result) + + +def u64(value): + return value.to_bytes(8, "little") + + +def native_block(name, type_name, rows, data): + name = name.encode() + type_name = type_name.encode() + return ( + write_var_uint(1) + + write_var_uint(rows) + + write_var_uint(len(name)) + + name + + write_var_uint(len(type_name)) + + type_name + + data + ) + + +def native_stdin(payload): + return payload.decode("latin1") + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + node.query( + "CREATE TABLE variant_data (x Variant(String, UInt8)) ENGINE = Memory", + settings={"allow_experimental_variant_type": "1"}, + ) + yield cluster + + finally: + cluster.shutdown() + + +def test_variant_size_less_than_discriminators_limit(started_cluster, tmp_path): + payload = native_block( + "x", + "Variant(String, UInt8)", + 4, + u64(0) + bytes([1, 1, 1, 1]) + bytes([7, 8]), + ) + + data_path = tmp_path / "variant_size_less_than_discriminators_limit.native" + data_path.write_bytes(payload) + node.copy_file_to_container( + str(data_path), "/tmp/variant_size_less_than_discriminators_limit.native" + ) + + error = node.exec_in_container( + [ + "bash", + "-c", + ( + "set +e; output=$(clickhouse local --send_logs_level=fatal " + "--query \"SELECT count(), toTypeName(x) " + "FROM file('/tmp/variant_size_less_than_discriminators_limit.native', 'Native', 'x Variant(String, UInt8)') " + "FORMAT TSV\" 2>&1); printf '%s' \"$output\"" + ), + ], + ) + + assert_error_code( + error, + "INCORRECT_DATA", + "Size of variant UInt8 is expected to be not less than 4 according to discriminators, but it is 2", + ) diff --git a/tests/integration/test_postgresql_database_engine/test.py b/tests/integration/test_postgresql_database_engine/test.py index 042291c5731f..456b1904473b 100644 --- a/tests/integration/test_postgresql_database_engine/test.py +++ b/tests/integration/test_postgresql_database_engine/test.py @@ -69,13 +69,17 @@ def test_postgres_database_engine_with_postgres_ddl(started_cluster): assert "test_table" in node1.query("SHOW TABLES FROM postgres_database") cursor.execute("ALTER TABLE test_table ADD COLUMN data Text") + assert "data" in node1.query("SHOW COLUMNS FROM postgres_database.test_table") + assert "PRIMARY" in node1.query("SHOW INDEX FROM postgres_database.test_table") assert "data" in node1.query( - "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'postgres_database'" + "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'postgres_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) cursor.execute("ALTER TABLE test_table DROP COLUMN data") assert "data" not in node1.query( - "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'postgres_database'" + "SELECT name FROM system.columns WHERE table = 'test_table' AND database = 'postgres_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) node1.query("DROP DATABASE postgres_database") @@ -285,7 +289,8 @@ def test_predefined_connection_configuration(started_cluster): node1.query("CREATE DATABASE postgres_database ENGINE = PostgreSQL(postgres1)") result = node1.query( - "select create_table_query from system.tables where database ='postgres_database'" + "select create_table_query from system.tables where database ='postgres_database'", + settings={"show_remote_databases_in_system_tables": 1}, ) print(f"kssenii: {result}") assert result.strip().endswith( @@ -449,7 +454,8 @@ def test_numeric_detach_attach(started_cluster): def get_actual_clickhouse_column_types(): res = node1.query( - "SELECT name, type FROM system.columns WHERE database = 'postgres_database' AND table = 'test_table'" + "SELECT name, type FROM system.columns WHERE database = 'postgres_database' AND table = 'test_table'", + settings={"show_remote_databases_in_system_tables": 1}, ) return dict(line.split('\t') for line in res.splitlines()) diff --git a/tests/integration/test_refreshable_mv_keeper_loss/__init__.py b/tests/integration/test_refreshable_mv_keeper_loss/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_refreshable_mv_keeper_loss/configs/remote_servers.xml b/tests/integration/test_refreshable_mv_keeper_loss/configs/remote_servers.xml new file mode 100644 index 000000000000..93d8f890f40f --- /dev/null +++ b/tests/integration/test_refreshable_mv_keeper_loss/configs/remote_servers.xml @@ -0,0 +1,16 @@ + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_refreshable_mv_keeper_loss/configs/settings.xml b/tests/integration/test_refreshable_mv_keeper_loss/configs/settings.xml new file mode 100644 index 000000000000..d15acfaa303b --- /dev/null +++ b/tests/integration/test_refreshable_mv_keeper_loss/configs/settings.xml @@ -0,0 +1,8 @@ + + + + 1 + Etc/UTC + + + diff --git a/tests/integration/test_refreshable_mv_keeper_loss/configs/zookeeper.xml b/tests/integration/test_refreshable_mv_keeper_loss/configs/zookeeper.xml new file mode 100644 index 000000000000..6f249703c08e --- /dev/null +++ b/tests/integration/test_refreshable_mv_keeper_loss/configs/zookeeper.xml @@ -0,0 +1,21 @@ + + + + zoo1 + 2181 + + + zoo2 + 2181 + + + zoo3 + 2181 + + + 5000 + 5000 + + diff --git a/tests/integration/test_refreshable_mv_keeper_loss/test.py b/tests/integration/test_refreshable_mv_keeper_loss/test.py new file mode 100644 index 000000000000..05eca0c6c6e6 --- /dev/null +++ b/tests/integration/test_refreshable_mv_keeper_loss/test.py @@ -0,0 +1,163 @@ +import time +from datetime import datetime, timedelta + +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__, zookeeper_config_path="configs/zookeeper.xml") + +node1 = cluster.add_instance( + "node1", + main_configs=["configs/remote_servers.xml"], + user_configs=["configs/settings.xml"], + with_zookeeper=True, + keeper_required_feature_flags=["multi_read", "create_if_not_exists"], + macros={"shard": 1, "replica": 1}, +) + +node2 = cluster.add_instance( + "node2", + main_configs=["configs/remote_servers.xml"], + user_configs=["configs/settings.xml"], + with_zookeeper=True, + keeper_required_feature_flags=["multi_read", "create_if_not_exists"], + macros={"shard": 1, "replica": 2}, +) + + +@pytest.fixture(scope="module", autouse=True) +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def view_state(table): + """Per-replica `system.view_refreshes` snapshot, for failure messages.""" + rows = [] + for n in [node1, node2]: + info = n.query( + "SELECT status, exception, retry, last_refresh_replica, " + "toInt64(last_refresh_time) AS last, toInt64(next_refresh_time) AS next, " + "toInt64(last_success_time) AS succ " + f"FROM system.view_refreshes WHERE view = '{table}' FORMAT Vertical" + ).strip() + rows.append(f"--- {n.name} ---\n{info}") + return "\n".join(rows) + + +def wait_for_status(node, table, expected, timeout=60): + deadline = time.time() + timeout + last = None + while time.time() < deadline: + last = node.query( + f"SELECT status FROM system.view_refreshes WHERE view = '{table}'" + ).strip() + if last == expected: + return + time.sleep(0.2) + raise AssertionError( + f"{node.name}: view '{table}' did not reach status '{expected}' within {timeout}s " + f"(last: '{last}')\n{view_state(table)}" + ) + + +def test_keeper_session_loss_during_coordinated_refresh(started_cluster): + """ + Regression test: when a coordinated RMV refresh is running on replica B and B + loses its keeper session (long enough for the 'running' ephemeral to expire), + replica A must NOT start a duplicate refresh for the same timeslot. + """ + if node1.is_built_with_sanitizer(): + # SELECT below balloons under sanitizers → flaky timing. + pytest.skip("Disabled for sanitizers (timing-sensitive)") + + # Replicated database is the only flavor that supports coordinated RMVs. + node1.query("DROP DATABASE IF EXISTS re ON CLUSTER default SYNC") + node1.query( + "CREATE DATABASE re ON CLUSTER default " + "ENGINE = Replicated('/clickhouse/databases/re', '{shard}', '{replica}')" + ) + + # Separate target table so we can SYSTEM SYNC REPLICA on it deterministically + # before counting rows. + node1.query( + "CREATE TABLE re.tgt (t DateTime, i UInt64) " + "ENGINE = ReplicatedMergeTree() ORDER BY tuple()" + ) + + # APPEND-mode coordinated RMV. The SELECT runs ~30s (sleepEachRow with + # max_block_size=1), giving a wide window to kill keeper mid-refresh. + # EVERY 1 YEAR puts the natural schedule far from any clock boundary, so + # neither replica spontaneously refreshes during setup. EMPTY skips the + # initial refresh at CREATE time. + node1.query( + """ + CREATE MATERIALIZED VIEW re.a + REFRESH EVERY 1 YEAR APPEND + TO re.tgt + EMPTY + AS SELECT now() AS t, sum(sleepEachRow(1) + 1) AS i + FROM numbers(30) SETTINGS max_block_size = 1 + """ + ) + + wait_for_status(node1, "a", "Scheduled") + wait_for_status(node2, "a", "Scheduled") + + # Stop the view on node1 so node2 deterministically wins the race for the + # 'running' ephemeral znode of the next refresh. + node1.query("SYSTEM STOP VIEW re.a") + + next_refresh = node2.query( + "SELECT next_refresh_time FROM system.view_refreshes WHERE view = 'a'" + ).strip() + assert next_refresh, "Expected node2 to have a scheduled next refresh" + next_refresh_dt = datetime.strptime(next_refresh, "%Y-%m-%d %H:%M:%S") + # node1's wake time must land in the same yearly timeslot as node2's, + # otherwise determineNextRefreshTime would advance to next year and APPEND + # for a different timeslot, masking the bug. 5 minutes leaves room for + # retry backoff to grow without crossing the year boundary. + node1_fake = (next_refresh_dt + timedelta(minutes=5)).strftime("%Y-%m-%d %H:%M:%S") + + # Fake clock is per-replica, so this only triggers node2's refresh. + node2.query(f"SYSTEM TEST VIEW re.a SET FAKE TIME '{next_refresh}'") + + wait_for_status(node2, "a", "Running", timeout=30) + wait_for_status(node1, "a", "RunningOnAnotherReplica", timeout=30) + + # Restart all keeper nodes. With session_timeout_ms=5000 (see zookeeper.xml) + # and >5s of total quorum-loss below, every client session expires, so + # node2's 'running' ephemeral disappears -- but node2's SELECT keeps + # running because query execution is independent of keeper. This is the + # bug's trigger condition and mirrors the user's manual repro. + started_cluster.stop_zookeeper_nodes(["zoo1", "zoo2", "zoo3"]) + started_cluster.start_zookeeper_nodes(["zoo1", "zoo2", "zoo3"]) + started_cluster.wait_zookeeper_nodes_to_start(["zoo1", "zoo2", "zoo3"], timeout=60) + + # Make node1 want to refresh for the same yearly timeslot. With the fix, + # node1 sees `refresh_running=true` in the root znode and waits for node2 + # to finish; without the fix, node1 races and APPENDs a duplicate row. + node1.query(f"SYSTEM TEST VIEW re.a SET FAKE TIME '{node1_fake}'") + node1.query("SYSTEM START VIEW re.a") + + # Wait for the dust to settle: node2's refresh completes (~30s SELECT + + # post-keeper INSERT) and propagates to keeper, then both replicas idle. + wait_for_status(node1, "a", "Scheduled", timeout=120) + wait_for_status(node2, "a", "Scheduled", timeout=120) + + # Pull both replicas' parts before counting -- otherwise the count races + # against ReplicatedMergeTree replication. + node1.query("SYSTEM SYNC REPLICA re.tgt") + node2.query("SYSTEM SYNC REPLICA re.tgt") + + rows = node1.query("SELECT toInt64(t), i, _part FROM re.tgt ORDER BY t") + final_count = int(node1.query("SELECT count() FROM re.tgt").strip()) + + assert final_count == 1, ( + f"Expected 1 row APPENDed for the timeslot, got {final_count}. " + f"Rows:\n{rows}" + ) diff --git a/tests/integration/test_ssh/test.py b/tests/integration/test_ssh/test.py index c28d0aeb1630..8f200a53a845 100644 --- a/tests/integration/test_ssh/test.py +++ b/tests/integration/test_ssh/test.py @@ -1,5 +1,8 @@ import os +import re +import socket import subprocess +import time import paramiko import pytest @@ -158,3 +161,153 @@ def test_paramiko_password(started_cluster): # Secsh channel 1 open FAILED: : Administratively prohibited client.close() + + +def _server_open_files(): + pid = instance.get_process_pid("clickhouse") + assert pid is not None, "could not locate clickhouse PID" + out = instance.exec_in_container( + ["bash", "-c", f"ls -1 /proc/{pid}/fd | wc -l"] + ) + return int(out.strip()) + + +def _hold_server_fds(port, count): + sockets = [] + for _ in range(count): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((instance.ip_address, port)) + sockets.append(s) + return sockets + + +def test_ssh_interactive_pty_with_high_fds(started_cluster): + """ + Regression test for the `select(2)` / `fd_set` overflow inside + `replxx::Terminal::wait_for_input` when the embedded clickhouse-client is + reached over SSH PTY in a process whose fd table extends past + `FD_SETSIZE` (1024 on glibc). + + With the bug present, the PTY-slave fd handed to replxx ends up >= 1024; + `select(nfds > FD_SETSIZE, ...)` returns -1/EINVAL on every iteration, + `wait_for_input` busy-loops, and the interactive shell never observes + user input — so the query result never comes back to the SSH peer. Under + ASan the same code path trips a stack-buffer-overflow on the on-stack + `fd_set`. + + The test asserts the positive behavior: an interactive PTY session with + > 1024 fds open on the server returns the expected query result within a + bounded time. + """ + # Inflate the server-side fd count to well above FD_SETSIZE (1024) so the + # fds opened by the new SSH session (TCP socket, PTY master/slave, libssh + # internal pipes) are guaranteed to land at >= 1024. We pick a margin large + # enough that the test is deterministic regardless of how many fds the + # server has already allocated for its own internals, but small enough to + # stay well under `max_connections` (bumped to 8192 in the test config). + DUMMY_CONNECTIONS = 1500 + + baseline_files = _server_open_files() + keepalive = _hold_server_fds(9000, DUMMY_CONNECTIONS) + try: + # Sanity check: the server-process fd count rose by at least most of + # our connections. Poco's `TCPServer` has a single accept thread, so + # the kernel can complete all 1500 TCP handshakes well before + # userspace `accept(2)` has drained the kernel accept queue and + # allocated fds. Poll until the count crosses the threshold instead + # of reading it once. + threshold = DUMMY_CONNECTIONS * 9 // 10 + deadline = time.time() + 30 + while time.time() < deadline: + after_files = _server_open_files() + delta = after_files - baseline_files + if delta >= threshold: + break + time.sleep(0.5) + else: + raise AssertionError( + f"expected clickhouse-server fd count to grow by ~{DUMMY_CONNECTIONS}, " + f"got delta={delta} (baseline={baseline_files}, after={after_files}); " + "Poco's accept thread did not drain the kernel accept queue" + ) + + # Interactive PTY session via paramiko `invoke_shell`. We cannot use + # `exec_command` here: it is non-interactive and never reaches + # `replxx::Terminal::wait_for_input`, which is the function the bug + # lives in. + pkey = paramiko.Ed25519Key.from_private_key_file( + f"{SCRIPT_DIR}/keys/lucy_ed25519" + ) + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect( + hostname=instance.ip_address, + port=9022, + username="lucy", + pkey=pkey, + timeout=30, + ) + try: + channel = client.invoke_shell(term="xterm", width=80, height=24) + channel.settimeout(20) + try: + # Drain the initial banner / prompt so the post-query output + # is easy to identify. We don't assert on the banner content + # (it changes across versions); break once the output has + # been quiet for a short idle window. + banner_deadline = time.time() + 10 + last_data = time.time() + while time.time() < banner_deadline: + if channel.recv_ready(): + channel.recv(65536) + last_data = time.time() + elif time.time() - last_data > 0.5: + break + else: + time.sleep(0.05) + + channel.sendall("SELECT 1 FORMAT TSV\n") + + # Wait for the query result. With the bug, `wait_for_input` + # is spinning on EINVAL and no input is ever delivered to + # the client, so the result never appears and we time out. + # + # `SELECT 1 FORMAT TSV` returns the bare line "1", followed + # by the client footer `" row in set. Elapsed: ..."`. + # The query itself is echoed back by replxx (with ANSI + # colors), so a naive `b"1" in buf` would match the echoed + # query and produce a false positive even on a hung session. + # The footer string is printed only after the result has + # been emitted and never appears in echoed input, so use it + # as the unambiguous completion marker. Strip ANSI escapes + # before searching, since replxx interleaves SGR / cursor + # codes throughout the output stream. + ansi_re = re.compile(rb"\x1b\[[0-9;?]*[A-Za-z]") + buf = b"" + got_result = False + deadline = time.time() + 15 + while time.time() < deadline: + if channel.recv_ready(): + buf += channel.recv(65536) + clean = ansi_re.sub(b"", buf) + if b"1 row in set" in clean: + got_result = True + break + else: + time.sleep(0.05) + + assert got_result, ( + "interactive SSH PTY session did not return query result " + "within timeout — likely select(2)/fd_set overflow hang " + f"in replxx::Terminal::wait_for_input. raw output: {buf!r}" + ) + finally: + channel.close() + finally: + client.close() + finally: + for s in keepalive: + try: + s.close() + except OSError: + pass diff --git a/tests/integration/test_tcp_query_body_oversized_read/__init__.py b/tests/integration/test_tcp_query_body_oversized_read/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_tcp_query_body_oversized_read/configs/no_log_noise.xml b/tests/integration/test_tcp_query_body_oversized_read/configs/no_log_noise.xml new file mode 100644 index 000000000000..8c6c6c93fb64 --- /dev/null +++ b/tests/integration/test_tcp_query_body_oversized_read/configs/no_log_noise.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/tests/integration/test_tcp_query_body_oversized_read/test.py b/tests/integration/test_tcp_query_body_oversized_read/test.py new file mode 100644 index 000000000000..5db160eaae69 --- /dev/null +++ b/tests/integration/test_tcp_query_body_oversized_read/test.py @@ -0,0 +1,126 @@ +""" +Regression test for the native TCP handler: a client that sends a query body +larger than the memory limit must be rejected cleanly, without driving the +server's RSS past `max_server_memory_usage` and tripping the cgroup OOM +killer. + +The previous implementation in `src/Server/TCPHandler.cpp` read the query +string via `readStringBinary(state->query, *in)`, where `state->query` was +`std::string`. The resize on receive went through `operator new` -> +`Memory::trackMemoryFromC` -> `allocNoThrow`, which never throws on per- +server / per-query memory limits. A single client could push the server's +RSS past `max_server_memory_usage` and get cgroup-OOM-killed. + +The fix changes `state->query` to `StringWithMemoryTracking`, whose +allocator goes through `CurrentMemoryTracker::alloc` (the throwing path). +The oversized resize now throws `MEMORY_LIMIT_EXCEEDED` and the server +stays alive. + +This test runs ClickHouse in a 1 GiB Docker memory cgroup, sends a single +~950 MiB query body, and asserts: + - the server is still running afterwards, + - no cgroup OOM kill was recorded, + - the server log records the `MEMORY_LIMIT_EXCEEDED` exception. +""" + +import time + +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +# 1 GiB container memory limit. ClickHouse will detect this via cgroup v2 and +# derive `max_server_memory_usage = 0.9 * 1 GiB = 921.6 MiB`. +node = cluster.add_instance( + "node", + main_configs=["configs/no_log_noise.xml"], + mem_limit="1g", + stay_alive=True, +) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def _cgroup_oom_kill_count(n): + """Read kernel-recorded OOM kills from the container's cgroup v2 stats.""" + out = n.exec_in_container( + ["bash", "-c", "cat /sys/fs/cgroup/memory.events 2>/dev/null || true"], + user="root", + nothrow=True, + ) + for line in (out or "").splitlines(): + key, _, value = line.strip().partition(" ") + if key == "oom_kill": + return int(value) + return 0 + + +def test_tcp_query_body_oversized_read(): + if node.is_built_with_sanitizer(): + pytest.skip("Disabled for sanitizers — they perturb the memory cap.") + + # 1. Confirm the server saw and applied the 1 GiB cgroup limit. Without + # this, the assertions below would not be exercising the per-server + # memory limit at all. + assert node.contains_in_log("Available RAM: 1.00 GiB"), ( + "server did not detect the 1 GiB cgroup limit at startup" + ) + assert node.contains_in_log("Changed setting 'max_server_memory_usage'"), ( + "server did not derive max_server_memory_usage from available RAM" + ) + + pid_before = node.get_process_pid("clickhouse") + oom_kill_before = _cgroup_oom_kill_count(node) + + # 2. Send a ~950 MiB query body via the native TCP protocol from the + # host-side client. The client process is on the host (outside the + # container's cgroup) so it can buffer the query body without being + # OOM-killed; only the server is on the diet. + # `ignore_error=True` because the server rejects mid-send and the + # client typically observes the disconnection. + big_query = "SELECT length('" + "x" * (950 * 1024 * 1024) + "')" + node.query( + big_query, + settings={"max_memory_usage": 1000000, "max_query_size": 1000}, + ignore_error=True, + timeout=60, + ) + + # 3. Give the server a moment to settle. + time.sleep(1) + + # 4. Server must still be alive — no OOM kill, no crash. + pid_after = node.get_process_pid("clickhouse") + assert pid_after is not None, ( + "expected ClickHouse to stay alive after rejecting the oversized " + "query, but the process is gone" + ) + assert pid_after == pid_before, ( + f"expected same ClickHouse process to keep running, " + f"got PID {pid_before} -> {pid_after}" + ) + assert node.query("SELECT 1").strip() == "1", ( + "server is unresponsive after rejecting the oversized query" + ) + + # 5. The kernel cgroup OOM killer must not have fired. + oom_kill_after = _cgroup_oom_kill_count(node) + assert oom_kill_after == oom_kill_before, ( + f"unexpected cgroup OOM kill: counter {oom_kill_before} -> {oom_kill_after}" + ) + + # 6. Confirm the rejection went through `MEMORY_LIMIT_EXCEEDED`, not via + # some other code path (which would not protect against memory growth + # from the unbounded read). + assert node.contains_in_log("MEMORY_LIMIT_EXCEEDED"), ( + "expected MEMORY_LIMIT_EXCEEDED in server log after oversized query" + ) diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index c3beecf6ca52..064b7c1eaa76 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -168,7 +168,7 @@ SYSTEM MOVES ['SYSTEM STOP MOVES','SYSTEM START MOVES','STOP MOVES','START MOVES SYSTEM SWARM ['SYSTEM STOP SWARM MODE','SYSTEM START SWARM MODE','STOP SWARM MODE','START SWARM MODE'] GLOBAL SYSTEM SYSTEM PULLING REPLICATION LOG ['SYSTEM STOP PULLING REPLICATION LOG','SYSTEM START PULLING REPLICATION LOG'] TABLE SYSTEM SYSTEM CLEANUP ['SYSTEM STOP CLEANUP','SYSTEM START CLEANUP'] TABLE SYSTEM -SYSTEM VIEWS ['SYSTEM REFRESH VIEW','SYSTEM START VIEWS','SYSTEM STOP VIEWS','SYSTEM START VIEW','SYSTEM STOP VIEW','SYSTEM CANCEL VIEW','REFRESH VIEW','START VIEWS','STOP VIEWS','START VIEW','STOP VIEW','CANCEL VIEW'] VIEW SYSTEM +SYSTEM VIEWS ['SYSTEM REFRESH VIEW','SYSTEM START VIEWS','SYSTEM STOP VIEWS','SYSTEM START VIEW','SYSTEM STOP VIEW','SYSTEM PAUSE VIEWS','SYSTEM PAUSE VIEW','SYSTEM CANCEL VIEW','REFRESH VIEW','START VIEWS','STOP VIEWS','START VIEW','STOP VIEW','PAUSE VIEWS','PAUSE VIEW','CANCEL VIEW'] VIEW SYSTEM SYSTEM DISTRIBUTED SENDS ['SYSTEM STOP DISTRIBUTED SENDS','SYSTEM START DISTRIBUTED SENDS','STOP DISTRIBUTED SENDS','START DISTRIBUTED SENDS'] TABLE SYSTEM SENDS SYSTEM REPLICATED SENDS ['SYSTEM STOP REPLICATED SENDS','SYSTEM START REPLICATED SENDS','STOP REPLICATED SENDS','START REPLICATED SENDS'] TABLE SYSTEM SENDS SYSTEM SENDS ['SYSTEM STOP SENDS','SYSTEM START SENDS','STOP SENDS','START SENDS'] \N SYSTEM diff --git a/tests/queries/0_stateless/03913_datalake_restful_catalog_bad_format.sh b/tests/queries/0_stateless/03913_datalake_restful_catalog_bad_format.sh index fb32a9b0c162..fcac9822b4bf 100755 --- a/tests/queries/0_stateless/03913_datalake_restful_catalog_bad_format.sh +++ b/tests/queries/0_stateless/03913_datalake_restful_catalog_bad_format.sh @@ -27,7 +27,7 @@ SETTINGS $CLICKHOUSE_CLIENT -q " SELECT database || '.' || name FROM system.tables WHERE database = '${NEW_DB_NAME}' AND engine = 'MergeTree' -SETTINGS show_data_lake_catalogs_in_system_tables = 1; +SETTINGS show_remote_databases_in_system_tables = 1; " $CLICKHOUSE_CLIENT -q "DROP DATABASE IF EXISTS ${NEW_DB_NAME};" diff --git a/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.reference b/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.reference index 179770dad9a3..ff58247073db 100644 --- a/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.reference +++ b/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.reference @@ -15,4 +15,8 @@ after delete lc 1 2000-01-01 01:02:03 before delete nullable dt64 1 2000-01-01 01:02:03.123 before delete nullable dt64 2 2000-01-01 04:05:06.456 after delete nullable dt64 1 2000-01-01 01:02:03.123 +update set dt 1 946735200 +update set dt 2 946735200 +update set dt64 1 946735200 +update set dt64 2 946735200 explicit tz 1 2000-01-01 01:02:03 diff --git a/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.sql b/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.sql index bd48cc2b1ad7..60a49ae9d3c0 100644 --- a/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.sql +++ b/tests/queries/0_stateless/04056_mutation_session_timezone_datetime_literals.sql @@ -66,6 +66,26 @@ SELECT 'before delete nullable dt64', id, time FROM test_mutation_tz64_nullable ALTER TABLE test_mutation_tz64_nullable DELETE WHERE time >= '2000-01-01 02:00:00'; SELECT 'after delete nullable dt64', id, time FROM test_mutation_tz64_nullable ORDER BY id; +-- ALTER UPDATE SET DateTime column to a string literal — the literal must be +-- interpreted in session timezone, not server timezone +DROP TABLE IF EXISTS test_mutation_tz_set SYNC; +CREATE TABLE test_mutation_tz_set (id UInt32, time DateTime) ENGINE = MergeTree ORDER BY id; + +INSERT INTO test_mutation_tz_set VALUES (1, '2000-01-01 00:00:00'); +ALTER TABLE test_mutation_tz_set UPDATE time = '2000-01-01 07:00:00' WHERE id = 1; +-- Verify UPDATE SET used session timezone: insert the same literal and compare unix timestamps +INSERT INTO test_mutation_tz_set VALUES (2, '2000-01-01 07:00:00'); +SELECT 'update set dt', id, toUnixTimestamp(time) FROM test_mutation_tz_set ORDER BY id; + +-- Same for DateTime64 +DROP TABLE IF EXISTS test_mutation_tz64_set SYNC; +CREATE TABLE test_mutation_tz64_set (id UInt32, time DateTime64(3)) ENGINE = MergeTree ORDER BY id; + +INSERT INTO test_mutation_tz64_set VALUES (1, '2000-01-01 00:00:00.000'); +ALTER TABLE test_mutation_tz64_set UPDATE time = '2000-01-01 07:00:00.123' WHERE id = 1; +INSERT INTO test_mutation_tz64_set VALUES (2, '2000-01-01 07:00:00.123'); +SELECT 'update set dt64', id, toUnixTimestamp(time) FROM test_mutation_tz64_set ORDER BY id; + -- DateTime with explicit timezone should NOT be rewritten (timezone is already determined) DROP TABLE IF EXISTS test_mutation_tz_explicit SYNC; CREATE TABLE test_mutation_tz_explicit (id UInt32, time DateTime('UTC')) ENGINE = MergeTree ORDER BY id; @@ -80,4 +100,6 @@ DROP TABLE test_mutation_tz_upd SYNC; DROP TABLE test_mutation_tz_nullable SYNC; DROP TABLE test_mutation_tz_lc SYNC; DROP TABLE test_mutation_tz64_nullable SYNC; +DROP TABLE test_mutation_tz_set SYNC; +DROP TABLE test_mutation_tz64_set SYNC; DROP TABLE test_mutation_tz_explicit SYNC; diff --git a/tests/queries/0_stateless/04070_view_with_projection_not_found_column.reference b/tests/queries/0_stateless/04070_view_with_projection_not_found_column.reference new file mode 100644 index 000000000000..2f35ae532c86 --- /dev/null +++ b/tests/queries/0_stateless/04070_view_with_projection_not_found_column.reference @@ -0,0 +1 @@ +42 42 diff --git a/tests/queries/0_stateless/04070_view_with_projection_not_found_column.sql b/tests/queries/0_stateless/04070_view_with_projection_not_found_column.sql new file mode 100644 index 000000000000..edad9a92d679 --- /dev/null +++ b/tests/queries/0_stateless/04070_view_with_projection_not_found_column.sql @@ -0,0 +1,24 @@ +DROP TABLE IF EXISTS test_proj; +DROP VIEW IF EXISTS test_proj_view; + +CREATE TABLE test_proj +( + id UInt64, + name String, + PROJECTION by_name + ( + SELECT * + ORDER BY name + ) +) +ENGINE = MergeTree +ORDER BY id; + +INSERT INTO test_proj SELECT number, toString(number) FROM numbers(1000000); + +CREATE VIEW test_proj_view AS SELECT * FROM test_proj; + +SELECT * FROM test_proj_view WHERE name = '42'; + +DROP VIEW test_proj_view; +DROP TABLE test_proj; diff --git a/tests/queries/0_stateless/04082_materialize_skip_indexes_text_index.reference b/tests/queries/0_stateless/04082_materialize_skip_indexes_text_index.reference new file mode 100644 index 000000000000..e227e7866023 --- /dev/null +++ b/tests/queries/0_stateless/04082_materialize_skip_indexes_text_index.reference @@ -0,0 +1,12 @@ +After merge with materialize_skip_indexes_on_merge=0 +idx_minmax 0 +idx_text 0 +After explicit MATERIALIZE INDEX +idx_minmax 1 +idx_text 1 +After merge with materialize_skip_indexes_on_merge=1 +idx_minmax 1 +idx_text 1 +After merge with exclude idx_text +idx_minmax 1 +idx_text 0 diff --git a/tests/queries/0_stateless/04082_materialize_skip_indexes_text_index.sql b/tests/queries/0_stateless/04082_materialize_skip_indexes_text_index.sql new file mode 100644 index 000000000000..1e773cf9d848 --- /dev/null +++ b/tests/queries/0_stateless/04082_materialize_skip_indexes_text_index.sql @@ -0,0 +1,82 @@ +-- Regression test for https://github.com/ClickHouse/ClickHouse/issues/101666 +-- materialize_skip_indexes_on_merge=false must suppress text (full-text) indexes during merge, +-- not only minmax/set/bloom_filter etc. Text indexes use a separate container +-- (text_indexes_to_merge) which was previously not cleared by the setting. + +SET mutations_sync = 2; + +DROP TABLE IF EXISTS t_text_idx; + +CREATE TABLE t_text_idx +( + id UInt64, + s String, + INDEX idx_text s TYPE text(tokenizer='splitByNonAlpha') GRANULARITY 1, + INDEX idx_minmax id TYPE minmax GRANULARITY 1 +) +ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4, materialize_skip_indexes_on_merge = 0; + +INSERT INTO t_text_idx SELECT number, 'hello world' FROM numbers(20); +INSERT INTO t_text_idx SELECT number + 20, 'hello world' FROM numbers(20); + +OPTIMIZE TABLE t_text_idx FINAL; + +-- Both indexes should have no data after merge with materialize_skip_indexes_on_merge=0 +SELECT 'After merge with materialize_skip_indexes_on_merge=0'; +SELECT name, data_compressed_bytes > 0 AS has_data +FROM system.data_skipping_indices +WHERE database = currentDatabase() AND table = 't_text_idx' +ORDER BY name; + +-- Explicit MATERIALIZE INDEX should still rebuild them +ALTER TABLE t_text_idx MATERIALIZE INDEX idx_text; +ALTER TABLE t_text_idx MATERIALIZE INDEX idx_minmax; + +SELECT 'After explicit MATERIALIZE INDEX'; +SELECT name, data_compressed_bytes > 0 AS has_data +FROM system.data_skipping_indices +WHERE database = currentDatabase() AND table = 't_text_idx' +ORDER BY name; + +-- Reset setting and verify indexes are built during merge again +ALTER TABLE t_text_idx MODIFY SETTING materialize_skip_indexes_on_merge = 1; + +TRUNCATE TABLE t_text_idx; +INSERT INTO t_text_idx SELECT number, 'hello world' FROM numbers(20); +INSERT INTO t_text_idx SELECT number + 20, 'hello world' FROM numbers(20); + +OPTIMIZE TABLE t_text_idx FINAL; + +SELECT 'After merge with materialize_skip_indexes_on_merge=1'; +SELECT name, data_compressed_bytes > 0 AS has_data +FROM system.data_skipping_indices +WHERE database = currentDatabase() AND table = 't_text_idx' +ORDER BY name; + +DROP TABLE t_text_idx; + +-- Also test exclude_materialize_skip_indexes_on_merge with text indexes +DROP TABLE IF EXISTS t_text_exclude; + +CREATE TABLE t_text_exclude +( + id UInt64, + s String, + INDEX idx_text s TYPE text(tokenizer='splitByNonAlpha') GRANULARITY 1, + INDEX idx_minmax id TYPE minmax GRANULARITY 1 +) +ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4, exclude_materialize_skip_indexes_on_merge = 'idx_text'; + +INSERT INTO t_text_exclude SELECT number, 'hello world' FROM numbers(20); +INSERT INTO t_text_exclude SELECT number + 20, 'hello world' FROM numbers(20); + +OPTIMIZE TABLE t_text_exclude FINAL; + +-- Only idx_text should be excluded; idx_minmax should have data +SELECT 'After merge with exclude idx_text'; +SELECT name, data_compressed_bytes > 0 AS has_data +FROM system.data_skipping_indices +WHERE database = currentDatabase() AND table = 't_text_exclude' +ORDER BY name; + +DROP TABLE t_text_exclude; diff --git a/tests/queries/0_stateless/04103_cast_string_to_date_time_mode_nullable.reference b/tests/queries/0_stateless/04103_cast_string_to_date_time_mode_nullable.reference new file mode 100644 index 000000000000..a38e984442cb --- /dev/null +++ b/tests/queries/0_stateless/04103_cast_string_to_date_time_mode_nullable.reference @@ -0,0 +1,7 @@ +2020-02-01 20:00:00 +2020-02-01 20:00:00 +2020-01-02 20:00:00 +2020-01-02 20:00:00 +\N +\N +2020-02-01 20:00:00 diff --git a/tests/queries/0_stateless/04103_cast_string_to_date_time_mode_nullable.sql b/tests/queries/0_stateless/04103_cast_string_to_date_time_mode_nullable.sql new file mode 100644 index 000000000000..da890add4fb0 --- /dev/null +++ b/tests/queries/0_stateless/04103_cast_string_to_date_time_mode_nullable.sql @@ -0,0 +1,22 @@ +-- Tags: no-fasttest +-- Test that cast_string_to_date_time_mode setting works with Nullable(DateTime) CAST +-- https://github.com/ClickHouse/ClickHouse/issues/101840 + +SET session_timezone = 'UTC'; + +-- best_effort mode: non-Nullable (baseline) +SELECT CAST('2020-02-01T20:00:00Z' AS DateTime) SETTINGS cast_string_to_date_time_mode = 'best_effort'; +-- best_effort mode: Nullable should also work (was returning NULL before the fix) +SELECT CAST('2020-02-01T20:00:00Z' AS Nullable(DateTime)) SETTINGS cast_string_to_date_time_mode = 'best_effort'; + +-- best_effort_us mode: non-Nullable (baseline) +SELECT CAST('01/02/2020 20:00:00' AS DateTime) SETTINGS cast_string_to_date_time_mode = 'best_effort_us'; +-- best_effort_us mode: Nullable should also work (was returning NULL before the fix) +SELECT CAST('01/02/2020 20:00:00' AS Nullable(DateTime)) SETTINGS cast_string_to_date_time_mode = 'best_effort_us'; + +-- Invalid strings should still return NULL for Nullable (not throw) +SELECT CAST('not_a_date' AS Nullable(DateTime)) SETTINGS cast_string_to_date_time_mode = 'best_effort'; +SELECT CAST('not_a_date' AS Nullable(DateTime)) SETTINGS cast_string_to_date_time_mode = 'best_effort_us'; + +-- Basic mode should still work as before +SELECT CAST('2020-02-01 20:00:00' AS Nullable(DateTime)) SETTINGS cast_string_to_date_time_mode = 'basic'; diff --git a/tests/queries/0_stateless/04105_system_pause_view.reference b/tests/queries/0_stateless/04105_system_pause_view.reference new file mode 100644 index 000000000000..e97f918273c3 --- /dev/null +++ b/tests/queries/0_stateless/04105_system_pause_view.reference @@ -0,0 +1,8 @@ +<1: pause allows current refresh to complete> 5 1 1 +<2: start resumes paused view> 42 +<3: stop cancels the refresh> 0 1 +<3a: stop after pause cancels the refresh> 0 1 +<4: pause views disables all> 2 2 +<5: start views resumes all> 2 2 +<6: granted pause works> +<6: denied pause errors as expected> diff --git a/tests/queries/0_stateless/04105_system_pause_view.sh b/tests/queries/0_stateless/04105_system_pause_view.sh new file mode 100755 index 000000000000..92f1c732cc44 --- /dev/null +++ b/tests/queries/0_stateless/04105_system_pause_view.sh @@ -0,0 +1,200 @@ +#!/usr/bin/env bash +# Tags: atomic-database, memory-engine, no-parallel +# The test uses `SYSTEM PAUSE VIEWS` and `SYSTEM START VIEWS` which affect all +# refreshable views on the server, so it must not run concurrently with other +# tests that create refreshable materialized views. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Set session timezone to UTC to make all DateTime formatting and parsing use UTC, because refresh +# scheduling is done in UTC. +CLICKHOUSE_CLIENT="`echo "$CLICKHOUSE_CLIENT" | sed 's/--session_timezone[= ][^ ]*//g'`" +CLICKHOUSE_CLIENT="`echo "$CLICKHOUSE_CLIENT --allow_materialized_view_with_bad_select=0 --session_timezone Etc/UTC"`" + +$CLICKHOUSE_CLIENT -q "create view refreshes as select * from system.view_refreshes where database = '$CLICKHOUSE_DATABASE' order by view" + +# Helper: wait until the view's status matches the expected one. +wait_status() { + local view_name=$1 + local expected=$2 + while [ "`$CLICKHOUSE_CLIENT -q "select status from refreshes where view = '$view_name' -- $LINENO" | xargs`" != "$expected" ] + do + sleep 0.1 + done +} + +# --------------------------------------------------------------------------- +# Test 1: SYSTEM PAUSE VIEW does NOT interrupt the current refresh. +# --------------------------------------------------------------------------- + +# A slow refresh: 5 rows * 1 second each, long enough to observe `Running` and pause. +$CLICKHOUSE_CLIENT -q " + create table src (x Int64) engine Memory; + insert into src select * from numbers(5) settings max_block_size=1; + create materialized view p refresh every 1 year (x Int64) engine Memory empty as + select x + sleepEachRow(1) as x from src settings max_block_size = 1, max_threads = 1; + system refresh view p;" + +wait_status p Running + +# Pause while the refresh is running. This should NOT cancel the ongoing refresh. +$CLICKHOUSE_CLIENT -q "system pause view p;" + +# The currently running refresh must complete successfully. +$CLICKHOUSE_CLIENT -q "system wait view p;" + +# After pause, the view must be disabled (no new refresh scheduled). +wait_status p Disabled + +$CLICKHOUSE_CLIENT -q " + select '<1: pause allows current refresh to complete>', + (select count() from p), + (select exception = '' from refreshes where view = 'p'), + (select last_success_time is not null from refreshes where view = 'p');" + +# SYSTEM START VIEW should resume a paused view. +$CLICKHOUSE_CLIENT -q " + truncate src; + insert into src values (42); + system start view p; + system refresh view p; + system wait view p; + select '<2: start resumes paused view>', * from p; + system stop view p; + drop table p; + drop table src;" + +# --------------------------------------------------------------------------- +# Test 2 (contrast): SYSTEM STOP VIEW DOES interrupt the current refresh. +# --------------------------------------------------------------------------- + +$CLICKHOUSE_CLIENT -q " + create table src (x Int64) engine Memory; + insert into src select * from numbers(5) settings max_block_size=1; + create materialized view s refresh every 1 year (x Int64) engine Memory empty as + select x + sleepEachRow(1) as x from src settings max_block_size = 1, max_threads = 1; + system refresh view s;" + +wait_status s Running + +$CLICKHOUSE_CLIENT -q "system stop view s;" + +# STOP should interrupt the refresh; status quickly becomes Disabled with a cancellation error. +wait_status s Disabled + +$CLICKHOUSE_CLIENT -q " + select '<3: stop cancels the refresh>', + (select count() from s), + (select exception != '' from refreshes where view = 's'); + drop table s; + drop table src;" + +# --------------------------------------------------------------------------- +# Test 3: SYSTEM STOP VIEW after SYSTEM PAUSE VIEW still interrupts the +# in-flight refresh. Regression test for the PAUSE-then-STOP interaction: +# once paused, a subsequent STOP must still cancel the running refresh, even +# though PAUSE has already set the same `stop_requested` guard. +# --------------------------------------------------------------------------- + +$CLICKHOUSE_CLIENT -q " + create table src (x Int64) engine Memory; + insert into src select * from numbers(10) settings max_block_size=1; + create materialized view ps refresh every 1 year (x Int64) engine Memory empty as + select x + sleepEachRow(1) as x from src settings max_block_size = 1, max_threads = 1; + system refresh view ps;" + +wait_status ps Running + +# Pause first; refresh keeps running because PAUSE does not interrupt. +$CLICKHOUSE_CLIENT -q "system pause view ps;" + +# Now STOP must still interrupt the in-flight refresh. +$CLICKHOUSE_CLIENT -q "system stop view ps;" + +wait_status ps Disabled + +$CLICKHOUSE_CLIENT -q " + select '<3a: stop after pause cancels the refresh>', + (select count() from ps), + (select exception != '' from refreshes where view = 'ps'); + drop table ps; + drop table src;" + +# --------------------------------------------------------------------------- +# Test 4: SYSTEM PAUSE VIEWS pauses all refreshable views on this replica. +# --------------------------------------------------------------------------- + +$CLICKHOUSE_CLIENT -q " + create table src (x Int64) engine Memory; + insert into src values (1); + create materialized view va refresh every 1 second (x Int64) engine Memory empty + as select x from src; + create materialized view vb refresh every 1 second (x Int64) engine Memory empty + as select x from src; + system wait view va; + system wait view vb;" + +$CLICKHOUSE_CLIENT -q "system pause views;" + +wait_status va Disabled +wait_status vb Disabled + +$CLICKHOUSE_CLIENT -q " + select '<4: pause views disables all>', + countIf(status = 'Disabled'), count() + from refreshes where view in ('va', 'vb');" + +$CLICKHOUSE_CLIENT -q "system start views;" + +# After start views, each view should leave the Disabled state. +while [ "`$CLICKHOUSE_CLIENT -q "select countIf(status = 'Disabled') from refreshes where view in ('va', 'vb') -- $LINENO" | xargs`" != '0' ] +do + sleep 0.1 +done + +$CLICKHOUSE_CLIENT -q " + select '<5: start views resumes all>', + countIf(status != 'Disabled'), count() + from refreshes where view in ('va', 'vb'); + system stop view va; + system stop view vb; + drop table va; + drop table vb; + drop table src;" + +# --------------------------------------------------------------------------- +# Test 4: Access control for SYSTEM PAUSE VIEW. +# --------------------------------------------------------------------------- + +test_user="user_04105_$CLICKHOUSE_DATABASE" +$CLICKHOUSE_CLIENT -q " + create user $test_user; + create table src (x Int64) engine Memory; + insert into src values (1); + create materialized view granted refresh every 1 year (x Int64) engine Memory empty as select x from src; + create materialized view denied refresh every 1 year (x Int64) engine Memory empty as select x from src; + system wait view granted; + system wait view denied; + grant system views on $CLICKHOUSE_DATABASE.granted to $test_user;" + +# Pausing a view without SYSTEM VIEWS privilege should fail. +$CLICKHOUSE_CLIENT --user $test_user -q " + system pause view $CLICKHOUSE_DATABASE.denied; -- {serverError ACCESS_DENIED}" + +# Pausing a view for which the user has SYSTEM VIEWS should succeed. +$CLICKHOUSE_CLIENT --user $test_user -q " + system pause view $CLICKHOUSE_DATABASE.granted;" +wait_status granted Disabled + +echo '<6: granted pause works>' +echo '<6: denied pause errors as expected>' + +$CLICKHOUSE_CLIENT -q " + system stop view granted; + system stop view denied; + drop table granted; + drop table denied; + drop table src; + drop user $test_user;" diff --git a/tests/queries/0_stateless/04113_compact_part_writer_cancel_on_exception.reference b/tests/queries/0_stateless/04113_compact_part_writer_cancel_on_exception.reference new file mode 100644 index 000000000000..08839f6bb296 --- /dev/null +++ b/tests/queries/0_stateless/04113_compact_part_writer_cancel_on_exception.reference @@ -0,0 +1 @@ +200 diff --git a/tests/queries/0_stateless/04113_compact_part_writer_cancel_on_exception.sql b/tests/queries/0_stateless/04113_compact_part_writer_cancel_on_exception.sql new file mode 100644 index 000000000000..b7f9121ea644 --- /dev/null +++ b/tests/queries/0_stateless/04113_compact_part_writer_cancel_on_exception.sql @@ -0,0 +1,28 @@ +-- Tags: no-parallel, no-random-merge-tree-settings +-- Regression test: `MergeTreeDataPartWriterCompact::cancel` must not dereference +-- a null `shared_ptr` when `addStreams` fails before fully constructing the stream. + +DROP TABLE IF EXISTS t_compact_cancel; + +CREATE TABLE t_compact_cancel (a UInt64, b String, c Float64) +ENGINE = MergeTree ORDER BY a +SETTINGS min_bytes_for_wide_part = '10G', min_rows_for_wide_part = 10000000; + +-- Prevent background merges from racing with the failpoint. +SYSTEM STOP MERGES t_compact_cancel; + +INSERT INTO t_compact_cancel SELECT number, toString(number), number FROM numbers(100); +INSERT INTO t_compact_cancel SELECT number, toString(number), number FROM numbers(100, 100); + +-- Force the Compact writer's `addStreams` to throw during OPTIMIZE (merge). +SYSTEM ENABLE FAILPOINT compact_part_writer_fail_in_add_streams; +SYSTEM START MERGES t_compact_cancel; + +OPTIMIZE TABLE t_compact_cancel FINAL; -- {serverError FAULT_INJECTED} + +SYSTEM DISABLE FAILPOINT compact_part_writer_fail_in_add_streams; + +-- The server must still be alive and the table readable. +SELECT count() FROM t_compact_cancel; + +DROP TABLE t_compact_cancel; diff --git a/tests/queries/0_stateless/04210_show_remote_databases_in_system_tables.reference b/tests/queries/0_stateless/04210_show_remote_databases_in_system_tables.reference new file mode 100644 index 000000000000..16078ff584ee --- /dev/null +++ b/tests/queries/0_stateless/04210_show_remote_databases_in_system_tables.reference @@ -0,0 +1,20 @@ +--- system.databases always shows remote databases, regardless of the setting --- +MySQL +PostgreSQL +MySQL +PostgreSQL +--- system.tables hides PostgreSQL by default (count must be 0 without contacting the server) --- +0 +--- system.tables hides MySQL by default --- +0 +--- system.columns hides PostgreSQL by default --- +0 +--- system.columns hides MySQL by default --- +0 +--- system.completions does not list remote databases by default (count must be 0 without contacting the server) --- +0 +--- the old setting name still works as an alias --- +0 +--- system.settings exposes both the new name and the alias --- +show_data_lake_catalogs_in_system_tables show_remote_databases_in_system_tables +show_remote_databases_in_system_tables diff --git a/tests/queries/0_stateless/04210_show_remote_databases_in_system_tables.sh b/tests/queries/0_stateless/04210_show_remote_databases_in_system_tables.sh new file mode 100755 index 000000000000..03144201fcb7 --- /dev/null +++ b/tests/queries/0_stateless/04210_show_remote_databases_in_system_tables.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Tag justification: depends on libpq and libmysql (PostgreSQL and MySQL database engines), +# which are not built in fast test. + +set -e + +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=fatal + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +PG_DB="${CLICKHOUSE_DATABASE}_pg_remote" +MYSQL_DB="${CLICKHOUSE_DATABASE}_mysql_remote" + +$CLICKHOUSE_CLIENT -q "DROP DATABASE IF EXISTS ${PG_DB}; DROP DATABASE IF EXISTS ${MYSQL_DB};" + +# PostgreSQL database engine does not connect at CREATE time, so a non-existent host is fine here. +$CLICKHOUSE_CLIENT -q "CREATE DATABASE ${PG_DB} ENGINE = PostgreSQL('192.0.2.1:5432', 'fake_db', 'user', 'password');" + +# MySQL probes the server at CREATE time, but is tolerant of failures on ATTACH. +# Use a short connect_timeout / single try so the tolerated failure happens fast. +$CLICKHOUSE_CLIENT -q "ATTACH DATABASE ${MYSQL_DB} ENGINE = MySQL('192.0.2.1:3306', 'fake_db', 'user', 'password') SETTINGS connect_timeout = 1, connection_max_tries = 1;" + +$CLICKHOUSE_CLIENT -n -q " +SELECT '--- system.databases always shows remote databases, regardless of the setting ---'; +SELECT engine FROM system.databases WHERE name IN ('${PG_DB}', '${MYSQL_DB}') ORDER BY engine SETTINGS show_remote_databases_in_system_tables = 0; +SELECT engine FROM system.databases WHERE name IN ('${PG_DB}', '${MYSQL_DB}') ORDER BY engine SETTINGS show_remote_databases_in_system_tables = 1; + +SELECT '--- system.tables hides PostgreSQL by default (count must be 0 without contacting the server) ---'; +USE ${PG_DB}; +SELECT count() FROM system.tables WHERE database = currentDatabase() SETTINGS show_remote_databases_in_system_tables = 0; + +SELECT '--- system.tables hides MySQL by default ---'; +USE ${MYSQL_DB}; +SELECT count() FROM system.tables WHERE database = currentDatabase() SETTINGS show_remote_databases_in_system_tables = 0; + +SELECT '--- system.columns hides PostgreSQL by default ---'; +USE ${PG_DB}; +SELECT count() FROM system.columns WHERE database = currentDatabase() SETTINGS show_remote_databases_in_system_tables = 0; + +SELECT '--- system.columns hides MySQL by default ---'; +USE ${MYSQL_DB}; +SELECT count() FROM system.columns WHERE database = currentDatabase() SETTINGS show_remote_databases_in_system_tables = 0; + +SELECT '--- system.completions does not list remote databases by default (count must be 0 without contacting the server) ---'; +SELECT count() FROM system.completions WHERE word IN ('${PG_DB}', '${MYSQL_DB}') AND context = 'database' SETTINGS show_remote_databases_in_system_tables = 0; + +SELECT '--- the old setting name still works as an alias ---'; +USE ${PG_DB}; +SELECT count() FROM system.tables WHERE database = currentDatabase() SETTINGS show_data_lake_catalogs_in_system_tables = 0; + +SELECT '--- system.settings exposes both the new name and the alias ---'; +SELECT name, alias_for FROM system.settings WHERE name IN ('show_remote_databases_in_system_tables', 'show_data_lake_catalogs_in_system_tables') ORDER BY name; + +USE ${CLICKHOUSE_DATABASE}; +DROP DATABASE ${PG_DB}; +DROP DATABASE ${MYSQL_DB}; +" diff --git a/tests/queries/0_stateless/04258_set_skip_index_float_where_issue_105355.reference b/tests/queries/0_stateless/04258_set_skip_index_float_where_issue_105355.reference new file mode 100644 index 000000000000..f6e28a37fec4 --- /dev/null +++ b/tests/queries/0_stateless/04258_set_skip_index_float_where_issue_105355.reference @@ -0,0 +1,7 @@ +2 +2 +2 +2 +1.5 +2 +2 diff --git a/tests/queries/0_stateless/04258_set_skip_index_float_where_issue_105355.sql b/tests/queries/0_stateless/04258_set_skip_index_float_where_issue_105355.sql new file mode 100644 index 000000000000..489b7d371afd --- /dev/null +++ b/tests/queries/0_stateless/04258_set_skip_index_float_where_issue_105355.sql @@ -0,0 +1,38 @@ +-- Regression test for https://github.com/ClickHouse/ClickHouse/issues/105355 +-- `__bitWrapperFunc` only supports integer arguments. The set skip index used +-- to wrap any WHERE atom (including atoms whose result type is `Float`) with +-- `__bitWrapperFunc`, which threw the internal "It's a bug!" exception at +-- execution time. After the fix, non-integer atom result types fall back to +-- `UNKNOWN_FIELD` and the query goes through the regular filter path. + +DROP TABLE IF EXISTS t_105355_uint; +DROP TABLE IF EXISTS t_105355_float; + +CREATE TABLE t_105355_uint (c0 UInt32, INDEX idx c0 TYPE set(100) GRANULARITY 4) + ENGINE = MergeTree() ORDER BY c0; +INSERT INTO t_105355_uint VALUES (2); + +-- Float64 atom from arithmetic +SELECT c0 FROM t_105355_uint WHERE c0 + 0.1; + +-- Float64 atom from `log` (non-zero result) +SELECT c0 FROM t_105355_uint WHERE log(c0); + +-- Float64 atom from negation of `log` +SELECT c0 FROM t_105355_uint WHERE -log(c0); + +-- BFloat16 atom +SELECT c0 FROM t_105355_uint WHERE c0 + toBFloat16(0.1); + +-- Float column as a direct WHERE atom (also reachable from an INPUT) +CREATE TABLE t_105355_float (f Float32, INDEX idx f TYPE set(100) GRANULARITY 4) + ENGINE = MergeTree() ORDER BY f; +INSERT INTO t_105355_float VALUES (1.5); +SELECT f FROM t_105355_float WHERE f; + +-- Sanity check: integer atoms still use the index and return correct results +SELECT c0 FROM t_105355_uint WHERE c0 = 2; +SELECT c0 FROM t_105355_uint WHERE c0 > 0; + +DROP TABLE t_105355_uint; +DROP TABLE t_105355_float; diff --git a/tests/queries/0_stateless/04259_dictgetordefault_nullable_default_short_circuit.reference b/tests/queries/0_stateless/04259_dictgetordefault_nullable_default_short_circuit.reference new file mode 100644 index 000000000000..4146f4617364 --- /dev/null +++ b/tests/queries/0_stateless/04259_dictgetordefault_nullable_default_short_circuit.reference @@ -0,0 +1,20 @@ +a0 +a1 +fb_99 +fb_100 +String +('a0',10) +('a1',11) +('fb_99',1099) +('fb_100',1100) +Tuple(\n a String,\n b UInt32) +('a0',10) +('a1',11) +('fb_99',1099) +('fb_100',1100) +Tuple(\n a String,\n b UInt32) +('a0',10) +('a1',11) +('fb_99',1099) +('fb_100',1100) +Tuple(\n a String,\n b UInt32) diff --git a/tests/queries/0_stateless/04259_dictgetordefault_nullable_default_short_circuit.sql b/tests/queries/0_stateless/04259_dictgetordefault_nullable_default_short_circuit.sql new file mode 100644 index 000000000000..536f8f5e974e --- /dev/null +++ b/tests/queries/0_stateless/04259_dictgetordefault_nullable_default_short_circuit.sql @@ -0,0 +1,79 @@ +-- Tags: no-parallel-replicas +-- no-parallel-replicas: Dictionary source tables are not available on parallel-replica workers. + +-- https://github.com/ClickHouse/ClickHouse/issues/104511 +-- dictGetOrDefault should not throw CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN when +-- the default argument is a Nullable expression and short-circuit evaluation is enabled. +-- clearMaskedNullsBeforeCast handles: Nullable(T), Tuple(Nullable(T)), +-- Nullable(Tuple(T)), Nullable(Tuple(Nullable(T))). + +SET short_circuit_function_evaluation = 'enable'; + +DROP TABLE IF EXISTS test_dict_nullable_default_src; +DROP TABLE IF EXISTS test_dict_nullable_default_keys; +DROP DICTIONARY IF EXISTS test_dict_nullable_default_single; +DROP DICTIONARY IF EXISTS test_dict_nullable_default_tuple; + +CREATE TABLE test_dict_nullable_default_src (id UInt64, a String, b UInt32) ENGINE = MergeTree ORDER BY id; +INSERT INTO test_dict_nullable_default_src VALUES (0, 'a0', 10), (1, 'a1', 11); + +CREATE TABLE test_dict_nullable_default_keys (x UInt64) ENGINE = MergeTree ORDER BY x; +INSERT INTO test_dict_nullable_default_keys VALUES (0), (1), (99), (100); + +CREATE DICTIONARY test_dict_nullable_default_single +( + `id` UInt64, + `a` String DEFAULT '' +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE 'test_dict_nullable_default_src')) +LAYOUT(DIRECT()); + +CREATE DICTIONARY test_dict_nullable_default_tuple +( + `id` UInt64, + `a` String DEFAULT '', + `b` UInt32 DEFAULT 0 +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE 'test_dict_nullable_default_src')) +LAYOUT(HASHED()) +LIFETIME(0); + +-- Nullable(String) -> String, single attribute with mixed found/not-found keys. +SELECT dictGetOrDefault('test_dict_nullable_default_single', 'a', x, + toNullable(concat('fb_', toString(x)))) AS v +FROM test_dict_nullable_default_keys; +SELECT toTypeName(dictGetOrDefault('test_dict_nullable_default_single', 'a', toUInt64(0), + toNullable(toString(x)))) FROM test_dict_nullable_default_keys LIMIT 1; + +-- Tuple(Nullable(String), Nullable(UInt32)) -> Tuple(String, UInt32). +SELECT dictGetOrDefault('test_dict_nullable_default_tuple', ('a', 'b'), x, + (toNullable(concat('fb_', toString(x))), toNullable(toUInt32(x + 1000)))) AS v +FROM test_dict_nullable_default_keys; +SELECT toTypeName(dictGetOrDefault('test_dict_nullable_default_tuple', ('a', 'b'), toUInt64(0), + (toNullable(toString(x)), toNullable(toUInt32(x))))) FROM test_dict_nullable_default_keys LIMIT 1; + +-- Nullable(Tuple(String, UInt32)) -> Tuple(String, UInt32). +SELECT dictGetOrDefault('test_dict_nullable_default_tuple', ('a', 'b'), x, + toNullable((concat('fb_', toString(x)), toUInt32(x + 1000)))) AS v +FROM test_dict_nullable_default_keys; +SELECT toTypeName(dictGetOrDefault('test_dict_nullable_default_tuple', ('a', 'b'), toUInt64(0), + toNullable((toString(x), toUInt32(x))))) FROM test_dict_nullable_default_keys LIMIT 1; + +-- Nullable(Tuple(Nullable(String), Nullable(UInt32))) -> Tuple(String, UInt32). +SELECT dictGetOrDefault('test_dict_nullable_default_tuple', ('a', 'b'), x, + toNullable((toNullable(concat('fb_', toString(x))), toNullable(toUInt32(x + 1000))))) AS v +FROM test_dict_nullable_default_keys; +SELECT toTypeName(dictGetOrDefault('test_dict_nullable_default_tuple', ('a', 'b'), toUInt64(0), + toNullable((toNullable(toString(x)), toNullable(toUInt32(x)))))) FROM test_dict_nullable_default_keys LIMIT 1; + +-- Negative: a default that genuinely evaluates to NULL on a not-found row must still fail. +SELECT dictGetOrDefault('test_dict_nullable_default_single', 'a', x, + materialize(NULL::Nullable(String))) +FROM test_dict_nullable_default_keys; -- { serverError CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN } + +DROP DICTIONARY IF EXISTS test_dict_nullable_default_single; +DROP DICTIONARY IF EXISTS test_dict_nullable_default_tuple; +DROP TABLE IF EXISTS test_dict_nullable_default_keys; +DROP TABLE IF EXISTS test_dict_nullable_default_src; diff --git a/tests/queries/0_stateless/04262_singleValueOrNull_json_deserialize.reference b/tests/queries/0_stateless/04262_singleValueOrNull_json_deserialize.reference new file mode 100644 index 000000000000..f2c122e01968 --- /dev/null +++ b/tests/queries/0_stateless/04262_singleValueOrNull_json_deserialize.reference @@ -0,0 +1,5 @@ +{"a":1} +{"a":1} +\N +1 \N +2 \N diff --git a/tests/queries/0_stateless/04262_singleValueOrNull_json_deserialize.sql b/tests/queries/0_stateless/04262_singleValueOrNull_json_deserialize.sql new file mode 100644 index 000000000000..e92383088c95 --- /dev/null +++ b/tests/queries/0_stateless/04262_singleValueOrNull_json_deserialize.sql @@ -0,0 +1,27 @@ +-- Tags: no-fasttest + +-- Regression test for https://github.com/ClickHouse/ClickHouse/issues/103630 +-- singleValueOrNull(JSON) crashed with SEGFAULT during deserialization because +-- result_type (Nullable(JSON)) was passed instead of value_type (JSON) to +-- SingleValueDataGenericWithColumn::read, causing a type/serialization mismatch. + +SET enable_json_type = 1; + +-- Test 1: singleValueOrNull with JSON works correctly without MergeTree (no serialize/deserialize cycle). +SELECT singleValueOrNull(j) FROM (SELECT '{"a":1}'::JSON AS j); +SELECT singleValueOrNull(j) FROM (SELECT '{"a":1}'::JSON AS j FROM numbers(3)); +SELECT singleValueOrNull(j) FROM (SELECT if(number % 2, '{"a":1}', '{"b":2}')::JSON AS j FROM numbers(4)); + +-- Test 2: Deserialization from MergeTree must not crash. +-- Note: singleValueOrNull serialize/read methods do not persist first_value/is_null flags (see TODO in source), +-- so the deserialized state always appears empty and returns NULL. This is a known pre-existing limitation. +-- This test verifies the SEGFAULT is fixed (the server does not crash during deserialization). +DROP TABLE IF EXISTS t_single_value_or_null_json; +CREATE TABLE t_single_value_or_null_json (id UInt32, s AggregateFunction(singleValueOrNull, JSON)) ENGINE = MergeTree ORDER BY id; +INSERT INTO t_single_value_or_null_json SELECT 1, singleValueOrNullState(j) FROM (SELECT '{"a":1}'::JSON AS j); +INSERT INTO t_single_value_or_null_json SELECT 2, singleValueOrNullState(j) FROM (SELECT '{"b":"hello"}'::JSON AS j); + +-- This SELECT triggered SEGFAULT before the fix (deserialization of aggregate state). +SELECT id, singleValueOrNullMerge(s) FROM t_single_value_or_null_json GROUP BY id ORDER BY id; + +DROP TABLE t_single_value_or_null_json; diff --git a/tests/queries/0_stateless/04262_uniq_state_or_null_with_rollup.reference b/tests/queries/0_stateless/04262_uniq_state_or_null_with_rollup.reference new file mode 100644 index 000000000000..8e54d13db7eb --- /dev/null +++ b/tests/queries/0_stateless/04262_uniq_state_or_null_with_rollup.reference @@ -0,0 +1,6 @@ +uniqStateOrNull WITH ROLLUP 18 +uniqStateOrNull WITH CUBE 18 +uniqStateOrNull WITH TOTALS 17 +uniqStateOrDefault WITH ROLLUP 18 +uniqOrNullState WITH ROLLUP 18 +plain GROUP BY 17 diff --git a/tests/queries/0_stateless/04262_uniq_state_or_null_with_rollup.sql b/tests/queries/0_stateless/04262_uniq_state_or_null_with_rollup.sql new file mode 100644 index 000000000000..0f62fee1ba6f --- /dev/null +++ b/tests/queries/0_stateless/04262_uniq_state_or_null_with_rollup.sql @@ -0,0 +1,35 @@ +-- Regression test for https://github.com/ClickHouse/ClickHouse/issues/105462 +-- Server segfaults on `uniqStateOrNull(Nullable) ... GROUP BY ... WITH ROLLUP` +-- (and the CUBE / TOTALS / OrDefault variants). +-- The synthesised "all-grouped-up" row holds an `-OrNull`/`-OrDefault`-wrapped +-- state whose inner `UniquesHashSet` was never properly constructed, and the +-- subsequent `write()` of that empty hash set dereferenced a null `buf`. + +DROP TABLE IF EXISTS t_105462; + +CREATE TABLE t_105462 (h Nullable(UInt16)) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO t_105462 SELECT if(number % 5 = 0, NULL, toUInt16(number)) FROM numbers(20); + +-- Each query must complete without crashing the server. We do not assert +-- specific lengths because they encode internal `UniquesHashSet` layout. + +SELECT 'uniqStateOrNull WITH ROLLUP', count() FROM ( + SELECT hex(uniqStateOrNull(h)) AS s FROM t_105462 GROUP BY h WITH ROLLUP); + +SELECT 'uniqStateOrNull WITH CUBE', count() FROM ( + SELECT hex(uniqStateOrNull(h)) AS s FROM t_105462 GROUP BY h WITH CUBE); + +SELECT 'uniqStateOrNull WITH TOTALS', count() FROM ( + SELECT hex(uniqStateOrNull(h)) AS s FROM t_105462 GROUP BY h WITH TOTALS); + +SELECT 'uniqStateOrDefault WITH ROLLUP', count() FROM ( + SELECT hex(uniqStateOrDefault(h)) AS s FROM t_105462 GROUP BY h WITH ROLLUP); + +SELECT 'uniqOrNullState WITH ROLLUP', count() FROM ( + SELECT hex(uniqOrNullState(h)) AS s FROM t_105462 GROUP BY h WITH ROLLUP); + +-- Sanity check: the same query without the rollup modifier still works. +SELECT 'plain GROUP BY', count() FROM ( + SELECT hex(uniqStateOrNull(h)) AS s FROM t_105462 GROUP BY h); + +DROP TABLE t_105462; diff --git a/tests/queries/0_stateless/04267_mongodb_empty_collection_crash.reference b/tests/queries/0_stateless/04267_mongodb_empty_collection_crash.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/04267_mongodb_empty_collection_crash.sql b/tests/queries/0_stateless/04267_mongodb_empty_collection_crash.sql new file mode 100644 index 000000000000..fd2d59810dd5 --- /dev/null +++ b/tests/queries/0_stateless/04267_mongodb_empty_collection_crash.sql @@ -0,0 +1,25 @@ +-- Tags: no-fasttest + +-- An empty collection name or a name containing NUL bytes used to abort the server inside the C driver +-- (assertion failure in `_mongoc_cursor_collection`: `collection_len > 0`). +-- See https://github.com/ClickHouse/ClickHouse/issues/105773. + +-- Table function: 6-argument form `mongodb('host:port', database, collection, user, password, structure)`. +SELECT * FROM mongodb('test', 'db', '', 'user', 'pass', 'x Int32'); -- { serverError BAD_ARGUMENTS } +SELECT * FROM mongodb('test', 'db', '\0', 'user', 'pass', 'x Int32'); -- { serverError BAD_ARGUMENTS } +SELECT * FROM mongodb('test', 'db', 'col\0lection', 'user', 'pass', 'x Int32'); -- { serverError BAD_ARGUMENTS } +SELECT * FROM mongodb('test', 'db', '\0col', 'user', 'pass', 'x Int32'); -- { serverError BAD_ARGUMENTS } + +-- Table function: 3-argument form `mongodb(uri, collection, structure)`. +SELECT * FROM mongodb('mongodb://some-cluster:27017/db', '', 'x Int32'); -- { serverError BAD_ARGUMENTS } +SELECT * FROM mongodb('mongodb://some-cluster:27017/db', '\0', 'x Int32'); -- { serverError BAD_ARGUMENTS } +SELECT * FROM mongodb('mongodb://some-cluster:27017/db', 'col\0lection', 'x Int32'); -- { serverError BAD_ARGUMENTS } +SELECT * FROM mongodb('mongodb://some-cluster:27017/db', '\0col', 'x Int32'); -- { serverError BAD_ARGUMENTS } + +-- The same problem reachable via the storage engine: `CREATE TABLE ... ENGINE = MongoDB(...)`. +DROP TABLE IF EXISTS t_mongo_empty_collection; +CREATE TABLE t_mongo_empty_collection (x Int32) ENGINE = MongoDB('some-cluster:27017', 'db', '', 'user', 'pass'); -- { serverError BAD_ARGUMENTS } +CREATE TABLE t_mongo_empty_collection (x Int32) ENGINE = MongoDB('some-cluster:27017', 'db', '\0', 'user', 'pass'); -- { serverError BAD_ARGUMENTS } +CREATE TABLE t_mongo_empty_collection (x Int32) ENGINE = MongoDB('mongodb://some-cluster:27017/db', ''); -- { serverError BAD_ARGUMENTS } +CREATE TABLE t_mongo_empty_collection (x Int32) ENGINE = MongoDB('mongodb://some-cluster:27017/db', '\0'); -- { serverError BAD_ARGUMENTS } +DROP TABLE IF EXISTS t_mongo_empty_collection; diff --git a/tests/queries/0_stateless/04268_mongodb_dictionary_empty_collection.reference b/tests/queries/0_stateless/04268_mongodb_dictionary_empty_collection.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/04268_mongodb_dictionary_empty_collection.sql b/tests/queries/0_stateless/04268_mongodb_dictionary_empty_collection.sql new file mode 100644 index 000000000000..3a20837111dc --- /dev/null +++ b/tests/queries/0_stateless/04268_mongodb_dictionary_empty_collection.sql @@ -0,0 +1,22 @@ +-- Tags: no-fasttest + +-- Regression test for the dictionary path of https://github.com/ClickHouse/ClickHouse/issues/105773. +-- An empty collection name (or one containing NUL bytes) used to abort the server inside the C driver +-- when the dictionary source was loaded; now it must fail with BAD_ARGUMENTS. + +DROP DICTIONARY IF EXISTS dict_mongo_empty_collection; +CREATE DICTIONARY dict_mongo_empty_collection (id UInt64, name String) +PRIMARY KEY id +SOURCE(MONGODB(URI 'mongodb://example.test:27017/db' COLLECTION '')) +LIFETIME(0) +LAYOUT(FLAT()); +SELECT * FROM dict_mongo_empty_collection; -- { serverError BAD_ARGUMENTS } +DROP DICTIONARY dict_mongo_empty_collection; + +CREATE DICTIONARY dict_mongo_empty_collection (id UInt64, name String) +PRIMARY KEY id +SOURCE(MONGODB(URI 'mongodb://example.test:27017/db' COLLECTION '\0')) +LIFETIME(0) +LAYOUT(FLAT()); +SELECT * FROM dict_mongo_empty_collection; -- { serverError BAD_ARGUMENTS } +DROP DICTIONARY dict_mongo_empty_collection; diff --git a/tests/queries/0_stateless/04268_system_dictionaries_partial_revoke.reference b/tests/queries/0_stateless/04268_system_dictionaries_partial_revoke.reference new file mode 100644 index 000000000000..19567e7d2bac --- /dev/null +++ b/tests/queries/0_stateless/04268_system_dictionaries_partial_revoke.reference @@ -0,0 +1,2 @@ +test_dict_04268 +1 diff --git a/tests/queries/0_stateless/04268_system_dictionaries_partial_revoke.sh b/tests/queries/0_stateless/04268_system_dictionaries_partial_revoke.sh new file mode 100755 index 000000000000..974cfcc74a8b --- /dev/null +++ b/tests/queries/0_stateless/04268_system_dictionaries_partial_revoke.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Test that system.dictionaries shows dictionaries when SHOW DICTIONARIES +# has a partial revoke on an unrelated table. + +USER="test_user_04268_${CLICKHOUSE_DATABASE}" +ROLE="test_role_04268_${CLICKHOUSE_DATABASE}" +DICT="${CLICKHOUSE_DATABASE}.test_dict_04268" + +${CLICKHOUSE_CLIENT} -q "DROP DICTIONARY IF EXISTS ${DICT}" +${CLICKHOUSE_CLIENT} -q "DROP USER IF EXISTS ${USER}" +${CLICKHOUSE_CLIENT} -q "DROP ROLE IF EXISTS ${ROLE}" + +${CLICKHOUSE_CLIENT} -q " + CREATE DICTIONARY ${DICT} (id UInt64, value String) + PRIMARY KEY id + SOURCE(CLICKHOUSE(TABLE 'numbers' DB 'system')) + LIFETIME(MIN 0 MAX 300) + LAYOUT(FLAT()) +" + +# Create role with SHOW DICTIONARIES and a partial revoke on an unrelated table +${CLICKHOUSE_CLIENT} -q "CREATE ROLE ${ROLE}" +${CLICKHOUSE_CLIENT} -q "GRANT SHOW DICTIONARIES, SELECT ON *.* TO ${ROLE}" +${CLICKHOUSE_CLIENT} -q "REVOKE SHOW DICTIONARIES, SELECT ON system.non_existing_table FROM ${ROLE}" + +${CLICKHOUSE_CLIENT} -q "CREATE USER ${USER} IDENTIFIED WITH no_password" +${CLICKHOUSE_CLIENT} -q "GRANT ${ROLE} TO ${USER}" + +# The dictionary should be visible despite the partial revoke +${CLICKHOUSE_CLIENT} --user "${USER}" -q "SELECT name FROM system.dictionaries WHERE database = '${CLICKHOUSE_DATABASE}' AND name = 'test_dict_04268'" + +# The dictionary should also appear in system.completions +${CLICKHOUSE_CLIENT} --user "${USER}" -q "SELECT count() > 0 FROM system.completions WHERE word = 'test_dict_04268'" + +# Cleanup +${CLICKHOUSE_CLIENT} -q "DROP DICTIONARY IF EXISTS ${DICT}" +${CLICKHOUSE_CLIENT} -q "DROP USER IF EXISTS ${USER}" +${CLICKHOUSE_CLIENT} -q "DROP ROLE IF EXISTS ${ROLE}" diff --git a/tests/queries/0_stateless/04273_uniq_state_or_null_rollup_consumers.reference b/tests/queries/0_stateless/04273_uniq_state_or_null_rollup_consumers.reference new file mode 100644 index 000000000000..a826e3b6b8f0 --- /dev/null +++ b/tests/queries/0_stateless/04273_uniq_state_or_null_rollup_consumers.reference @@ -0,0 +1,6 @@ +uniqMerge over rollup 16 +uniqMerge over rollup, OrDefault 16 +aggregating merge tree insert 16 +two-level single-thread 18 +multi-threaded merge 18 +memory engine, uniqMerge over rollup 16 diff --git a/tests/queries/0_stateless/04273_uniq_state_or_null_rollup_consumers.sql b/tests/queries/0_stateless/04273_uniq_state_or_null_rollup_consumers.sql new file mode 100644 index 000000000000..86b5e801cd14 --- /dev/null +++ b/tests/queries/0_stateless/04273_uniq_state_or_null_rollup_consumers.sql @@ -0,0 +1,53 @@ +-- Consumers of an aggregate state produced by `*StateOrNull` / `*StateOrDefault` +-- under `WITH ROLLUP` / `CUBE` / `TOTALS`: re-aggregation via `Merge`, +-- persistence into `AggregatingMergeTree`, two-level and multi-threaded +-- aggregator variants, and a `Memory`-engine source. + +DROP TABLE IF EXISTS source_105462; +DROP TABLE IF EXISTS agg_t_105462; +DROP TABLE IF EXISTS source_mem_105462; + +CREATE TABLE source_105462 (h Nullable(UInt16)) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO source_105462 SELECT if(number % 5 = 0, NULL, toUInt16(number)) FROM numbers(20); + +-- `Merge` consumes the rollup-produced state column, reaching +-- `UniquesHashSet::merge` rather than `UniquesHashSet::write`. +SELECT 'uniqMerge over rollup', uniqMerge(s) +FROM (SELECT uniqStateOrNull(h) AS s FROM source_105462 GROUP BY h WITH ROLLUP); + +SELECT 'uniqMerge over rollup, OrDefault', uniqMerge(s) +FROM (SELECT uniqStateOrDefault(h) AS s FROM source_105462 GROUP BY h WITH ROLLUP); + +-- Persist the rollup-produced state into `AggregatingMergeTree`, reaching +-- `UniquesHashSet::merge` via `ColumnAggregateFunction::insertMergeFrom` and the +-- background-merge / `MergeTreeDataWriter::mergeBlock` path. +-- `uniqMerge` over the persisted column is the deterministic check (commutative +-- and associative under any row split), unlike `count` which depends on how the +-- `INSERT` pipeline parallelises the rollup output. +CREATE TABLE agg_t_105462 (k UInt8, s AggregateFunction(uniq, UInt16)) + ENGINE = AggregatingMergeTree ORDER BY k; +INSERT INTO agg_t_105462 + SELECT toUInt8(coalesce(h, 999)) AS k, uniqStateOrNull(h) + FROM source_105462 GROUP BY h WITH ROLLUP; +SELECT 'aggregating merge tree insert', uniqMerge(s) FROM agg_t_105462; + +-- Two-level and multi-threaded aggregator variants: same state shape reached +-- through different scheduling. +SELECT 'two-level single-thread', count() FROM ( + SELECT hex(uniqStateOrNull(h)) FROM source_105462 GROUP BY h WITH ROLLUP + SETTINGS group_by_two_level_threshold = 1, group_by_two_level_threshold_bytes = 1, max_threads = 1); + +SELECT 'multi-threaded merge', count() FROM ( + SELECT hex(uniqStateOrNull(h)) FROM source_105462 GROUP BY h WITH ROLLUP + SETTINGS max_threads = 4); + +-- `Memory`-engine source: confirms the consumer paths above work independently +-- of `MergeTree`. +CREATE TABLE source_mem_105462 (h Nullable(UInt16)) ENGINE = Memory; +INSERT INTO source_mem_105462 SELECT if(number % 5 = 0, NULL, toUInt16(number)) FROM numbers(20); +SELECT 'memory engine, uniqMerge over rollup', uniqMerge(s) +FROM (SELECT uniqStateOrNull(h) AS s FROM source_mem_105462 GROUP BY h WITH ROLLUP); + +DROP TABLE source_105462; +DROP TABLE agg_t_105462; +DROP TABLE source_mem_105462; diff --git a/tests/queries/0_stateless/04275_104781_qcc_prewhere_skip_index_attribution.reference b/tests/queries/0_stateless/04275_104781_qcc_prewhere_skip_index_attribution.reference new file mode 100644 index 000000000000..701fe98e1773 --- /dev/null +++ b/tests/queries/0_stateless/04275_104781_qcc_prewhere_skip_index_attribution.reference @@ -0,0 +1,6 @@ +truth 500002 +after_trigger 500002 +after_trigger_no_cache 500002 +after_trigger_workaround 500002 +sanity_prewhere_1 49999 +sanity_prewhere_2 49999 diff --git a/tests/queries/0_stateless/04275_104781_qcc_prewhere_skip_index_attribution.sql b/tests/queries/0_stateless/04275_104781_qcc_prewhere_skip_index_attribution.sql new file mode 100644 index 000000000000..6b1202942255 --- /dev/null +++ b/tests/queries/0_stateless/04275_104781_qcc_prewhere_skip_index_attribution.sql @@ -0,0 +1,110 @@ +-- Tags: no-parallel-replicas, no-parallel +-- Tag no-parallel: messes with the server-level query condition cache + +-- Regression test for #104781: a `SELECT ... PREWHERE pk_prefix = X WHERE non_pk IN [...]` +-- against a column with a `bloom_filter` skip index poisons the query condition cache for +-- the PREWHERE predicate. A subsequent benign `SELECT count() ... WHERE pk_prefix = X` +-- then under-counts. +-- +-- The bug requires `use_skip_indexes_on_data_read = 1` (default since #93407): the +-- bloom-filter reader runs ahead of PREWHERE and drops marks via `canSkipMark`. Without +-- the fix, those marks are incorrectly attributed to the PREWHERE predicate, even though +-- the predicate matches all rows. +-- +-- Settings pinned per-query so that CI randomization cannot disable the bug ingredients: +-- * `use_query_condition_cache = 1` - the cache that can be poisoned. +-- * `use_skip_indexes_on_data_read = 1` - puts `MergeTreeReaderIndex` ahead of +-- PREWHERE in the reader chain. +-- * `optimize_move_to_prewhere = 1` and +-- `query_plan_optimize_prewhere = 1` - guarantee `WHERE pk_prefix = X` is hashed +-- under the same PREWHERE-predicate key as +-- the explicit-PREWHERE trigger query. +-- * `optimize_use_implicit_projections = 0` - disable `_exact_count_projection` so the +-- `count()` queries traverse the data path +-- and can hit the poisoned cache entry. With +-- the implicit projection the count is read +-- from per-part metadata and bypasses QCC. + +SET allow_experimental_analyzer = 1; + +DROP TABLE IF EXISTS t_104781; + +CREATE TABLE t_104781 +( + id String, + project_id String, + created_at DateTime64(3) DEFAULT now64(3), + started_at DateTime64(6), + INDEX idx_id_bloom id TYPE bloom_filter(0.01) GRANULARITY 1 +) +ENGINE = ReplacingMergeTree(created_at) +PARTITION BY toYYYYMM(started_at) +ORDER BY (project_id, started_at, id) +SETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192; + +-- 500000 rows in partition 202604, all `project_id = 'P1'`. +INSERT INTO t_104781 (id, project_id, started_at) +SELECT concat('id-', toString(number)), 'P1', + toDateTime64('2026-04-01 00:00:00', 6) + INTERVAL number SECOND +FROM numbers(500000); + +-- 2 rows in partition 202605, same `project_id`, in a separate part. +INSERT INTO t_104781 (id, project_id, started_at) +SELECT concat('id-fresh-', toString(number)), 'P1', + toDateTime64('2026-05-15 00:00:00', 6) + INTERVAL number SECOND +FROM numbers(2); + +SYSTEM DROP QUERY CONDITION CACHE; + +SELECT 'truth', count() FROM t_104781 WHERE project_id = 'P1' +SETTINGS use_query_condition_cache = 1, use_skip_indexes_on_data_read = 1, + optimize_move_to_prewhere = 1, query_plan_optimize_prewhere = 1, + optimize_use_implicit_projections = 0; + +-- Trigger: PREWHERE on a primary-key-prefix column + WHERE on a non-PK column with `IN`. +-- The bloom-filter index on `id` drops most marks at read time; before the fix, those +-- marks were attributed to the PREWHERE predicate, poisoning the cache. +SELECT id FROM t_104781 +PREWHERE project_id = 'P1' +WHERE id IN ['anything-1', 'anything-2'] +FORMAT Null +SETTINGS use_query_condition_cache = 1, use_skip_indexes_on_data_read = 1; + +-- With the fix, this returns the full 500002. Before the fix, it returns a smaller, wrong +-- count (the exact under-count is bloom-filter-FPR-dependent). +SELECT 'after_trigger', count() FROM t_104781 WHERE project_id = 'P1' +SETTINGS use_query_condition_cache = 1, use_skip_indexes_on_data_read = 1, + optimize_move_to_prewhere = 1, query_plan_optimize_prewhere = 1, + optimize_use_implicit_projections = 0; + +-- Sanity: cache-off path must match `truth`. +SELECT 'after_trigger_no_cache', count() FROM t_104781 WHERE project_id = 'P1' +SETTINGS use_query_condition_cache = 0, optimize_use_implicit_projections = 0; + +-- Sanity: the `use_skip_indexes_on_data_read = 0` workaround (per @nihalzp's bisect) +-- avoids the bug too. +SYSTEM DROP QUERY CONDITION CACHE; +SELECT id FROM t_104781 +PREWHERE project_id = 'P1' +WHERE id IN ['anything-1', 'anything-2'] +FORMAT Null +SETTINGS use_query_condition_cache = 1, use_skip_indexes_on_data_read = 0; +SELECT 'after_trigger_workaround', count() FROM t_104781 WHERE project_id = 'P1' +SETTINGS use_query_condition_cache = 1, use_skip_indexes_on_data_read = 1, + optimize_move_to_prewhere = 1, query_plan_optimize_prewhere = 1, + optimize_use_implicit_projections = 0; + +-- Sanity: a plain PREWHERE-only query (no bloom-filter-backed `WHERE IN`) still benefits +-- from the PREWHERE-side query condition cache. The fix only suppresses the PREWHERE +-- write when a skip-index reader is in the chain; ordinary cases are unaffected. +DROP TABLE IF EXISTS t_104781_sanity; +CREATE TABLE t_104781_sanity (x UInt32, y UInt32) ENGINE = MergeTree ORDER BY x; +INSERT INTO t_104781_sanity SELECT number, number FROM numbers(100000); +SYSTEM DROP QUERY CONDITION CACHE; +SELECT 'sanity_prewhere_1', count() FROM t_104781_sanity PREWHERE x > 50000 +SETTINGS use_query_condition_cache = 1, optimize_use_implicit_projections = 0; +SELECT 'sanity_prewhere_2', count() FROM t_104781_sanity PREWHERE x > 50000 +SETTINGS use_query_condition_cache = 1, optimize_use_implicit_projections = 0; + +DROP TABLE t_104781; +DROP TABLE t_104781_sanity; diff --git a/tests/queries/0_stateless/04277_distr_in_negative_array.reference b/tests/queries/0_stateless/04277_distr_in_negative_array.reference new file mode 100644 index 000000000000..573541ac9702 --- /dev/null +++ b/tests/queries/0_stateless/04277_distr_in_negative_array.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/04277_distr_in_negative_array.sql b/tests/queries/0_stateless/04277_distr_in_negative_array.sql new file mode 100644 index 000000000000..5e93ee142bab --- /dev/null +++ b/tests/queries/0_stateless/04277_distr_in_negative_array.sql @@ -0,0 +1 @@ +SELECT countIf(dummy IN [1, -1]) from remote('127.0.0.{1,2}', 'system', 'one') settings empty_result_for_aggregation_by_empty_set=0; diff --git a/tests/queries/0_stateless/04300_deltalake_ffi_panic_guard.reference b/tests/queries/0_stateless/04300_deltalake_ffi_panic_guard.reference new file mode 100644 index 000000000000..1f31ca18f879 --- /dev/null +++ b/tests/queries/0_stateless/04300_deltalake_ffi_panic_guard.reference @@ -0,0 +1 @@ +panicked across the FFI boundary diff --git a/tests/queries/0_stateless/04300_deltalake_ffi_panic_guard.sh b/tests/queries/0_stateless/04300_deltalake_ffi_panic_guard.sh new file mode 100755 index 000000000000..3e2cf2522ab9 --- /dev/null +++ b/tests/queries/0_stateless/04300_deltalake_ffi_panic_guard.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-msan +# Tag no-fasttest: depends on delta-kernel-rs and AWS (not built/available in fast test) +# Tag no-msan: delta-kernel-rs is not built with MSan + +# Regression test for https://github.com/ClickHouse/ClickHouse/issues/104509 (failure mode 2). +# +# A NUL byte is valid UTF-8, so it passes the option decoding, but it is an invalid HTTP header +# value. While signing the S3 request, `object_store` does `.unwrap()` on the resulting +# `InvalidHeaderValue` and panics on a `TokioBackgroundExecutor` worker; the kernel executor then +# panics on the calling thread inside `ffi::snapshot`. Because the FFI is `extern "C"` (nounwind), +# that panic used to reach `panic_cannot_unwind` and abort the server. The FFI now wraps `snapshot` +# and `builder_build` in `catch_unwind`, converting the panic into a `DELTA_KERNEL_ERROR` exception. +# +# Written as a `.sh` test through `clickhouse-local`: on the unfixed binary the process aborts, so +# stdout is empty and mismatches the reference (clean FAIL); on the fixed binary the guarded panic +# is reported and grep matches (PASS). Running in `clickhouse-local` keeps a crash on the unfixed +# binary from poisoning the live server. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL --query "SELECT * FROM deltaLake('https://clickhouse-public-datasets.s3.amazonaws.com/delta_lake/hits/', '\0', 'x') SETTINGS allow_experimental_delta_kernel_rs = 1" 2>&1 \ + | grep -o -m1 'panicked across the FFI boundary' diff --git a/tests/queries/0_stateless/04301_deltalake_cdf_ffi_panic_guard.reference b/tests/queries/0_stateless/04301_deltalake_cdf_ffi_panic_guard.reference new file mode 100644 index 000000000000..1f31ca18f879 --- /dev/null +++ b/tests/queries/0_stateless/04301_deltalake_cdf_ffi_panic_guard.reference @@ -0,0 +1 @@ +panicked across the FFI boundary diff --git a/tests/queries/0_stateless/04301_deltalake_cdf_ffi_panic_guard.sh b/tests/queries/0_stateless/04301_deltalake_cdf_ffi_panic_guard.sh new file mode 100755 index 000000000000..7256fbf3c158 --- /dev/null +++ b/tests/queries/0_stateless/04301_deltalake_cdf_ffi_panic_guard.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-msan +# Tag no-fasttest: depends on delta-kernel-rs and AWS (not built/available in fast test) +# Tag no-msan: delta-kernel-rs is not built with MSan + +# Regression test for https://github.com/ClickHouse/ClickHouse/issues/104509 on the Change Data +# Feed path. With `delta_lake_snapshot_start_version` set, the query goes through +# `ffi::table_changes_from_version` instead of `ffi::snapshot`. That entry point reads the log +# (object_store I/O) too, so the same invalid-HTTP-header credential (a NUL byte in the S3 access +# key) panics on a `TokioBackgroundExecutor` worker; the executor then panics on the calling thread +# inside `table_changes_from_version`, which is `extern "C"` (nounwind) and aborts the server. The +# CDF FFI entry points are now wrapped in `catch_unwind`, converting the panic into a +# `DELTA_KERNEL_ERROR` exception. +# +# Written as a `.sh` test through `clickhouse-local`: on the unfixed binary the process aborts, so +# stdout is empty and mismatches the reference (clean FAIL); on the fixed binary the guarded panic +# is reported and grep matches (PASS). Running in `clickhouse-local` keeps a crash on the unfixed +# binary from poisoning the live server. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL --query "SELECT * FROM deltaLake('https://clickhouse-public-datasets.s3.amazonaws.com/delta_lake/hits/', '\0', 'x') SETTINGS allow_experimental_delta_kernel_rs = 1, delta_lake_snapshot_start_version = 0" 2>&1 \ + | grep -o -m1 'panicked across the FFI boundary'