From a1ab343af453febdf4ed18bb2a4cd85ecd29c501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Fri, 1 Dec 2023 12:44:14 +0100 Subject: [PATCH] rabbit_feature_flags: Protect concurrent reloads of the registry [Why] When a node joins another node, it resets its feature flags registry (the registry is unloaded) and it copies the feature flags states from the remote cluster. Before this patch, there was a small window where a concurrent use of the registry right between these two steps would reload a registry from the default/empty states, which does not reflect the intent. This could happen because another node is running peer discovery and queries the cluster membership of the node joining a cluster. [How] We acquire the registry reload lock around the reset+copy and in `rabbit_ff_registry_wrapper`. --- deps/rabbit/src/rabbit_db_cluster.erl | 19 ++++++++++++-- .../rabbit/src/rabbit_ff_registry_wrapper.erl | 26 ++++++++++++++----- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/deps/rabbit/src/rabbit_db_cluster.erl b/deps/rabbit/src/rabbit_db_cluster.erl index e54af8b75e37..b4121fa5e6e0 100644 --- a/deps/rabbit/src/rabbit_db_cluster.erl +++ b/deps/rabbit/src/rabbit_db_cluster.erl @@ -118,8 +118,23 @@ join(RemoteNode, NodeType) end end, - ok = rabbit_db:reset(), - rabbit_feature_flags:copy_feature_states_after_reset(RemoteNode), + %% We acquire the feature flags registry reload lock because + %% between the time we reset the registry (as part of + %% `rabbit_db:reset/0' and the states copy from the remote node, + %% there could be a concurrent reload of the registry (for instance + %% because of peer discovery on another node) with the + %% default/empty states. + %% + %% To make this work, the lock is also acquired from + %% `rabbit_ff_registry_wrapper'. + rabbit_ff_registry_factory:acquire_state_change_lock(), + try + ok = rabbit_db:reset(), + rabbit_feature_flags:copy_feature_states_after_reset( + RemoteNode) + after + rabbit_ff_registry_factory:release_state_change_lock() + end, ?LOG_INFO( "DB: joining cluster using remote nodes:~n~tp", [ClusterNodes], diff --git a/deps/rabbit/src/rabbit_ff_registry_wrapper.erl b/deps/rabbit/src/rabbit_ff_registry_wrapper.erl index 5d8555e84a0f..904ed3e010f1 100644 --- a/deps/rabbit/src/rabbit_ff_registry_wrapper.erl +++ b/deps/rabbit/src/rabbit_ff_registry_wrapper.erl @@ -52,7 +52,7 @@ get(FeatureName) -> case rabbit_ff_registry:get(FeatureName) of init_required -> - _ = rabbit_ff_registry_factory:initialize_registry(), + initialize_registry(), get(FeatureName); Ret -> Ret @@ -74,7 +74,7 @@ get(FeatureName) -> list(Which) -> case rabbit_ff_registry:list(Which) of init_required -> - _ = rabbit_ff_registry_factory:initialize_registry(), + initialize_registry(), list(Which); Ret -> Ret @@ -93,7 +93,7 @@ list(Which) -> states() -> case rabbit_ff_registry:states() of init_required -> - _ = rabbit_ff_registry_factory:initialize_registry(), + initialize_registry(), states(); Ret -> Ret @@ -115,7 +115,7 @@ states() -> is_supported(FeatureName) -> case rabbit_ff_registry:is_supported(FeatureName) of init_required -> - _ = rabbit_ff_registry_factory:initialize_registry(), + initialize_registry(), is_supported(FeatureName); Ret -> Ret @@ -137,7 +137,7 @@ is_supported(FeatureName) -> is_enabled(FeatureName) -> case rabbit_ff_registry:is_enabled(FeatureName) of init_required -> - _ = rabbit_ff_registry_factory:initialize_registry(), + initialize_registry(), is_enabled(FeatureName); Ret -> Ret @@ -150,8 +150,22 @@ is_enabled(FeatureName) -> inventory() -> case rabbit_ff_registry:inventory() of init_required -> - _ = rabbit_ff_registry_factory:initialize_registry(), + initialize_registry(), inventory(); Ret -> Ret end. + +initialize_registry() -> + %% We acquire the feature flags registry reload lock here to make sure we + %% don't reload the registry in the middle of a cluster join. Indeed, the + %% registry is reset and feature flags states are copied from a remote + %% node. Therefore, there is a small window where the registry is not + %% loaded and the states on disk do not reflect the intent. + rabbit_ff_registry_factory:acquire_state_change_lock(), + try + _ = rabbit_ff_registry_factory:initialize_registry(), + ok + after + rabbit_ff_registry_factory:release_state_change_lock() + end.