Skip to content

Commit 50cae13

Browse files
committed
rabbit_mnesia: add retries
Occasionally, clustering will fail with the log as pasted before. I believe it's because of the parallel node startup, sometimes leading to crashes. Hopefully, with retries, we'll handle this more gracefully. ``` Feature flags: nodes `rmq-ct-cluster_size_3_2-2-21072@localhost` and `rmq-ct-cluster_size_3_2-1-21000@localhost` are compatible Mnesia('rmq-ct-cluster_size_3_2-2-21072@localhost'): ** ERROR ** (ignoring core) ** FATAL ** mnesia_monitor crashed: {{badmatch, <0.203.0>, Ref<0.1988436133.884998146.137464>}}, {mnesia_monitor, handle_info, 2, [{file, "mnesia_monitor.erl"}, {line, 583}]}, gen_server, try_handle_info, 3, [{file, "gen_server.erl"}, {line, 2434}]}, gen_server, handle_msg, 3, [{file, "gen_server.erl"}, {line, 2420}]}, proc_lib, init_p_do_apply, 3, [{file, "proc_lib.erl"}, {line, 333}]}]} Error in process <0.300.0> on node 'rmq-ct-cluster_size_3_2-2-21072@localhost' with exit value: {badarg,[{erlang,send, [mnesia_locker,{release_tid,{tid,142,<24815.431.0>}}], [{error_info,#{module => erl_erts_errors}}]}, {mnesia_locker,release_tid,1,[{file,"mnesia_locker.erl"},{line,128}]}, {mnesia_tm,commit_participant,7, [{file,"mnesia_tm.erl"},{line,1828}]}]} Application mnesia exited with reason: stopped BOOT FAILED =========== Exception during startup: Exit:{killed,{gen_server,call,[<0.280.0>,{negotiate_protocol,['rmq-ct-cluster_size_3_2-1-21000@localhost']},infinity]}} gen_server:call/3, line 1301 mnesia_monitor:call/1, line 232 rabbit_mnesia:-check_mnesia_consistency/2-fun-0-/2, line 1002 rabbit_mnesia:with_running_or_clean_mnesia/1, line 1036 rabbit_mnesia:check_cluster_consistency/2, line 719 lists:foldl/3, line 2466 rabbit_mnesia:check_cluster_consistency/0, line 680 rabbit_prelaunch_cluster:setup/1, line 27 ```
1 parent 2f382d9 commit 50cae13

File tree

1 file changed

+36
-3
lines changed

1 file changed

+36
-3
lines changed

deps/rabbit/src/rabbit_mnesia.erl

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -965,13 +965,27 @@ stop_mnesia() ->
965965
ensure_mnesia_not_running().
966966

967967
change_extra_db_nodes(ClusterNodes0, CheckOtherNodes) ->
968+
change_extra_db_nodes(ClusterNodes0, CheckOtherNodes, 3).
969+
970+
change_extra_db_nodes(ClusterNodes0, CheckOtherNodes, RetriesLeft) ->
968971
ClusterNodes = rabbit_nodes:nodes_excl_me(ClusterNodes0),
969972
case {mnesia:change_config(extra_db_nodes, ClusterNodes), ClusterNodes} of
970973
{{ok, []}, [_|_]} when CheckOtherNodes ->
971974
throw({error, {failed_to_cluster_with, ClusterNodes,
972975
"Mnesia could not connect to any nodes."}});
973976
{{ok, Nodes}, _} ->
974-
Nodes
977+
Nodes;
978+
{{error, _} = Error, _} when RetriesLeft > 0 ->
979+
?LOG_WARNING(
980+
"Failed to add extra Mnesia db nodes ~tp: ~tp. "
981+
"Retrying (~b attempts left).",
982+
[ClusterNodes, Error, RetriesLeft - 1]),
983+
timer:sleep(1000),
984+
change_extra_db_nodes(ClusterNodes0, CheckOtherNodes,
985+
RetriesLeft - 1);
986+
{{error, _} = Error, _} ->
987+
throw({error, {failed_to_cluster_with, ClusterNodes,
988+
rabbit_misc:format("~tp", [Error])}})
975989
end.
976990

977991
check_nodes_consistency(Node, {RemoteAllNodes, _, _}) ->
@@ -1011,8 +1025,27 @@ check_mnesia_consistency(Node, ProtocolVersion) ->
10111025
end
10121026
end).
10131027

1014-
negotiate_protocol([Node]) ->
1015-
mnesia_monitor:negotiate_protocol([Node]).
1028+
negotiate_protocol(Nodes) ->
1029+
negotiate_protocol(Nodes, 3).
1030+
1031+
negotiate_protocol([Node], RetriesLeft) ->
1032+
try
1033+
mnesia_monitor:negotiate_protocol([Node])
1034+
catch
1035+
exit:Reason when RetriesLeft > 1 ->
1036+
?LOG_WARNING(
1037+
"Mnesia protocol negotiation with node ~tp "
1038+
"failed: ~tp. Retrying (~b attempts left).",
1039+
[Node, Reason, RetriesLeft - 1]),
1040+
timer:sleep(1000),
1041+
negotiate_protocol([Node], RetriesLeft - 1);
1042+
exit:Reason ->
1043+
?LOG_WARNING(
1044+
"Mnesia protocol negotiation with node ~tp "
1045+
"failed: ~tp. No retries left.",
1046+
[Node, Reason]),
1047+
[]
1048+
end.
10161049

10171050
with_running_or_clean_mnesia(Fun) ->
10181051
IsMnesiaRunning = case mnesia:system_info(is_running) of

0 commit comments

Comments
 (0)