diff --git a/src/ra_log_pre_init.erl b/src/ra_log_pre_init.erl index 366757d8..ff348012 100644 --- a/src/ra_log_pre_init.erl +++ b/src/ra_log_pre_init.erl @@ -46,9 +46,9 @@ init([System]) -> ok -> ok catch _:Err -> ?ERROR("pre_init failed in system ~s for UId ~ts with name ~ts" - " This error may need manual intervention", - [System, UId, Name]), - throw({stop, {error, Err}}) + " This error may need manual intervention, Error ~p", + [System, UId, Name, Err]), + ok end end|| {Name, UId} <- Regd], {ok, #state{} , hibernate}. @@ -95,7 +95,7 @@ pre_init(System, UId) -> {error, Err} -> ?ERROR("pre_init failed to read config file for UId '~ts', Err ~p", [UId, Err]), - exit({pre_init_failed, Err}) + ok end; false -> ?INFO("pre_init UId '~ts' is registered but no data directory was found", diff --git a/src/ra_snapshot.erl b/src/ra_snapshot.erl index 505f3e79..29c7f42b 100644 --- a/src/ra_snapshot.erl +++ b/src/ra_snapshot.erl @@ -224,43 +224,47 @@ find_checkpoints(#?MODULE{uid = UId, module = Module, current = Current, checkpoint_directory = CheckpointDir} = State) -> - true = ra_lib:is_dir(CheckpointDir), - CurrentIdx = case Current of - undefined -> - -1; - {I, _} -> - I - end, - {ok, CPFiles0} = prim_file:list_dir(CheckpointDir), - CPFiles = lists:reverse(lists:sort(CPFiles0)), - Checkpoints = - lists:filtermap( - fun(File) -> - CP = filename:join(CheckpointDir, File), - case Module:validate(CP) of - ok -> - {ok, #{index := Idx, term := Term}} = + case ra_lib:is_dir(CheckpointDir) of + false -> + State; + true -> + CurrentIdx = case Current of + undefined -> + -1; + {I, _} -> + I + end, + {ok, CPFiles0} = prim_file:list_dir(CheckpointDir), + CPFiles = lists:reverse(lists:sort(CPFiles0)), + Checkpoints = + lists:filtermap( + fun(File) -> + CP = filename:join(CheckpointDir, File), + case Module:validate(CP) of + ok -> + {ok, #{index := Idx, term := Term}} = Module:read_meta(CP), - case Idx > CurrentIdx of - true -> - {true, {Idx, Term}}; - false -> - ?INFO("ra_snapshot: ~ts: removing " - "checkpoint ~s as was older than the " - "current snapshot.", - [UId, CP]), - delete(CheckpointDir, {Idx, Term}), - false - end; - Err -> - ?INFO("ra_snapshot: ~ts: removing checkpoint ~s as " - "did not validate. Err: ~w", - [UId, CP, Err]), - ra_lib:recursive_delete(CP), - false - end - end, CPFiles), - State#?MODULE{checkpoints = Checkpoints}. + case Idx > CurrentIdx of + true -> + {true, {Idx, Term}}; + false -> + ?INFO("ra_snapshot: ~ts: removing " + "checkpoint ~s as was older than the " + "current snapshot.", + [UId, CP]), + delete(CheckpointDir, {Idx, Term}), + false + end; + Err -> + ?INFO("ra_snapshot: ~ts: removing checkpoint ~s as " + "did not validate. Err: ~w", + [UId, CP, Err]), + ra_lib:recursive_delete(CP), + false + end + end, CPFiles), + State#?MODULE{checkpoints = Checkpoints} + end. -spec init_ets() -> ok. init_ets() -> diff --git a/test/ra_log_2_SUITE.erl b/test/ra_log_2_SUITE.erl index a03ff2c4..3bbd3a64 100644 --- a/test/ra_log_2_SUITE.erl +++ b/test/ra_log_2_SUITE.erl @@ -33,6 +33,7 @@ all_tests() -> recovery, recover_many, recovery_with_missing_directory, + recovery_with_missing_checkpoints_directory, wal_crash_recover, wal_down_read_availability, wal_down_append_throws, @@ -653,6 +654,30 @@ recovery_with_missing_directory(Config) -> ok. +recovery_with_missing_checkpoints_directory(Config) -> + %% checking that the ra system can be restarted even if the checkpoints + %% directory is missing, it will be created the next time the + %% log is initialised + logger:set_primary_config(level, debug), + UId = ?config(uid, Config), + Log0 = ra_log_init(Config), + ra_log:close(Log0), + + ServerDataDir = ra_env:server_data_dir(default, UId), + CheckpointsDir = filename:join(ServerDataDir, "checkpoints"), + ok = ra_lib:recursive_delete(CheckpointsDir), + ?assertNot(filelib:is_dir(CheckpointsDir)), + + application:stop(ra), + start_ra(Config), + + Log5 = ra_log_init(Config), + ra_log:close(Log5), + ok = ra_lib:recursive_delete(ServerDataDir), + ?assertNot(filelib:is_dir(ServerDataDir)), + + ok. + resend_write(Config) -> % logger:set_primary_config(level, debug), % simulate lost messages requiring the ra server to resend in flight