Skip to content

Commit a0c240d

Browse files
committed
Fix snapshot installation CRC failure
Due to the unwise use of term_to_binary rather than using the original binary data in the replication of snapshot states. This change: Introduces a new optional ra_snapshot callback: context/0 This is called by the sending Ra leader node to discover context and capabilities of the receiver. In this case it is used to indicate if the receiver is capabable of receiving the entire snapshot file. Receiving the entire file is the updated approach that ensures the CRC check will be done on the same binary data it was generated from. If the receiver does not have the context/0 callback or does not indicate support the old approach of sending the deserialised metat data map and any data following that is used. When a snapshot is received from an old node (i.e. _not_ including the entire file) the receiver will not validate the checksum (as it may fail due to differences in map serialisation) and instead patch up it's local file with it's own calculcated checksum. The scenario where a snapshot taken by a newer version of OTP and is then sent to a member using the old code cannot be handled and the old node will fail at snapshot checksum validation.
1 parent a9b907c commit a0c240d

File tree

7 files changed

+168
-51
lines changed

7 files changed

+168
-51
lines changed

src/ra_log_snapshot.erl

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@
1515
begin_accept/2,
1616
accept_chunk/2,
1717
complete_accept/2,
18-
begin_read/1,
18+
begin_read/2,
1919
read_chunk/3,
2020
recover/1,
2121
validate/1,
22-
read_meta/1
22+
read_meta/1,
23+
context/0
2324
]).
2425

2526
-define(MAGIC, "RASN").
@@ -68,34 +69,51 @@ begin_accept(SnapDir, Meta) ->
6869
Data]),
6970
{ok, {PartialCrc, Fd}}.
7071

72+
accept_chunk(<<?MAGIC, ?VERSION:8/unsigned, Crc:32/integer,
73+
Rest/binary>> = Chunk, {_PartialCrc, Fd}) ->
74+
% ensure we overwrite the existing header when we are receiving the
75+
% full file
76+
PartialCrc = erlang:crc32(Rest),
77+
{ok, 0} = file:position(Fd, 0),
78+
ok = file:write(Fd, Chunk),
79+
{ok, {PartialCrc, Crc, Fd}};
7180
accept_chunk(Chunk, {PartialCrc, Fd}) ->
72-
<<Crc:32/integer, Rest/binary>> = Chunk,
73-
accept_chunk(Rest, {PartialCrc, Crc, Fd});
81+
%% compatibility clause where we did not receive the full file
82+
%% do not validate Crc due to OTP 26 map key ordering changes
83+
<<_Crc:32/integer, Rest/binary>> = Chunk,
84+
accept_chunk(Rest, {PartialCrc, undefined, Fd});
7485
accept_chunk(Chunk, {PartialCrc0, Crc, Fd}) ->
7586
ok = file:write(Fd, Chunk),
7687
PartialCrc = erlang:crc32(PartialCrc0, Chunk),
7788
{ok, {PartialCrc, Crc, Fd}}.
7889

79-
complete_accept(Chunk, {PartialCrc, Fd}) ->
80-
<<Crc:32/integer, Rest/binary>> = Chunk,
81-
complete_accept(Rest, {PartialCrc, Crc, Fd});
82-
complete_accept(Chunk, {PartialCrc0, Crc, Fd}) ->
83-
ok = file:write(Fd, Chunk),
84-
ok = file:pwrite(Fd, 5, <<Crc:32/integer>>),
85-
Crc = erlang:crc32(PartialCrc0, Chunk),
90+
complete_accept(Chunk, St0) ->
91+
{ok, {CalculatedCrc, Crc, Fd}} = accept_chunk(Chunk, St0),
92+
CrcToWrite = case Crc of
93+
undefined ->
94+
CalculatedCrc;
95+
_ ->
96+
Crc
97+
end,
98+
ok = file:pwrite(Fd, 5, <<CrcToWrite:32/integer>>),
8699
ok = file:sync(Fd),
87100
ok = file:close(Fd),
101+
CalculatedCrc = CrcToWrite,
88102
ok.
89103

90-
begin_read(Dir) ->
104+
begin_read(Dir, Context) ->
91105
File = filename(Dir),
92106
case file:open(File, [read, binary, raw]) of
93107
{ok, Fd} ->
94108
case read_meta_internal(Fd) of
109+
{ok, Meta, _Crc}
110+
when map_get(can_accept_full_file, Context) ->
111+
{ok, Eof} = file:position(Fd, eof),
112+
{ok, Meta, {0, Eof, Fd}};
95113
{ok, Meta, Crc} ->
96-
{ok, DataStart} = file:position(Fd, cur),
114+
{ok, Cur} = file:position(Fd, cur),
97115
{ok, Eof} = file:position(Fd, eof),
98-
{ok, Meta, {Crc, {DataStart, Eof, Fd}}};
116+
{ok, Meta, {Crc, {Cur, Eof, Fd}}};
99117
{error, _} = Err ->
100118
_ = file:close(Fd),
101119
Err
@@ -105,6 +123,7 @@ begin_read(Dir) ->
105123
end.
106124

107125
read_chunk({Crc, ReadState}, Size, Dir) when is_integer(Crc) ->
126+
%% this the compatibility read mode for old snapshot receivers
108127
case read_chunk(ReadState, Size - 4, Dir) of
109128
{ok, Data, ReadState1} ->
110129
{ok, <<Crc:32/integer, Data/binary>>, ReadState1};
@@ -156,25 +175,28 @@ validate(Dir) ->
156175
%% entire binary body. NB: this does not do checksum validation.
157176
-spec read_meta(file:filename()) ->
158177
{ok, meta()} | {error, invalid_format |
159-
{invalid_version, integer()} |
160-
checksum_error |
161-
file_err()}.
178+
{invalid_version, integer()} |
179+
checksum_error |
180+
file_err()}.
162181
read_meta(Dir) ->
163182
File = filename(Dir),
164183
case file:open(File, [read, binary, raw]) of
165184
{ok, Fd} ->
166185
case read_meta_internal(Fd) of
167-
{ok, Meta, _} ->
186+
{ok, Meta, _Crc} ->
168187
_ = file:close(Fd),
169188
{ok, Meta};
170-
{error, _} = Err ->
189+
Err ->
171190
_ = file:close(Fd),
172191
Err
173-
end;
174-
Err ->
175-
Err
192+
end
176193
end.
177194

195+
-spec context() -> map().
196+
context() ->
197+
#{can_accept_full_file => true}.
198+
199+
178200
%% Internal
179201

180202
read_meta_internal(Fd) ->

src/ra_server_proc.erl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1565,11 +1565,12 @@ fold_log(From, Fun, Term, State) ->
15651565

15661566
send_snapshots(Me, Id, Term, {_, ToNode} = To, ChunkSize,
15671567
InstallTimeout, SnapState, Machine) ->
1568+
Context = ra_snapshot:context(SnapState, ToNode),
15681569
{ok, #{machine_version := SnapMacVer} = Meta, ReadState} =
1569-
ra_snapshot:begin_read(SnapState),
1570+
ra_snapshot:begin_read(SnapState, Context),
15701571

15711572
%% only send the snapshot if the target server can accept it
1572-
TheirMacVer = rpc:call(ToNode, ra_machine, version, [Machine]),
1573+
TheirMacVer = erpc:call(ToNode, ra_machine, version, [Machine]),
15731574

15741575
case SnapMacVer > TheirMacVer of
15751576
true ->

src/ra_snapshot.erl

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
-export([
1717
recover/1,
1818
read_meta/2,
19-
begin_read/1,
19+
begin_read/2,
2020
read_chunk/3,
2121
delete/2,
2222

@@ -36,6 +36,8 @@
3636
accept_chunk/4,
3737
abort_accept/1,
3838

39+
context/2,
40+
3941
handle_down/3,
4042
current_snapshot_dir/1
4143
]).
@@ -68,6 +70,8 @@
6870

6971
-export_type([state/0]).
7072

73+
-optional_callbacks([context/0]).
74+
7175
%% Side effect function
7276
%% Turn the current state into immutable reference.
7377
-callback prepare(Index :: ra_index(),
@@ -87,7 +91,9 @@
8791

8892
%% Read the snapshot metadata and initialise a read state used in read_chunk/1
8993
%% The read state should contain all the information required to read a chunk
90-
-callback begin_read(Location :: file:filename()) ->
94+
%% The Context is the map returned by the context/0 callback
95+
%% This can be used to inform the sender of receive capabilities.
96+
-callback begin_read(Location :: file:filename(), Context :: map()) ->
9197
{ok, Meta :: meta(), ReadState :: term()}
9298
| {error, term()}.
9399

@@ -131,6 +137,8 @@
131137
file_err() |
132138
term()}.
133139

140+
-callback context() -> map().
141+
134142
-spec init(ra_uid(), module(), file:filename()) ->
135143
state().
136144
init(UId, Mod, File) ->
@@ -310,6 +318,18 @@ abort_accept(#?MODULE{accepting = #accept{idxterm = {Idx, Term}},
310318
ok = delete(Dir, {Idx, Term}),
311319
State#?MODULE{accepting = undefined}.
312320

321+
%% get the snapshot capabilities context of a remote node
322+
-spec context(state(), node()) -> map().
323+
context(#?MODULE{module = Mod}, Node) ->
324+
try erpc:call(Node, Mod, ?FUNCTION_NAME, []) of
325+
Result ->
326+
Result
327+
catch
328+
error:{exception, undef, _} ->
329+
#{}
330+
end.
331+
332+
313333

314334
-spec handle_down(pid(), Info :: term(), state()) ->
315335
state().
@@ -335,14 +355,15 @@ delete(Dir, {Idx, Term}) ->
335355
ok = ra_lib:recursive_delete(SnapDir),
336356
ok.
337357

338-
-spec begin_read(State :: state()) ->
358+
-spec begin_read(State :: state(), Context :: map()) ->
339359
{ok, Meta :: meta(), ReadState} |
340360
{error, term()} when ReadState :: term().
341361
begin_read(#?MODULE{module = Mod,
342362
directory = Dir,
343-
current = {Idx, Term}}) ->
363+
current = {Idx, Term}},
364+
Context) when is_map(Context) ->
344365
Location = make_snapshot_dir(Dir, Idx, Term),
345-
Mod:begin_read(Location).
366+
Mod:begin_read(Location, Context).
346367

347368

348369
-spec read_chunk(ReadState, ChunkSizeBytes :: non_neg_integer(),

test/ra_SUITE.erl

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,18 @@ server_catches_up(Config) ->
674674
terminate_cluster([N3 | InitialNodes]).
675675

676676
snapshot_installation(Config) ->
677+
ra_env:configure_logger(logger),
678+
ok = logger:set_primary_config(level, debug),
679+
680+
LogFile = filename:join(?config(priv_dir, Config), "ra.log"),
681+
SaslFile = filename:join(?config(priv_dir, Config), "ra_sasl.log"),
682+
logger:add_handler(ra_handler, logger_std_h,
683+
#{config => #{file => LogFile}}),
684+
application:load(sasl),
685+
application:set_env(sasl, sasl_error_logger, {file, SaslFile}),
686+
application:stop(sasl),
687+
application:start(sasl),
688+
_ = error_logger:tty(false),
677689
N1 = nth_server_name(Config, 1),
678690
N2 = nth_server_name(Config, 2),
679691
N3 = nth_server_name(Config, 3),
@@ -710,9 +722,9 @@ snapshot_installation(Config) ->
710722
length(filelib:wildcard(
711723
filename:join([Dir, "snapshots", "*"]))) > 0
712724
end,
713-
?assert(try_n_times(fun () -> TryFun(N2Dir) end, 20)),
714-
?assert(try_n_times(fun () -> TryFun(N3Dir) end, 20)),
715-
?assert(try_n_times(fun () -> TryFun(N1Dir) end, 20)),
725+
?assert(try_n_times(fun () -> TryFun(N2Dir) end, 100)),
726+
?assert(try_n_times(fun () -> TryFun(N3Dir) end, 100)),
727+
?assert(try_n_times(fun () -> TryFun(N1Dir) end, 100)),
716728

717729
% then do some more
718730
[begin

test/ra_log_2_SUITE.erl

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,8 @@ snapshot_written_after_installation(Config) ->
678678
end,
679679

680680
Meta = meta(15, 2, [?N1]),
681-
Chunk = create_snapshot_chunk(Config, Meta),
681+
Context = #{},
682+
Chunk = create_snapshot_chunk(Config, Meta, Context),
682683
SnapState0 = ra_log:snapshot_state(Log2),
683684
{ok, SnapState1} = ra_snapshot:begin_accept(Meta, SnapState0),
684685
{ok, SnapState} = ra_snapshot:accept_chunk(Chunk, 1, last, SnapState1),
@@ -726,7 +727,7 @@ snapshot_installation(Config) ->
726727

727728
%% create snapshot chunk
728729
Meta = meta(15, 2, [?N1]),
729-
Chunk = create_snapshot_chunk(Config, Meta),
730+
Chunk = create_snapshot_chunk(Config, Meta, #{}),
730731
SnapState0 = ra_log:snapshot_state(Log2),
731732
{ok, SnapState1} = ra_snapshot:begin_accept(Meta, SnapState0),
732733
{ok, SnapState} = ra_snapshot:accept_chunk(Chunk, 1, last, SnapState1),
@@ -775,7 +776,7 @@ append_after_snapshot_installation(Config) ->
775776
end),
776777
%% do snapshot
777778
Meta = meta(15, 2, [?N1]),
778-
Chunk = create_snapshot_chunk(Config, Meta),
779+
Chunk = create_snapshot_chunk(Config, Meta, #{}),
779780
SnapState0 = ra_log:snapshot_state(Log1),
780781
{ok, SnapState1} = ra_snapshot:begin_accept(Meta, SnapState0),
781782
{ok, SnapState} = ra_snapshot:accept_chunk(Chunk, 1, last, SnapState1),
@@ -806,7 +807,7 @@ written_event_after_snapshot_installation(Config) ->
806807
SnapIdx = 10,
807808
%% do snapshot in
808809
Meta = meta(SnapIdx, 2, [?N1]),
809-
Chunk = create_snapshot_chunk(Config, Meta),
810+
Chunk = create_snapshot_chunk(Config, Meta, #{}),
810811
SnapState0 = ra_log:snapshot_state(Log1),
811812
{ok, SnapState1} = ra_snapshot:begin_accept(Meta, SnapState0),
812813
{ok, SnapState} = ra_snapshot:accept_chunk(Chunk, 1, last, SnapState1),
@@ -1274,7 +1275,7 @@ meta(Idx, Term, Cluster) ->
12741275
cluster => Cluster,
12751276
machine_version => 1}.
12761277

1277-
create_snapshot_chunk(Config, #{index := Idx} = Meta) ->
1278+
create_snapshot_chunk(Config, #{index := Idx} = Meta, Context) ->
12781279
OthDir = filename:join(?config(priv_dir, Config), "snapshot_installation"),
12791280
ok = ra_lib:make_dir(OthDir),
12801281
Sn0 = ra_snapshot:init(<<"someotheruid_adsfasdf">>, ra_log_snapshot,
@@ -1288,7 +1289,7 @@ create_snapshot_chunk(Config, #{index := Idx} = Meta) ->
12881289
after 1000 ->
12891290
exit(snapshot_timeout)
12901291
end,
1291-
{ok, Meta, ChunkSt} = ra_snapshot:begin_read(Sn2),
1292+
{ok, Meta, ChunkSt} = ra_snapshot:begin_read(Sn2, Context),
12921293
{ok, Chunk, _} = ra_snapshot:read_chunk(ChunkSt, 1000000000, Sn2),
12931294
Chunk.
12941295

0 commit comments

Comments
 (0)