Skip to content

Commit

Permalink
MDEV-35806: Error in read_log_event() corrupts relay log writer, cras…
Browse files Browse the repository at this point in the history
…hes server

In Log_event::read_log_event(), don't use IO_CACHE::error of the relay log's
IO_CACHE to signal an error back to the caller. When reading the active
relay log, this flag is also being used by the IO thread, and setting it can
randomly cause the IO thread to wrongly detect IO error on writing and
permanently disable the relay log.

This was seen sporadically in test case rpl.rpl_from_mysql80. The read
error set by the SQL thread in the IO_CACHE would be interpreted as a
write error by the IO thread, which would cause it to throw a fatal
error and close the relay log. And this would later cause CHANGE
MASTER to try to purge a closed relay log, resulting in nullptr crash.

SQL thread is not able to parse an event read from the relay log. This
can happen like here when replicating unknown events from a MySQL master,
potentially also for other reasons.

Also fix a mistake in my_b_flush_io_cache() introduced back in 2001
(fa09f2c) where my_b_flush_io_cache() could wrongly return an error set
in IO_CACHE::error, even if the flush operation itself succeeded.

Also fix another sporadic failure in rpl.rpl_from_mysql80 where the outout
of MASTER_POS_WAIT() depended on timing of SQL and IO thread.

Reviewed-by: Monty <[email protected]>
Reviewed-by: Andrei Elkin <[email protected]>
Signed-off-by: Kristian Nielsen <[email protected]>
  • Loading branch information
knielsen committed Jan 24, 2025
1 parent 2543be6 commit 72e1cc8
Show file tree
Hide file tree
Showing 12 changed files with 54 additions and 31 deletions.
17 changes: 11 additions & 6 deletions client/mysqlbinlog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2702,6 +2702,7 @@ static Exit_status check_header(IO_CACHE* file,
uchar buf[PROBE_HEADER_LEN];
my_off_t tmp_pos, pos;
MY_STAT my_file_stat;
int read_error;

delete glob_description_event;
if (!(glob_description_event= new Format_description_log_event(3)))
Expand Down Expand Up @@ -2802,7 +2803,8 @@ static Exit_status check_header(IO_CACHE* file,
Format_description_log_event *new_description_event;
my_b_seek(file, tmp_pos); /* seek back to event's start */
if (!(new_description_event= (Format_description_log_event*)
Log_event::read_log_event(file, glob_description_event,
Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum)))
/* EOF can't be hit here normally, so it's a real error */
{
Expand Down Expand Up @@ -2835,7 +2837,8 @@ static Exit_status check_header(IO_CACHE* file,
{
Log_event *ev;
my_b_seek(file, tmp_pos); /* seek back to event's start */
if (!(ev= Log_event::read_log_event(file, glob_description_event,
if (!(ev= Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum)))
{
/* EOF can't be hit here normally, so it's a real error */
Expand Down Expand Up @@ -2948,8 +2951,10 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info,
{
char llbuff[21];
my_off_t old_off = my_b_tell(file);
int read_error;

Log_event* ev = Log_event::read_log_event(file, glob_description_event,
Log_event* ev = Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum);
if (!ev)
{
Expand All @@ -2958,15 +2963,15 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info,
about a corruption, but treat it as EOF and move to the next binlog.
*/
if (glob_description_event->flags & LOG_EVENT_BINLOG_IN_USE_F)
file->error= 0;
else if (file->error)
read_error= 0;
else if (read_error)
{
error("Could not read entry at offset %s: "
"Error in log format or read error.",
llstr(old_off,llbuff));
goto err;
}
// else file->error == 0 means EOF, that's OK, we break in this case
// else read_error == 0 means EOF, that's OK, we break in this case

/*
Emit a warning in the event that we finished processing input
Expand Down
2 changes: 0 additions & 2 deletions mysql-test/suite/rpl/r/rpl_from_mysql80.result
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ START SLAVE IO_THREAD;
include/wait_for_slave_io_to_start.inc
START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178;
SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60);
MASTER_POS_WAIT('master-bin.000001', 1178, 60)
NULL
SELECT * FROM t1 ORDER BY a;
a b c
1 0
Expand Down
2 changes: 2 additions & 0 deletions mysql-test/suite/rpl/t/rpl_from_mysql80.test
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ START SLAVE IO_THREAD;
# The position 1178 is the start of: INSERT INTO t1 VALUES (4, 0, 'skip');
# After that comes unknown MySQL 8.0 events, which we test error for below.
START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178;
--disable_result_log
SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60);
--enable_result_log
SELECT * FROM t1 ORDER BY a;

--source include/wait_for_slave_sql_to_stop.inc
Expand Down
2 changes: 1 addition & 1 deletion mysys/mf_iocache.c
Original file line number Diff line number Diff line change
Expand Up @@ -1735,7 +1735,7 @@ int my_b_flush_io_cache(IO_CACHE *info, int need_append_buffer_lock)
info->write_pos= info->write_buffer;
++info->disk_writes;
UNLOCK_APPEND_BUFFER;
DBUG_RETURN(info->error);
DBUG_RETURN(0);
}
}
UNLOCK_APPEND_BUFFER;
Expand Down
12 changes: 8 additions & 4 deletions sql/log.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11385,7 +11385,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
binlog_checkpoint_found= false;
for (round= 1;;)
{
while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log,
int error;
while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log, &error,
fdle, opt_master_verify_checksum))
&& ev->is_valid())
{
Expand Down Expand Up @@ -11671,7 +11672,8 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
return 1;
}

if ((ev= Log_event::read_log_event(&log, &fdle,
int read_error;
if ((ev= Log_event::read_log_event(&log, &read_error, &fdle,
opt_master_verify_checksum)) &&
ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
{
Expand Down Expand Up @@ -11895,10 +11897,11 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
Format_description_log_event *fdle;
Log_event *ev;
const char *errormsg = NULL;
int read_error;

*out_gtid_list= NULL;

if (!(ev= Log_event::read_log_event(cache, &init_fdle,
if (!(ev= Log_event::read_log_event(cache, &read_error, &init_fdle,
opt_master_verify_checksum)) ||
ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
{
Expand All @@ -11914,7 +11917,8 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
{
Log_event_type typ;

ev= Log_event::read_log_event(cache, fdle, opt_master_verify_checksum);
ev= Log_event::read_log_event(cache, &read_error, fdle,
opt_master_verify_checksum);
if (!ev)
{
errormsg= "Could not read GTID list event while looking for GTID "
Expand Down
15 changes: 12 additions & 3 deletions sql/log_event.cc
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ int Log_event::read_log_event(IO_CACHE* file, String* packet,
DBUG_RETURN(0);
}

Log_event* Log_event::read_log_event(IO_CACHE* file,
Log_event* Log_event::read_log_event(IO_CACHE* file, int *out_error,
const Format_description_log_event *fdle,
my_bool crc_check,
my_bool print_errors)
Expand All @@ -927,6 +927,7 @@ Log_event* Log_event::read_log_event(IO_CACHE* file,
const char *error= 0;
Log_event *res= 0;

*out_error= 0;
switch (read_log_event(file, &event, fdle, BINLOG_CHECKSUM_ALG_OFF))
{
case 0:
Expand Down Expand Up @@ -976,14 +977,22 @@ Log_event* Log_event::read_log_event(IO_CACHE* file,
#endif

/*
The SQL slave thread will check if file->error<0 to know
The SQL slave thread will check *out_error to know
if there was an I/O error. Even if there is no "low-level" I/O errors
with 'file', any of the high-level above errors is worrying
enough to stop the SQL thread now ; as we are skipping the current event,
going on with reading and successfully executing other events can
only corrupt the slave's databases. So stop.
*/
file->error= -1;
*out_error= 1;
/*
Clear any error that might have been set in the IO_CACHE from a read
error, while we are still holding the relay log mutex (if reading from
the hot log). Otherwise the error might interfere unpredictably with
write operations to the same IO_CACHE in the IO thread.
*/
file->error= 0;


#ifndef MYSQL_CLIENT
if (!print_errors)
Expand Down
2 changes: 1 addition & 1 deletion sql/log_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,7 @@ class Log_event
we detect the event's type, then call the specific event's
constructor and pass description_event as an argument.
*/
static Log_event* read_log_event(IO_CACHE* file,
static Log_event* read_log_event(IO_CACHE* file, int *out_error,
const Format_description_log_event
*description_event,
my_bool crc_check,
Expand Down
4 changes: 2 additions & 2 deletions sql/log_event_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5041,7 +5041,7 @@ int Execute_load_log_event::do_apply_event(rpl_group_info *rgi)
char fname[FN_REFLEN+10];
char *ext;
int fd;
int error= 1;
int error= 1, read_error;
IO_CACHE file;
Load_log_event *lev= 0;
Relay_log_info const *rli= rgi->rli;
Expand All @@ -5060,7 +5060,7 @@ int Execute_load_log_event::do_apply_event(rpl_group_info *rgi)
goto err;
}
if (!(lev= (Load_log_event*)
Log_event::read_log_event(&file,
Log_event::read_log_event(&file, &read_error,
rli->relay_log.description_event_for_exec,
opt_slave_sql_verify_checksum)) ||
lev->get_type_code() != NEW_LOAD_EVENT)
Expand Down
5 changes: 3 additions & 2 deletions sql/rpl_parallel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1037,14 +1037,15 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
/* The loop is here so we can try again the next relay log file on EOF. */
for (;;)
{
int error;
old_offset= cur_offset;
ev= Log_event::read_log_event(&rlog, description_event,
ev= Log_event::read_log_event(&rlog, &error, description_event,
opt_slave_sql_verify_checksum);
cur_offset= my_b_tell(&rlog);

if (ev)
break;
if (unlikely(rlog.error < 0))
if (unlikely(error))
{
errmsg= "slave SQL thread aborted because of I/O error";
err= 1;
Expand Down
9 changes: 5 additions & 4 deletions sql/rpl_rli.cc
Original file line number Diff line number Diff line change
Expand Up @@ -544,12 +544,13 @@ read_relay_log_description_event(IO_CACHE *cur_log, ulonglong start_pos,
if (my_b_tell(cur_log) >= start_pos)
break;

if (!(ev= Log_event::read_log_event(cur_log, fdev,
int read_error;
if (!(ev= Log_event::read_log_event(cur_log, &read_error, fdev,
opt_slave_sql_verify_checksum)))
{
DBUG_PRINT("info",("could not read event, cur_log->error=%d",
cur_log->error));
if (cur_log->error) /* not EOF */
DBUG_PRINT("info",("could not read event, read_error=%d",
read_error));
if (read_error) /* not EOF */
{
*errmsg= "I/O error reading event at position 4";
delete fdev;
Expand Down
7 changes: 4 additions & 3 deletions sql/slave.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7845,7 +7845,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
MYSQL_BIN_LOG::open() will write the buffered description event.
*/
old_pos= rli->event_relay_log_pos;
if ((ev= Log_event::read_log_event(cur_log,
int error;
if ((ev= Log_event::read_log_event(cur_log, &error,
rli->relay_log.description_event_for_exec,
opt_slave_sql_verify_checksum)))

Expand All @@ -7862,8 +7863,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
DBUG_RETURN(ev);
}
if (opt_reckless_slave) // For mysql-test
cur_log->error = 0;
if (unlikely(cur_log->error < 0))
error = 0;
if (unlikely(error))
{
errmsg = "slave SQL thread aborted because of I/O error";
if (hot_log)
Expand Down
8 changes: 5 additions & 3 deletions sql/sql_repl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4188,7 +4188,8 @@ bool mysql_show_binlog_events(THD* thd)
my_off_t scan_pos = BIN_LOG_HEADER_SIZE;
while (scan_pos < pos)
{
ev= Log_event::read_log_event(&log, description_event,
int error;
ev= Log_event::read_log_event(&log, &error, description_event,
opt_master_verify_checksum);
scan_pos = my_b_tell(&log);
if (ev == NULL || !ev->is_valid())
Expand Down Expand Up @@ -4263,8 +4264,9 @@ bool mysql_show_binlog_events(THD* thd)
writing about this in the server log would be confusing as it isn't
related to server operational status.
*/
int error;
for (event_count = 0;
(ev = Log_event::read_log_event(&log,
(ev = Log_event::read_log_event(&log, &error,
description_event,
(opt_master_verify_checksum ||
verify_checksum_once), false)); )
Expand Down Expand Up @@ -4308,7 +4310,7 @@ bool mysql_show_binlog_events(THD* thd)
break;
}

if (unlikely(event_count < unit->lim.get_select_limit() && log.error))
if (unlikely(event_count < unit->lim.get_select_limit() && error))
{
errmsg = "Wrong offset or I/O error";
mysql_mutex_unlock(log_lock);
Expand Down

0 comments on commit 72e1cc8

Please sign in to comment.