diff --git a/bdb/phys_rep_lsn.c b/bdb/phys_rep_lsn.c index 9b9a2ccb90..b42708e344 100644 --- a/bdb/phys_rep_lsn.c +++ b/bdb/phys_rep_lsn.c @@ -18,7 +18,7 @@ logmsg(lvl, "physrep: " __VA_ARGS__); \ } while (0) -int matchable_log_type(int rectype); +int matchable_log_type(DB_ENV *dbenv, int rectype); extern int gbl_physrep_debug; int gbl_physrep_exit_on_invalid_logstream = 0; @@ -126,7 +126,7 @@ int find_log_timestamp(bdb_state_type *bdb_state, time_t time, LOGCOPY_32(&rectype, logrec.data); normalize_rectype(&rectype); - } while (!matchable_log_type(rectype)); + } while (!matchable_log_type(bdb_state->dbenv, rectype)); my_time = get_timestamp_from_matchable_record(logrec.data); if (gbl_physrep_debug) { @@ -194,7 +194,7 @@ static int get_next_matchable(DB_LOGC *logc, LOG_INFO *info, int check_current, LOGCOPY_32(&rectype, logrec->data); normalize_rectype(&rectype); - if (matchable_log_type(rectype) && in_parent_range(&match_lsn, parent_highest, parent_lowest)) { + if (matchable_log_type(logc->dbenv, rectype) && in_parent_range(&match_lsn, parent_highest, parent_lowest)) { if (gbl_physrep_debug) { physrep_logmsg(LOGMSG_USER, "%s: Initial rec {%u:%u} is matchable\n", __func__, info->file, info->offset); @@ -228,7 +228,8 @@ static int get_next_matchable(DB_LOGC *logc, LOG_INFO *info, int check_current, LOGCOPY_32(&rectype, logrec->data); normalize_rectype(&rectype); - matchable = (matchable_log_type(rectype) && in_parent_range(&match_lsn, parent_highest, parent_lowest)); + matchable = + (matchable_log_type(logc->dbenv, rectype) && in_parent_range(&match_lsn, parent_highest, parent_lowest)); } while (!matchable); info->file = match_lsn.file; @@ -565,7 +566,7 @@ int physrep_bdb_wait_for_seqnum(bdb_state_type *bdb_state, DB_LSN *lsn, void *da LOGCOPY_32(&rectype, data); normalize_rectype(&rectype); - if (!matchable_log_type(rectype)) { + if (!matchable_log_type(bdb_state->dbenv, rectype)) { return 0; } diff --git a/berkdb/build/db.h b/berkdb/build/db.h index 7b988b2b91..a3850a1d09 100644 --- a/berkdb/build/db.h +++ b/berkdb/build/db.h @@ -207,6 +207,7 @@ struct txn_properties; #define DB_TXN_NOT_DURABLE 0x0000200 /* Do not log changes. */ #define DB_USE_ENVIRON 0x0000400 /* Use the environment. */ #define DB_USE_ENVIRON_ROOT 0x0000800 /* Use the environment if root. */ +#define DB_RECOVERY_CKP 0x0001000 /* Do recovery checkpoint */ /* * Common flags -- diff --git a/berkdb/db/db_dispatch.c b/berkdb/db/db_dispatch.c index 5ee2b73eaa..ee068d9fb1 100644 --- a/berkdb/db/db_dispatch.c +++ b/berkdb/db/db_dispatch.c @@ -105,57 +105,56 @@ dump_log_event_counts(void) { int events[] = { DB___bam_split, DB___bam_rsplit, DB___bam_adj, DB___bam_cadjust, - DB___bam_cdel, + DB___bam_cdel, DB___bam_repl, DB___bam_root, DB___bam_curadj, DB___bam_rcuradj, - DB___crdel_metasub, + DB___crdel_metasub, DB___db_addrem, DB___db_big, DB___db_ovref, DB___db_relink, - DB___db_debug, + DB___db_debug, DB___db_noop, DB___db_pg_alloc, DB___db_pg_free, DB___db_cksum, - DB___db_pg_freedata, + DB___db_pg_freedata, DB___db_pg_prepare, DB___db_pg_new, DB___dbreg_register, - DB___fop_create, + DB___fop_create, DB___fop_remove, DB___fop_write, DB___fop_rename, - DB___fop_file_remove, DB___ham_insdel, + DB___fop_file_remove, DB___ham_insdel, DB___ham_newpage, DB___ham_splitdata, DB___ham_replace, - DB___ham_copypage, + DB___ham_copypage, DB___ham_metagroup, DB___ham_groupalloc, DB___ham_curadj, - DB___ham_chgpg, DB___qam_incfirst, + DB___ham_chgpg, DB___qam_incfirst, DB___qam_mvptr, DB___qam_del, DB___qam_add, DB___qam_delext, - DB___txn_regop, DB___txn_dist_prepare, DB___txn_dist_abort, + DB___txn_regop, DB___txn_dist_prepare, DB___txn_dist_abort, DB___txn_regop_gen, DB___txn_regop_rowlocks, DB___txn_dist_commit, - DB___txn_ckp, DB___txn_child, DB___txn_xa_regop, - DB___txn_recycle + DB___txn_ckp, DB___txn_ckp_recovery, DB___txn_child, + DB___txn_xa_regop, DB___txn_recycle }; char *event_names[] = { "DB___bam_split", "DB___bam_rsplit", "DB___bam_adj", - "DB___bam_cadjust", "DB___bam_cdel", + "DB___bam_cadjust", "DB___bam_cdel", "DB___bam_repl", "DB___bam_root", "DB___bam_curadj", - "DB___bam_rcuradj", "DB___crdel_metasub", + "DB___bam_rcuradj", "DB___crdel_metasub", "DB___db_addrem", "DB___db_big", "DB___db_ovref", - "DB___db_relink", "DB___db_debug", + "DB___db_relink", "DB___db_debug", "DB___db_noop", "DB___db_pg_alloc", "DB___db_pg_free", - "DB___db_cksum", "DB___db_pg_freedata", + "DB___db_cksum", "DB___db_pg_freedata", "DB___db_pg_prepare", "DB___db_pg_new", "DB___dbreg_register", - "DB___fop_create", + "DB___fop_create", "DB___fop_remove", "DB___fop_write", "DB___fop_rename", - "DB___fop_file_remove", "DB___ham_insdel", + "DB___fop_file_remove", "DB___ham_insdel", "DB___ham_newpage", "DB___ham_splitdata", "DB___ham_replace", - "DB___ham_copypage", + "DB___ham_copypage", "DB___ham_metagroup", "DB___ham_groupalloc", "DB___ham_curadj", - "DB___ham_chgpg", "DB___qam_incfirst", - "DB___qam_mvptr", "DB___qam_del", "DB___qam_add", - "DB___qam_delext", "DB___txn_regop", - "DB___txn_dist_prepare", "DB___txn_dist_abort", + "DB___ham_chgpg", "DB___qam_incfirst", + "DB___qam_mvptr", "DB___qam_del", "DB___qam_add", "DB___qam_delext", + "DB___txn_regop", "DB___txn_dist_prepare", "DB___txn_dist_abort", "DB___txn_regop_gen", "DB___txn_regop_rowlocks", "DB___txn_dist_commit", - "DB___txn_ckp", "DB___txn_child", "DB___txn_xa_regop", - "DB___txn_recycle" + "DB___txn_ckp", "DB___txn_ckp_recovery", "DB___txn_child", + "DB___txn_xa_regop", "DB___txn_recycle" }; int i; for (i = 0; i < sizeof(events) / sizeof(events[0]); i++) { if (log_event_counts[events[i]]) logmsg(LOGMSG_USER, "%-20s %d\n", event_names[i], - log_event_counts[events[i]]); + log_event_counts[events[i]]); } } @@ -262,6 +261,8 @@ optostr(int op) return "DB___txn_dist_prepare"; case DB___txn_ckp: return "DB___txn_ckp"; + case DB___txn_ckp_recovery: + return "DB___txn_ckp_recovery"; case DB___txn_child: return "DB___txn_child"; case DB___txn_xa_regop: @@ -386,6 +387,7 @@ ufid_for_recovery_record(DB_ENV *env, DB_LSN *lsn, int rectype, case DB___txn_dist_abort: case DB___txn_regop_rowlocks: case DB___txn_ckp: + case DB___txn_ckp_recovery: case DB___txn_child: case DB___txn_xa_regop: case DB___txn_recycle: @@ -469,7 +471,7 @@ __db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info) if (normalize_rectype(&rectype) && (redo == DB_TXN_OPENFILES)) { LOGCOPY_64(&utxnid, &((char*)db->data)[4 + 4 + 8]); - if (rectype == DB___txn_ckp) { + if (rectype == DB___txn_ckp || rectype == DB___txn_ckp_recovery) { LOGCOPY_64(&maxutxnid, &((char*)db->data)[4 + 4 + 8 + 8 + 8 + 8 + 4 + 4]); } Pthread_mutex_lock(&dbenv->utxnid_lock); @@ -559,8 +561,10 @@ __db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info) /* FALLTHROUGH */ case DB_TXN_POPENFILES: if (rectype == DB___dbreg_register || - rectype == DB___txn_child || - rectype == DB___txn_ckp || rectype == DB___txn_recycle) + rectype == DB___txn_child || + rectype == DB___txn_ckp || + rectype == DB___txn_ckp_recovery || + rectype == DB___txn_recycle) return (dtab[rectype] (dbenv, db, lsnp, redo, info)); break; case DB_TXN_BACKWARD_ROLL: @@ -583,6 +587,7 @@ __db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info) case DB___txn_regop_rowlocks: case DB___txn_recycle: case DB___txn_ckp: + case DB___txn_ckp_recovery: case DB___db_noop: case DB___fop_file_remove: case DB___txn_child: @@ -647,6 +652,7 @@ __db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info) switch (rectype) { case DB___txn_recycle: case DB___txn_ckp: + case DB___txn_ckp_recovery: case DB___db_noop: make_call = 1; break; diff --git a/berkdb/dbinc_auto/txn_auto.h b/berkdb/dbinc_auto/txn_auto.h index b54f9770fe..666ec27da9 100644 --- a/berkdb/dbinc_auto/txn_auto.h +++ b/berkdb/dbinc_auto/txn_auto.h @@ -124,4 +124,17 @@ typedef struct __txn_dist_commit_args { u_int64_t timestamp; DBT dist_txnid; } __txn_dist_commit_args; + +#define DB___txn_ckp_recovery 20 +typedef struct __txn_ckp_recovery_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + DB_LSN ckp_lsn; + DB_LSN last_ckp; + int32_t timestamp; + u_int32_t rep_gen; + u_int64_t max_utxnid; +} __txn_ckp_recovery_args; + #endif diff --git a/berkdb/dbinc_auto/txn_ext.h b/berkdb/dbinc_auto/txn_ext.h index 8197b31d56..55b0c1bd30 100644 --- a/berkdb/dbinc_auto/txn_ext.h +++ b/berkdb/dbinc_auto/txn_ext.h @@ -65,7 +65,7 @@ int __txn_regop_rowlocks_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, db_recops, voi int __txn_regop_rowlocks_getallpgnos __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); int __txn_regop_rowlocks_print __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); int __txn_regop_rowlocks_read __P((DB_ENV *, void *, __txn_regop_rowlocks_args **)); -int __txn_ckp_log __P((DB_ENV *, DB_TXN *, DB_LSN *, u_int32_t, DB_LSN *, DB_LSN *, int32_t, u_int32_t, u_int64_t)); +int __txn_ckp_log __P((DB_ENV *, DB_TXN *, u_int32_t, DB_LSN *, u_int32_t, DB_LSN *, DB_LSN *, int32_t, u_int32_t, u_int64_t)); int __txn_ckp_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); int __txn_ckp_getallpgnos __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); int __txn_ckp_print __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); diff --git a/berkdb/env/env_recover.c b/berkdb/env/env_recover.c index 1f4b48ef10..647f070469 100644 --- a/berkdb/env/env_recover.c +++ b/berkdb/env/env_recover.c @@ -124,7 +124,7 @@ __checkpoint_get_recovery_lsn(DB_ENV *dbenv, DB_LSN *lsnout) LOGCOPY_32(&type, dbt.data); normalize_rectype(&type); - if (type != DB___txn_ckp) { + if (type != DB___txn_ckp && type != DB___txn_ckp_recovery) { logmsg(LOGMSG_ERROR, "checkpoint record unexpeted type %d\n", type); goto err; } @@ -774,7 +774,7 @@ int __dbenv_build_mintruncate_list(dbenv) Pthread_mutex_unlock(&dbenv->mintruncate_lk); } - if (type == DB___txn_ckp) { + if (type == DB___txn_ckp || type == DB___txn_ckp_recovery) { if ((ret = __txn_ckp_read(dbenv, rec.data, &ckp_args)) != 0) abort(); @@ -1526,7 +1526,7 @@ __db_apprec(dbenv, max_lsn, trunclsn, update, flags) logmsg(LOGMSG_ERROR, "memp_sync returned %d\n", ret); goto err; } - } else if ((ret = __txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0) + } else if ((ret = __txn_checkpoint(dbenv, 0, 0, DB_FORCE|DB_RECOVERY_CKP)) != 0) goto err; @@ -1836,7 +1836,7 @@ __log_find_latest_checkpoint_before_lsn_try_harder(DB_ENV * dbenv, if (data.size >= sizeof(u_int32_t)) { LOGCOPY_32(&type, data.data); normalize_rectype(&type); - if (type == DB___txn_ckp) { + if (type == DB___txn_ckp || type == DB___txn_ckp_recovery) { if (log_compare(&lsn, max_lsn) < 0) { *foundlsn = lsn; free(data.data); @@ -1959,7 +1959,7 @@ __log_earliest(dbenv, logc, lowtime, lowlsn) ret == 0; ret = __log_c_get(logc, &lsn, &data, DB_NEXT)) { LOGCOPY_32(&rectype, data.data); normalize_rectype(&rectype); - if (rectype != DB___txn_ckp) + if (rectype != DB___txn_ckp && rectype != DB___txn_ckp_recovery) continue; if ((ret = __txn_ckp_read(dbenv, data.data, &ckpargs)) == 0) { cmp = log_compare(&ckpargs->ckp_lsn, &first_lsn); @@ -2137,6 +2137,7 @@ __scan_logfiles_for_asof_modsnap(dbenv) LOGCOPY_32(&rectype, data.data); normalize_rectype(&rectype); switch (rectype) { + case DB___txn_ckp_recovery: case DB___txn_ckp: if ((ret = __txn_ckp_read(dbenv, data.data, @@ -2307,6 +2308,7 @@ __recover_logfile_pglogs(dbenv, fileid_tbl) LOGCOPY_32(&rectype, data.data); normalize_rectype(&rectype); switch (rectype) { + case DB___txn_ckp_recovery: case DB___txn_ckp: if ((ret = __txn_ckp_read(dbenv, data.data, @@ -2662,7 +2664,7 @@ __env_find_verify_recover_start(dbenv, lsnp) do { LOGCOPY_32(&rectype, rec.data); normalize_rectype(&rectype); - } while ((!matchable_log_type(rectype) || log_compare(lsnp, &s_lsn) >= 0) && + } while ((!matchable_log_type(dbenv, rectype) || log_compare(lsnp, &s_lsn) >= 0) && (ret = __log_c_get(logc, lsnp, &rec, DB_PREV)) == 0); if (ret != 0) diff --git a/berkdb/log/log.c b/berkdb/log/log.c index 9b9e2d9fe0..57b132a009 100644 --- a/berkdb/log/log.c +++ b/berkdb/log/log.c @@ -327,7 +327,7 @@ __log_get_last_ckp(DB_ENV *dbenv, DB_LSN *lsn) LOGCOPY_32(&rectype, dbt.data); normalize_rectype(&rectype); - if (rectype == DB___txn_ckp) { + if (rectype == DB___txn_ckp || rectype == DB___txn_ckp_recovery) { /* found it */ (void)__log_c_close(logc); return 0; @@ -415,7 +415,7 @@ __log_recover(dblp) continue; LOGCOPY_32(&rectype, dbt.data); normalize_rectype(&rectype); - if (rectype == DB___txn_ckp) + if (rectype == DB___txn_ckp || rectype == DB___txn_ckp_recovery) /* * If we happen to run into a checkpoint, cache its * LSN so that the transaction system doesn't have diff --git a/berkdb/mp/mp_bh.c b/berkdb/mp/mp_bh.c index 634483b6ec..04b31a8916 100644 --- a/berkdb/mp/mp_bh.c +++ b/berkdb/mp/mp_bh.c @@ -643,7 +643,7 @@ berkdb_verify_lsn_written_to_disk(DB_ENV *dbenv, DB_LSN *lsn, LOGCOPY_32(&type, logent); normalize_rectype(&type); /* check that the checkpoint lsn is valid and readable */ - if (type == DB___txn_ckp && check_checkpoint) { + if ((type == DB___txn_ckp || type == DB___txn_ckp_recovery) && check_checkpoint) { __txn_ckp_args *ckp = NULL; rc = __txn_ckp_read(dbenv, logent, &ckp); diff --git a/berkdb/rep/rep_method.c b/berkdb/rep/rep_method.c index 700fe7fe17..f05e169b8d 100644 --- a/berkdb/rep/rep_method.c +++ b/berkdb/rep/rep_method.c @@ -39,6 +39,8 @@ static const char revid[] = "$Id: rep_method.c,v 1.134 2003/11/13 15:41:51 sue E #include "logmsg.h" #include #include +extern int gbl_fullrecovery; +int gbl_reproduce_ckp_bug = 0; int gbl_rep_method_max_sleep_cnt = 0; @@ -332,14 +334,18 @@ __rep_start(dbenv, dbt, gen, flags) if (!F_ISSET(rep, REP_F_MASTER)) { /* Master is not yet set. */ if (role_chg) { - if (rep->w_gen > rep->recover_gen) { - ++rep->w_gen; - __rep_set_gen(dbenv, __func__, __LINE__, rep->w_gen); - } else if (rep->gen > rep->recover_gen) { - __rep_set_gen(dbenv, __func__, __LINE__, rep->gen + 1); + if (!gbl_fullrecovery || gbl_reproduce_ckp_bug) { + if (rep->w_gen > rep->recover_gen) { + ++rep->w_gen; + __rep_set_gen(dbenv, __func__, __LINE__, rep->w_gen); + } else if (rep->gen > rep->recover_gen) { + __rep_set_gen(dbenv, __func__, __LINE__, rep->gen + 1); + } else { + __rep_set_gen(dbenv, __func__, __LINE__, + rep->recover_gen + 1); + } } else { - __rep_set_gen(dbenv, __func__, __LINE__, - rep->recover_gen + 1); + logmsg(LOGMSG_USER, "%s line %d keeping gen %d for full recovery\n", __func__, __LINE__, rep->gen); } /* * There could have been any number of failed @@ -351,9 +357,13 @@ __rep_start(dbenv, dbt, gen, flags) rep->egen); __rep_set_gen(dbenv, __func__, __LINE__, gen); } else if (rep->egen > rep->gen) { - logmsg(LOGMSG_DEBUG, "%s line %d setting gen to rep->egen " - "%d\n", __func__, __LINE__, rep->egen); - __rep_set_gen(dbenv, __func__, __LINE__, rep->egen); + if (!gbl_fullrecovery || gbl_reproduce_ckp_bug) { + logmsg(LOGMSG_DEBUG, "%s line %d setting gen to rep->egen " + "%d\n", __func__, __LINE__, rep->egen); + __rep_set_gen(dbenv, __func__, __LINE__, rep->egen); + } else { + logmsg(LOGMSG_USER, "%s line %d keeping gen %d for full recovery\n", __func__, __LINE__, rep->gen); + } } redo_prepared = 1; @@ -1072,6 +1082,8 @@ __rep_set_rep_transport(dbenv, eid, f_send) extern pthread_mutex_t rep_queue_lock; extern void send_master_req(DB_ENV *dbenv, const char *func, int line); +extern int gbl_recovery_ckp; +extern int gbl_match_on_ckp; static int __retrieve_logged_generation_commitlsn(dbenv, lsn, gen) @@ -1116,17 +1128,48 @@ __retrieve_logged_generation_commitlsn(dbenv, lsn, gen) LOGCOPY_32(&rectype, rec.data); normalize_rectype(&rectype); + int vote_on_ckp = (gbl_recovery_ckp || gbl_reproduce_ckp_bug) && gbl_match_on_ckp; + int regop_cnt = 0; while (ret == 0 && rectype != DB___txn_regop_gen && rectype != DB___txn_regop_rowlocks && rectype != DB___txn_dist_commit && - rectype != DB___txn_dist_prepare && rectype != DB___txn_regop) { + rectype != DB___txn_dist_prepare && (rectype != DB___txn_ckp || !vote_on_ckp)) { + + if (rectype == DB___txn_regop && !vote_on_ckp) { + regop_cnt++; + + /* Tolerate txn_regop to a point (unfortunately) - but only if we are not allowed to + * vote with checkpoints. The code goes away when recovery-ckp is enabled, because we + * guarantee that a ckp is only ever emitted by the cluster-master */ + + if (regop_cnt > 100) { + logmsg(LOGMSG_USER, "%s failing after %d txn-regop records\n", + __func__, regop_cnt); + goto err; + } + } if ((ret = __log_c_get(logc, &curlsn, &rec, DB_PREV)) == 0) { LOGCOPY_32(&rectype, rec.data); normalize_rectype(&rectype); } } - if (rectype == DB___txn_regop_gen) { + /* NOTE: never use DB___txn_ckp_recovery: + * it is never written by the master for that generation */ + if (rectype == DB___txn_ckp) { + __txn_ckp_args *txn_ckp_args = NULL; + if ((ret = __txn_ckp_read(dbenv, rec.data, &txn_ckp_args)) != 0) + goto err; + MUTEX_LOCK(dbenv, db_rep->rep_mutexp); + rep->committed_lsn = *lsn = curlsn; + rep->committed_gen = *gen = txn_ckp_args->rep_gen; + if (rep->gen < rep->committed_gen) { + __rep_set_gen(dbenv, __func__, __LINE__, rep->committed_gen); + __rep_set_log_gen(dbenv, __func__, __LINE__, rep->gen); + } + MUTEX_UNLOCK(dbenv, db_rep->rep_mutexp); + __os_free(dbenv, txn_ckp_args); + } else if (rectype == DB___txn_regop_gen) { __txn_regop_gen_args *txn_gen_args = NULL; if ((ret = __txn_regop_gen_read(dbenv, rec.data, &txn_gen_args)) != 0) @@ -1166,10 +1209,6 @@ __retrieve_logged_generation_commitlsn(dbenv, lsn, gen) } MUTEX_UNLOCK(dbenv, db_rep->rep_mutexp); __os_free(dbenv, txn_rl_args); - } else if (rectype == DB___txn_regop) { - logmsg(LOGMSG_ERROR, "%s returning -1 on regop-record / " - "recent-upgrade.\n", __func__); - ret = -1; } else if (rectype == DB___txn_dist_commit) { __txn_dist_commit_args *txn_dist_commit_args = NULL; if ((ret = __txn_dist_commit_read(dbenv, rec.data, @@ -1260,7 +1299,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) */ if (in_progress) { *eidp = dbenv->rep_eid; - logmsg(LOGMSG_DEBUG, "%s line %d returning %d master %s egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d returning %d master %s egen is %d\n", __func__, __LINE__, ret, *eidp, *newgen); return (0); } @@ -1268,7 +1308,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) fprintf(stderr, "%s:%d broadcasting REP_MASTER_REQ\n", __FILE__, __LINE__); #endif - logmsg(LOGMSG_DEBUG, "%s start sending master req\n", __func__); + logmsg(LOGMSG_USER, + "%s start sending master req\n", __func__); send_master_req(dbenv, __func__, __LINE__); ret = __rep_wait(dbenv, timeout / 4, eidp, newgen, 0, REP_F_EPHASE1); switch (ret) { @@ -1279,7 +1320,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) if (FLD_ISSET(dbenv->verbose, DB_VERB_REPLICATION)) __db_err(dbenv, "Found master %d", *eidp); #endif - logmsg(LOGMSG_DEBUG, "%s line %d returning %d master %s egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d returning %d master %s egen is %d\n", __func__, __LINE__, ret, *eidp, *newgen); return (0); } @@ -1308,7 +1350,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) MUTEX_LOCK(dbenv, db_rep->rep_mutexp); /* WAITSTART pushes us past point of no-return */ - logmsg(LOGMSG_DEBUG, "%s line %d setting PHASE1 clearing TALLY\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d setting PHASE1 clearing TALLY\n", __func__, __LINE__); F_SET(rep, REP_F_EPHASE1 | REP_F_WAITSTART | REP_F_NOARCHIVE); F_CLR(rep, REP_F_TALLY); @@ -1329,7 +1372,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) /* Tally our own vote */ if (__rep_tally(dbenv, rep, rep->eid, &rep->sites, rep->egen, rep->tally_off, __func__, __LINE__) != 0) { - logmsg(LOGMSG_DEBUG, "%s line %d rep-tally failed, lockdone\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d rep-tally failed, lockdone\n", __func__, __LINE__); goto lockdone; } __rep_cmp_vote(dbenv, rep, &rep->eid, rep->egen, &lsn, priority, @@ -1345,12 +1389,13 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) egen = rep->egen; committed_gen = rep->committed_gen; send_vote2 = (rep->sites >= rep->nsites && rep->w_priority != 0); - logmsg(LOGMSG_DEBUG, "%s line %d send_vote2 is %d, rep->sites is %d, rep->nsites is %d\n", + logmsg(LOGMSG_USER, + "%s line %d send_vote2 is %d, rep->sites is %d, rep->nsites is %d\n", __func__, __LINE__, send_vote2, rep->sites, rep->nsites); /* If we have all vote1, change to PHASE2 immediately */ if (send_vote2) { - logmsg(LOGMSG_DEBUG, "%s line %d clearing PHASE1 setting PHASE2\n", __func__, __LINE__); + logmsg(LOGMSG_USER, "%s line %d clearing PHASE1 setting PHASE2\n", __func__, __LINE__); F_SET(rep, REP_F_EPHASE2); F_CLR(rep, REP_F_EPHASE1); if (rep->winner == rep->eid) { @@ -1363,12 +1408,14 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) Pthread_mutex_unlock(&rep_candidate_lock); if (use_committed_gen) { - logmsg(LOGMSG_DEBUG, "%s line %d broadcasting REP_GEN_VOTE1 to all with committed-gen=%d gen=%d egen=%d\n", + logmsg(LOGMSG_USER, + "%s line %d broadcasting REP_GEN_VOTE1 to all with committed-gen=%d gen=%d egen=%d\n", __func__, __LINE__, committed_gen, rep->gen, egen); __rep_send_gen_vote(dbenv, &lsn, nsites, priority, tiebreaker, egen, committed_gen, db_eid_broadcast, REP_GEN_VOTE1); } else { - logmsg(LOGMSG_DEBUG, "%s line %d broadcasting REP_VOTE1 to all (committed-gen=0) gen=%d egen=%d\n", + logmsg(LOGMSG_USER, + "%s line %d broadcasting REP_VOTE1 to all (committed-gen=0) gen=%d egen=%d\n", __func__, __LINE__, rep->gen, egen); __rep_send_vote(dbenv, &lsn, nsites, priority, tiebreaker, egen, db_eid_broadcast, REP_VOTE1); @@ -1387,15 +1434,18 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) #endif /* This increments our election gen */ __rep_elect_done(dbenv, rep, 0, __func__, __LINE__); - logmsg(LOGMSG_DEBUG, "%s line %d returning %d master %s egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d returning %d master %s egen is %d\n", __func__, __LINE__, ret, *eidp, *newgen); return (0); } - logmsg(LOGMSG_DEBUG, "%s line %d going to phase2 because nomaster\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d going to phase2 because nomaster\n", __func__, __LINE__); goto phase2; case DB_ELECTION_GENCHG: case DB_TIMEOUT: - logmsg(LOGMSG_DEBUG, "%s line %d ret is %d break\n", __func__, __LINE__, ret); + logmsg(LOGMSG_USER, + "%s line %d ret is %d break\n", __func__, __LINE__, ret); break; default: goto err; @@ -1420,7 +1470,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) __db_err(dbenv, "Egen changed from %lu to %lu", (u_long)egen, (u_long)rep->egen); #endif - logmsg(LOGMSG_DEBUG, "%s line %d rep egen changed from %d to %d, restarting\n", + logmsg(LOGMSG_USER, + "%s line %d rep egen changed from %d to %d, restarting\n", __func__, __LINE__, egen, rep->egen); goto restart; } @@ -1430,7 +1481,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) if (rep->sites > rep->nsites / 2) { /* We think we've seen enough to cast a vote. */ - logmsg(LOGMSG_DEBUG, "%s line %d have seen enough votes for vote2\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d have seen enough votes for vote2\n", __func__, __LINE__); send_vote = rep->winner; /* * See if we won. This will make sure we @@ -1446,7 +1498,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) "Counted my vote %d", rep->votes); #endif } - logmsg(LOGMSG_DEBUG, "%s line %d setting PHASE2 clearing PHASE1\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d setting PHASE2 clearing PHASE1\n", __func__, __LINE__); F_SET(rep, REP_F_EPHASE2); F_CLR(rep, REP_F_EPHASE1); } @@ -1460,7 +1513,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) "Not enough votes to elect: received %d of %d", rep->sites, rep->nsites); #endif - logmsg(LOGMSG_DEBUG, "%s line %d not enough vote1s, failing\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d not enough vote1s, failing\n", __func__, __LINE__); ret = DB_REP_UNAVAIL; goto err; @@ -1469,7 +1523,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) * We have seen enough vote1's. Now we need to wait * for all the vote2's. */ - logmsg(LOGMSG_DEBUG, "%s line %d have seen enough votes to cast vote2!\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d have seen enough votes to cast vote2!\n", __func__, __LINE__); if (send_vote != rep->eid) { #ifdef DIAGNOSTIC if (FLD_ISSET(dbenv->verbose, DB_VERB_REPLICATION) && @@ -1477,13 +1532,15 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) __db_err(dbenv, "Sending vote"); #endif if (use_committed_gen) { - logmsg(LOGMSG_DEBUG, "%s line %d sending REP_GEN_VOTE2 to %s " + logmsg(LOGMSG_USER, + "%s line %d sending REP_GEN_VOTE2 to %s " "with committed-gen=%d gen=%d egen=%d\n", __func__, __LINE__, send_vote, committed_gen, rep->gen, egen); __rep_send_gen_vote(dbenv, NULL, 0, 0, 0, egen, committed_gen, send_vote, REP_GEN_VOTE2); } else { - logmsg(LOGMSG_DEBUG, "%s line %d sending REP_VOTE2 to %s " + logmsg(LOGMSG_USER, + "%s line %d sending REP_VOTE2 to %s " "(committed-gen=0) gen=%d egen=%d\n", __func__, __LINE__, send_vote, rep->gen, egen); __rep_send_vote(dbenv, NULL, 0, 0, 0, egen, @@ -1502,7 +1559,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) case 0: /* Increment our election gen */ __rep_elect_done(dbenv, rep, 0, __func__, __LINE__); - logmsg(LOGMSG_DEBUG, "%s line %d returning %d master %s egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d returning %d master %s egen is %d\n", __func__, __LINE__, ret, *eidp, *newgen); return (0); case DB_TIMEOUT: @@ -1522,7 +1580,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) if (send_vote == rep->eid && done) { if (nsites == 1) __rep_elect_master(dbenv, rep, eidp); - logmsg(LOGMSG_DEBUG, "%s line %d elected master %s current-egen " + logmsg(LOGMSG_USER, + "%s line %d elected master %s current-egen " "%d\n", __func__, __LINE__, rep->eid, rep->egen); ret = 0; goto lockdone; @@ -1540,7 +1599,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) * from elect_init where we were unable to grow_sites. In * that case we do not want to discard all known election info. */ - logmsg(LOGMSG_DEBUG, "%s line %d ret is %d\n", __func__, __LINE__, ret); + logmsg(LOGMSG_USER, + "%s line %d ret is %d\n", __func__, __LINE__, ret); assert(ret == 0 || ret == DB_REP_UNAVAIL); if (ret == 0 || ret == DB_REP_UNAVAIL) { __rep_elect_done(dbenv, rep, 0, __func__, __LINE__); @@ -1550,7 +1610,8 @@ __rep_elect(dbenv, nsites, priority, timeout, newgen, already_master, eidp) Pthread_mutex_unlock(&rep_candidate_lock); MUTEX_UNLOCK(dbenv, db_rep->rep_mutexp); - logmsg(LOGMSG_DEBUG, "%s line %d returning %d master %s egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d returning %d master %s egen is %d\n", __func__, __LINE__, ret, *eidp, *newgen); return (ret); } @@ -1582,7 +1643,8 @@ __rep_elect_init(dbenv, lsnp, nsites, priority, beginp, otally) /* If we are already a master; simply broadcast that fact and return. */ if (F_ISSET(rep, REP_F_MASTER)) { - logmsg(LOGMSG_DEBUG, "%s line %d sending REP_NEWMASTER\n", + logmsg(LOGMSG_USER, + "%s line %d sending REP_NEWMASTER\n", __func__, __LINE__); (void)__rep_send_message(dbenv, db_eid_broadcast, REP_NEWMASTER, lsnp, NULL, 0, NULL); @@ -1607,7 +1669,8 @@ __rep_elect_init(dbenv, lsnp, nsites, priority, beginp, otally) DB_ENV_TEST_RECOVERY(dbenv, DB_TEST_ELECTINIT, ret, NULL); rep->nsites = nsites; rep->priority = priority; - logmsg(LOGMSG_DEBUG, "%s line %d setting master_id to %s\n", __func__, __LINE__, db_eid_invalid); + logmsg(LOGMSG_USER, + "%s line %d setting master_id to %s\n", __func__, __LINE__, db_eid_invalid); rep->master_id = db_eid_invalid; } DB_TEST_RECOVERY_LABEL @@ -1628,7 +1691,8 @@ __rep_elect_master(dbenv, rep, eidp) REP *rep; char **eidp; { - logmsg(LOGMSG_DEBUG, "%s line %d setting master_id to %s\n", __func__, __LINE__, rep->eid); + logmsg(LOGMSG_USER, + "%s line %d setting master_id to %s\n", __func__, __LINE__, rep->eid); rep->master_id = rep->eid; F_SET(rep, REP_F_MASTERELECT); if (eidp != NULL) @@ -1688,7 +1752,8 @@ __rep_wait(dbenv, timeout, eidp, outegen, inegen, flags) Pthread_mutex_lock(&gbl_rep_egen_lk); rc = pthread_cond_timedwait(&gbl_rep_egen_cd, &gbl_rep_egen_lk, &tm); if (rc && rc != ETIMEDOUT) - logmsg(LOGMSG_ERROR, "Err rc=%d from pthread_cond_timedwait\n", rc); + logmsg(LOGMSG_USER, + "Err rc=%d from pthread_cond_timedwait\n", rc); *outegen = rep->egen; Pthread_mutex_unlock(&gbl_rep_egen_lk); @@ -1744,7 +1809,7 @@ __rep_flush(dbenv) /* treat the end of the log as perm */ (void)__rep_send_message(dbenv, - db_eid_broadcast, REP_LOG, &lsn, &rec, DB_LOG_PERM, NULL); + db_eid_broadcast, REP_LOG, &lsn, &rec, DB_LOG_PERM, NULL); err: if ((t_ret = __log_c_close(logc)) != 0 && ret == 0) ret = t_ret; @@ -1822,13 +1887,13 @@ extern pthread_mutex_t gbl_durable_lsn_lk; static int __rep_deadlocks(dbenv, deadlocks) - DB_ENV *dbenv; - u_int64_t *deadlocks; + DB_ENV *dbenv; + u_int64_t *deadlocks; { DB_REP *db_rep = dbenv->rep_handle; REP *rep = db_rep->region; - *deadlocks = ATOMIC_LOAD64(rep->stat.retry); - return 0; + *deadlocks = ATOMIC_LOAD64(rep->stat.retry); + return 0; } /* @@ -1859,7 +1924,7 @@ __rep_stat(dbenv, statp, flags) *statp = NULL; if ((ret = __db_fchk(dbenv, - "DB_ENV->rep_stat", flags, DB_STAT_CLEAR)) != 0) + "DB_ENV->rep_stat", flags, DB_STAT_CLEAR)) != 0) return (ret); /* Allocate a stat struct to return to the user. */ @@ -1917,7 +1982,7 @@ __rep_stat(dbenv, statp, flags) queued = rep->stat.st_log_queued; memset(&rep->stat, 0, sizeof(rep->stat)); rep->stat.st_log_queued = rep->stat.st_log_queued_total = - rep->stat.st_log_queued_max = queued; + rep->stat.st_log_queued_max = queued; } if (dolock) { diff --git a/berkdb/rep/rep_record.c b/berkdb/rep/rep_record.c index c00c0a0cce..52c1453c8a 100644 --- a/berkdb/rep/rep_record.c +++ b/berkdb/rep/rep_record.c @@ -150,7 +150,8 @@ static inline int wait_for_running_transactions(DB_ENV *dbenv); #define IS_SIMPLE(R) ((R) != DB___txn_regop && (R) != DB___txn_xa_regop && \ (R) != DB___txn_regop_rowlocks && (R) != DB___txn_regop_gen && \ - (R) != DB___txn_dist_commit && (R) != DB___txn_ckp && (R) != DB___dbreg_register && \ + (R) != DB___txn_dist_commit && (R) != DB___txn_ckp && \ + (R) != DB___txn_ckp_recovery && (R) != DB___dbreg_register && \ (R) != DB___txn_dist_prepare && (R) != DB___txn_dist_abort) int gbl_rep_process_msg_print_rc; @@ -337,7 +338,8 @@ static inline void send_dupmaster(DB_ENV *dbenv, const char *func, int line) __rep_send_message(dbenv, db_eid_broadcast, REP_DUPMASTER, NULL, NULL, 0, NULL); - logmsg(LOGMSG_DEBUG, "%s line %d sending DUPMASTER\n", func, line); + logmsg(LOGMSG_USER, + "%s line %d sending DUPMASTER\n", func, line); } void send_master_req(DB_ENV *dbenv, const char *func, int line) @@ -422,20 +424,21 @@ int gbl_match_on_ckp = 1; /* * matchable_log_type -- * - * PUBLIC: int matchable_log_type __P((int)); + * PUBLIC: int matchable_log_type __P((DB_ENV *, int)); */ int -matchable_log_type(int rectype) +matchable_log_type(DB_ENV *dbenv, int rectype) { extern int gbl_only_match_commit_records; int ret; + /* XXX do not ever match DB___txn_ckp_recovery XXX */ if (gbl_only_match_commit_records) { - ret = (rectype == DB___txn_regop || - rectype == DB___txn_regop_gen || - rectype == DB___txn_dist_commit || - rectype == DB___txn_dist_abort || - rectype == DB___txn_regop_rowlocks || - (gbl_match_on_ckp && rectype == DB___txn_ckp)); + ret = ((!dbenv->attr.elect_highest_committed_gen && rectype == DB___txn_regop) || + rectype == DB___txn_regop_gen || + rectype == DB___txn_dist_commit || + rectype == DB___txn_dist_abort || + rectype == DB___txn_regop_rowlocks || + (gbl_match_on_ckp && rectype == DB___txn_ckp)); } else { switch (rectype) { case DB___txn_recycle: @@ -537,9 +540,8 @@ int send_rep_all_req(DB_ENV *dbenv, char *master_eid, DB_LSN *lsn, int flags, return send_rep_all_req_dedup(dbenv, master_eid, lsn, flags, func, line); } if (gbl_dedup_rep_all_reqs && rep_qstat_has_allreq()) { - if (gbl_verbose_fills) { - logmsg(LOGMSG_DEBUG, "BLOCKING rep_all_req from %s line %d\n", func, line); - } + logmsg(LOGMSG_USER, + "BLOCKING rep_all_req from %s line %d\n", func, line); return 0; } int rc = __rep_send_message(dbenv, master_eid, REP_ALL_REQ, lsn, NULL, flags, NULL); @@ -571,9 +573,9 @@ static void *apply_thread(void *arg) lp = dblp->reginfo.primary; bdb_thread_start_rw(); - thrman_register(THRTYPE_GENERIC); - thread_started("apply thread"); - + thrman_register(THRTYPE_GENERIC); + thread_started("apply thread"); + Pthread_mutex_lock(&rep_queue_lock); while (!db_is_exiting() && gbl_decoupled_logputs) { int pollms = (gbl_apply_thread_pollms > 0) ? @@ -1033,7 +1035,7 @@ __rep_verify_will_recover(dbenv, control, rec) LOGCOPY_32(&rectype, mylog.data); normalize_rectype(&rectype); - if ((will_recover == 1 && !matchable_log_type(rectype)) && + if ((will_recover == 1 && !matchable_log_type(dbenv, rectype)) && ((ret = __log_c_get(logc, &lsn, &mylog, DB_PREV)) == 0)){ will_recover = 0; } @@ -1241,7 +1243,7 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n static u_int32_t lastpr = 0; u_int32_t now; - if (gbl_rep_badgen_trace && ((now = time(NULL)) - lastpr)) { + if (((now = time(NULL)) - lastpr)) { logmsg(LOGMSG_USER, "Ignoring rp->gen %u from %s mygen is %u, " "rectype=%u cnt %u\n", rp->gen, *eidp, gen, rp->rectype, rep->stat.st_msgs_badgen); @@ -1259,7 +1261,7 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n */ static u_int32_t lastpr = 0; u_int32_t now; - if (gbl_rep_badgen_trace && ((now = time(NULL)) - lastpr)) { + if (((now = time(NULL)) - lastpr)) { logmsg(LOGMSG_USER, "rp->gen %u from %s is larger than " "mygen %u, rectype=%u\n", rp->gen, *eidp, gen, rp->rectype); lastpr = now; @@ -1290,7 +1292,8 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n __db_err(dbenv, "Updating gen from %lu to %lu", (u_long)gen, (u_long)rp->gen); #endif - logmsg(LOGMSG_DEBUG, "%s line %d setting rep->gen to %d for rectype " + logmsg(LOGMSG_USER, + "%s line %d setting rep->gen to %d for rectype " "%d\n", __func__, __LINE__, rp->gen, rp->rectype); __rep_set_gen(dbenv, __func__, __LINE__, rp->gen); gen = rp->gen; @@ -1665,7 +1668,7 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n logmsg(LOGMSG_USER, "%s line %d continuing REP_ALL_REQ lsn " "%d:%d\n", __func__, __LINE__, lsn.file, lsn.offset); } - fromline = __LINE__; + fromline = __LINE__; } goto errlock; @@ -1774,8 +1777,8 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n "Unable to get prev of [%lu][%lu]", (u_long)lsn.file, (u_long)lsn.offset); - logmsg(LOGMSG_INFO, "%s:%d sending DB_REP_OUTDATED\n", - __func__, __LINE__); + logmsg(LOGMSG_INFO, "%s:%d sending DB_REP_OUTDATED\n", + __func__, __LINE__); ret = DB_REP_OUTDATED; /* Tell the replicant he's outdated. */ if (gbl_verbose_fills) { @@ -1783,9 +1786,9 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n "for LSN %d:%d\n", __func__, __LINE__, lsn.file, lsn.offset); } - logmsg(LOGMSG_INFO, "%s:%d log_c_get failed to find [%d:%d]" - " and [%d:%d]: REP_VERIFY_FAIL\n", __func__, __LINE__, - lsn.file, lsn.offset,endlsn.file, endlsn.offset); + logmsg(LOGMSG_INFO, "%s:%d log_c_get failed to find [%d:%d]" + " and [%d:%d]: REP_VERIFY_FAIL\n", __func__, __LINE__, + lsn.file, lsn.offset,endlsn.file, endlsn.offset); if ((resp_rc = __rep_time_send_message(dbenv, *eidp, REP_VERIFY_FAIL, &lsn, NULL, 0, NULL, &sendtime)) != 0 && gbl_verbose_fills) { @@ -2046,14 +2049,14 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n /* * Skip over any records recovery can write. */ - if ((match == 0 || !matchable_log_type(rectype)) && + if ((match == 0 || !matchable_log_type(dbenv, rectype)) && (ret = __log_c_get(logc, &lsn, &mylog, DB_PREV)) == 0) { match = 0; if (gbl_berkdb_verify_skip_skipables) { LOGCOPY_32(&rectype, mylog.data); normalize_rectype(&rectype); - while (!matchable_log_type(rectype) && (ret = + while (!matchable_log_type(dbenv, rectype) && (ret = __log_c_get(logc, &lsn, &mylog, DB_PREV)) == 0) { LOGCOPY_32(&rectype, mylog.data); @@ -2078,7 +2081,7 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n verify_req_print = now; } - assert(lsn.file > 0); + assert(lsn.file > 0); (void)__rep_send_message(dbenv, *eidp, REP_VERIFY_REQ, &lsn, NULL, 0, NULL); @@ -2114,7 +2117,7 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n verify_req_print = now; } - assert(lsn.file > 0); + assert(lsn.file > 0); (void)__rep_send_message(dbenv, *eidp, REP_VERIFY_REQ, &lsn, NULL, 0, NULL); @@ -2144,8 +2147,8 @@ __rep_process_message(dbenv, control, rec, eidp, ret_lsnp, commit_gen, newgen, n * the same environment and we'll say so. */ ret = DB_REP_OUTDATED; - logmsg(LOGMSG_INFO, "%s:%d returning DB_REP_OUTDATED\n", - __func__, __LINE__); + logmsg(LOGMSG_INFO, "%s:%d returning DB_REP_OUTDATED\n", + __func__, __LINE__); if (rp->lsn.file != 1) __db_err(dbenv, @@ -2175,8 +2178,8 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && case REP_VERIFY_FAIL: rep->stat.st_outdated++; ret = DB_REP_OUTDATED; - logmsg(LOGMSG_INFO, "%s:%d returning DB_REP_OUTDATED\n", - __func__, __LINE__); + logmsg(LOGMSG_INFO, "%s:%d returning DB_REP_OUTDATED\n", + __func__, __LINE__); fromline = __LINE__; goto errlock; case REP_VERIFY_REQ: @@ -2213,10 +2216,10 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && if (ret == DB_NOTFOUND && __log_is_outdated(dbenv, rp->lsn.file, &old) == 0 && old != 0) { - logmsg(LOGMSG_INFO, "%s rep_verify_req returning REP_VERIFY_FAIL " - "for [%d:%d]\n", __func__, rp->lsn.file, rp->lsn.offset); + logmsg(LOGMSG_INFO, "%s rep_verify_req returning REP_VERIFY_FAIL " + "for [%d:%d]\n", __func__, rp->lsn.file, rp->lsn.offset); type = REP_VERIFY_FAIL; - } + } if (ret != 0) d = NULL; @@ -2245,7 +2248,8 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && R_LOCK(dbenv, &dblp->reginfo); lsn = lp->lsn; R_UNLOCK(dbenv, &dblp->reginfo); - logmsg(LOGMSG_DEBUG, "%s line %d sending REP_NEWMASTER\n", + logmsg(LOGMSG_USER, + "%s line %d sending REP_NEWMASTER\n", __func__, __LINE__); (void)__rep_send_message(dbenv, *eidp, REP_NEWMASTER, &lsn, NULL, 0, NULL); @@ -2262,7 +2266,8 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && vi_nsites = vi->nsites; vi_priority = vi->priority; vi_tiebreaker = vi->tiebreaker; - logmsg(LOGMSG_DEBUG, "%s line %d processing REP_VOTE1 from %s gen %d egen %d my-egen is %d " + logmsg(LOGMSG_USER, + "%s line %d processing REP_VOTE1 from %s gen %d egen %d my-egen is %d " "(Setting write-gen to 0)\n", __func__, __LINE__, *eidp, rp->gen, vi_egen, rep->egen); } else { @@ -2274,7 +2279,8 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && vi_nsites = vig->nsites; vi_priority = vig->priority; vi_tiebreaker = vig->tiebreaker; - logmsg(LOGMSG_DEBUG, "%s line %d processed REP_GEN_VOTE1 from %s gen %d egen %d my-egen is %d " + logmsg(LOGMSG_USER, + "%s line %d processed REP_GEN_VOTE1 from %s gen %d egen %d my-egen is %d " "(Setting write-gen to %d)\n", __func__, __LINE__, *eidp, rp->gen, vi_egen, rep->egen, vig->last_write_gen); } @@ -2288,13 +2294,15 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && * start over by tallying it. */ if (vi_egen < rep->egen) { - logmsg(LOGMSG_DEBUG, "%s line %d ignoring %s from %s: it's egen is %d my-egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d ignoring %s from %s: it's egen is %d my-egen is %d\n", __func__, __LINE__, rp->rectype == REP_VOTE1 ? "REP_VOTE1" : "REP_GEN_VOTE1", *eidp, vi_egen, rep->egen); goto errunlock; } if (vi_egen > rep->egen) { - logmsg(LOGMSG_DEBUG, "%s line %d reseting election for %s from %s: it's egen is %d my-egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d reseting election for %s from %s: it's egen is %d my-egen is %d\n", __func__, __LINE__, rp->rectype == REP_VOTE1 ? "REP_VOTE1" : "REP_GEN_VOTE1", *eidp, vi_egen, rep->egen); __rep_elect_done(dbenv, rep, vi_egen, __func__, __LINE__); @@ -2412,7 +2420,8 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && #endif egen = rep->egen; committed_gen = rep->committed_gen; - logmsg(LOGMSG_DEBUG, "%s line %d Setting PHASE2 clearing PHASE1\n", __func__, __LINE__); + logmsg(LOGMSG_USER, + "%s line %d Setting PHASE2 clearing PHASE1\n", __func__, __LINE__); F_SET(rep, REP_F_EPHASE2); F_CLR(rep, REP_F_EPHASE1); if (master == rep->eid) { @@ -2425,14 +2434,16 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && /* Vote for someone else. */ if (dbenv->attr.elect_highest_committed_gen) { - logmsg(LOGMSG_DEBUG, "%s line %d sending REP_GEN_VOTE2 to %s " + logmsg(LOGMSG_USER, + "%s line %d sending REP_GEN_VOTE2 to %s " "with committed-gen=%d gen=%d egen=%d\n", __func__, __LINE__, master, committed_gen, rep->gen, egen); __rep_send_gen_vote(dbenv, NULL, 0, 0, 0, egen, committed_gen, master, REP_GEN_VOTE2); } else { - logmsg(LOGMSG_DEBUG, "%s line %d sending REP_VOTE2 to %s " + logmsg(LOGMSG_USER, + "%s line %d sending REP_VOTE2 to %s " "(committed-gen=0) gen=%d egen=%d\n", __func__, __LINE__, master, rep->gen, egen); __rep_send_vote(dbenv, NULL, 0, 0, 0, egen, @@ -2470,7 +2481,8 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && vi_nsites = vi->nsites; vi_priority = vi->priority; vi_tiebreaker = vi->tiebreaker; - logmsg(LOGMSG_DEBUG, "%s line %d processing REP_VOTE2 from %s gen %d egen %d my-egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d processing REP_VOTE2 from %s gen %d egen %d my-egen is %d\n", __func__, __LINE__, *eidp, rp->gen, vi_egen, rep->egen); } else { vig = (REP_GEN_VOTE_INFO *) rec->data; @@ -2481,12 +2493,14 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && vi_nsites = vig->nsites; vi_priority = vig->priority; vi_tiebreaker = vig->tiebreaker; - logmsg(LOGMSG_DEBUG, "%s line %d processing REP_GEN_VOTE2 from %s gen %d egen %d my-egen is %d\n", + logmsg(LOGMSG_USER, + "%s line %d processing REP_GEN_VOTE2 from %s gen %d egen %d my-egen is %d\n", __func__, __LINE__, *eidp, rp->gen, vi_egen, rep->egen); } if (!IN_ELECTION_TALLY(rep) && vi_egen > rep->egen) { - logmsg(LOGMSG_DEBUG, "%s line %d not in election and vote2-egen %d " + logmsg(LOGMSG_USER, + "%s line %d not in election and vote2-egen %d " "> rep->egen (%d): returning HOLDELECTION\n", __func__, __LINE__, vi_egen, rep->egen); ret = DB_REP_HOLDELECTION; @@ -2529,7 +2543,8 @@ rep_verify_err:if ((t_ret = __log_c_close(logc)) != 0 && __db_err(dbenv, "Counted vote %d", rep->votes); #endif if (done) { - logmsg(LOGMSG_USER, "%s line %d elected master %s for egen %d\n", + logmsg(LOGMSG_USER, + "%s line %d elected master %s for egen %d\n", __func__, __LINE__, rep->eid, vi_egen); __rep_elect_master(dbenv, rep, eidp); if (newgen) *newgen = vi_egen; @@ -3726,6 +3741,7 @@ gap_check: max_lsn_dbtp = NULL; DB_TXN_APPLY, NULL); } break; + case DB___txn_ckp_recovery: case DB___txn_ckp: #if 0 LOGCOPY_TOLSN(&ckp_lsn, (u_int8_t *) rec->data + @@ -3771,12 +3787,12 @@ gap_check: max_lsn_dbtp = NULL; !(rand() % gbl_slow_rep_process_txn_freq)) { - if (gbl_slow_rep_process_txn_maxms <= gbl_slow_rep_process_txn_minms) { - gbl_slow_rep_process_txn_maxms = gbl_slow_rep_process_txn_minms + 1; - } - int range = gbl_slow_rep_process_txn_maxms - gbl_slow_rep_process_txn_minms; - int polltime = gbl_slow_rep_process_txn_minms + (rand() % range); - logmsg(LOGMSG_DEBUG, "%s polling an additional %d ms\n", __func__, polltime); + if (gbl_slow_rep_process_txn_maxms <= gbl_slow_rep_process_txn_minms) { + gbl_slow_rep_process_txn_maxms = gbl_slow_rep_process_txn_minms + 1; + } + int range = gbl_slow_rep_process_txn_maxms - gbl_slow_rep_process_txn_minms; + int polltime = gbl_slow_rep_process_txn_minms + (rand() % range); + logmsg(LOGMSG_DEBUG, "%s polling an additional %d ms\n", __func__, polltime); poll(0, 0, polltime); } } @@ -3952,10 +3968,10 @@ __rep_apply(dbenv, rp, rec, ret_lsnp, commit_gen, decoupled) int rc, now; bbtime_t start = {0}, end = {0}; - int debug_switch_replicant_latency(void); - if (debug_switch_replicant_latency() && !(time(NULL) % 4)) { - return 0; - } + int debug_switch_replicant_latency(void); + if (debug_switch_replicant_latency() && !(time(NULL) % 4)) { + return 0; + } Pthread_mutex_lock(&apply_lk); getbbtime(&start); @@ -4005,7 +4021,7 @@ int __dbenv_apply_log(DB_ENV* dbenv, unsigned int file, unsigned int offset, /* call with decoupled = 2 to differentiate from true master */ int ret = __rep_apply(dbenv, &rp, &rec, &ret_lsnp, - (gbl_is_physical_replicant) ? &rep->log_gen : &rep->gen, 2); + (gbl_is_physical_replicant) ? &rep->log_gen : &rep->gen, 2); if (ret == 0 || ret == DB_REP_ISPERM) { bdb_set_seqnum(dbenv->app_private); @@ -6899,9 +6915,9 @@ __rep_cmp_vote(dbenv, rep, eidp, egen, lsnp, priority, gen, committed_gen, tiebr * LSN is primary determinant. Then priority if LSNs * are equal, then tiebreaker if both are equal. */ - if (cmp > 0 || - (cmp == 0 && (priority > rep->w_priority || - (priority == rep->w_priority && + if (cmp > 0 || + (cmp == 0 && (priority > rep->w_priority || + (priority == rep->w_priority && (tiebreaker > rep->w_tiebreaker))))) { #ifdef DIAGNOSTIC if (FLD_ISSET(dbenv->verbose, DB_VERB_REPLICATION)) @@ -7224,7 +7240,7 @@ __rep_dorecovery(dbenv, lsnp, trunclsnp, online, undid_schema_change) if (ret) goto err; - } + } if (rectype == DB___txn_regop_gen) { if ((ret = __txn_regop_gen_read(dbenv, mylog.data, @@ -7326,13 +7342,13 @@ __rep_dorecovery(dbenv, lsnp, trunclsnp, online, undid_schema_change) } if (have_recover_lk) { - dbenv->unlock_recovery_lock(dbenv, __func__, __LINE__); + dbenv->unlock_recovery_lock(dbenv, __func__, __LINE__); } - if (logc_dist != NULL) { - __log_c_close(logc_dist); - logc_dist = NULL; - } + if (logc_dist != NULL) { + __log_c_close(logc_dist); + logc_dist = NULL; + } if ((t_ret = __log_c_close(logc)) != 0 && ret == 0) ret = t_ret; @@ -8142,9 +8158,9 @@ __truncate_repdb(dbenv) } if ((!F_ISSET(rep, REP_ISCLIENT) && !gbl_is_physical_replicant) || !db_rep->rep_db) { - logmsg(LOGMSG_FATAL, "%s:%d returning DB_NOTFOUND\n", __func__, __LINE__); + logmsg(LOGMSG_FATAL, "%s:%d returning DB_NOTFOUND\n", __func__, __LINE__); return DB_NOTFOUND; - } + } MUTEX_LOCK(dbenv, db_rep->db_mutexp); diff --git a/berkdb/rep/rep_util.c b/berkdb/rep/rep_util.c index b34597e54c..b2ba99c5cb 100644 --- a/berkdb/rep/rep_util.c +++ b/berkdb/rep/rep_util.c @@ -222,8 +222,8 @@ __rep_send_message(dbenv, eid, rtype, lsnp, dbtp, flags, usr_ptr) memcpy(&rectype, dbtp->data, sizeof(rectype)); normalize_rectype(&rectype); if (rectype == DB___txn_regop || rectype == DB___txn_regop_gen - || rectype == DB___txn_ckp || rectype == DB___txn_dist_commit || - rectype == DB___txn_regop_rowlocks) + || rectype == DB___txn_ckp || rectype == DB___txn_ckp_recovery || + rectype == DB___txn_dist_commit || rectype == DB___txn_regop_rowlocks) F_SET(&cntrl, DB_LOG_PERM); } @@ -407,7 +407,8 @@ __rep_set_gen(dbenv, func, line, gen) egen = rep->egen; if (rep->egen <= gen) egen = gen + 1; - logmsg(LOGMSG_DEBUG, "%s line %d setting rep->gen from %d to %d, egen from %d to %d\n", + logmsg(LOGMSG_USER, + "%s line %d setting rep->gen from %d to %d, egen from %d to %d\n", func, line, rep->gen, gen, rep->egen, egen); rep->gen = gen; rep->egen = egen; @@ -446,13 +447,13 @@ __rep_set_egen(dbenv, func, line, egen) REP *rep; db_rep = dbenv->rep_handle; rep = db_rep->region; - logmsg(LOGMSG_DEBUG, "%s line %d setting rep->egen from %d to %d\n", + logmsg(LOGMSG_USER, "%s line %d setting rep->egen from %d to %d\n", func, line, rep->egen, egen); rep->egen = egen; } /* - * __rep_set_loh_gen -- + * __rep_set_log_gen -- * Called as a utility function to see places where an instance's * replication log generation can be changed. * @@ -470,7 +471,7 @@ __rep_set_log_gen(dbenv, func, line, log_gen) REP *rep; db_rep = dbenv->rep_handle; rep = db_rep->region; - logmsg(LOGMSG_DEBUG, "%s line %d setting rep->log_gen from %d to %d\n", + logmsg(LOGMSG_USER, "%s line %d setting rep->log_gen from %d to %d\n", func, line, rep->log_gen, log_gen); rep->log_gen = log_gen; } diff --git a/berkdb/txn/txn.c b/berkdb/txn/txn.c index 74a89c6906..9ed7711627 100644 --- a/berkdb/txn/txn.c +++ b/berkdb/txn/txn.c @@ -93,6 +93,10 @@ void ctrace(char *format, ...); int __txn_commit_map_add_nolock(DB_ENV *, u_int64_t, DB_LSN); extern int gbl_is_physical_replicant; +extern int gbl_fullrecovery; +extern int gbl_recovery_gen; +extern int gbl_reproduce_ckp_bug; +int gbl_recovery_ckp = 1; extern int get_commit_lsn_map_switch_value(); #else @@ -1282,6 +1286,7 @@ __txn_commit_int(txnp, flags, ltranid, llid, last_commit_lsn, rlocks, inlks, abort(); } } else { + logmsg(LOGMSG_USER, "%s emitting regop-rowlocks record with generation %u\n", __func__, gen); ret = __txn_regop_rowlocks_log(dbenv, txnp, lsn_out, &context, lflags, @@ -2992,8 +2997,19 @@ __txn_checkpoint(dbenv, kbytes, minutes, flags) return (0); } - if (REP_ON(dbenv)) + if (REP_ON(dbenv)) { __rep_get_gen(dbenv, &gen); + if (gbl_fullrecovery && gen > gbl_recovery_gen) { + if (gbl_reproduce_ckp_bug) { + logmsg(LOGMSG_USER, "Reproducing ckp-bug by writing larger-generation ckp: %d vs %d\n", gbl_recovery_gen, gen); + + } else { + logmsg(LOGMSG_FATAL, "txn_checkpoint: writing larger gen-checkpoint in full-recovery mode, recovery_gen=%u, gen=%u\n", + gbl_recovery_gen, gen); + abort(); + } + } + } /* Get the fq-lock now to preserve our locking order */ dblp = dbenv->lg_handle; @@ -3046,8 +3062,10 @@ __txn_checkpoint(dbenv, kbytes, minutes, flags) max_utxnid = dbenv->next_utxnid; Pthread_mutex_unlock(&dbenv->utxnid_lock); + u_int32_t rectype = ((gbl_fullrecovery || LF_ISSET(DB_RECOVERY_CKP)) && + gbl_recovery_ckp) ? DB___txn_ckp_recovery : DB___txn_ckp; if ((ret = __dbreg_open_files_checkpoint(dbenv)) != 0 || - (ret = __txn_ckp_log(dbenv, NULL, &ckp_lsn, + (ret = __txn_ckp_log(dbenv, NULL, rectype, &ckp_lsn, DB_FLUSH |DB_LOG_PERM |DB_LOG_CHKPNT | DB_LOG_DONT_LOCK, &ckp_lsn, &last_ckp, timestamp, gen, max_utxnid)) != 0) { diff --git a/berkdb/txn/txn_auto.c b/berkdb/txn/txn_auto.c index 7c3d9b4bc8..6eb7531ee8 100644 --- a/berkdb/txn/txn_auto.c +++ b/berkdb/txn/txn_auto.c @@ -525,14 +525,15 @@ __txn_regop_read_context(argp) } /* - * PUBLIC: int __txn_ckp_log __P((DB_ENV *, DB_TXN *, DB_LSN *, + * PUBLIC: int __txn_ckp_log __P((DB_ENV *, DB_TXN *, rectype, DB_LSN *, * PUBLIC: u_int32_t, DB_LSN *, DB_LSN *, int32_t, u_int32_t)); */ int -__txn_ckp_log(dbenv, txnid, ret_lsnp, flags, +__txn_ckp_log(dbenv, txnid, rectype, ret_lsnp, flags, ckp_lsn, last_ckp, timestamp, rep_gen, max_utxnid) DB_ENV *dbenv; DB_TXN *txnid; + u_int32_t rectype; DB_LSN *ret_lsnp; u_int32_t flags; DB_LSN * ckp_lsn; @@ -544,7 +545,7 @@ __txn_ckp_log(dbenv, txnid, ret_lsnp, flags, DBT logrec; DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t uinttmp, rectype, txn_num; + u_int32_t uinttmp, txn_num; u_int64_t uint64tmp, txn_unum; u_int npad; u_int8_t *bp; @@ -561,7 +562,6 @@ __txn_ckp_log(dbenv, txnid, ret_lsnp, flags, fprintf(stderr,"__txn_ckp_log: begin\n"); #endif - rectype = DB___txn_ckp; if (utxnid_log) { rectype += 2000; } @@ -836,7 +836,7 @@ __txn_ckp_read_int(dbenv, recbuf, do_pgswp, argpp) LOGCOPY_TOLSN(&argp->prev_lsn, bp); bp += sizeof(DB_LSN); - if (argp->type == DB___txn_ckp + 2000) { + if (argp->type == DB___txn_ckp + 2000 || argp->type == DB___txn_ckp_recovery + 2000) { LOGCOPY_64(&argp->txnid->utxnid, bp); bp += sizeof(argp->txnid->utxnid); } else { @@ -857,7 +857,7 @@ __txn_ckp_read_int(dbenv, recbuf, do_pgswp, argpp) argp->rep_gen = (u_int32_t)uinttmp; bp += sizeof(uinttmp); - if (argp->type == DB___txn_ckp + 2000) { + if (argp->type == DB___txn_ckp + 2000 || argp->type == DB___txn_ckp_recovery + 2000) { LOGCOPY_64(&argp->max_utxnid, bp); bp += sizeof(argp->max_utxnid); } else { @@ -888,16 +888,20 @@ __txn_ckp_print(dbenv, dbtp, lsnp, notused2, notused3) if ((ret = __txn_ckp_read_int(dbenv, dbtp->data, 0, &argp)) != 0) return (ret); + int type = argp->type; + if (type > 2000) + type -= 2000; (void)printf( - "[%lu][%lu]__txn_ckp%s: rec: %lu txnid %lx prevlsn [%lu][%lu] utxnid \%"PRIx64"\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (argp->type & DB_debug_FLAG) ? "_debug" : "", - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset, - argp->txnid->utxnid); + "[%lu][%lu]%s%s: rec: %lu txnid %lx prevlsn [%lu][%lu] utxnid \%"PRIx64"\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (type == DB___txn_ckp_recovery) ? "__txn_ckp_recovery" : "__txn_ckp", + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset, + argp->txnid->utxnid); (void)printf("\tckp_lsn: [%lu][%lu]\n", (u_long)argp->ckp_lsn.file, (u_long)argp->ckp_lsn.offset); fflush(stdout); @@ -4458,6 +4462,9 @@ __txn_init_print(dbenv, dtabp, dtabsizep) if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, __txn_dist_abort_print, DB___txn_dist_abort)) != 0) return (ret); + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __txn_ckp_print, DB___txn_ckp_recovery)) != 0) + return (ret); return (0); } @@ -4505,6 +4512,9 @@ __txn_init_getpgnos(dbenv, dtabp, dtabsizep) if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, __txn_dist_commit_getpgnos, DB___txn_dist_commit)) != 0) return (ret); + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __txn_ckp_getpgnos, DB___txn_ckp_recovery)) != 0) + return (ret); return (0); } @@ -4554,6 +4564,9 @@ __txn_init_getallpgnos(dbenv, dtabp, dtabsizep) if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, __txn_dist_commit_getallpgnos, DB___txn_dist_commit)) != 0) return (ret); + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __txn_ckp_getallpgnos, DB___txn_ckp_recovery)) != 0) + return (ret); return (0); } @@ -4601,5 +4614,8 @@ __txn_init_recover(dbenv, dtabp, dtabsizep) if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, __txn_dist_abort_recover, DB___txn_dist_abort)) != 0) return (ret); + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __txn_ckp_recover, DB___txn_ckp_recovery)) != 0) + return (ret); return (0); } diff --git a/berkdb/txn/txn_rec.c b/berkdb/txn/txn_rec.c index e8cd085173..d48998f098 100644 --- a/berkdb/txn/txn_rec.c +++ b/berkdb/txn/txn_rec.c @@ -57,6 +57,10 @@ static const char revid[] = "$Id: txn_rec.c,v 11.54 2003/10/31 23:26:11 ubell Ex #define IS_XA_TXN(R) (R->xid.size != 0) +extern int gbl_recovery_ckp; +int gbl_retrieve_gen_from_ckp = 1; +int gbl_recovery_gen = 0; + int set_commit_context(unsigned long long context, uint32_t *generation, void *plsn, void *args, unsigned int rectype); @@ -201,8 +205,11 @@ __txn_dist_commit_recover(dbenv, dbtp, lsnp, op, info) MUTEX_LOCK(dbenv, db_rep->rep_mutexp); rep->committed_gen = argp->generation; rep->committed_lsn = *lsnp; - if (argp->generation > rep->gen) + if (argp->generation > rep->gen) { __rep_set_gen(dbenv, __func__, __LINE__, argp->generation); + __rep_set_log_gen(dbenv, __func__, __LINE__, rep->gen); + gbl_recovery_gen = rep->gen; + } MUTEX_UNLOCK(dbenv, db_rep->rep_mutexp); #if defined (DEBUG_PREPARE) logmsg(LOGMSG_USER, "%s op %d removed %s from txnlist\n", @@ -371,8 +378,11 @@ __txn_dist_prepare_recover(dbenv, dbtp, lsnp, op, info) (argp->generation == rep->committed_gen && log_compare(lsnp, &rep->committed_lsn) > 0)) { rep->committed_gen = argp->generation; rep->committed_lsn = *lsnp; - if (argp->generation > rep->gen) + if (argp->generation > rep->gen) { __rep_set_gen(dbenv, __func__, __LINE__, argp->generation); + __rep_set_log_gen(dbenv, __func__, __LINE__, rep->gen); + gbl_recovery_gen = rep->gen; + } } MUTEX_UNLOCK(dbenv, db_rep->rep_mutexp); @@ -464,6 +474,9 @@ __txn_regop_gen_recover(dbenv, dbtp, lsnp, op, info) if (argp->generation > rep->gen) { __rep_set_gen(dbenv, __func__, __LINE__, argp->generation); __rep_set_log_gen(dbenv, __func__, __LINE__, rep->gen); + gbl_recovery_gen = rep->gen; + } else { + logmsg(LOGMSG_USER, "%s line %d: rep->gen is %u, not setting to %u\n", __func__, __LINE__, rep->gen, argp->generation); } MUTEX_UNLOCK(dbenv, db_rep->rep_mutexp); } else if ((dbenv->tx_timestamp != 0 && @@ -791,7 +804,10 @@ __txn_regop_rowlocks_recover(dbenv, dbtp, lsnp, op, info) rep->committed_lsn = *lsnp; if (argp->generation > rep->gen) { __rep_set_gen(dbenv, __func__, __LINE__, argp->generation); - __rep_set_gen(dbenv, __func__, __LINE__, rep->gen); + __rep_set_log_gen(dbenv, __func__, __LINE__, rep->gen); + gbl_recovery_gen = rep->gen; + } else { + logmsg(LOGMSG_USER, "%s line %d: rep->gen is %u, not setting to %u\n", __func__, __LINE__, rep->gen, argp->generation); } MUTEX_UNLOCK(dbenv, db_rep->rep_mutexp); } @@ -858,8 +874,8 @@ __txn_regop_rowlocks_recover(dbenv, dbtp, lsnp, op, info) if (NULL == lt) { if((ret = __txn_create_ltrans(dbenv, argp->ltranid, - <, lsnp, &argp->begin_lsn, - &argp->last_commit_lsn)) != 0) + <, lsnp, &argp->begin_lsn, + &argp->last_commit_lsn)) != 0) goto err; } @@ -868,13 +884,13 @@ __txn_regop_rowlocks_recover(dbenv, dbtp, lsnp, op, info) if (argp->lflags & DB_TXN_LOGICAL_BEGIN) { if (NULL == dbenv->txn_logical_commit || - (ret = dbenv->txn_logical_commit(dbenv, dbenv->app_private, - lt->ltranid, lsnp)) != 0) + (ret = dbenv->txn_logical_commit(dbenv, dbenv->app_private, + lt->ltranid, lsnp)) != 0) { logmsg(LOGMSG_ERROR, "%s: txn_logical_commit error, %d\n", __func__, ret); goto err; } - + __txn_deallocate_ltrans(dbenv, lt); } } @@ -1067,10 +1083,28 @@ __txn_ckp_recover(dbenv, dbtp, lsnp, op, info) } if (op == DB_TXN_FORWARD_ROLL) { - /* Record the max generation number that we've seen. */ if (REP_ON(dbenv)) { db_rep = dbenv->rep_handle; rep = db_rep->region; + + /* It is okay to use txn_ckp for voting after recovery-gen is enabled */ + if (gbl_retrieve_gen_from_ckp && gbl_recovery_ckp && argp->type == DB___txn_ckp) { + rep->committed_gen = argp->rep_gen; + rep->committed_lsn = *lsnp; + } + + if (argp->rep_gen > rep->gen) { + /* Tunable because without txn_ckp_recovery, a normal ckp can cause + * a fresh full-recovered database to become master incorrectly */ + if (gbl_retrieve_gen_from_ckp) { + __rep_set_gen(dbenv, __func__, __LINE__, argp->rep_gen); + __rep_set_log_gen(dbenv, __func__, __LINE__, rep->gen); + gbl_recovery_gen = rep->gen; + } + } else { + logmsg(LOGMSG_USER, "%s line %d: rep->gen is %u, not setting to %u\n", __func__, __LINE__, rep->gen, argp->rep_gen); + } + if (argp->rep_gen > rep->recover_gen) rep->recover_gen = argp->rep_gen; } diff --git a/berkdb/txn/txn_region.c b/berkdb/txn/txn_region.c index ab61d8fb6c..e135aeb92e 100644 --- a/berkdb/txn/txn_region.c +++ b/berkdb/txn/txn_region.c @@ -233,7 +233,7 @@ __txn_findlastckp(dbenv, lsnp) continue; LOGCOPY_32(&rectype, dbt.data); normalize_rectype(&rectype); - if (rectype == DB___txn_ckp) { + if (rectype == DB___txn_ckp || rectype == DB___txn_ckp_recovery) { *lsnp = lsn; break; } diff --git a/berkdb/txn/txn_stat.c b/berkdb/txn/txn_stat.c index a087aae44c..5a25118499 100644 --- a/berkdb/txn/txn_stat.c +++ b/berkdb/txn/txn_stat.c @@ -173,7 +173,7 @@ __txn_stat(dbenv, statp, flags) if (!ret) { LOGCOPY_32(&type, dbt.data); normalize_rectype(&type); - if (type == DB___txn_ckp) { + if (type == DB___txn_ckp || type == DB___txn_ckp_recovery) { ret = __txn_ckp_read(dbenv, dbt.data, &ckp); if (ret == 0) { stats->st_ckp_lsn = ckp->ckp_lsn; diff --git a/db/config.c b/db/config.c index 5656ec6e88..0d6d84eb74 100644 --- a/db/config.c +++ b/db/config.c @@ -390,6 +390,7 @@ void clear_deferred_options(void) } } +// clang-format off static char *legacy_options[] = { "allow_negative_column_size", "berkattr elect_highest_committed_gen 0", @@ -444,7 +445,11 @@ static char *legacy_options[] = { "wal_osync 1", "usenames", "setattr max_sql_idle_time 864000", + "retrieve_gen_from_ckp 0", + "recovery_ckp 0", }; +// clang-format on + int gbl_legacy_defaults = 0; int pre_read_legacy_defaults(void *_, void *__) { diff --git a/db/db_tunables.c b/db/db_tunables.c index 16200d39ce..970fc7d480 100644 --- a/db/db_tunables.c +++ b/db/db_tunables.c @@ -139,6 +139,9 @@ extern int gbl_return_long_column_names; extern int gbl_round_robin_stripes; extern int skip_clear_queue_extents; extern int gbl_rep_skip_recovery; +extern int gbl_retrieve_gen_from_ckp; +extern int gbl_recovery_ckp; +extern int gbl_reproduce_ckp_bug; extern int gbl_sample_queries; extern int gbl_sample_queries_max_queries; extern int gbl_slow_rep_process_txn_freq; diff --git a/db/db_tunables.h b/db/db_tunables.h index 4a52c1a0b1..859811a2ee 100644 --- a/db/db_tunables.h +++ b/db/db_tunables.h @@ -1049,6 +1049,13 @@ REGISTER_TUNABLE("rep_process_txn_trace", NULL, NULL, NULL, NULL); REGISTER_TUNABLE("rep_skip_recovery", "Skip recovery if truncate won't unwind a transaction. (Default: off)", TUNABLE_BOOLEAN, &gbl_rep_skip_recovery, 0, NULL, NULL, NULL, NULL); +/* 'retrieve_gen_from_ckp' / 'recovery_ckp' disabled under legacy_defaults until db moves */ +REGISTER_TUNABLE("retrieve_gen_from_ckp", "Retrieve generation from ckp records. (Default: on)", TUNABLE_BOOLEAN, + &gbl_retrieve_gen_from_ckp, 0, NULL, NULL, NULL, NULL); +REGISTER_TUNABLE("recovery_ckp", "Emit a non-matchable ckp during recovery. (Default: on)", TUNABLE_BOOLEAN, + &gbl_recovery_ckp, 0, NULL, NULL, NULL, NULL); +REGISTER_TUNABLE("reproduce_ckp_bug", "Allow full-recovery ckp-gen to exceed cluster generation. (Default: off)", + TUNABLE_BOOLEAN, &gbl_reproduce_ckp_bug, 0, NULL, NULL, NULL, NULL); REGISTER_TUNABLE("reqldiffstat", NULL, TUNABLE_INTEGER, &diffstat_thresh, READONLY, NULL, NULL, NULL, NULL); REGISTER_TUNABLE("reqltruncate", NULL, TUNABLE_INTEGER, &reqltruncate, READONLY, NULL, NULL, NULL, NULL); @@ -1770,15 +1777,12 @@ REGISTER_TUNABLE("disable_tpsc_tblvers", TUNABLE_BOOLEAN, &gbl_disable_tpsc_tblvers, NOARG, NULL, NULL, NULL, NULL); -REGISTER_TUNABLE("abort_irregular_set_durable_lsn", - "Abort incorrect calls to set_durable_lsn. (Default: off)", - TUNABLE_BOOLEAN, &gbl_abort_irregular_set_durable_lsn, - EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, NULL); +REGISTER_TUNABLE("abort_irregular_set_durable_lsn", "Abort incorrect calls to set_durable_lsn. (Default: off)", + TUNABLE_BOOLEAN, &gbl_abort_irregular_set_durable_lsn, EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, + NULL); -REGISTER_TUNABLE("instrument_dblist", - "Extended dblist-trace in berkley. (Default: off)", - TUNABLE_BOOLEAN, &gbl_instrument_dblist, - READONLY | EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, NULL); +REGISTER_TUNABLE("instrument_dblist", "Extended dblist-trace in berkley. (Default: off)", TUNABLE_BOOLEAN, + &gbl_instrument_dblist, READONLY | EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, NULL); REGISTER_TUNABLE("match_on_ckp", "Allow rep_verify_match on ckp records. (Default: on)", TUNABLE_BOOLEAN, &gbl_match_on_ckp, EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, NULL); diff --git a/db/truncate_log.c b/db/truncate_log.c index b1e975a99f..b4bab993f6 100644 --- a/db/truncate_log.c +++ b/db/truncate_log.c @@ -8,7 +8,6 @@ extern int gbl_physrep_debug; extern struct dbenv *thedb; -extern int gbl_match_on_ckp; LOG_INFO find_match_lsn(void *bdb_state, cdb2_hndl_tp *repl_db, LOG_INFO start_info); diff --git a/sqlite/ext/comdb2/tranlog.c b/sqlite/ext/comdb2/tranlog.c index c6af8cd4c4..7bfe734fb2 100644 --- a/sqlite/ext/comdb2/tranlog.c +++ b/sqlite/ext/comdb2/tranlog.c @@ -532,9 +532,10 @@ u_int64_t get_timestamp_from_matchable_record(char *data) return get_timestamp_from_regop_record(data); } - if (rectype == DB___txn_ckp || (rectype == DB___txn_ckp+2000)) { - return get_timestamp_from_ckp_record(data); - } + if (rectype == DB___txn_ckp || (rectype == DB___txn_ckp+2000) || + rectype == DB___txn_ckp_recovery || (rectype == DB___txn_ckp_recovery+2000)) { + return get_timestamp_from_ckp_record(data); + } return -1; } @@ -577,7 +578,7 @@ static int tranlogColumn( case TRANLOG_COLUMN_MAXUTXNID: if (pCur->data.data) { LOGCOPY_32(&rectype, pCur->data.data); - if (rectype == DB___txn_ckp+2000) { + if (rectype == DB___txn_ckp+2000 || rectype == DB___txn_ckp_recovery+2000) { LOGCOPY_64(&maxutxnid, &((char*)pCur->data.data)[4 + 4 + 8 + 8 + 8 + 8 + 4 + 4]); sqlite3_result_int64(ctx, maxutxnid); break; @@ -632,7 +633,7 @@ static int tranlogColumn( generation = get_generation_from_regop_rowlocks_record(pCur->data.data); } - if (rectype == DB___txn_ckp) { + if (rectype == DB___txn_ckp || rectype == DB___txn_ckp_recovery) { generation = get_generation_from_ckp_record(pCur->data.data); } @@ -681,9 +682,10 @@ static int tranlogColumn( timestamp = get_timestamp_from_regop_record(pCur->data.data); } - if (rectype == DB___txn_ckp || (rectype == DB___txn_ckp+2000)) { - timestamp = get_timestamp_from_ckp_record(pCur->data.data); - } + if (rectype == DB___txn_ckp || (rectype == DB___txn_ckp+2000) || + rectype == DB___txn_ckp_recovery || (rectype == DB___txn_ckp_recovery+2000)) { + timestamp = get_timestamp_from_ckp_record(pCur->data.data); + } if (timestamp > 0) { sqlite3_result_int64(ctx, timestamp); diff --git a/tests/nogen_full_recovery.test/README b/tests/nogen_full_recovery.test/README index b35012bbf4..acd7ffac42 100644 --- a/tests/nogen_full_recovery.test/README +++ b/tests/nogen_full_recovery.test/README @@ -5,7 +5,24 @@ disabled. We believe that there are cases under the original algorithm where full-recovery can push the end of the transaction-log beyond that of the cluster, which may again incorrectly bias it towards winning an election. -There are 3 different versions of this test: +Original design of elect-highest-committed-gen did not consider the effects (or +include any tests) for full-recovery. The phy-rep/lcl-rep combination databases +demonstrated that a fully-recovered db can coerce the cluster into unwinding +committed data. This was because full-recovery called 'open_dbs' +transactionally, which produced a regop_gen record which can be higher than the +cluster's gen, which allowed it to incorrectly win an election. A similar +problem exists for checkpoint records: the issue in that case was that +fullrecovery can produce a checkpoint record which could also produced by the +cluster master- meaning that replicants can match against records which were not +produced by the cluster master for that generation. + +In order to work correctly with elect-highest-committed-gen, full-recovery must +not emit any matchable records, and it must not increment the generation. The +database instead will issue 'txn_regop' records, and 'tkn_ckp_recover' records +rather than 'txn_regop_gen' / 'txn_ckp'. Neither txn_regop nor txn_ckp_recover +records are used for voting or matching. + +Versions of this test: The default (no-testopts) version has elect-highest-committed-gen enabled, and demonstrated the original bug. It shows how full-recovery writes a regop-gen @@ -19,7 +36,9 @@ highest-committed-gen is disabled. This is the original issue that this tunable was intended to fix. This version of the test requires that the master node downgrade just as the fully-recovered node rejoins the cluster. The fully-recovered node should be guaranteed to win the election because -full-recovery has pushed it's LSN further than the cluster's LSN. +full-recovery has pushed it's LSN further than the cluster's LSN. While this +version of the test serves as proof that we can lose data, it is not reliable- +it will not always lose data- so I have disabled it. The 'enableddowngrade' version of this test shows that elect-highest-committed-gen will not allow the fully-recovered node to win the @@ -27,3 +46,15 @@ election despite the fact that it's LSN is larger. Rather, the election allows a node which has the highest written generation to win the election. So data which has replicated to a majority is not lost. This is the correct behavior. +The 'ckpbug' version of this test shows that even if fullrecovery emits non- +matchable regop records, the same issue can happen if we allow checkpoint +records to be considered for voting (committed-gen), and the checkpoint is +written at a higher generation than the cluster's generation. As with the +'disable' version of the test, this serves as proof that we can lose data, but +is not reliable, and is disabled. + +Finally, the 'rollout' version of the test demonstrates that we usually work +correctly with retrieve_gen_from_ckp and recovery_ckp disabled, even though it +is incorrect for recovery to produce any matchable record. 'recovery_ckp' +introduces a new log-record type, and must therefore be disabled until the code +has moved everywhere. diff --git a/tests/nogen_full_recovery.test/ckpbug.testopts.disabled b/tests/nogen_full_recovery.test/ckpbug.testopts.disabled new file mode 100644 index 0000000000..a427ac5e31 --- /dev/null +++ b/tests/nogen_full_recovery.test/ckpbug.testopts.disabled @@ -0,0 +1,2 @@ +reproduce_ckp_bug 1 +recovery_ckp 0 diff --git a/tests/nogen_full_recovery.test/disable.testopts b/tests/nogen_full_recovery.test/disable.testopts.disabled similarity index 100% rename from tests/nogen_full_recovery.test/disable.testopts rename to tests/nogen_full_recovery.test/disable.testopts.disabled diff --git a/tests/nogen_full_recovery.test/lrl.options b/tests/nogen_full_recovery.test/lrl.options index a7c05a7289..00bfb723aa 100644 --- a/tests/nogen_full_recovery.test/lrl.options +++ b/tests/nogen_full_recovery.test/lrl.options @@ -1,3 +1,4 @@ nowatch logmsg level info berkattr elect_highest_committed_gen 1 +debug_election 1 diff --git a/tests/nogen_full_recovery.test/rollout.testopts b/tests/nogen_full_recovery.test/rollout.testopts new file mode 100644 index 0000000000..4657abece2 --- /dev/null +++ b/tests/nogen_full_recovery.test/rollout.testopts @@ -0,0 +1,3 @@ +# Active options while rolling out code +retrieve_gen_from_ckp 0 +recovery_ckp 0 diff --git a/tests/nogen_full_recovery.test/runit b/tests/nogen_full_recovery.test/runit index b666f5e49e..c8fa8db000 100755 --- a/tests/nogen_full_recovery.test/runit +++ b/tests/nogen_full_recovery.test/runit @@ -9,6 +9,161 @@ export debug=1 . ${TESTSROOTDIR}/tools/ddl.sh . ${TESTSROOTDIR}/tools/runit_common.sh +# Create-time will be generation 2 +# txn_regop_gen (with generation) records : rectype 2016 +# a checkpoint (create-time) : rectype 2011 +# +# Copy-time will be generation 2 / different rectypes +# some number of txn_regop (no generation) records : rectype 2010 +# a recovery-checkpoint : rectype 2020 +# +# Run-time will be generation 3 +# some number of txn_regop_gen records : rectype 2016 +# a ckp-record : rectype 2011 +# +function verify_create_recovery_records +{ + export done_create=0 + export done_copy=0 + + typeset create_commit=0 + typeset create_chkpnt=0 + typeset create_generation=-1 + + typeset copy_commit=0 + typeset copy_chkpnt=0 + typeset copy_generation=-1 + + typeset run_commit=0 + typeset run_chkpnt=0 + typeset run_generation=-1 + + $CDB2SQL_EXE --tabs $CDB2_OPTIONS $DBNAME default "select lsn, rectype, generation from comdb2_transaction_logs where rectype = 2016 or rectype = 2011 or rectype = 2010 or rectype = 2020" | while read lsn rectype generation ; do + + # Create phase - expect 2016 and 2011 + if [[ "$done_create" == 0 ]]; then + + if [[ "$rectype" != "2016" && "$rectype" != "2011" ]]; then + + if [[ "$create_commit" == 0 || "$create_chkpnt" == 0 ]]; then + + echo "Found invalid create-phase rectype $rectype at lsn $lsn" + failexit "Invalid rectype in create" + + else + + echo "Finished create-phase generation $create_generation" + echo "Counted $create_commit commits and $create_chkpnt checkpoints" + export done_create=1 + + if [[ "$rectype" == "2010" ]]; then + let copy_commit=copy_commit+1 + fi + + if [[ "$rectype" == "2020" ]]; then + let copy_chkpnt=copy_chkpnt+1 + fi + fi + + else + + if [[ "$rectype" == "2016" ]]; then + let create_commit=create_commit+1 + fi + + if [[ "$rectype" == "2011" ]]; then + let create_chkpnt=create_chkpnt+1 + fi + + if [[ "$generation" != "NULL" ]]; then + + if [[ "$create_generation" == -1 ]]; then + create_generation=$generation + fi + + if [[ "$create_generation" != "$generation" ]]; then + echo "Generation mismatch $create_generation != $generation" + failexit "Generation mismatch" + fi + fi + fi + + # Copy phase - expect 2010 and 2020 + elif [[ "$done_copy" == 0 ]]; then + + if [[ "$rectype" != "2010" && "$rectype" != "2020" ]]; then + if [[ "$copy_commit" == 0 || "$copy_chkpnt" == 0 ]]; then + echo "Found invalid copy-phase rectype $rectype at lsn $lsn" + failexit "Invalid rectype in copy" + else + echo "Finished copy generation $copy_generation" + echo "Counted $copy_commit commits and $copy_chkpnt checkpoints" + + export done_copy=1 + if [[ "$rectype" == "2016" ]]; then + let run_commit=run_commit+1 + fi + + if [[ "$rectype" == "2011" ]]; then + let run_chkpnt=run_chkpnt+1 + fi + fi + + else + + if [[ "$rectype" == "2010" ]]; then + let copy_commit=copy_commit+1 + fi + if [[ "$rectype" == "2020" ]]; then + let copy_chkpnt=copy_chkpnt+1 + fi + + if [[ "$generation" != "NULL" ]]; then + + if [[ "$copy_generation" == -1 ]]; then + copy_generation=$generation + fi + + if [[ "$copy_generation" != "$generation" ]]; then + echo "Generation mismatch $copy_generation != $generation" + failexit "Generation mismatch" + fi + + if [[ "$copy_generation" != "$create_generation" ]]; then + echo "Generation mismatch $copy_generation != $create_generation" + failexit "Generation mismatch" + fi + fi + fi + else + # Run phase - expect 2016 and 2011 again + if [[ "$rectype" == "2010" || "$rectype" == "2020" ]]; then + echo "Invalid rectype $rectype for run phase" + failexit "Invalid rectype in run phase" + fi + if [[ "$rectype" == "2016" ]]; then + let run_commit=run_commit+1 + fi + if [[ "$rectype" == "2011" ]]; then + let run_chkpnt=run_chkpnt+1 + fi + if [[ "$generation" != "NULL" ]]; then + if [[ "$run_generation" == -1 ]]; then + run_generation=$generation + fi + + if [[ "$run_generation" -gt "$generation" ]]; then + echo "Generation grew backwards run-gen=$run_generation new-gen=$generation" + failexit "Backwards generation" + fi + run_generation=$generation + fi + fi + done + + echo "Record-types and generations are correct" +} + function create_lots_of_btrees { j=0 @@ -29,15 +184,34 @@ function full_recover_node export LOGDIR=$TESTDIR/logs export REP_ENV_VARS="${DBDIR}/replicant_env_vars" + export CLUSTER_GENERATION="" + export RECOVERY_GENERATION="" # Run full-recovery if [ $node == `hostname` ] ; then PARAMS="--no-global-lrl --lrl $DBDIR/${DBNAME}.lrl --fullrecovery" $COMDB2_EXE ${DBNAME} ${PARAMS} + RECOVERY_GENERATION=$($COMDB2_EXE --tool cdb2_printlog -h $DBDIR/logs -l 10-100| egrep "generation|rep_gen" | tail -1 | awk '{print $2}') else PARAMS="--no-global-lrl --lrl $DBDIR/${DBNAME}.lrl --fullrecovery" CMD="cd ${DBDIR}; source ${REP_ENV_VARS} ; $COMDB2_EXE ${DBNAME} ${PARAMS}" ssh -n -o StrictHostKeyChecking=no -tt $node "${CMD}" < /dev/null + ssh -n -o StrictHostKeyChecking=no -tt $node "$COMDB2_EXE --tool cdb2_printlog -h $DBDIR/logs -l 10-100" < /dev/null 2>/dev/null > printlog.txt + export RECOVERY_GENERATION=`egrep "generation|rep_gen" printlog.txt | tail -1 | awk '{print $2}' | tr -d '\r'` + fi + + # Retrieve cluster-generation + CLUSTER_GENERATION=$($CDB2SQL_EXE --tabs $CDB2_OPTIONS $DBNAME default "select max(generation) from comdb2_transaction_logs") + + # For all 'failure' cases, the recovered node generation will be higher than the clusters + if [[ "$DBNAME" == *"disable"* || "$DBNAME" == *"ckpbug"* ]]; then + if [[ "$RECOVERY_GENERATION" -lt "$CLUSTER_GENERATION" ]]; then + failexit "Recovered generation is $RECOVERY_GENERATION, not higher than $CLUSTER_GENERATION" + fi + else + if [[ "$RECOVERY_GENERATION" -gt "$CLUSTER_GENERATION" ]]; then + failexit "Recovered generation is $RECOVERY_GENERATION, not less than or equal to $CLUSTER_GENERATION" + fi fi # While it is down, insert another record .. @@ -45,6 +219,7 @@ function full_recover_node # records here and they would disappear .. inserting 1 record allows the same test to continue # to fail when elect-highest-committed-gen is disabled $CDB2SQL_EXE $CDB2_OPTIONS $DBNAME default "insert into t1 (a) values(1)" + CLUSTER_GENERATION=$($CDB2SQL_EXE --tabs $CDB2_OPTIONS $DBNAME default "select max(generation) from comdb2_transaction_logs") # Verify that the count has replicated (it should but make really, really sure) for n in $CLUSTER ; do @@ -76,7 +251,7 @@ function full_recover_node # The original bug in elect-highest-committed-gen doesn't require a downgrade here # The old version of election requires a downgrade .. # Downgrade on 'disable' or 'enableddowngrade' (which has elect-highest-committed-gen enabled) - if [[ "$DBNAME" == *"enableddowngrade"* || "$DBNAME" == *"disable"* ]]; then + if [[ "$DBNAME" == *"enableddowngrade"* || "$DBNAME" == *"disable"* || "$DBNAME" == *"ckpbug"* ]]; then echo "Downgrading $master" $CDB2SQL_EXE $CDB2_OPTIONS $DBNAME --host $master "exec procedure sys.cmd.send('downgrade')" >/dev/null 2>&1 & fi @@ -95,9 +270,6 @@ function full_recover_node sleep 1 fi done - - - } function run_test @@ -122,7 +294,7 @@ function run_test master=$($CDB2SQL_EXE -tabs $CDB2_OPTIONS $DBNAME default "select host from comdb2_cluster where is_master='Y' limit 1") # Just repeat until the fully recovered node becomes the master - if [[ "$DBNAME" == *"disable"* ]]; then + if [[ "$DBNAME" == *"disable"* || "$DBNAME" == *"ckpbug"* ]]; then while [[ "$master" != "$fullrecovery" ]]; do $CDB2SQL_EXE $CDB2_OPTIONS $DBNAME default "truncate table t1" $CDB2SQL_EXE $CDB2_OPTIONS $DBNAME default "insert into t1 select * from generate_series(1, 1000)" @@ -147,13 +319,13 @@ function run_test # 'FAILURE' is 'SUCCESS' for the disable testcase .. it is attempting to show the failure that # occurs if elect-highest-committed-gen is disabled echo "We got a count of $count, not $target_records" - if [[ "$DBNAME" != *"disable"* ]]; then + if [[ "$DBNAME" != *"disable"* && "$DBNAME" != *"ckpbug"* ]]; then kill_by_pidfile ${TMPDIR}/${DBNAME}.${node}.pid failexit "Count is not $target_records" else echo "Successfully reproduced bug" fi - elif [[ "$DBNAME" == *"disable"* ]]; then + elif [[ "$DBNAME" == *"disable"* || "$DBNAME" == *"ckpbug"* ]]; then echo "FAILURE: disable test failed to reproduce bug" kill_by_pidfile ${TMPDIR}/${DBNAME}.${node}.pid failexit "Count is correct even though committed-gen is disabled" @@ -170,6 +342,9 @@ function run_test # This tests both with committed-gen enabled and disabled # When committed_gen is disabled, run-test should fail. +if [[ "$DBNAME" != *"rollout"* ]]; then + verify_create_recovery_records +fi create_lots_of_btrees run_test diff --git a/tests/runtestcase b/tests/runtestcase index 7be84e745d..22673a2391 100755 --- a/tests/runtestcase +++ b/tests/runtestcase @@ -292,7 +292,7 @@ verify_physical_at_finish() { fi } -call_setup_int() { +call_setup() { [[ $COMDB2_UNITTEST == 1 ]] && return if [[ $SETUP_WARN ]]; then @@ -334,22 +334,6 @@ call_setup_int() { testdb_end_testcase 1 exit 1 elif [[ $ret != "setup successful" ]] ; then - return 1 - fi - return 0 -} - -call_setup() { - call_setup_int - local ret=$? - if [[ ret -ne 0 ]]; then - echo "!$TESTCASE: retrying setup (rc=$ret)" - echo "!$TESTCASE: retrying setup (rc=$ret)" >> ${TEST_LOG} - call_unsetup - call_setup_int - ret=$? - fi - if [[ ret -ne 0 ]]; then echo "!$TESTCASE: setup failed (rc=$ret)" >> ${TEST_LOG} echo "!$TESTCASE: setup failed (rc=$ret) see ${TESTDIR}/logs/${DBNAME}.setup" testdb_end_testcase 1 diff --git a/tests/tunables.test/t00_all_tunables.expected b/tests/tunables.test/t00_all_tunables.expected index 3f4a0e6cc0..6302b256c3 100644 --- a/tests/tunables.test/t00_all_tunables.expected +++ b/tests/tunables.test/t00_all_tunables.expected @@ -769,6 +769,7 @@ (name='receive_coherency_lease_trace', description='', type='BOOLEAN', value='OFF', read_only='N') (name='receive_start_lsn_request_trace', description='', type='BOOLEAN', value='OFF', read_only='N') (name='recover_deadlock_newmode', description='recover_deadlock_newmode', type='BOOLEAN', value='ON', read_only='N') +(name='recovery_ckp', description='Emit a non-matchable ckp during recovery. (Default: on)', type='BOOLEAN', value='ON', read_only='N') (name='recovery_pages', description='Disabled if set to 0. Othersize, number of pages to write in addition to writing datapages. This works around corner recovery cases on questionable filesystems.', type='INTEGER', value='0', read_only='N') (name='recovery_processor_poll_interval_us', description='Recovery processor wakes this often to check workers', type='INTEGER', value='1000', read_only='N') (name='recovery_processors.dump_on_full', description='Dump status on full queue.', type='BOOLEAN', value='OFF', read_only='N') @@ -830,6 +831,7 @@ (name='repmethodmaxsleep', description='Delay commits by at most this much if forced to delay by incoherent nodes.', type='INTEGER', value='300', read_only='N') (name='report_deadlock_verbose', description='If set, dump the current thread's stack for every deadlock. (Default: off)', type='BOOLEAN', value='ON', read_only='N') (name='report_decimal_conversion', description='', type='BOOLEAN', value='OFF', read_only='N') +(name='reproduce_ckp_bug', description='Allow full-recovery ckp-gen to exceed cluster generation. (Default: off)', type='BOOLEAN', value='OFF', read_only='N') (name='repsleep', description='Add a delay on replicants before completing processing a log record.', type='INTEGER', value='0', read_only='N') (name='reptimeout', description='Replication timeout', type='INTEGER', value='20', read_only='N') (name='reptimeout_lag', description='Used in replication. Once a node has received our update, we will wait REPTIMEOUT_LAG% of the time that took for all other nodes.', type='INTEGER', value='50', read_only='N') @@ -841,6 +843,7 @@ (name='request_durable_lsn_trace', description='', type='BOOLEAN', value='OFF', read_only='N') (name='requeue_on_tran_dispatch', description='Requeue transactional statement if not enough threads', type='BOOLEAN', value='ON', read_only='N') (name='reset_deadlock_race', description='reset_deadlock_race', type='BOOLEAN', value='OFF', read_only='N') +(name='retrieve_gen_from_ckp', description='Retrieve generation from ckp records. (Default: on)', type='BOOLEAN', value='ON', read_only='N') (name='retry', description='', type='INTEGER', value='10', read_only='Y') (name='return_long_column_names', description='Enables returning of long column names. (Default: ON)', type='BOOLEAN', value='ON', read_only='N') (name='revsql_force_rte', description='Force reverse sql connections to use rte. (Default: on)', type='BOOLEAN', value='ON', read_only='N')