From 9805730685de2c5390260c8b624e44d9c26ec517 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Wed, 26 Jun 2024 22:23:17 +0000 Subject: [PATCH] Guarantee that spa_load_guid is unique The zpool reguid feature introduced the spa_load_guid, which is a transient value used for runtime identification purposes in the ARC. This value is not the same as the spa's persistent pool guid. However, the value is seeded from spa_generate_load_guid() which does not check for uniqueness against the spa_load_guid from other pools. Although extremely rare, you can end up with two different pools sharing the same spa_load_guid value! This change guarantees that the value is always unique and additionally not still in use by an async arc flush task. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady --- include/sys/arc.h | 1 + include/sys/spa.h | 1 + module/zfs/arc.c | 119 ++++++++++++++++++++++++++++++++++++------ module/zfs/spa_misc.c | 28 ++++++++++ module/zfs/vdev.c | 2 +- 5 files changed, 134 insertions(+), 17 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index fc33e60b7a8c..44d7d6207de7 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -332,6 +332,7 @@ void arc_flush(spa_t *spa, boolean_t retry); void arc_flush_async(spa_t *spa); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); +boolean_t arc_async_flush_guid_inuse(uint64_t load_guid); uint64_t arc_all_memory(void); uint64_t arc_default_max(uint64_t min, uint64_t allmem); diff --git a/include/sys/spa.h b/include/sys/spa.h index ca30b60c0af7..3ca795bebe33 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1102,6 +1102,7 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_generate_guid(spa_t *spa); +extern uint64_t spa_generate_load_guid(void); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa, const uint64_t *guidp); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6ae818538f6a..1b9f58420d2b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -776,6 +776,23 @@ static buf_hash_table_t buf_hash_table; uint64_t zfs_crc64_table[256]; +/* + * Asynchronous ARC flush + * + * We track these in a list for arc_async_flush_guid_inuse(). + * Used for both L1 and L2 async teardown. + */ +static list_t arc_async_flush_list; +static kmutex_t arc_async_flush_lock; + +typedef struct arc_async_flush { + uint64_t af_spa_guid; + taskq_ent_t af_tqent; + uint_t af_cache_level; /* 1 or 2 to differentiate node */ + list_node_t af_node; +} arc_async_flush_t; + + /* * Level 2 ARC */ @@ -4423,19 +4440,52 @@ arc_flush(spa_t *spa, boolean_t retry) arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry); } +static arc_async_flush_t * +arc_async_flush_add(uint64_t spa_guid, uint_t level) +{ + arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP); + af->af_spa_guid = spa_guid; + af->af_cache_level = level; + taskq_init_ent(&af->af_tqent); + list_link_init(&af->af_node); + + mutex_enter(&arc_async_flush_lock); + list_insert_tail(&arc_async_flush_list, af); + mutex_exit(&arc_async_flush_lock); + + return (af); +} + +static void +arc_async_flush_remove(uint64_t spa_guid, uint_t level) +{ + mutex_enter(&arc_async_flush_lock); + for (arc_async_flush_t *af = list_head(&arc_async_flush_list); + af != NULL; af = list_next(&arc_async_flush_list, af)) { + if (af->af_spa_guid == spa_guid && + af->af_cache_level == level) { + list_remove(&arc_async_flush_list, af); + kmem_free(af, sizeof (*af)); + break; + } + } + mutex_exit(&arc_async_flush_lock); +} + static void arc_flush_task(void *arg) { - uint64_t guid = *((uint64_t *)arg); + arc_async_flush_t *af = arg; hrtime_t start_time = gethrtime(); + uint64_t spa_guid = af->af_spa_guid; - arc_flush_impl(guid, B_FALSE); - kmem_free(arg, sizeof (uint64_t *)); + arc_flush_impl(spa_guid, B_FALSE); + arc_async_flush_remove(spa_guid, af->af_cache_level); uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); if (elaspsed > 0) { zfs_dbgmsg("spa %llu arc flushed in %llu ms", - (u_longlong_t)guid, (u_longlong_t)elaspsed); + (u_longlong_t)spa_guid, (u_longlong_t)elaspsed); } } @@ -4452,15 +4502,36 @@ arc_flush_task(void *arg) void arc_flush_async(spa_t *spa) { - uint64_t *guidp = kmem_alloc(sizeof (uint64_t *), KM_SLEEP); + uint64_t spa_guid = spa_load_guid(spa); + arc_async_flush_t *af = arc_async_flush_add(spa_guid, 1); - *guidp = spa_load_guid(spa); + /* + * Note that arc_flush_task() needs arc_async_flush_lock to remove af + * list node. So by holding the lock we avoid a race for af removal + * with our use here. + */ + mutex_enter(&arc_async_flush_lock); + taskq_dispatch_ent(arc_flush_taskq, arc_flush_task, + af, TQ_SLEEP, &af->af_tqent); + mutex_exit(&arc_async_flush_lock); +} - if (taskq_dispatch(arc_flush_taskq, arc_flush_task, guidp, - TQ_SLEEP) == TASKQID_INVALID) { - arc_flush_impl(*guidp, B_FALSE); - kmem_free(guidp, sizeof (uint64_t *)); +/* + * Check if a guid is still in-use as part of an async teardown task + */ +boolean_t +arc_async_flush_guid_inuse(uint64_t spa_guid) +{ + mutex_enter(&arc_async_flush_lock); + for (arc_async_flush_t *af = list_head(&arc_async_flush_list); + af != NULL; af = list_next(&arc_async_flush_list, af)) { + if (af->af_spa_guid == spa_guid) { + mutex_exit(&arc_async_flush_lock); + return (B_TRUE); + } } + mutex_exit(&arc_async_flush_lock); + return (B_FALSE); } uint64_t @@ -7801,6 +7872,9 @@ arc_init(void) arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + list_create(&arc_async_flush_list, sizeof (arc_async_flush_t), + offsetof(arc_async_flush_t, af_node)); + mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL); arc_flush_taskq = taskq_create("arc_flush", 75, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); @@ -7884,6 +7958,9 @@ arc_fini(void) taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); + list_destroy(&arc_async_flush_list); + mutex_destroy(&arc_async_flush_lock); + mutex_enter(&arc_prune_mtx); while ((p = list_remove_head(&arc_prune_list)) != NULL) { (void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); @@ -9795,6 +9872,8 @@ typedef struct { l2arc_dev_t *rva_l2arc_dev; uint64_t rva_spa_gid; uint64_t rva_vdev_gid; + boolean_t rva_async; + } remove_vdev_args_t; static void @@ -9825,6 +9904,9 @@ l2arc_device_teardown(void *arg) (u_longlong_t)rva->rva_vdev_gid, (u_longlong_t)elaspsed); } + + if (rva->rva_async) + arc_async_flush_remove(rva->rva_spa_gid, 2); kmem_free(rva, sizeof (remove_vdev_args_t)); } @@ -9850,7 +9932,7 @@ l2arc_remove_vdev(vdev_t *vd) remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t), KM_SLEEP); rva->rva_l2arc_dev = remdev; - rva->rva_spa_gid = spa_guid(remdev->l2ad_spa); + rva->rva_spa_gid = spa_load_guid(spa); rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid; /* @@ -9866,6 +9948,7 @@ l2arc_remove_vdev(vdev_t *vd) asynchronous = B_FALSE; } mutex_exit(&l2arc_rebuild_thr_lock); + rva->rva_async = asynchronous; /* * Remove device from global list @@ -9883,13 +9966,17 @@ l2arc_remove_vdev(vdev_t *vd) } mutex_exit(&l2arc_dev_mtx); - /* - * If possible, the teardown is completed asynchronously - */ - if (!asynchronous || taskq_dispatch(arc_flush_taskq, - l2arc_device_teardown, rva, TQ_SLEEP) == TASKQID_INVALID) { + if (!asynchronous) { l2arc_device_teardown(rva); + return; } + + arc_async_flush_t *af = arc_async_flush_add(rva->rva_spa_gid, 2); + + mutex_enter(&arc_async_flush_lock); + taskq_dispatch_ent(arc_flush_taskq, l2arc_device_teardown, rva, + TQ_SLEEP, &af->af_tqent); + mutex_exit(&arc_async_flush_lock); } void diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f486513fcaf9..6a4f71174f34 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1588,6 +1588,34 @@ spa_generate_guid(spa_t *spa) return (guid); } +static boolean_t +spa_load_guid_exists(uint64_t guid) +{ + avl_tree_t *t = &spa_namespace_avl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa_load_guid(spa) == guid) + return (B_TRUE); + } + + return (arc_async_flush_guid_inuse(guid)); +} + +uint64_t +spa_generate_load_guid(void) +{ + uint64_t guid; + + do { + (void) random_get_pseudo_bytes((void *)&guid, + sizeof (guid)); + } while (guid == 0 || spa_load_guid_exists(guid)); + + return (guid); +} + void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index bcab46c63bfa..f8d58a0c1c33 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -647,7 +647,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; - spa->spa_load_guid = spa_generate_guid(NULL); + spa->spa_load_guid = spa_generate_load_guid(); } if (guid == 0 && ops != &vdev_hole_ops) {