From 2e6a0174682de346430396062f465cec6671f656 Mon Sep 17 00:00:00 2001 From: buzzingwires Date: Fri, 12 Sep 2025 06:48:41 -0400 Subject: [PATCH 1/2] Refactor `zhack label repair` and fix `-c` regression on nonzero TXG This commit fixes a likely regression introduced by 64db435 where the checksum repair functionality (`-c` or default behavior) will perform checks and access data associated with the newer undetach (`-u`) functionality, resulting in a failure when an uberblock's TXG is not 0 as required by `-u` but not `-c` Additionally, code is refactored for better separation of tasks. Signed-off-by: buzzingwires --- cmd/zhack.c | 134 ++++++++++-------- .../functional/cli_root/zhack/library.kshlib | 29 +++- .../cli_root/zhack/zhack_label_repair_001.ksh | 15 +- 3 files changed, 110 insertions(+), 68 deletions(-) diff --git a/cmd/zhack.c b/cmd/zhack.c index edf9dfa2cece..8ffbf91ffb30 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -714,6 +714,23 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl, return (0); } +static int +zhack_repair_get_byteswap(const zio_eck_t *vdev_eck, const int l, int *byteswap) +{ + if (vdev_eck->zec_magic == ZEC_MAGIC) { + *byteswap = B_FALSE; + } else if (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)) { + *byteswap = B_TRUE; + } else { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum magic number but instead got " + "0x%" PRIx64 "\n", + l, vdev_eck->zec_magic); + return (1); + } + return (0); +} + static void zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum) @@ -740,33 +757,10 @@ zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, } static int -zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, - const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg, - uint64_t *ashift) +zhack_repair_get_ashift(nvlist_t *cfg, const int l, uint64_t *ashift) { int err; - - if (ub->ub_txg != 0) { - (void) fprintf(stderr, - "error: label %d: UB TXG of 0 expected, but got %" - PRIu64 "\n", - l, ub->ub_txg); - (void) fprintf(stderr, "It would appear the device was not " - "properly removed.\n"); - return (1); - } - - for (int i = 0; i < cfg_keys_len; i++) { - uint64_t val; - err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); - if (err) { - (void) fprintf(stderr, - "error: label %d, %d: " - "cannot find nvlist key %s\n", - l, i, cfg_keys[i]); - return (err); - } - } + nvlist_t *vdev_tree_cfg; err = nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg); @@ -790,7 +784,7 @@ zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, (void) fprintf(stderr, "error: label %d: nvlist key %s is zero\n", l, ZPOOL_CONFIG_ASHIFT); - return (err); + return (1); } return (0); @@ -805,30 +799,35 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l) */ if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) { const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp); + int err; + ub->ub_txg = txg; - if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { + err = nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG); + if (err) { (void) fprintf(stderr, "error: label %d: " "Failed to remove pool creation TXG\n", l); - return (1); + return (err); } - if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) { + err = nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG); + if (err) { (void) fprintf(stderr, "error: label %d: Failed to remove pool TXG to " "be replaced.\n", l); - return (1); + return (err); } - if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) { + err = nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg); + if (err) { (void) fprintf(stderr, "error: label %d: " "Failed to add pool TXG of %" PRIu64 "\n", l, txg); - return (1); + return (err); } } @@ -922,6 +921,7 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data, BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; const uint64_t actual_magic = vdev_eck->zec_magic; int err = 0; + if (actual_magic != expected_magic) { (void) fprintf(stderr, "error: label %d: " "Expected " @@ -943,6 +943,36 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data, return (err); } +static int +zhack_repair_unpack_cfg(vdev_label_t *vl, const int l, nvlist_t **cfg) +{ + const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, + ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; + int err; + + err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, + VDEV_PHYS_SIZE - sizeof (zio_eck_t), cfg, 0); + if (err) { + (void) fprintf(stderr, + "error: cannot unpack nvlist label %d\n", l); + return (err); + } + + for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) { + uint64_t val; + err = nvlist_lookup_uint64(*cfg, cfg_keys[i], &val); + if (err) { + (void) fprintf(stderr, + "error: label %d, %d: " + "cannot find nvlist key %s\n", + l, i, cfg_keys[i]); + return (err); + } + } + + return (0); +} + static void zhack_repair_one_label(const zhack_repair_op_t op, const int fd, vdev_label_t *vl, const uint64_t label_offset, const int l, @@ -956,10 +986,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1; const uint64_t vdev_phys_offset = label_offset + offsetof(vdev_label_t, vl_vdev_phys); - const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, - ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; nvlist_t *cfg; - nvlist_t *vdev_tree_cfg = NULL; uint64_t ashift; int byteswap; @@ -967,18 +994,9 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, if (err) return; - if (vdev_eck->zec_magic == 0) { - (void) fprintf(stderr, "error: label %d: " - "Expected the nvlist checksum magic number to not be zero" - "\n", - l); - (void) fprintf(stderr, "There should already be a checksum " - "for the label.\n"); + err = zhack_repair_get_byteswap(vdev_eck, l, &byteswap); + if (err) return; - } - - byteswap = - (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)); if (byteswap) { byteswap_uint64_array(&vdev_eck->zec_cksum, @@ -994,16 +1012,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, return; } - err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, - VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); - if (err) { - (void) fprintf(stderr, - "error: cannot unpack nvlist label %d\n", l); - return; - } - - err = zhack_repair_check_label(ub, - l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift); + err = zhack_repair_unpack_cfg(vl, l, &cfg); if (err) return; @@ -1011,6 +1020,19 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, char *buf; size_t buflen; + if (ub->ub_txg != 0) { + (void) fprintf(stderr, + "error: label %d: UB TXG of 0 expected, but got %" + PRIu64 "\n", l, ub->ub_txg); + (void) fprintf(stderr, "It would appear the device was " + "not properly detached.\n"); + return; + } + + err = zhack_repair_get_ashift(cfg, l, &ashift); + if (err) + return; + err = zhack_repair_undetach(ub, cfg, l); if (err) return; diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib index 0f5f6198daf2..13918a38b083 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib @@ -33,13 +33,16 @@ # Test one: # # 1. Create pool on a loopback device with some test data -# 2. Export the pool. -# 3. Corrupt all label checksums in the pool -# 4. Check that pool cannot be imported -# 5. Verify that it cannot be imported after using zhack label repair -u +# 2. Checksum repair should work with a valid TXG. Repeatedly write and +# sync the pool so there are enough transactions for every uberblock +# to have a TXG +# 3. Export the pool. +# 4. Corrupt all label checksums in the pool +# 5. Check that pool cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -u # to ensure that the -u option will quit on corrupted checksums. -# 6. Use zhack label repair -c on device -# 7. Check that pool can be imported and that data is intact +# 7. Use zhack label repair -c on device +# 8. Check that pool can be imported and that data is intact # # Test two: # @@ -170,6 +173,17 @@ function setup_dataset check_dataset true } +function force_transactions +{ + L_TIMES="$1" + for ((i=0; i < L_TIMES; i++)) + do + touch "$TESTDIR"/"test" || return $? + zpool sync -f "$TESTPOOL" || return $? + done + return 0 +} + function get_practical_size { L_SIZE="$1" @@ -257,6 +271,9 @@ function run_test_one setup_dataset + # Force 256 extra transactions to ensure all uberblocks are assigned a TXG + log_must force_transactions 256 + log_must zpool export "$TESTPOOL" corrupt_labels "$L_SIZE" "$VIRTUAL_DISK" diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh index ce159b555d20..b5b24322f882 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh @@ -18,13 +18,16 @@ # Strategy: # # 1. Create pool on a loopback device with some test data -# 2. Export the pool. -# 3. Corrupt all label checksums in the pool -# 4. Check that pool cannot be imported -# 5. Verify that it cannot be imported after using zhack label repair -u +# 2. Checksum repair should work with a valid TXG. Repeatedly write and +# sync the pool so there are enough transactions for every uberblock +# to have a TXG +# 3. Export the pool. +# 4. Corrupt all label checksums in the pool +# 5. Check that pool cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -u # to ensure that the -u option will quit on corrupted checksums. -# 6. Use zhack label repair -c on device -# 7. Check that pool can be imported and that data is intact +# 7. Use zhack label repair -c on device +# 8. Check that pool can be imported and that data is intact . "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib From b334d129eadc857b8a383a365692f3d74d974a66 Mon Sep 17 00:00:00 2001 From: buzzingwires Date: Mon, 15 Sep 2025 15:03:01 -0400 Subject: [PATCH 2/2] Add `typeset`s to `zhack label repair` test scripts As a quality assurance measure, `typeset` is added to local variable declarations to actually enforce their intended scope. Signed-off-by: buzzingwires --- .../functional/cli_root/zhack/library.kshlib | 53 +++++++++++-------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib index 13918a38b083..0d07b1fd1952 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib @@ -100,7 +100,7 @@ VIRTUAL_MIRROR_DEVICE= function cleanup_lo { - L_DEVICE="$1" + typeset L_DEVICE="$1" if [[ -e $L_DEVICE ]]; then if is_linux; then @@ -136,9 +136,9 @@ function get_devsize function pick_logop { - L_SHOULD_SUCCEED="$1" + typeset L_SHOULD_SUCCEED="$1" - l_logop="log_mustnot" + typeset l_logop="log_mustnot" if [ "$L_SHOULD_SUCCEED" == true ]; then l_logop="log_must" fi @@ -148,7 +148,9 @@ function pick_logop function check_dataset { - L_SHOULD_SUCCEED="$1" + typeset L_SHOULD_SUCCEED="$1" + + typeset L_LOGOP= L_LOGOP="$(pick_logop "$L_SHOULD_SUCCEED")" "$L_LOGOP" mounted "$TESTPOOL"/"$TESTFS" @@ -175,7 +177,8 @@ function setup_dataset function force_transactions { - L_TIMES="$1" + typeset L_TIMES="$1" + typeset i= for ((i=0; i < L_TIMES; i++)) do touch "$TESTDIR"/"test" || return $? @@ -186,7 +189,7 @@ function force_transactions function get_practical_size { - L_SIZE="$1" + typeset L_SIZE="$1" if [ "$((L_SIZE % LABEL_SIZE))" -ne 0 ]; then echo "$(((L_SIZE / LABEL_SIZE) * LABEL_SIZE))" @@ -197,10 +200,11 @@ function get_practical_size function corrupt_sized_label_checksum { - L_SIZE="$1" - L_LABEL="$2" - L_DEVICE="$3" + typeset L_SIZE="$1" + typeset L_LABEL="$2" + typeset L_DEVICE="$3" + typeset L_PRACTICAL_SIZE= L_PRACTICAL_SIZE="$(get_practical_size "$L_SIZE")" typeset -a L_OFFSETS=("$LABEL_CKSUM_START" \ @@ -215,8 +219,8 @@ function corrupt_sized_label_checksum function corrupt_labels { - L_SIZE="$1" - L_DISK="$2" + typeset L_SIZE="$1" + typeset L_DISK="$2" corrupt_sized_label_checksum "$L_SIZE" 0 "$L_DISK" corrupt_sized_label_checksum "$L_SIZE" 1 "$L_DISK" @@ -226,11 +230,14 @@ function corrupt_labels function try_import_and_repair { - L_REPAIR_SHOULD_SUCCEED="$1" - L_IMPORT_SHOULD_SUCCEED="$2" - L_OP="$3" - L_POOLDISK="$4" + typeset L_REPAIR_SHOULD_SUCCEED="$1" + typeset L_IMPORT_SHOULD_SUCCEED="$2" + typeset L_OP="$3" + typeset L_POOLDISK="$4" + + typeset L_REPAIR_LOGOP= L_REPAIR_LOGOP="$(pick_logop "$L_REPAIR_SHOULD_SUCCEED")" + typeset L_IMPORT_LOGOP= L_IMPORT_LOGOP="$(pick_logop "$L_IMPORT_SHOULD_SUCCEED")" log_mustnot zpool import "$TESTPOOL" -d "$L_POOLDISK" @@ -244,10 +251,10 @@ function try_import_and_repair function prepare_vdev { - L_SIZE="$1" - L_BACKFILE="$2" + typeset L_SIZE="$1" + typeset L_BACKFILE="$2" - l_devname= + typeset l_devname= if truncate -s "$L_SIZE" "$L_BACKFILE"; then if is_linux; then l_devname="$(losetup -f "$L_BACKFILE" --show)" @@ -262,7 +269,7 @@ function prepare_vdev function run_test_one { - L_SIZE="$1" + typeset L_SIZE="$1" VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" log_must test -e "$VIRTUAL_DEVICE" @@ -289,7 +296,7 @@ function run_test_one function make_mirrored_pool { - L_SIZE="$1" + typeset L_SIZE="$1" VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" log_must test -e "$VIRTUAL_DEVICE" @@ -313,7 +320,7 @@ function export_and_cleanup_vdisk function run_test_two { - L_SIZE="$1" + typeset L_SIZE="$1" make_mirrored_pool "$L_SIZE" @@ -334,7 +341,7 @@ function run_test_two function run_test_three { - L_SIZE="$1" + typeset L_SIZE="$1" make_mirrored_pool "$L_SIZE" @@ -359,7 +366,7 @@ function run_test_three function run_test_four { - L_SIZE="$1" + typeset L_SIZE="$1" make_mirrored_pool "$L_SIZE"