aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Lu <chrislusf@users.noreply.github.com>2025-10-08 20:52:20 -0700
committerGitHub <noreply@github.com>2025-10-08 20:52:20 -0700
commite90809521b0df68de523d23949cd0ea4f90d9850 (patch)
treee437a237ac30c6dacfed1f93fa8dcde65932cbba
parentd0a338684c97cdd3042bc6c158ff35ab5ac16c50 (diff)
downloadseaweedfs-e90809521b0df68de523d23949cd0ea4f90d9850.tar.xz
seaweedfs-e90809521b0df68de523d23949cd0ea4f90d9850.zip
Fix #7307: Prevent infinite loop in volume.check.disk (#7308)
The volume.check.disk command could get stuck in an infinite loop when syncing replicas that have persistent discrepancies that cannot be resolved. This happened because the sync loop had no maximum iteration limit and no detection for when progress stopped being made. Issues fixed: 1. Infinite loop: Added maxIterations limit (5) to prevent endless looping 2. Progress detection: Detect when hasChanges state doesn't change between iterations, indicating sync is stuck 3. Return value bug: Fixed naked return statement that was returning zero values instead of the actual hasChanges value, causing incorrect loop termination logic Changes: - Added maximum iteration limit with clear error messages - Added progress detection to identify stuck sync situations - Fixed return statement to properly return hasChanges and error - Added verbose logging for sync iterations The fix ensures that: - Sync will terminate after 5 iterations maximum - Users get clear messages about why sync stopped - The hasChanges logic properly reflects deletion sync results Fixes #7307
-rw-r--r--weed/shell/command_volume_check_disk.go28
1 files changed, 25 insertions, 3 deletions
diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go
index 2f3ccfdc6..4d246e26c 100644
--- a/weed/shell/command_volume_check_disk.go
+++ b/weed/shell/command_volume_check_disk.go
@@ -183,11 +183,34 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write
func (c *commandVolumeCheckDisk) syncTwoReplicas(a *VolumeReplica, b *VolumeReplica, applyChanges bool, doSyncDeletions bool, nonRepairThreshold float64, verbose bool) (err error) {
aHasChanges, bHasChanges := true, true
- for aHasChanges || bHasChanges {
+ const maxIterations = 5
+ iteration := 0
+
+ for (aHasChanges || bHasChanges) && iteration < maxIterations {
+ iteration++
+ if verbose {
+ fmt.Fprintf(c.writer, "sync iteration %d for volume %d\n", iteration, a.info.Id)
+ }
+
+ prevAHasChanges, prevBHasChanges := aHasChanges, bHasChanges
if aHasChanges, bHasChanges, err = c.checkBoth(a, b, applyChanges, doSyncDeletions, nonRepairThreshold, verbose); err != nil {
return err
}
+
+ // Detect if we're stuck in a loop with no progress
+ if iteration > 1 && prevAHasChanges == aHasChanges && prevBHasChanges == bHasChanges && (aHasChanges || bHasChanges) {
+ fmt.Fprintf(c.writer, "volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop\n",
+ a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, iteration)
+ return fmt.Errorf("sync not making progress after %d iterations", iteration)
+ }
}
+
+ if iteration >= maxIterations && (aHasChanges || bHasChanges) {
+ fmt.Fprintf(c.writer, "volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention\n",
+ a.info.Id, maxIterations, a.location.dataNode.Id, b.location.dataNode.Id)
+ return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations)
+ }
+
return nil
}
@@ -307,11 +330,10 @@ func doVolumeCheckDisk(minuend, subtrahend *needle_map.MemDb, source, target *Vo
for _, deleteResult := range deleteResults {
if deleteResult.Status == http.StatusAccepted && deleteResult.Size > 0 {
hasChanges = true
- return
}
}
}
- return
+ return hasChanges, nil
}
func readSourceNeedleBlob(grpcDialOption grpc.DialOption, sourceVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue) (needleBlob []byte, err error) {