diff options
| author | Chris Lu <chrislusf@users.noreply.github.com> | 2025-10-08 20:52:20 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-10-08 20:52:20 -0700 |
| commit | e90809521b0df68de523d23949cd0ea4f90d9850 (patch) | |
| tree | e437a237ac30c6dacfed1f93fa8dcde65932cbba | |
| parent | d0a338684c97cdd3042bc6c158ff35ab5ac16c50 (diff) | |
| download | seaweedfs-e90809521b0df68de523d23949cd0ea4f90d9850.tar.xz seaweedfs-e90809521b0df68de523d23949cd0ea4f90d9850.zip | |
Fix #7307: Prevent infinite loop in volume.check.disk (#7308)
The volume.check.disk command could get stuck in an infinite loop when
syncing replicas that have persistent discrepancies that cannot be
resolved. This happened because the sync loop had no maximum iteration
limit and no detection for when progress stopped being made.
Issues fixed:
1. Infinite loop: Added maxIterations limit (5) to prevent endless looping
2. Progress detection: Detect when hasChanges state doesn't change between
iterations, indicating sync is stuck
3. Return value bug: Fixed naked return statement that was returning zero
values instead of the actual hasChanges value, causing incorrect loop
termination logic
Changes:
- Added maximum iteration limit with clear error messages
- Added progress detection to identify stuck sync situations
- Fixed return statement to properly return hasChanges and error
- Added verbose logging for sync iterations
The fix ensures that:
- Sync will terminate after 5 iterations maximum
- Users get clear messages about why sync stopped
- The hasChanges logic properly reflects deletion sync results
Fixes #7307
| -rw-r--r-- | weed/shell/command_volume_check_disk.go | 28 |
1 files changed, 25 insertions, 3 deletions
diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index 2f3ccfdc6..4d246e26c 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -183,11 +183,34 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write func (c *commandVolumeCheckDisk) syncTwoReplicas(a *VolumeReplica, b *VolumeReplica, applyChanges bool, doSyncDeletions bool, nonRepairThreshold float64, verbose bool) (err error) { aHasChanges, bHasChanges := true, true - for aHasChanges || bHasChanges { + const maxIterations = 5 + iteration := 0 + + for (aHasChanges || bHasChanges) && iteration < maxIterations { + iteration++ + if verbose { + fmt.Fprintf(c.writer, "sync iteration %d for volume %d\n", iteration, a.info.Id) + } + + prevAHasChanges, prevBHasChanges := aHasChanges, bHasChanges if aHasChanges, bHasChanges, err = c.checkBoth(a, b, applyChanges, doSyncDeletions, nonRepairThreshold, verbose); err != nil { return err } + + // Detect if we're stuck in a loop with no progress + if iteration > 1 && prevAHasChanges == aHasChanges && prevBHasChanges == bHasChanges && (aHasChanges || bHasChanges) { + fmt.Fprintf(c.writer, "volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop\n", + a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, iteration) + return fmt.Errorf("sync not making progress after %d iterations", iteration) + } } + + if iteration >= maxIterations && (aHasChanges || bHasChanges) { + fmt.Fprintf(c.writer, "volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention\n", + a.info.Id, maxIterations, a.location.dataNode.Id, b.location.dataNode.Id) + return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations) + } + return nil } @@ -307,11 +330,10 @@ func doVolumeCheckDisk(minuend, subtrahend *needle_map.MemDb, source, target *Vo for _, deleteResult := range deleteResults { if deleteResult.Status == http.StatusAccepted && deleteResult.Size > 0 { hasChanges = true - return } } } - return + return hasChanges, nil } func readSourceNeedleBlob(grpcDialOption grpc.DialOption, sourceVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue) (needleBlob []byte, err error) { |
