diff options
| author | chrislusf <chris.lu@gmail.com> | 2025-12-03 19:31:12 -0800 |
|---|---|---|
| committer | chrislusf <chris.lu@gmail.com> | 2025-12-03 19:31:12 -0800 |
| commit | 454b9651450a8feecf5896ce3b2526a442a73e33 (patch) | |
| tree | 34a9dd4579cc72d01c4400adcedf7c56f445fc65 /pkg/driver/volume.go | |
| parent | a6742a3ec78ac67f4371d59e4ecd3d15878b940a (diff) | |
| download | seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.tar.xz seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.zip | |
fix: add self-healing for volume mount failures after driver restart
This addresses issue #203 - CSI Driver Self-Healing for Volume Mount Failures.
Problem:
When the CSI node driver restarts, the in-memory volume cache is lost.
Kubelet then directly calls NodePublishVolume (skipping NodeStageVolume),
which fails with 'volume hasn't been staged yet' error.
Solution:
1. Added isStagingPathHealthy() to detect healthy vs stale/corrupted mounts
2. Added cleanupStaleStagingPath() to clean up stale mount points
3. Enhanced NodeStageVolume to clean up stale mounts before staging
4. Implemented self-healing in NodePublishVolume:
- If staging path is healthy: rebuild volume cache from existing mount
- If staging path is stale: clean up and re-stage automatically
5. Updated Volume.Unstage to handle rebuilt volumes without unmounter
Benefits:
- Automatic recovery after CSI driver restarts
- No manual intervention required (no kubelet/pod restarts needed)
- Handles both live and dead FUSE mount scenarios
- Backward compatible with normal operations
Fixes #203
Diffstat (limited to 'pkg/driver/volume.go')
| -rw-r--r-- | pkg/driver/volume.go | 23 |
1 files changed, 18 insertions, 5 deletions
diff --git a/pkg/driver/volume.go b/pkg/driver/volume.go index ab0dcd4..8abf88b 100644 --- a/pkg/driver/volume.go +++ b/pkg/driver/volume.go @@ -114,13 +114,26 @@ func (vol *Volume) Unpublish(targetPath string) error { func (vol *Volume) Unstage(stagingTargetPath string) error { glog.V(0).Infof("unmounting volume %s from %s", vol.VolumeId, stagingTargetPath) - if vol.unmounter == nil { - glog.Errorf("volume is not mounted: %s, path: %s", vol.VolumeId, stagingTargetPath) - return nil + if stagingTargetPath != vol.StagedPath && vol.StagedPath != "" { + glog.Warningf("staging path %s differs for volume %s at %s", stagingTargetPath, vol.VolumeId, vol.StagedPath) } - if stagingTargetPath != vol.StagedPath { - glog.Warningf("staging path %s differs for volume %s at %s", stagingTargetPath, vol.VolumeId, vol.StagedPath) + if vol.unmounter == nil { + // This can happen when the volume was rebuilt from an existing staging path + // after a CSI driver restart. In this case, we need to force unmount. + glog.Infof("volume %s has no unmounter (rebuilt from existing mount), using force unmount", vol.VolumeId) + + // Try to unmount the staging path + if err := mountutil.Unmount(stagingTargetPath); err != nil { + glog.Warningf("error force unmounting volume %s: %v", vol.VolumeId, err) + } + + // Clean up using mount utilities + if err := mount.CleanupMountPoint(stagingTargetPath, mountutil, true); err != nil { + glog.Warningf("error cleaning up mount point for volume %s: %v", vol.VolumeId, err) + } + + return nil } if err := vol.unmounter.Unmount(); err != nil { |
