diff options
| author | chrislusf <chris.lu@gmail.com> | 2025-12-03 19:31:12 -0800 |
|---|---|---|
| committer | chrislusf <chris.lu@gmail.com> | 2025-12-03 19:31:12 -0800 |
| commit | 454b9651450a8feecf5896ce3b2526a442a73e33 (patch) | |
| tree | 34a9dd4579cc72d01c4400adcedf7c56f445fc65 /pkg/driver/mount_util.go | |
| parent | a6742a3ec78ac67f4371d59e4ecd3d15878b940a (diff) | |
| download | seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.tar.xz seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.zip | |
fix: add self-healing for volume mount failures after driver restart
This addresses issue #203 - CSI Driver Self-Healing for Volume Mount Failures.
Problem:
When the CSI node driver restarts, the in-memory volume cache is lost.
Kubelet then directly calls NodePublishVolume (skipping NodeStageVolume),
which fails with 'volume hasn't been staged yet' error.
Solution:
1. Added isStagingPathHealthy() to detect healthy vs stale/corrupted mounts
2. Added cleanupStaleStagingPath() to clean up stale mount points
3. Enhanced NodeStageVolume to clean up stale mounts before staging
4. Implemented self-healing in NodePublishVolume:
- If staging path is healthy: rebuild volume cache from existing mount
- If staging path is stale: clean up and re-stage automatically
5. Updated Volume.Unstage to handle rebuilt volumes without unmounter
Benefits:
- Automatic recovery after CSI driver restarts
- No manual intervention required (no kubelet/pod restarts needed)
- Handles both live and dead FUSE mount scenarios
- Backward compatible with normal operations
Fixes #203
Diffstat (limited to 'pkg/driver/mount_util.go')
| -rw-r--r-- | pkg/driver/mount_util.go | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/pkg/driver/mount_util.go b/pkg/driver/mount_util.go index b62a9e7..049d4dc 100644 --- a/pkg/driver/mount_util.go +++ b/pkg/driver/mount_util.go @@ -2,13 +2,98 @@ package driver import ( "errors" + "os" "time" + "github.com/seaweedfs/seaweedfs/weed/glog" "k8s.io/mount-utils" ) var mountutil = mount.New("") +// isStagingPathHealthy checks if the staging path has a healthy FUSE mount. +// It returns true if the path is mounted and accessible, false otherwise. +func isStagingPathHealthy(stagingPath string) bool { + // Check if path exists + info, err := os.Stat(stagingPath) + if err != nil { + if os.IsNotExist(err) { + glog.V(4).Infof("staging path %s does not exist", stagingPath) + return false + } + // "Transport endpoint is not connected" or similar FUSE errors + if mount.IsCorruptedMnt(err) { + glog.Warningf("staging path %s has corrupted mount: %v", stagingPath, err) + return false + } + glog.V(4).Infof("staging path %s stat error: %v", stagingPath, err) + return false + } + + // Check if it's a directory + if !info.IsDir() { + glog.Warningf("staging path %s is not a directory", stagingPath) + return false + } + + // Check if it's a mount point + isMnt, err := mountutil.IsMountPoint(stagingPath) + if err != nil { + if mount.IsCorruptedMnt(err) { + glog.Warningf("staging path %s has corrupted mount point: %v", stagingPath, err) + return false + } + glog.V(4).Infof("staging path %s mount point check error: %v", stagingPath, err) + return false + } + + if !isMnt { + glog.V(4).Infof("staging path %s is not a mount point", stagingPath) + return false + } + + // Try to read the directory to verify FUSE is responsive + _, err = os.ReadDir(stagingPath) + if err != nil { + glog.Warningf("staging path %s is not readable (FUSE may be dead): %v", stagingPath, err) + return false + } + + glog.V(4).Infof("staging path %s is healthy", stagingPath) + return true +} + +// cleanupStaleStagingPath cleans up a stale or corrupted staging mount point. +// It attempts to unmount and remove the directory. +func cleanupStaleStagingPath(stagingPath string) error { + glog.Infof("cleaning up stale staging path %s", stagingPath) + + // Try to unmount first (handles corrupted mounts) + if err := mountutil.Unmount(stagingPath); err != nil { + glog.V(4).Infof("unmount staging path %s (may already be unmounted): %v", stagingPath, err) + } + + // Check if directory still exists and remove it + if _, err := os.Stat(stagingPath); err == nil { + if err := os.Remove(stagingPath); err != nil { + glog.Warningf("failed to remove staging path %s: %v", stagingPath, err) + return err + } + } else if !os.IsNotExist(err) { + // If stat fails with a different error (like corrupted mount), try force cleanup + if mount.IsCorruptedMnt(err) { + // Force unmount for corrupted mounts + if err := mount.CleanupMountPoint(stagingPath, mountutil, true); err != nil { + glog.Warningf("failed to cleanup corrupted mount point %s: %v", stagingPath, err) + return err + } + } + } + + glog.Infof("successfully cleaned up staging path %s", stagingPath) + return nil +} + func waitForMount(path string, timeout time.Duration) error { var elapsed time.Duration var interval = 10 * time.Millisecond |
