fix: add self-healing for volume mount failures after driver restart

This addresses issue #203 - CSI Driver Self-Healing for Volume Mount Failures. Problem: When the CSI node driver restarts, the in-memory volume cache is lost. Kubelet then directly calls NodePublishVolume (skipping NodeStageVolume), which fails with 'volume hasn't been staged yet' error. Solution: 1. Added isStagingPathHealthy() to detect healthy vs stale/corrupted mounts 2. Added cleanupStaleStagingPath() to clean up stale mount points 3. Enhanced NodeStageVolume to clean up stale mounts before staging 4. Implemented self-healing in NodePublishVolume: - If staging path is healthy: rebuild volume cache from existing mount - If staging path is stale: clean up and re-stage automatically 5. Updated Volume.Unstage to handle rebuilt volumes without unmounter Benefits: - Automatic recovery after CSI driver restarts - No manual intervention required (no kubelet/pod restarts needed) - Handles both live and dead FUSE mount scenarios - Backward compatible with normal operations Fixes #203
author: chrislusf <chris.lu@gmail.com> 2025-12-03 19:31:12 -0800
committer: chrislusf <chris.lu@gmail.com> 2025-12-03 19:31:12 -0800
commit: 454b9651450a8feecf5896ce3b2526a442a73e33 (patch)
tree: 34a9dd4579cc72d01c4400adcedf7c56f445fc65 /pkg/driver/volume.go
parent: a6742a3ec78ac67f4371d59e4ecd3d15878b940a (diff)
download: seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.tar.xz
seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.zip
1 files changed, 18 insertions, 5 deletions
diff --git a/pkg/driver/volume.go b/pkg/driver/volume.go
index ab0dcd4..8abf88b 100644
--- a/pkg/driver/volume.go
+++ b/pkg/driver/volume.go
@@ -114,13 +114,26 @@ func (vol *Volume) Unpublish(targetPath string) error {
 func (vol *Volume) Unstage(stagingTargetPath string) error {
 	glog.V(0).Infof("unmounting volume %s from %s", vol.VolumeId, stagingTargetPath)
 
-	if vol.unmounter == nil {
-		glog.Errorf("volume is not mounted: %s, path: %s", vol.VolumeId, stagingTargetPath)
-		return nil
+	if stagingTargetPath != vol.StagedPath && vol.StagedPath != "" {
+		glog.Warningf("staging path %s differs for volume %s at %s", stagingTargetPath, vol.VolumeId, vol.StagedPath)
 	}
 
-	if stagingTargetPath != vol.StagedPath {
-		glog.Warningf("staging path %s differs for volume %s at %s", stagingTargetPath, vol.VolumeId, vol.StagedPath)
+	if vol.unmounter == nil {
+		// This can happen when the volume was rebuilt from an existing staging path
+		// after a CSI driver restart. In this case, we need to force unmount.
+		glog.Infof("volume %s has no unmounter (rebuilt from existing mount), using force unmount", vol.VolumeId)
+
+		// Try to unmount the staging path
+		if err := mountutil.Unmount(stagingTargetPath); err != nil {
+			glog.Warningf("error force unmounting volume %s: %v", vol.VolumeId, err)
+		}
+
+		// Clean up using mount utilities
+		if err := mount.CleanupMountPoint(stagingTargetPath, mountutil, true); err != nil {
+			glog.Warningf("error cleaning up mount point for volume %s: %v", vol.VolumeId, err)
+		}
+
+		return nil
 	}
 
 	if err := vol.unmounter.Unmount(); err != nil {
author	chrislusf <chris.lu@gmail.com>	2025-12-03 19:31:12 -0800
committer	chrislusf <chris.lu@gmail.com>	2025-12-03 19:31:12 -0800
commit	454b9651450a8feecf5896ce3b2526a442a73e33 (patch)
tree	34a9dd4579cc72d01c4400adcedf7c56f445fc65 /pkg/driver/volume.go
parent	a6742a3ec78ac67f4371d59e4ecd3d15878b940a (diff)
download	seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.tar.xz seaweedfs-csi-driver-454b9651450a8feecf5896ce3b2526a442a73e33.zip