aboutsummaryrefslogtreecommitdiff
path: root/pkg/driver/mount_util.go
diff options
context:
space:
mode:
authorchrislusf <chris.lu@gmail.com>2025-12-03 19:31:12 -0800
committerChris Lu <chrislusf@users.noreply.github.com>2025-12-03 20:52:27 -0800
commite76bd693e2022ac71f857548b0919155ebb04ca9 (patch)
tree9ecb087da5b777f5119f77cc0991ce1bb2124b95 /pkg/driver/mount_util.go
parent56b4ae6398fda983484c77b5c549a8c6bceab339 (diff)
downloadseaweedfs-csi-driver-e76bd693e2022ac71f857548b0919155ebb04ca9.tar.xz
seaweedfs-csi-driver-e76bd693e2022ac71f857548b0919155ebb04ca9.zip
fix: add self-healing for volume mount failures after driver restart
This addresses issue #203 - CSI Driver Self-Healing for Volume Mount Failures. Problem: When the CSI node driver restarts, the in-memory volume cache is lost. Kubelet then directly calls NodePublishVolume (skipping NodeStageVolume), which fails with 'volume hasn't been staged yet' error. Solution: 1. Added isStagingPathHealthy() to detect healthy vs stale/corrupted mounts 2. Added cleanupStaleStagingPath() to clean up stale mount points 3. Enhanced NodeStageVolume to clean up stale mounts before staging 4. Implemented self-healing in NodePublishVolume: - If staging path is healthy: rebuild volume cache from existing mount - If staging path is stale: clean up and re-stage automatically 5. Updated Volume.Unstage to handle rebuilt volumes without unmounter Benefits: - Automatic recovery after CSI driver restarts - No manual intervention required (no kubelet/pod restarts needed) - Handles both live and dead FUSE mount scenarios - Backward compatible with normal operations Fixes #203
Diffstat (limited to 'pkg/driver/mount_util.go')
-rw-r--r--pkg/driver/mount_util.go85
1 files changed, 85 insertions, 0 deletions
diff --git a/pkg/driver/mount_util.go b/pkg/driver/mount_util.go
index b62a9e7..049d4dc 100644
--- a/pkg/driver/mount_util.go
+++ b/pkg/driver/mount_util.go
@@ -2,13 +2,98 @@ package driver
import (
"errors"
+ "os"
"time"
+ "github.com/seaweedfs/seaweedfs/weed/glog"
"k8s.io/mount-utils"
)
var mountutil = mount.New("")
+// isStagingPathHealthy checks if the staging path has a healthy FUSE mount.
+// It returns true if the path is mounted and accessible, false otherwise.
+func isStagingPathHealthy(stagingPath string) bool {
+ // Check if path exists
+ info, err := os.Stat(stagingPath)
+ if err != nil {
+ if os.IsNotExist(err) {
+ glog.V(4).Infof("staging path %s does not exist", stagingPath)
+ return false
+ }
+ // "Transport endpoint is not connected" or similar FUSE errors
+ if mount.IsCorruptedMnt(err) {
+ glog.Warningf("staging path %s has corrupted mount: %v", stagingPath, err)
+ return false
+ }
+ glog.V(4).Infof("staging path %s stat error: %v", stagingPath, err)
+ return false
+ }
+
+ // Check if it's a directory
+ if !info.IsDir() {
+ glog.Warningf("staging path %s is not a directory", stagingPath)
+ return false
+ }
+
+ // Check if it's a mount point
+ isMnt, err := mountutil.IsMountPoint(stagingPath)
+ if err != nil {
+ if mount.IsCorruptedMnt(err) {
+ glog.Warningf("staging path %s has corrupted mount point: %v", stagingPath, err)
+ return false
+ }
+ glog.V(4).Infof("staging path %s mount point check error: %v", stagingPath, err)
+ return false
+ }
+
+ if !isMnt {
+ glog.V(4).Infof("staging path %s is not a mount point", stagingPath)
+ return false
+ }
+
+ // Try to read the directory to verify FUSE is responsive
+ _, err = os.ReadDir(stagingPath)
+ if err != nil {
+ glog.Warningf("staging path %s is not readable (FUSE may be dead): %v", stagingPath, err)
+ return false
+ }
+
+ glog.V(4).Infof("staging path %s is healthy", stagingPath)
+ return true
+}
+
+// cleanupStaleStagingPath cleans up a stale or corrupted staging mount point.
+// It attempts to unmount and remove the directory.
+func cleanupStaleStagingPath(stagingPath string) error {
+ glog.Infof("cleaning up stale staging path %s", stagingPath)
+
+ // Try to unmount first (handles corrupted mounts)
+ if err := mountutil.Unmount(stagingPath); err != nil {
+ glog.V(4).Infof("unmount staging path %s (may already be unmounted): %v", stagingPath, err)
+ }
+
+ // Check if directory still exists and remove it
+ if _, err := os.Stat(stagingPath); err == nil {
+ if err := os.Remove(stagingPath); err != nil {
+ glog.Warningf("failed to remove staging path %s: %v", stagingPath, err)
+ return err
+ }
+ } else if !os.IsNotExist(err) {
+ // If stat fails with a different error (like corrupted mount), try force cleanup
+ if mount.IsCorruptedMnt(err) {
+ // Force unmount for corrupted mounts
+ if err := mount.CleanupMountPoint(stagingPath, mountutil, true); err != nil {
+ glog.Warningf("failed to cleanup corrupted mount point %s: %v", stagingPath, err)
+ return err
+ }
+ }
+ }
+
+ glog.Infof("successfully cleaned up staging path %s", stagingPath)
+ return nil
+}
+
func waitForMount(path string, timeout time.Duration) error {
var elapsed time.Duration
var interval = 10 * time.Millisecond