aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--weed/storage/disk_location_ec.go39
-rw-r--r--weed/storage/disk_location_ec_test.go62
2 files changed, 92 insertions, 9 deletions
diff --git a/weed/storage/disk_location_ec.go b/weed/storage/disk_location_ec.go
index c8dc3c444..bb71236c6 100644
--- a/weed/storage/disk_location_ec.go
+++ b/weed/storage/disk_location_ec.go
@@ -42,6 +42,18 @@ func (l *DiskLocation) DestroyEcVolume(vid needle.VolumeId) {
}
}
+// unloadEcVolume removes an EC volume from memory without deleting its files on disk.
+// This is useful for distributed EC volumes where shards may be on other servers.
+func (l *DiskLocation) unloadEcVolume(vid needle.VolumeId) {
+ l.ecVolumesLock.Lock()
+ defer l.ecVolumesLock.Unlock()
+
+ if ecVolume, found := l.ecVolumes[vid]; found {
+ ecVolume.Close()
+ delete(l.ecVolumes, vid)
+ }
+}
+
func (l *DiskLocation) CollectEcShards(vid needle.VolumeId, shardFileNames []string) (ecVolume *erasure_coding.EcVolume, found bool) {
l.ecVolumesLock.RLock()
defer l.ecVolumesLock.RUnlock()
@@ -202,9 +214,9 @@ func (l *DiskLocation) loadAllEcShards() (err error) {
if datExists && len(sameVolumeShards) < erasure_coding.DataShardsCount {
glog.Warningf("Incomplete EC encoding for volume %d: .dat exists but only %d shards found (need at least %d), cleaning up EC files...",
volumeId, len(sameVolumeShards), erasure_coding.DataShardsCount)
- // Clean up any in-memory state before removing files
- l.DestroyEcVolume(volumeId)
l.removeEcVolumeFiles(collection, volumeId)
+ // Clean up any in-memory state. This does not delete files (already deleted by removeEcVolumeFiles).
+ l.unloadEcVolume(volumeId)
sameVolumeShards = nil
prevVolumeId = 0
continue
@@ -215,14 +227,12 @@ func (l *DiskLocation) loadAllEcShards() (err error) {
// If .dat is gone, log error but don't clean up (may be waiting for shards from other servers)
if datExists {
glog.Warningf("Failed to load EC shards for volume %d and .dat exists: %v, cleaning up EC files to use .dat...", volumeId, err)
- // Clean up any partially loaded in-memory state before removing files
- l.DestroyEcVolume(volumeId)
l.removeEcVolumeFiles(collection, volumeId)
} else {
glog.Warningf("Failed to load EC shards for volume %d: %v (this may be normal for distributed EC volumes)", volumeId, err)
- // Clean up any partially loaded in-memory state even if we don't remove files
- l.DestroyEcVolume(volumeId)
}
+ // Clean up any partially loaded in-memory state. This does not delete files.
+ l.unloadEcVolume(volumeId)
sameVolumeShards = nil
prevVolumeId = 0
continue
@@ -248,9 +258,9 @@ func (l *DiskLocation) loadAllEcShards() (err error) {
if util.FileExists(datFileName) {
glog.Warningf("Found %d EC shards without .ecx file for volume %d (incomplete encoding interrupted before .ecx creation), cleaning up...",
len(sameVolumeShards), volumeId)
- // Clean up any in-memory state before removing files
- l.DestroyEcVolume(volumeId)
l.removeEcVolumeFiles(collection, volumeId)
+ // Clean up any in-memory state. This does not delete files (already deleted by removeEcVolumeFiles).
+ l.unloadEcVolume(volumeId)
}
}
}
@@ -300,6 +310,7 @@ func (l *DiskLocation) EcShardCount() int {
// validateEcVolume checks if EC volume has enough shards to be functional
// For distributed EC volumes (where .dat is deleted), any number of shards is valid
// For incomplete EC encoding (where .dat still exists), we need at least DataShardsCount shards
+// Also validates that all shards have the same size (required for Reed-Solomon EC)
func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) bool {
baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid))
datFileName := baseFileName + ".dat"
@@ -312,13 +323,23 @@ func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId)
return true
}
- // .dat file exists, so we need to validate shard count for local EC
+ // .dat file exists, so we need to validate shard count and size for local EC
shardCount := 0
+ var expectedShardSize int64 = -1
+
for i := 0; i < erasure_coding.TotalShardsCount; i++ {
shardFileName := baseFileName + erasure_coding.ToExt(i)
if fi, err := os.Stat(shardFileName); err == nil {
// Check if file has non-zero size
if fi.Size() > 0 {
+ // Validate all shards are the same size (required for Reed-Solomon EC)
+ if expectedShardSize == -1 {
+ expectedShardSize = fi.Size()
+ } else if fi.Size() != expectedShardSize {
+ glog.V(0).Infof("EC volume %d shard %d has size %d, expected %d (all EC shards must be same size)",
+ vid, i, fi.Size(), expectedShardSize)
+ return false
+ }
shardCount++
}
} else if !os.IsNotExist(err) {
diff --git a/weed/storage/disk_location_ec_test.go b/weed/storage/disk_location_ec_test.go
index ec125bd30..875bd5bb3 100644
--- a/weed/storage/disk_location_ec_test.go
+++ b/weed/storage/disk_location_ec_test.go
@@ -426,3 +426,65 @@ func TestEcCleanupWithSeparateIdxDirectory(t *testing.T) {
t.Errorf(".dat file should remain but was deleted")
}
}
+
+// TestDistributedEcVolumeNoFileDeletion verifies that distributed EC volumes
+// (where .dat is deleted) do NOT have their shard files deleted when load fails
+// This tests the critical bug fix where DestroyEcVolume was incorrectly deleting files
+func TestDistributedEcVolumeNoFileDeletion(t *testing.T) {
+ tempDir := t.TempDir()
+
+ minFreeSpace := util.MinFreeSpace{Type: util.AsPercent, Percent: 1, Raw: "1"}
+ diskLocation := &DiskLocation{
+ Directory: tempDir,
+ DirectoryUuid: "test-uuid",
+ IdxDirectory: tempDir,
+ DiskType: types.HddType,
+ MinFreeSpace: minFreeSpace,
+ ecVolumes: make(map[needle.VolumeId]*erasure_coding.EcVolume),
+ }
+
+ collection := ""
+ volumeId := needle.VolumeId(500)
+ baseFileName := erasure_coding.EcShardFileName(collection, tempDir, int(volumeId))
+
+ // Create EC shards (only 5 shards - not enough to load, but this is a distributed EC volume)
+ for i := 0; i < 5; i++ {
+ shardFile, err := os.Create(baseFileName + erasure_coding.ToExt(i))
+ if err != nil {
+ t.Fatalf("Failed to create shard file: %v", err)
+ }
+ shardFile.WriteString("dummy shard data")
+ shardFile.Close()
+ }
+
+ // Create .ecx file to trigger EC loading
+ ecxFile, err := os.Create(baseFileName + ".ecx")
+ if err != nil {
+ t.Fatalf("Failed to create .ecx file: %v", err)
+ }
+ ecxFile.WriteString("dummy ecx data")
+ ecxFile.Close()
+
+ // NO .dat file - this is a distributed EC volume
+
+ // Run loadAllEcShards - this should fail but NOT delete shard files
+ loadErr := diskLocation.loadAllEcShards()
+ if loadErr != nil {
+ t.Logf("loadAllEcShards returned error (expected): %v", loadErr)
+ }
+
+ // CRITICAL CHECK: Verify shard files still exist (should NOT be deleted)
+ for i := 0; i < 5; i++ {
+ shardFile := baseFileName + erasure_coding.ToExt(i)
+ if !util.FileExists(shardFile) {
+ t.Errorf("CRITICAL BUG: Shard file %s was deleted for distributed EC volume!", shardFile)
+ }
+ }
+
+ // Verify .ecx file still exists (should NOT be deleted for distributed EC)
+ if !util.FileExists(baseFileName + ".ecx") {
+ t.Errorf("CRITICAL BUG: .ecx file was deleted for distributed EC volume!")
+ }
+
+ t.Logf("SUCCESS: Distributed EC volume files preserved (not deleted)")
+}