aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/container_release_unified.yml1
-rw-r--r--.github/workflows/ec-integration-tests.yml41
-rw-r--r--.github/workflows/helm_ci.yml74
-rw-r--r--.github/workflows/s3-go-tests.yml54
-rw-r--r--.github/workflows/sftp-tests.yml93
-rw-r--r--k8s/charts/seaweedfs/Chart.yaml2
-rw-r--r--k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml4
-rw-r--r--k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml4
-rw-r--r--k8s/charts/seaweedfs/templates/master/master-statefulset.yaml4
-rw-r--r--k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml4
-rw-r--r--k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml4
-rw-r--r--k8s/charts/seaweedfs/values.yaml4
-rw-r--r--test/erasure_coding/ec_integration_test.go438
-rw-r--r--test/s3/sse/s3_sse_integration_test.go72
-rw-r--r--test/s3/tagging/Makefile321
-rw-r--r--test/s3/tagging/README.md53
-rw-r--r--test/s3/tagging/s3_tagging_test.go446
-rw-r--r--test/sftp/Makefile41
-rw-r--r--test/sftp/README.md92
-rw-r--r--test/sftp/basic_test.go652
-rw-r--r--test/sftp/framework.go423
-rw-r--r--test/sftp/go.mod17
-rw-r--r--test/sftp/go.sum64
-rw-r--r--test/sftp/testdata/userstore.json37
-rw-r--r--weed/command/scaffold/filer.toml18
-rw-r--r--weed/command/server.go1
-rw-r--r--weed/command/volume.go7
-rw-r--r--weed/filer/empty_folder_cleanup/cleanup_queue.go207
-rw-r--r--weed/filer/empty_folder_cleanup/cleanup_queue_test.go371
-rw-r--r--weed/filer/empty_folder_cleanup/empty_folder_cleaner.go436
-rw-r--r--weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go569
-rw-r--r--weed/filer/filer.go8
-rw-r--r--weed/filer/filer_notify.go39
-rw-r--r--weed/filer/filer_on_meta_event.go39
-rw-r--r--weed/filer/filer_search.go13
-rw-r--r--weed/filer/redis2/redis_cluster_store.go6
-rw-r--r--weed/filer/redis2/redis_sentinel_store.go4
-rw-r--r--weed/filer/redis2/redis_store.go26
-rw-r--r--weed/filer/redis2/universal_redis_store.go30
-rw-r--r--weed/filer/redis_lua/redis_cluster_store.go6
-rw-r--r--weed/filer/redis_lua/redis_sentinel_store.go4
-rw-r--r--weed/filer/redis_lua/redis_store.go6
-rw-r--r--weed/filer/redis_lua/universal_redis_store.go18
-rw-r--r--weed/pb/master.proto2
-rw-r--r--weed/pb/master_pb/master.pb.go26
-rw-r--r--weed/pb/server_address.go12
-rw-r--r--weed/s3api/chunked_reader_v4.go77
-rw-r--r--weed/s3api/s3api_bucket_config.go4
-rw-r--r--weed/s3api/s3api_object_handlers_copy.go119
-rw-r--r--weed/s3api/s3api_object_handlers_copy_unified.go108
-rw-r--r--weed/s3api/s3api_object_handlers_delete.go57
-rw-r--r--weed/s3api/s3api_object_handlers_put.go27
-rw-r--r--weed/s3api/s3api_streaming_copy.go60
-rw-r--r--weed/server/master_grpc_server.go4
-rw-r--r--weed/server/master_grpc_server_volume.go2
-rw-r--r--weed/server/volume_server.go4
-rw-r--r--weed/server/volume_server_handlers_admin.go31
-rw-r--r--weed/sftpd/sftp_file_writer.go5
-rw-r--r--weed/sftpd/sftp_filer.go82
-rw-r--r--weed/sftpd/sftp_server.go24
-rw-r--r--weed/sftpd/sftp_server_test.go103
-rw-r--r--weed/sftpd/sftp_service.go4
-rw-r--r--weed/sftpd/user/filestore.go5
-rw-r--r--weed/shell/command_ec_common.go190
-rw-r--r--weed/shell/command_volume_check_disk.go131
-rw-r--r--weed/shell/command_volume_check_disk_test.go6
-rw-r--r--weed/shell/command_volume_server_evacuate.go10
-rw-r--r--weed/shell/common.go8
-rw-r--r--weed/storage/erasure_coding/placement/placement.go420
-rw-r--r--weed/storage/erasure_coding/placement/placement_test.go517
-rw-r--r--weed/storage/store.go12
-rw-r--r--weed/storage/store_ec_delete.go17
-rw-r--r--weed/storage/store_load_balancing_test.go2
-rw-r--r--weed/topology/data_node.go1
-rw-r--r--weed/topology/rack.go69
-rw-r--r--weed/topology/topology_test.go119
-rw-r--r--weed/util/network.go11
-rw-r--r--weed/worker/tasks/erasure_coding/detection.go130
78 files changed, 6684 insertions, 468 deletions
diff --git a/.github/workflows/container_release_unified.yml b/.github/workflows/container_release_unified.yml
index eb8df9834..c7aa648fd 100644
--- a/.github/workflows/container_release_unified.yml
+++ b/.github/workflows/container_release_unified.yml
@@ -223,3 +223,4 @@ jobs:
echo "✓ Successfully copied ${{ matrix.variant }} to Docker Hub"
+
diff --git a/.github/workflows/ec-integration-tests.yml b/.github/workflows/ec-integration-tests.yml
new file mode 100644
index 000000000..ea476b77c
--- /dev/null
+++ b/.github/workflows/ec-integration-tests.yml
@@ -0,0 +1,41 @@
+name: "EC Integration Tests"
+
+on:
+ push:
+ branches: [ master ]
+ pull_request:
+ branches: [ master ]
+
+permissions:
+ contents: read
+
+jobs:
+ ec-integration-tests:
+ name: EC Integration Tests
+ runs-on: ubuntu-22.04
+ timeout-minutes: 30
+ steps:
+ - name: Set up Go 1.x
+ uses: actions/setup-go@v6
+ with:
+ go-version: ^1.24
+ id: go
+
+ - name: Check out code into the Go module directory
+ uses: actions/checkout@v4
+
+ - name: Build weed binary
+ run: |
+ cd weed && go build -o weed .
+
+ - name: Run EC Integration Tests
+ working-directory: test/erasure_coding
+ run: |
+ go test -v
+
+ - name: Archive logs
+ if: failure()
+ uses: actions/upload-artifact@v4
+ with:
+ name: ec-integration-test-logs
+ path: test/erasure_coding \ No newline at end of file
diff --git a/.github/workflows/helm_ci.yml b/.github/workflows/helm_ci.yml
index f936ff445..ea971aec1 100644
--- a/.github/workflows/helm_ci.yml
+++ b/.github/workflows/helm_ci.yml
@@ -44,6 +44,80 @@ jobs:
- name: Run chart-testing (lint)
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --all --validate-maintainers=false --chart-dirs k8s/charts
+ - name: Verify template rendering
+ run: |
+ set -e
+ CHART_DIR="k8s/charts/seaweedfs"
+
+ echo "=== Testing default configuration ==="
+ helm template test $CHART_DIR > /tmp/default.yaml
+ echo "✓ Default configuration renders successfully"
+
+ echo "=== Testing with S3 enabled ==="
+ helm template test $CHART_DIR --set s3.enabled=true > /tmp/s3.yaml
+ grep -q "kind: Deployment" /tmp/s3.yaml && grep -q "seaweedfs-s3" /tmp/s3.yaml
+ echo "✓ S3 deployment renders correctly"
+
+ echo "=== Testing with all-in-one mode ==="
+ helm template test $CHART_DIR --set allInOne.enabled=true > /tmp/allinone.yaml
+ grep -q "seaweedfs-all-in-one" /tmp/allinone.yaml
+ echo "✓ All-in-one deployment renders correctly"
+
+ echo "=== Testing with security enabled ==="
+ helm template test $CHART_DIR --set global.enableSecurity=true > /tmp/security.yaml
+ grep -q "security-config" /tmp/security.yaml
+ echo "✓ Security configuration renders correctly"
+
+ echo "=== Testing with monitoring enabled ==="
+ helm template test $CHART_DIR \
+ --set global.monitoring.enabled=true \
+ --set global.monitoring.gatewayHost=prometheus \
+ --set global.monitoring.gatewayPort=9091 > /tmp/monitoring.yaml
+ echo "✓ Monitoring configuration renders correctly"
+
+ echo "=== Testing with PVC storage ==="
+ helm template test $CHART_DIR \
+ --set master.data.type=persistentVolumeClaim \
+ --set master.data.size=10Gi \
+ --set master.data.storageClass=standard > /tmp/pvc.yaml
+ grep -q "PersistentVolumeClaim" /tmp/pvc.yaml
+ echo "✓ PVC configuration renders correctly"
+
+ echo "=== Testing with custom replicas ==="
+ helm template test $CHART_DIR \
+ --set master.replicas=3 \
+ --set filer.replicas=2 \
+ --set volume.replicas=3 > /tmp/replicas.yaml
+ echo "✓ Custom replicas configuration renders correctly"
+
+ echo "=== Testing filer with S3 gateway ==="
+ helm template test $CHART_DIR \
+ --set filer.s3.enabled=true \
+ --set filer.s3.enableAuth=true > /tmp/filer-s3.yaml
+ echo "✓ Filer S3 gateway renders correctly"
+
+ echo "=== Testing SFTP enabled ==="
+ helm template test $CHART_DIR --set sftp.enabled=true > /tmp/sftp.yaml
+ grep -q "seaweedfs-sftp" /tmp/sftp.yaml
+ echo "✓ SFTP deployment renders correctly"
+
+ echo "=== Testing ingress configurations ==="
+ helm template test $CHART_DIR \
+ --set master.ingress.enabled=true \
+ --set filer.ingress.enabled=true \
+ --set s3.enabled=true \
+ --set s3.ingress.enabled=true > /tmp/ingress.yaml
+ grep -q "kind: Ingress" /tmp/ingress.yaml
+ echo "✓ Ingress configurations render correctly"
+
+ echo "=== Testing COSI driver ==="
+ helm template test $CHART_DIR --set cosi.enabled=true > /tmp/cosi.yaml
+ grep -q "seaweedfs-cosi" /tmp/cosi.yaml
+ echo "✓ COSI driver renders correctly"
+
+ echo ""
+ echo "✅ All template rendering tests passed!"
+
- name: Create kind cluster
uses: helm/kind-action@v1.13.0
diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml
index 6e04a8d44..78387d4f1 100644
--- a/.github/workflows/s3-go-tests.yml
+++ b/.github/workflows/s3-go-tests.yml
@@ -411,4 +411,58 @@ jobs:
path: test/s3/versioning/weed-test*.log
retention-days: 7
+ s3-tagging-tests:
+ # CI job for S3 object tagging tests
+ name: S3 Tagging Tests
+ runs-on: ubuntu-22.04
+ timeout-minutes: 20
+
+ steps:
+ - name: Check out code
+ uses: actions/checkout@v6
+
+ - name: Set up Go
+ uses: actions/setup-go@v6
+ with:
+ go-version-file: 'go.mod'
+ id: go
+
+ - name: Install SeaweedFS
+ run: |
+ go install -buildvcs=false
+
+ - name: Run S3 Tagging Tests
+ timeout-minutes: 15
+ working-directory: test/s3/tagging
+ run: |
+ set -x
+ echo "=== System Information ==="
+ uname -a
+ free -h
+
+ # Set environment variables for the test
+ export S3_ENDPOINT="http://localhost:8006"
+ export S3_ACCESS_KEY="0555b35654ad1656d804"
+ export S3_SECRET_KEY="h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q=="
+
+ # Run the specific test that is equivalent to AWS S3 tagging behavior
+ make test-with-server || {
+ echo "❌ Test failed, checking logs..."
+ if [ -f weed-test.log ]; then
+ echo "=== Server logs ==="
+ tail -100 weed-test.log
+ fi
+ echo "=== Process information ==="
+ ps aux | grep -E "(weed|test)" || true
+ exit 1
+ }
+
+ - name: Upload test logs on failure
+ if: failure()
+ uses: actions/upload-artifact@v5
+ with:
+ name: s3-tagging-test-logs
+ path: test/s3/tagging/weed-test*.log
+ retention-days: 3
+
# Removed SSE-C integration tests and compatibility job \ No newline at end of file
diff --git a/.github/workflows/sftp-tests.yml b/.github/workflows/sftp-tests.yml
new file mode 100644
index 000000000..80a1b9929
--- /dev/null
+++ b/.github/workflows/sftp-tests.yml
@@ -0,0 +1,93 @@
+name: "SFTP Integration Tests"
+
+on:
+ push:
+ branches: [ master, main ]
+ paths:
+ - 'weed/sftpd/**'
+ - 'weed/command/sftp.go'
+ - 'test/sftp/**'
+ - '.github/workflows/sftp-tests.yml'
+ pull_request:
+ branches: [ master, main ]
+ paths:
+ - 'weed/sftpd/**'
+ - 'weed/command/sftp.go'
+ - 'test/sftp/**'
+ - '.github/workflows/sftp-tests.yml'
+
+concurrency:
+ group: ${{ github.head_ref }}/sftp-tests
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+
+env:
+ GO_VERSION: '1.24'
+ TEST_TIMEOUT: '15m'
+
+jobs:
+ sftp-integration:
+ name: SFTP Integration Testing
+ runs-on: ubuntu-22.04
+ timeout-minutes: 20
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Go ${{ env.GO_VERSION }}
+ uses: actions/setup-go@v5
+ with:
+ go-version: ${{ env.GO_VERSION }}
+
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y openssh-client
+
+ - name: Build SeaweedFS
+ run: |
+ cd weed
+ go build -o weed .
+ chmod +x weed
+ ./weed version
+
+ - name: Run SFTP Integration Tests
+ run: |
+ cd test/sftp
+
+ echo "🧪 Running SFTP integration tests..."
+ echo "============================================"
+
+ # Install test dependencies
+ go mod download
+
+ # Run all SFTP tests
+ go test -v -timeout=${{ env.TEST_TIMEOUT }} ./...
+
+ echo "============================================"
+ echo "✅ SFTP integration tests completed"
+
+ - name: Test Summary
+ if: always()
+ run: |
+ echo "## 🔐 SFTP Integration Test Summary" >> $GITHUB_STEP_SUMMARY
+ echo "" >> $GITHUB_STEP_SUMMARY
+ echo "### Test Coverage" >> $GITHUB_STEP_SUMMARY
+ echo "- ✅ **HomeDir Path Translation**: User home directory mapping (fixes #7470)" >> $GITHUB_STEP_SUMMARY
+ echo "- ✅ **File Operations**: Upload, download, delete" >> $GITHUB_STEP_SUMMARY
+ echo "- ✅ **Directory Operations**: Create, list, remove" >> $GITHUB_STEP_SUMMARY
+ echo "- ✅ **Large File Handling**: 1MB+ file support" >> $GITHUB_STEP_SUMMARY
+ echo "- ✅ **Path Edge Cases**: Unicode, trailing slashes, .. paths" >> $GITHUB_STEP_SUMMARY
+ echo "- ✅ **Admin Access**: Root user verification" >> $GITHUB_STEP_SUMMARY
+ echo "" >> $GITHUB_STEP_SUMMARY
+ echo "### Test Configuration" >> $GITHUB_STEP_SUMMARY
+ echo "| User | HomeDir | Permissions |" >> $GITHUB_STEP_SUMMARY
+ echo "|------|---------|-------------|" >> $GITHUB_STEP_SUMMARY
+ echo "| admin | / | Full access |" >> $GITHUB_STEP_SUMMARY
+ echo "| testuser | /sftp/testuser | Home directory only |" >> $GITHUB_STEP_SUMMARY
+ echo "| readonly | /public | Read-only |" >> $GITHUB_STEP_SUMMARY
+
+
diff --git a/k8s/charts/seaweedfs/Chart.yaml b/k8s/charts/seaweedfs/Chart.yaml
index 379f67890..421b85175 100644
--- a/k8s/charts/seaweedfs/Chart.yaml
+++ b/k8s/charts/seaweedfs/Chart.yaml
@@ -3,4 +3,4 @@ description: SeaweedFS
name: seaweedfs
appVersion: "4.01"
# Dev note: Trigger a helm chart release by `git tag -a helm-<version>`
-version: 4.0.401 \ No newline at end of file
+version: 4.0.401
diff --git a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml
index 8700a8a69..6f176ae19 100644
--- a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml
+++ b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml
@@ -352,7 +352,7 @@ spec:
httpGet:
path: {{ .Values.allInOne.readinessProbe.httpGet.path }}
port: {{ .Values.master.port }}
- scheme: {{ .Values.allInOne.readinessProbe.scheme }}
+ scheme: {{ .Values.allInOne.readinessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.allInOne.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.allInOne.readinessProbe.periodSeconds }}
successThreshold: {{ .Values.allInOne.readinessProbe.successThreshold }}
@@ -364,7 +364,7 @@ spec:
httpGet:
path: {{ .Values.allInOne.livenessProbe.httpGet.path }}
port: {{ .Values.master.port }}
- scheme: {{ .Values.allInOne.livenessProbe.scheme }}
+ scheme: {{ .Values.allInOne.livenessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.allInOne.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.allInOne.livenessProbe.periodSeconds }}
successThreshold: {{ .Values.allInOne.livenessProbe.successThreshold }}
diff --git a/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml b/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml
index 5aeccfa02..af82bd5e0 100644
--- a/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml
+++ b/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml
@@ -289,7 +289,7 @@ spec:
httpGet:
path: {{ .Values.filer.readinessProbe.httpGet.path }}
port: {{ .Values.filer.port }}
- scheme: {{ .Values.filer.readinessProbe.scheme }}
+ scheme: {{ .Values.filer.readinessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.filer.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.filer.readinessProbe.periodSeconds }}
successThreshold: {{ .Values.filer.readinessProbe.successThreshold }}
@@ -301,7 +301,7 @@ spec:
httpGet:
path: {{ .Values.filer.livenessProbe.httpGet.path }}
port: {{ .Values.filer.port }}
- scheme: {{ .Values.filer.livenessProbe.scheme }}
+ scheme: {{ .Values.filer.livenessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.filer.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.filer.livenessProbe.periodSeconds }}
successThreshold: {{ .Values.filer.livenessProbe.successThreshold }}
diff --git a/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml b/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml
index 704a33b80..a70673454 100644
--- a/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml
+++ b/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml
@@ -235,7 +235,7 @@ spec:
httpGet:
path: {{ .Values.master.readinessProbe.httpGet.path }}
port: {{ .Values.master.port }}
- scheme: {{ .Values.master.readinessProbe.scheme }}
+ scheme: {{ .Values.master.readinessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.master.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.master.readinessProbe.periodSeconds }}
successThreshold: {{ .Values.master.readinessProbe.successThreshold }}
@@ -247,7 +247,7 @@ spec:
httpGet:
path: {{ .Values.master.livenessProbe.httpGet.path }}
port: {{ .Values.master.port }}
- scheme: {{ .Values.master.livenessProbe.scheme }}
+ scheme: {{ .Values.master.livenessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.master.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.master.livenessProbe.periodSeconds }}
successThreshold: {{ .Values.master.livenessProbe.successThreshold }}
diff --git a/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml b/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml
index 0c6d52c3e..830e1d787 100644
--- a/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml
+++ b/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml
@@ -204,7 +204,7 @@ spec:
httpGet:
path: {{ .Values.s3.readinessProbe.httpGet.path }}
port: {{ .Values.s3.port }}
- scheme: {{ .Values.s3.readinessProbe.scheme }}
+ scheme: {{ .Values.s3.readinessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.s3.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.s3.readinessProbe.periodSeconds }}
successThreshold: {{ .Values.s3.readinessProbe.successThreshold }}
@@ -216,7 +216,7 @@ spec:
httpGet:
path: {{ .Values.s3.livenessProbe.httpGet.path }}
port: {{ .Values.s3.port }}
- scheme: {{ .Values.s3.livenessProbe.scheme }}
+ scheme: {{ .Values.s3.livenessProbe.httpGet.scheme }}
initialDelaySeconds: {{ .Values.s3.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.s3.livenessProbe.periodSeconds }}
successThreshold: {{ .Values.s3.livenessProbe.successThreshold }}
diff --git a/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml b/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml
index 29a035a2b..1a8964a55 100644
--- a/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml
+++ b/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml
@@ -251,7 +251,7 @@ spec:
httpGet:
path: {{ $volume.readinessProbe.httpGet.path }}
port: {{ $volume.port }}
- scheme: {{ $volume.readinessProbe.scheme }}
+ scheme: {{ $volume.readinessProbe.httpGet.scheme }}
initialDelaySeconds: {{ $volume.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ $volume.readinessProbe.periodSeconds }}
successThreshold: {{ $volume.readinessProbe.successThreshold }}
@@ -263,7 +263,7 @@ spec:
httpGet:
path: {{ $volume.livenessProbe.httpGet.path }}
port: {{ $volume.port }}
- scheme: {{ $volume.livenessProbe.scheme }}
+ scheme: {{ $volume.livenessProbe.httpGet.scheme }}
initialDelaySeconds: {{ $volume.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ $volume.livenessProbe.periodSeconds }}
successThreshold: {{ $volume.livenessProbe.successThreshold }}
diff --git a/k8s/charts/seaweedfs/values.yaml b/k8s/charts/seaweedfs/values.yaml
index 547b05479..520323dce 100644
--- a/k8s/charts/seaweedfs/values.yaml
+++ b/k8s/charts/seaweedfs/values.yaml
@@ -1133,7 +1133,7 @@ allInOne:
httpGet:
path: /cluster/status
port: 9333
- scheme: HTTP
+ scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 15
successThreshold: 1
@@ -1145,7 +1145,7 @@ allInOne:
httpGet:
path: /cluster/status
port: 9333
- scheme: HTTP
+ scheme: HTTP
initialDelaySeconds: 20
periodSeconds: 30
successThreshold: 1
diff --git a/test/erasure_coding/ec_integration_test.go b/test/erasure_coding/ec_integration_test.go
index 87b9b40ba..67f8eed04 100644
--- a/test/erasure_coding/ec_integration_test.go
+++ b/test/erasure_coding/ec_integration_test.go
@@ -5,9 +5,11 @@ import (
"context"
"fmt"
"io"
+ "math"
"os"
"os/exec"
"path/filepath"
+ "strings"
"testing"
"time"
@@ -139,6 +141,15 @@ func TestECEncodingVolumeLocationTimingBug(t *testing.T) {
t.Logf("EC encoding completed successfully")
}
+ // Add detailed logging for EC encoding command
+ t.Logf("Debug: Executing EC encoding command for volume %d", volumeId)
+ t.Logf("Debug: Command arguments: %v", args)
+ if err != nil {
+ t.Logf("Debug: EC encoding command failed with error: %v", err)
+ } else {
+ t.Logf("Debug: EC encoding command completed successfully")
+ }
+
// The key test: check if the fix prevents the timing issue
if contains(outputStr, "Collecting volume locations") && contains(outputStr, "before EC encoding") {
t.Logf("FIX DETECTED: Volume locations collected BEFORE EC encoding (timing bug prevented)")
@@ -526,7 +537,8 @@ func uploadTestData(data []byte, masterAddress string) (needle.VolumeId, error)
func getVolumeLocations(commandEnv *shell.CommandEnv, volumeId needle.VolumeId) ([]string, error) {
// Retry mechanism to handle timing issues with volume registration
- for i := 0; i < 10; i++ {
+ // Increase retry attempts for volume location retrieval
+ for i := 0; i < 20; i++ { // Increased from 10 to 20 retries
locations, ok := commandEnv.MasterClient.GetLocationsClone(uint32(volumeId))
if ok {
var result []string
@@ -646,3 +658,427 @@ func TestECEncodingRegressionPrevention(t *testing.T) {
t.Log("Timing pattern regression test passed")
})
}
+
+// TestDiskAwareECRebalancing tests EC shard placement across multiple disks per server
+// This verifies the disk-aware EC rebalancing feature works correctly
+func TestDiskAwareECRebalancing(t *testing.T) {
+ if testing.Short() {
+ t.Skip("Skipping disk-aware integration test in short mode")
+ }
+
+ testDir, err := os.MkdirTemp("", "seaweedfs_disk_aware_ec_test_")
+ require.NoError(t, err)
+ defer os.RemoveAll(testDir)
+
+ ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
+ defer cancel()
+
+ // Start cluster with MULTIPLE DISKS per volume server
+ cluster, err := startMultiDiskCluster(ctx, testDir)
+ require.NoError(t, err)
+ defer cluster.Stop()
+
+ // Wait for servers to be ready
+ require.NoError(t, waitForServer("127.0.0.1:9334", 30*time.Second))
+ for i := 0; i < 3; i++ {
+ require.NoError(t, waitForServer(fmt.Sprintf("127.0.0.1:809%d", i), 30*time.Second))
+ }
+
+ // Wait longer for volume servers to register with master and create volumes
+ t.Log("Waiting for volume servers to register with master...")
+ time.Sleep(10 * time.Second)
+
+ // Create command environment
+ options := &shell.ShellOptions{
+ Masters: stringPtr("127.0.0.1:9334"),
+ GrpcDialOption: grpc.WithInsecure(),
+ FilerGroup: stringPtr("default"),
+ }
+ commandEnv := shell.NewCommandEnv(options)
+
+ // Connect to master with longer timeout
+ ctx2, cancel2 := context.WithTimeout(context.Background(), 60*time.Second)
+ defer cancel2()
+ go commandEnv.MasterClient.KeepConnectedToMaster(ctx2)
+ commandEnv.MasterClient.WaitUntilConnected(ctx2)
+
+ // Wait for master client to fully sync
+ time.Sleep(5 * time.Second)
+
+ // Upload test data to create a volume - retry if volumes not ready
+ var volumeId needle.VolumeId
+ testData := []byte("Disk-aware EC rebalancing test data - this needs to be large enough to create a volume")
+ for retry := 0; retry < 5; retry++ {
+ volumeId, err = uploadTestDataToMaster(testData, "127.0.0.1:9334")
+ if err == nil {
+ break
+ }
+ t.Logf("Upload attempt %d failed: %v, retrying...", retry+1, err)
+ time.Sleep(3 * time.Second)
+ }
+ require.NoError(t, err, "Failed to upload test data after retries")
+ t.Logf("Created volume %d for disk-aware EC test", volumeId)
+
+ // Wait for volume to be registered
+ time.Sleep(3 * time.Second)
+
+ t.Run("verify_multi_disk_setup", func(t *testing.T) {
+ // Verify that each server has multiple disk directories
+ for server := 0; server < 3; server++ {
+ diskCount := 0
+ for disk := 0; disk < 4; disk++ {
+ diskDir := filepath.Join(testDir, fmt.Sprintf("server%d_disk%d", server, disk))
+ if _, err := os.Stat(diskDir); err == nil {
+ diskCount++
+ }
+ }
+ assert.Equal(t, 4, diskCount, "Server %d should have 4 disk directories", server)
+ t.Logf("Server %d has %d disk directories", server, diskCount)
+ }
+ })
+
+ t.Run("ec_encode_with_disk_awareness", func(t *testing.T) {
+ // Get lock first
+ lockCmd := shell.Commands[findCommandIndex("lock")]
+ var lockOutput bytes.Buffer
+ err := lockCmd.Do([]string{}, commandEnv, &lockOutput)
+ if err != nil {
+ t.Logf("Lock command failed: %v", err)
+ }
+
+ // Execute EC encoding
+ var output bytes.Buffer
+ ecEncodeCmd := shell.Commands[findCommandIndex("ec.encode")]
+ args := []string{"-volumeId", fmt.Sprintf("%d", volumeId), "-collection", "test", "-force"}
+
+ // Capture output
+ oldStdout := os.Stdout
+ oldStderr := os.Stderr
+ r, w, _ := os.Pipe()
+ os.Stdout = w
+ os.Stderr = w
+
+ err = ecEncodeCmd.Do(args, commandEnv, &output)
+
+ w.Close()
+ os.Stdout = oldStdout
+ os.Stderr = oldStderr
+
+ capturedOutput, _ := io.ReadAll(r)
+ outputStr := string(capturedOutput) + output.String()
+
+ t.Logf("EC encode output:\n%s", outputStr)
+
+ if err != nil {
+ t.Logf("EC encoding completed with error: %v", err)
+ } else {
+ t.Logf("EC encoding completed successfully")
+ }
+ })
+
+ t.Run("verify_disk_level_shard_distribution", func(t *testing.T) {
+ // Wait for shards to be distributed
+ time.Sleep(2 * time.Second)
+
+ // Count shards on each disk of each server
+ diskDistribution := countShardsPerDisk(testDir, uint32(volumeId))
+
+ totalShards := 0
+ disksWithShards := 0
+ maxShardsOnSingleDisk := 0
+
+ t.Logf("Disk-level shard distribution for volume %d:", volumeId)
+ for server, disks := range diskDistribution {
+ for diskId, shardCount := range disks {
+ if shardCount > 0 {
+ t.Logf(" %s disk %d: %d shards", server, diskId, shardCount)
+ totalShards += shardCount
+ disksWithShards++
+ if shardCount > maxShardsOnSingleDisk {
+ maxShardsOnSingleDisk = shardCount
+ }
+ }
+ }
+ }
+
+ t.Logf("Summary: %d total shards across %d disks (max %d on single disk)",
+ totalShards, disksWithShards, maxShardsOnSingleDisk)
+
+ // EC creates 14 shards (10 data + 4 parity), plus .ecx and .ecj files
+ // We should see shards distributed across multiple disks
+ if disksWithShards > 1 {
+ t.Logf("PASS: Shards distributed across %d disks", disksWithShards)
+ } else {
+ t.Logf("INFO: Shards on %d disk(s) - may be expected if volume was on single disk", disksWithShards)
+ }
+ })
+
+ t.Run("test_ec_balance_disk_awareness", func(t *testing.T) {
+ // Calculate initial disk balance variance
+ initialDistribution := countShardsPerDisk(testDir, uint32(volumeId))
+ initialVariance := calculateDiskShardVariance(initialDistribution)
+ t.Logf("Initial disk shard variance: %.2f", initialVariance)
+
+ // Run ec.balance command
+ var output bytes.Buffer
+ ecBalanceCmd := shell.Commands[findCommandIndex("ec.balance")]
+
+ oldStdout := os.Stdout
+ oldStderr := os.Stderr
+ r, w, _ := os.Pipe()
+ os.Stdout = w
+ os.Stderr = w
+
+ err := ecBalanceCmd.Do([]string{"-force"}, commandEnv, &output)
+
+ w.Close()
+ os.Stdout = oldStdout
+ os.Stderr = oldStderr
+
+ capturedOutput, _ := io.ReadAll(r)
+ outputStr := string(capturedOutput) + output.String()
+
+ if err != nil {
+ t.Logf("ec.balance error: %v", err)
+ }
+ t.Logf("ec.balance output:\n%s", outputStr)
+
+ // Wait for balance to complete
+ time.Sleep(2 * time.Second)
+
+ // Calculate final disk balance variance
+ finalDistribution := countShardsPerDisk(testDir, uint32(volumeId))
+ finalVariance := calculateDiskShardVariance(finalDistribution)
+ t.Logf("Final disk shard variance: %.2f", finalVariance)
+
+ t.Logf("Variance change: %.2f -> %.2f", initialVariance, finalVariance)
+ })
+
+ t.Run("verify_no_disk_overload", func(t *testing.T) {
+ // Verify that no single disk has too many shards of the same volume
+ diskDistribution := countShardsPerDisk(testDir, uint32(volumeId))
+
+ for server, disks := range diskDistribution {
+ for diskId, shardCount := range disks {
+ // With 14 EC shards and 12 disks (3 servers x 4 disks), ideally ~1-2 shards per disk
+ // Allow up to 4 shards per disk as a reasonable threshold
+ if shardCount > 4 {
+ t.Logf("WARNING: %s disk %d has %d shards (may indicate imbalance)",
+ server, diskId, shardCount)
+ }
+ }
+ }
+ })
+}
+
+// MultiDiskCluster represents a test cluster with multiple disks per volume server
+type MultiDiskCluster struct {
+ masterCmd *exec.Cmd
+ volumeServers []*exec.Cmd
+ testDir string
+}
+
+func (c *MultiDiskCluster) Stop() {
+ // Stop volume servers first
+ for _, cmd := range c.volumeServers {
+ if cmd != nil && cmd.Process != nil {
+ cmd.Process.Kill()
+ cmd.Wait()
+ }
+ }
+
+ // Stop master server
+ if c.masterCmd != nil && c.masterCmd.Process != nil {
+ c.masterCmd.Process.Kill()
+ c.masterCmd.Wait()
+ }
+}
+
+// startMultiDiskCluster starts a SeaweedFS cluster with multiple disks per volume server
+func startMultiDiskCluster(ctx context.Context, dataDir string) (*MultiDiskCluster, error) {
+ weedBinary := findWeedBinary()
+ if weedBinary == "" {
+ return nil, fmt.Errorf("weed binary not found")
+ }
+
+ cluster := &MultiDiskCluster{testDir: dataDir}
+
+ // Create master directory
+ masterDir := filepath.Join(dataDir, "master")
+ os.MkdirAll(masterDir, 0755)
+
+ // Start master server on a different port to avoid conflict
+ masterCmd := exec.CommandContext(ctx, weedBinary, "master",
+ "-port", "9334",
+ "-mdir", masterDir,
+ "-volumeSizeLimitMB", "10",
+ "-ip", "127.0.0.1",
+ )
+
+ masterLogFile, err := os.Create(filepath.Join(masterDir, "master.log"))
+ if err != nil {
+ return nil, fmt.Errorf("failed to create master log file: %v", err)
+ }
+ masterCmd.Stdout = masterLogFile
+ masterCmd.Stderr = masterLogFile
+
+ if err := masterCmd.Start(); err != nil {
+ return nil, fmt.Errorf("failed to start master server: %v", err)
+ }
+ cluster.masterCmd = masterCmd
+
+ // Wait for master to be ready
+ time.Sleep(2 * time.Second)
+
+ // Start 3 volume servers, each with 4 disks
+ const numServers = 3
+ const disksPerServer = 4
+
+ for i := 0; i < numServers; i++ {
+ // Create 4 disk directories per server
+ var diskDirs []string
+ var maxVolumes []string
+
+ for d := 0; d < disksPerServer; d++ {
+ diskDir := filepath.Join(dataDir, fmt.Sprintf("server%d_disk%d", i, d))
+ if err := os.MkdirAll(diskDir, 0755); err != nil {
+ cluster.Stop()
+ return nil, fmt.Errorf("failed to create disk dir: %v", err)
+ }
+ diskDirs = append(diskDirs, diskDir)
+ maxVolumes = append(maxVolumes, "5")
+ }
+
+ port := fmt.Sprintf("809%d", i)
+ rack := fmt.Sprintf("rack%d", i)
+
+ volumeCmd := exec.CommandContext(ctx, weedBinary, "volume",
+ "-port", port,
+ "-dir", strings.Join(diskDirs, ","),
+ "-max", strings.Join(maxVolumes, ","),
+ "-mserver", "127.0.0.1:9334",
+ "-ip", "127.0.0.1",
+ "-dataCenter", "dc1",
+ "-rack", rack,
+ )
+
+ // Create log file for this volume server
+ logDir := filepath.Join(dataDir, fmt.Sprintf("server%d_logs", i))
+ os.MkdirAll(logDir, 0755)
+ volumeLogFile, err := os.Create(filepath.Join(logDir, "volume.log"))
+ if err != nil {
+ cluster.Stop()
+ return nil, fmt.Errorf("failed to create volume log file: %v", err)
+ }
+ volumeCmd.Stdout = volumeLogFile
+ volumeCmd.Stderr = volumeLogFile
+
+ if err := volumeCmd.Start(); err != nil {
+ cluster.Stop()
+ return nil, fmt.Errorf("failed to start volume server %d: %v", i, err)
+ }
+ cluster.volumeServers = append(cluster.volumeServers, volumeCmd)
+ }
+
+ // Wait for volume servers to register with master
+ // Multi-disk servers may take longer to initialize
+ time.Sleep(8 * time.Second)
+
+ return cluster, nil
+}
+
+// uploadTestDataToMaster uploads test data to a specific master address
+func uploadTestDataToMaster(data []byte, masterAddress string) (needle.VolumeId, error) {
+ assignResult, err := operation.Assign(context.Background(), func(ctx context.Context) pb.ServerAddress {
+ return pb.ServerAddress(masterAddress)
+ }, grpc.WithInsecure(), &operation.VolumeAssignRequest{
+ Count: 1,
+ Collection: "test",
+ Replication: "000",
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ uploader, err := operation.NewUploader()
+ if err != nil {
+ return 0, err
+ }
+
+ uploadResult, err, _ := uploader.Upload(context.Background(), bytes.NewReader(data), &operation.UploadOption{
+ UploadUrl: "http://" + assignResult.Url + "/" + assignResult.Fid,
+ Filename: "testfile.txt",
+ MimeType: "text/plain",
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ if uploadResult.Error != "" {
+ return 0, fmt.Errorf("upload error: %s", uploadResult.Error)
+ }
+
+ fid, err := needle.ParseFileIdFromString(assignResult.Fid)
+ if err != nil {
+ return 0, err
+ }
+
+ return fid.VolumeId, nil
+}
+
+// countShardsPerDisk counts EC shards on each disk of each server
+// Returns map: "serverN" -> map[diskId]shardCount
+func countShardsPerDisk(testDir string, volumeId uint32) map[string]map[int]int {
+ result := make(map[string]map[int]int)
+
+ const numServers = 3
+ const disksPerServer = 4
+
+ for server := 0; server < numServers; server++ {
+ serverKey := fmt.Sprintf("server%d", server)
+ result[serverKey] = make(map[int]int)
+
+ for disk := 0; disk < disksPerServer; disk++ {
+ diskDir := filepath.Join(testDir, fmt.Sprintf("server%d_disk%d", server, disk))
+ count, err := countECShardFiles(diskDir, volumeId)
+ if err == nil && count > 0 {
+ result[serverKey][disk] = count
+ }
+ }
+ }
+
+ return result
+}
+
+// calculateDiskShardVariance measures how evenly shards are distributed across disks
+// Lower variance means better distribution
+func calculateDiskShardVariance(distribution map[string]map[int]int) float64 {
+ var counts []float64
+
+ for _, disks := range distribution {
+ for _, count := range disks {
+ if count > 0 {
+ counts = append(counts, float64(count))
+ }
+ }
+ }
+
+ if len(counts) == 0 {
+ return 0
+ }
+
+ // Calculate mean
+ mean := 0.0
+ for _, c := range counts {
+ mean += c
+ }
+ mean /= float64(len(counts))
+
+ // Calculate variance
+ variance := 0.0
+ for _, c := range counts {
+ variance += (c - mean) * (c - mean)
+ }
+
+ return math.Sqrt(variance / float64(len(counts)))
+}
diff --git a/test/s3/sse/s3_sse_integration_test.go b/test/s3/sse/s3_sse_integration_test.go
index 7b939ea76..4b7eb0ddc 100644
--- a/test/s3/sse/s3_sse_integration_test.go
+++ b/test/s3/sse/s3_sse_integration_test.go
@@ -2082,6 +2082,78 @@ func TestCopyToBucketDefaultEncryptedRegression(t *testing.T) {
require.NoError(t, err, "Failed to read object")
assertDataEqual(t, testData, data, "Data mismatch")
})
+
+ t.Run("LargeFileCopyEncrypted_ToTemp_ToEncrypted", func(t *testing.T) {
+ // Test with large file (1MB) to exercise chunk-by-chunk copy path
+ // This verifies consistent behavior with SSE-C and SSE-KMS
+ largeTestData := generateTestData(1024 * 1024) // 1MB
+ objectKey := "large-file-test.bin"
+
+ // Step 1: Upload large object to source bucket (will be automatically encrypted)
+ _, err = client.PutObject(ctx, &s3.PutObjectInput{
+ Bucket: aws.String(srcBucket),
+ Key: aws.String(objectKey),
+ Body: bytes.NewReader(largeTestData),
+ })
+ require.NoError(t, err, "Failed to upload large file to source bucket")
+
+ // Verify source object is encrypted
+ srcHead, err := client.HeadObject(ctx, &s3.HeadObjectInput{
+ Bucket: aws.String(srcBucket),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Failed to HEAD source object")
+ assert.Equal(t, types.ServerSideEncryptionAes256, srcHead.ServerSideEncryption,
+ "Source object should be SSE-S3 encrypted")
+
+ // Step 2: Copy to temp bucket (unencrypted) - exercises chunk-by-chunk decrypt
+ _, err = client.CopyObject(ctx, &s3.CopyObjectInput{
+ Bucket: aws.String(tempBucket),
+ Key: aws.String(objectKey),
+ CopySource: aws.String(fmt.Sprintf("%s/%s", srcBucket, objectKey)),
+ })
+ require.NoError(t, err, "Failed to copy large file to temp bucket")
+
+ // Verify temp object is unencrypted and data is correct
+ tempGet, err := client.GetObject(ctx, &s3.GetObjectInput{
+ Bucket: aws.String(tempBucket),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Failed to GET temp object")
+ tempData, err := io.ReadAll(tempGet.Body)
+ tempGet.Body.Close()
+ require.NoError(t, err, "Failed to read temp object")
+ assertDataEqual(t, largeTestData, tempData, "Temp object data mismatch after decrypt")
+
+ // Step 3: Copy from temp bucket to dest bucket (with default encryption)
+ // This exercises chunk-by-chunk encrypt copy
+ _, err = client.CopyObject(ctx, &s3.CopyObjectInput{
+ Bucket: aws.String(dstBucket),
+ Key: aws.String(objectKey),
+ CopySource: aws.String(fmt.Sprintf("%s/%s", tempBucket, objectKey)),
+ })
+ require.NoError(t, err, "Failed to copy large file to destination bucket")
+
+ // Verify destination object is encrypted
+ dstHead, err := client.HeadObject(ctx, &s3.HeadObjectInput{
+ Bucket: aws.String(dstBucket),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Failed to HEAD destination object")
+ assert.Equal(t, types.ServerSideEncryptionAes256, dstHead.ServerSideEncryption,
+ "Destination object should be SSE-S3 encrypted via bucket default")
+
+ // Verify destination object content is correct after re-encryption
+ dstGet, err := client.GetObject(ctx, &s3.GetObjectInput{
+ Bucket: aws.String(dstBucket),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Failed to GET destination object")
+ dstData, err := io.ReadAll(dstGet.Body)
+ dstGet.Body.Close()
+ require.NoError(t, err, "Failed to read destination object")
+ assertDataEqual(t, largeTestData, dstData, "Large file data mismatch after re-encryption")
+ })
}
// REGRESSION TESTS FOR CRITICAL BUGS FIXED
diff --git a/test/s3/tagging/Makefile b/test/s3/tagging/Makefile
new file mode 100644
index 000000000..aa2f18f7c
--- /dev/null
+++ b/test/s3/tagging/Makefile
@@ -0,0 +1,321 @@
+# S3 Object Tagging Tests
+# Tests for GitHub issue #7589: S3 object Tags query comes back empty
+
+.PHONY: help build-weed setup-server start-server stop-server test-tagging test-tagging-quick test-tagging-comprehensive test-all clean logs check-deps test-with-server
+
+# Configuration
+WEED_BINARY := ../../../weed/weed_binary
+S3_PORT := 8006
+MASTER_PORT := 9338
+VOLUME_PORT := 8085
+FILER_PORT := 8893
+TEST_TIMEOUT := 10m
+TEST_PATTERN := TestObjectTaggingOnUpload|TestPutObjectTaggingAPI|TestDeleteObjectTagging|TestTag
+
+# Default target
+help:
+ @echo "S3 Object Tagging Tests Makefile"
+ @echo ""
+ @echo "Available targets:"
+ @echo " help - Show this help message"
+ @echo " build-weed - Build the SeaweedFS binary"
+ @echo " check-deps - Check dependencies and build binary if needed"
+ @echo " start-server - Start SeaweedFS server for testing"
+ @echo " stop-server - Stop SeaweedFS server"
+ @echo " test-tagging - Run all tagging tests"
+ @echo " test-tagging-quick - Run core tagging tests only"
+ @echo " test-tagging-comprehensive - Run comprehensive tagging tests"
+ @echo " test-with-server - Start server, run tests, stop server"
+ @echo " logs - Show server logs"
+ @echo " clean - Clean up test artifacts and stop server"
+ @echo " health-check - Check if server is accessible"
+ @echo ""
+ @echo "Configuration:"
+ @echo " S3_PORT=${S3_PORT}"
+ @echo " TEST_TIMEOUT=${TEST_TIMEOUT}"
+
+# Build the SeaweedFS binary
+build-weed:
+ @echo "Building SeaweedFS binary..."
+ @cd ../../../weed && go build -o weed_binary .
+ @chmod +x $(WEED_BINARY)
+ @echo "✅ SeaweedFS binary built at $(WEED_BINARY)"
+
+check-deps: build-weed
+ @echo "Checking dependencies..."
+ @echo "🔍 DEBUG: Checking Go installation..."
+ @command -v go >/dev/null 2>&1 || (echo "Go is required but not installed" && exit 1)
+ @echo "🔍 DEBUG: Go version: $$(go version)"
+ @echo "🔍 DEBUG: Checking binary at $(WEED_BINARY)..."
+ @test -f $(WEED_BINARY) || (echo "SeaweedFS binary not found at $(WEED_BINARY)" && exit 1)
+ @echo "🔍 DEBUG: Binary size: $$(ls -lh $(WEED_BINARY) | awk '{print $$5}')"
+ @echo "🔍 DEBUG: Binary permissions: $$(ls -la $(WEED_BINARY) | awk '{print $$1}')"
+ @echo "🔍 DEBUG: Checking Go module dependencies..."
+ @go list -m github.com/aws/aws-sdk-go-v2 >/dev/null 2>&1 || (echo "AWS SDK Go v2 not found. Run 'go mod tidy'." && exit 1)
+ @go list -m github.com/stretchr/testify >/dev/null 2>&1 || (echo "Testify not found. Run 'go mod tidy'." && exit 1)
+ @echo "✅ All dependencies are available"
+
+# Start SeaweedFS server for testing
+start-server: check-deps
+ @echo "Starting SeaweedFS server..."
+ @echo "🔍 DEBUG: Current working directory: $$(pwd)"
+ @echo "🔍 DEBUG: Checking for existing weed processes..."
+ @ps aux | grep weed | grep -v grep || echo "No existing weed processes found"
+ @echo "🔍 DEBUG: Cleaning up any existing PID file..."
+ @rm -f weed-server.pid
+ @echo "🔍 DEBUG: Checking for port conflicts..."
+ @if netstat -tlnp 2>/dev/null | grep $(S3_PORT) >/dev/null; then \
+ echo "⚠️ Port $(S3_PORT) is already in use, trying to find the process..."; \
+ netstat -tlnp 2>/dev/null | grep $(S3_PORT) || true; \
+ else \
+ echo "✅ Port $(S3_PORT) is available"; \
+ fi
+ @echo "🔍 DEBUG: Checking binary at $(WEED_BINARY)"
+ @ls -la $(WEED_BINARY) || (echo "❌ Binary not found!" && exit 1)
+ @echo "🔍 DEBUG: Checking config file at ../../../docker/compose/s3.json"
+ @ls -la ../../../docker/compose/s3.json || echo "⚠️ Config file not found, continuing without it"
+ @echo "🔍 DEBUG: Creating volume directory..."
+ @mkdir -p ./test-volume-data
+ @echo "🔍 DEBUG: Launching SeaweedFS server in background..."
+ @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -filer -filer.maxMB=64 -s3 -ip.bind 0.0.0.0 -dir=./test-volume-data -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 -volume.max=100 -volume.preStopSeconds=1 -master.port=$(MASTER_PORT) -volume.port=$(VOLUME_PORT) -filer.port=$(FILER_PORT) -s3.port=$(S3_PORT) -metricsPort=9329 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -master.peers=none"
+ @$(WEED_BINARY) server \
+ -filer \
+ -filer.maxMB=64 \
+ -s3 \
+ -ip.bind 0.0.0.0 \
+ -dir=./test-volume-data \
+ -master.raftHashicorp \
+ -master.electionTimeout 1s \
+ -master.volumeSizeLimitMB=100 \
+ -volume.max=100 \
+ -volume.preStopSeconds=1 \
+ -master.port=$(MASTER_PORT) \
+ -volume.port=$(VOLUME_PORT) \
+ -filer.port=$(FILER_PORT) \
+ -s3.port=$(S3_PORT) \
+ -metricsPort=9329 \
+ -s3.allowEmptyFolder=false \
+ -s3.allowDeleteBucketNotEmpty=true \
+ -s3.config=../../../docker/compose/s3.json \
+ -master.peers=none \
+ > weed-test.log 2>&1 & echo $$! > weed-server.pid
+ @echo "🔍 DEBUG: Server PID: $$(cat weed-server.pid 2>/dev/null || echo 'PID file not found')"
+ @echo "🔍 DEBUG: Checking if PID is still running..."
+ @sleep 2
+ @if [ -f weed-server.pid ]; then \
+ SERVER_PID=$$(cat weed-server.pid); \
+ ps -p $$SERVER_PID || echo "⚠️ Server PID $$SERVER_PID not found after 2 seconds"; \
+ else \
+ echo "⚠️ PID file not found"; \
+ fi
+ @echo "🔍 DEBUG: Waiting for server to start (up to 90 seconds)..."
+ @for i in $$(seq 1 90); do \
+ echo "🔍 DEBUG: Attempt $$i/90 - checking port $(S3_PORT)"; \
+ if curl -s http://localhost:$(S3_PORT) >/dev/null 2>&1; then \
+ echo "✅ SeaweedFS server started successfully on port $(S3_PORT) after $$i seconds"; \
+ exit 0; \
+ fi; \
+ if [ $$i -eq 5 ]; then \
+ echo "🔍 DEBUG: After 5 seconds, checking process and logs..."; \
+ ps aux | grep weed | grep -v grep || echo "No weed processes found"; \
+ if [ -f weed-test.log ]; then \
+ echo "=== First server logs ==="; \
+ head -20 weed-test.log; \
+ fi; \
+ fi; \
+ if [ $$i -eq 15 ]; then \
+ echo "🔍 DEBUG: After 15 seconds, checking port bindings..."; \
+ netstat -tlnp 2>/dev/null | grep $(S3_PORT) || echo "Port $(S3_PORT) not bound"; \
+ netstat -tlnp 2>/dev/null | grep $(MASTER_PORT) || echo "Port $(MASTER_PORT) not bound"; \
+ netstat -tlnp 2>/dev/null | grep $(VOLUME_PORT) || echo "Port $(VOLUME_PORT) not bound"; \
+ fi; \
+ if [ $$i -eq 30 ]; then \
+ echo "⚠️ Server taking longer than expected (30s), checking logs..."; \
+ if [ -f weed-test.log ]; then \
+ echo "=== Recent server logs ==="; \
+ tail -20 weed-test.log; \
+ fi; \
+ fi; \
+ sleep 1; \
+ done; \
+ echo "❌ Server failed to start within 90 seconds"; \
+ echo "🔍 DEBUG: Final process check:"; \
+ ps aux | grep weed | grep -v grep || echo "No weed processes found"; \
+ echo "🔍 DEBUG: Final port check:"; \
+ netstat -tlnp 2>/dev/null | grep -E "($(S3_PORT)|$(MASTER_PORT)|$(VOLUME_PORT))" || echo "No ports bound"; \
+ echo "=== Full server logs ==="; \
+ if [ -f weed-test.log ]; then \
+ cat weed-test.log; \
+ else \
+ echo "No log file found"; \
+ fi; \
+ exit 1
+
+# Stop SeaweedFS server
+stop-server:
+ @echo "Stopping SeaweedFS server..."
+ @if [ -f weed-server.pid ]; then \
+ SERVER_PID=$$(cat weed-server.pid); \
+ echo "Killing server PID $$SERVER_PID"; \
+ if ps -p $$SERVER_PID >/dev/null 2>&1; then \
+ kill -TERM $$SERVER_PID 2>/dev/null || true; \
+ sleep 2; \
+ if ps -p $$SERVER_PID >/dev/null 2>&1; then \
+ echo "Process still running, sending KILL signal..."; \
+ kill -KILL $$SERVER_PID 2>/dev/null || true; \
+ sleep 1; \
+ fi; \
+ else \
+ echo "Process $$SERVER_PID not found (already stopped)"; \
+ fi; \
+ rm -f weed-server.pid; \
+ else \
+ echo "No PID file found, checking for running processes..."; \
+ echo "⚠️ Skipping automatic process cleanup to avoid CI issues"; \
+ echo "Note: Any remaining weed processes should be cleaned up by the CI environment"; \
+ fi
+ @echo "✅ SeaweedFS server stopped"
+
+# Show server logs
+logs:
+ @if test -f weed-test.log; then \
+ echo "=== SeaweedFS Server Logs ==="; \
+ tail -f weed-test.log; \
+ else \
+ echo "No log file found. Server may not be running."; \
+ fi
+
+# Core tagging tests (basic functionality)
+test-tagging-quick: check-deps
+ @echo "Running core tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUpload|TestPutObjectTaggingAPI" .
+ @echo "✅ Core tagging tests completed"
+
+# All tagging tests (comprehensive)
+test-tagging: check-deps
+ @echo "Running all tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "$(TEST_PATTERN)" .
+ @echo "✅ All tagging tests completed"
+
+# Comprehensive tagging tests (all features)
+test-tagging-comprehensive: check-deps
+ @echo "Running comprehensive tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) .
+ @echo "✅ Comprehensive tagging tests completed"
+
+# All tests without server management
+test-tagging-simple: check-deps
+ @echo "Running tagging tests (assuming server is already running)..."
+ @go test -v -timeout=$(TEST_TIMEOUT) .
+ @echo "✅ All tagging tests completed"
+
+# Start server, run tests, stop server
+test-with-server: start-server
+ @echo "Running tagging tests with managed server..."
+ @sleep 5 # Give server time to fully start
+ @make test-tagging-comprehensive || (echo "Tests failed, stopping server..." && make stop-server && exit 1)
+ @make stop-server
+ @echo "✅ All tests completed with managed server"
+
+# Health check
+health-check:
+ @echo "Checking server health..."
+ @if curl -s http://localhost:$(S3_PORT) >/dev/null 2>&1; then \
+ echo "✅ Server is accessible on port $(S3_PORT)"; \
+ else \
+ echo "❌ Server is not accessible on port $(S3_PORT)"; \
+ exit 1; \
+ fi
+
+# Clean up
+clean:
+ @echo "Cleaning up test artifacts..."
+ @make stop-server
+ @rm -f weed-test.log
+ @rm -f weed-server.pid
+ @rm -rf ./test-volume-data
+ @rm -f tagging.test
+ @go clean -testcache
+ @echo "✅ Cleanup completed"
+
+# Individual test targets for specific functionality
+test-upload:
+ @echo "Running upload tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUpload" .
+
+test-special-chars:
+ @echo "Running special characters tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUploadWithSpecialCharacters" .
+
+test-api:
+ @echo "Running API tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "TestPutObjectTaggingAPI" .
+
+test-get:
+ @echo "Running get tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "TestGetObjectTaggingAPI" .
+
+test-delete:
+ @echo "Running delete tagging tests..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "TestDeleteObjectTaggingAPI" .
+
+# Development targets
+dev-start: start-server
+ @echo "Development server started. Access S3 API at http://localhost:$(S3_PORT)"
+ @echo "To stop: make stop-server"
+
+dev-test: check-deps
+ @echo "Running tests in development mode..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUpload" .
+
+# CI targets
+ci-test: check-deps
+ @echo "Running tests in CI mode..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -race .
+
+# All targets
+test-all: test-tagging test-tagging-comprehensive
+ @echo "✅ All tagging tests completed"
+
+# Benchmark targets
+benchmark-tagging:
+ @echo "Running tagging performance benchmarks..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -bench=. -benchmem .
+
+# Coverage targets
+coverage:
+ @echo "Running tests with coverage..."
+ @go test -v -timeout=$(TEST_TIMEOUT) -coverprofile=coverage.out .
+ @go tool cover -html=coverage.out -o coverage.html
+ @echo "Coverage report generated: coverage.html"
+
+# Format and lint
+fmt:
+ @echo "Formatting Go code..."
+ @go fmt .
+
+lint:
+ @echo "Running linter..."
+ @golint . || echo "golint not available, skipping..."
+
+# Install dependencies for development
+install-deps:
+ @echo "Installing Go dependencies..."
+ @go mod tidy
+ @go mod download
+
+# Show current configuration
+show-config:
+ @echo "Current configuration:"
+ @echo " WEED_BINARY: $(WEED_BINARY)"
+ @echo " S3_PORT: $(S3_PORT)"
+ @echo " TEST_TIMEOUT: $(TEST_TIMEOUT)"
+ @echo " TEST_PATTERN: $(TEST_PATTERN)"
+
+# Legacy targets for backward compatibility
+test: test-with-server
+test-verbose: test-tagging-comprehensive
+test-single: test-upload
+test-clean: clean
+build: check-deps
+setup: check-deps
diff --git a/test/s3/tagging/README.md b/test/s3/tagging/README.md
new file mode 100644
index 000000000..73f2a46c4
--- /dev/null
+++ b/test/s3/tagging/README.md
@@ -0,0 +1,53 @@
+# S3 Object Tagging Tests
+
+This directory contains tests for S3 object tagging functionality.
+
+## Issue Reference
+
+These tests were created to verify the fix for [GitHub Issue #7589](https://github.com/seaweedfs/seaweedfs/issues/7589):
+**S3 object Tags query comes back empty**
+
+## Problem Description
+
+When uploading an object with tags using the `X-Amz-Tagging` header, the tags were not being stored.
+When querying the object tagging with `GetObjectTagging`, the response was empty.
+
+This was a regression between SeaweedFS 4.00 and 4.01.
+
+## Root Cause
+
+The `putToFiler` function in `s3api_object_handlers_put.go` was not parsing the `X-Amz-Tagging` header
+and storing the tags in the entry's Extended metadata. The code was only copying user metadata
+(headers starting with `X-Amz-Meta-`) but not object tags.
+
+## Fix
+
+Added tag parsing logic to `putToFiler` that:
+1. Reads the `X-Amz-Tagging` header
+2. Parses it using `url.ParseQuery()` for proper URL decoding
+3. Stores each tag with the prefix `X-Amz-Tagging-` in the entry's Extended metadata
+
+## Running Tests
+
+```bash
+# Run all tagging tests
+cd test/s3/tagging
+make test
+
+# Run specific test
+make test-upload
+
+# Or using go test directly
+go test -v ./...
+```
+
+## Test Cases
+
+1. **TestObjectTaggingOnUpload** - Basic test for tags sent during object upload
+2. **TestObjectTaggingOnUploadWithSpecialCharacters** - Tests URL-encoded tag values
+3. **TestObjectTaggingOnUploadWithEmptyValue** - Tests tags with empty values
+4. **TestPutObjectTaggingAPI** - Tests the PutObjectTagging API separately
+5. **TestDeleteObjectTagging** - Tests tag deletion
+6. **TestTagsNotPreservedAfterObjectOverwrite** - Verifies AWS S3 behavior on overwrite
+7. **TestMaximumNumberOfTags** - Tests storing the maximum 10 tags
+8. **TestTagCountHeader** - Tests the x-amz-tagging-count header in HeadObject
diff --git a/test/s3/tagging/s3_tagging_test.go b/test/s3/tagging/s3_tagging_test.go
new file mode 100644
index 000000000..c490ca1aa
--- /dev/null
+++ b/test/s3/tagging/s3_tagging_test.go
@@ -0,0 +1,446 @@
+package tagging
+
+import (
+ "context"
+ "fmt"
+ "os"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/aws/aws-sdk-go-v2/aws"
+ "github.com/aws/aws-sdk-go-v2/config"
+ "github.com/aws/aws-sdk-go-v2/credentials"
+ "github.com/aws/aws-sdk-go-v2/service/s3"
+ "github.com/aws/aws-sdk-go-v2/service/s3/types"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+// S3TestConfig holds configuration for S3 tests
+type S3TestConfig struct {
+ Endpoint string
+ AccessKey string
+ SecretKey string
+ Region string
+ BucketPrefix string
+ UseSSL bool
+ SkipVerifySSL bool
+}
+
+// getDefaultConfig returns a fresh instance of the default test configuration
+func getDefaultConfig() *S3TestConfig {
+ endpoint := os.Getenv("S3_ENDPOINT")
+ if endpoint == "" {
+ endpoint = "http://localhost:8333" // Default SeaweedFS S3 port
+ }
+ accessKey := os.Getenv("S3_ACCESS_KEY")
+ if accessKey == "" {
+ accessKey = "some_access_key1"
+ }
+ secretKey := os.Getenv("S3_SECRET_KEY")
+ if secretKey == "" {
+ secretKey = "some_secret_key1"
+ }
+ return &S3TestConfig{
+ Endpoint: endpoint,
+ AccessKey: accessKey,
+ SecretKey: secretKey,
+ Region: "us-east-1",
+ BucketPrefix: "test-tagging-",
+ UseSSL: false,
+ SkipVerifySSL: true,
+ }
+}
+
+// getS3Client creates an AWS S3 client for testing
+func getS3Client(t *testing.T) *s3.Client {
+ defaultConfig := getDefaultConfig()
+ cfg, err := config.LoadDefaultConfig(context.TODO(),
+ config.WithRegion(defaultConfig.Region),
+ config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
+ defaultConfig.AccessKey,
+ defaultConfig.SecretKey,
+ "",
+ )),
+ config.WithEndpointResolverWithOptions(aws.EndpointResolverWithOptionsFunc(
+ func(service, region string, options ...interface{}) (aws.Endpoint, error) {
+ return aws.Endpoint{
+ URL: defaultConfig.Endpoint,
+ SigningRegion: defaultConfig.Region,
+ }, nil
+ })),
+ )
+ require.NoError(t, err)
+
+ client := s3.NewFromConfig(cfg, func(o *s3.Options) {
+ o.UsePathStyle = true
+ })
+ return client
+}
+
+// createTestBucket creates a test bucket with a unique name
+func createTestBucket(t *testing.T, client *s3.Client) string {
+ defaultConfig := getDefaultConfig()
+ bucketName := fmt.Sprintf("%s%d", defaultConfig.BucketPrefix, time.Now().UnixNano())
+
+ _, err := client.CreateBucket(context.TODO(), &s3.CreateBucketInput{
+ Bucket: aws.String(bucketName),
+ })
+ require.NoError(t, err)
+
+ // Wait for bucket metadata to be fully processed
+ time.Sleep(50 * time.Millisecond)
+
+ return bucketName
+}
+
+// cleanupTestBucket removes the test bucket and all its contents
+func cleanupTestBucket(t *testing.T, client *s3.Client, bucketName string) {
+ // First, delete all objects in the bucket
+ listResp, err := client.ListObjectsV2(context.TODO(), &s3.ListObjectsV2Input{
+ Bucket: aws.String(bucketName),
+ })
+ if err == nil {
+ for _, obj := range listResp.Contents {
+ _, err := client.DeleteObject(context.TODO(), &s3.DeleteObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: obj.Key,
+ })
+ if err != nil {
+ t.Logf("Warning: failed to delete object %s: %v", *obj.Key, err)
+ }
+ }
+ }
+
+ // Then delete the bucket
+ _, err = client.DeleteBucket(context.TODO(), &s3.DeleteBucketInput{
+ Bucket: aws.String(bucketName),
+ })
+ if err != nil {
+ t.Logf("Warning: failed to delete bucket %s: %v", bucketName, err)
+ }
+}
+
+// TestObjectTaggingOnUpload tests that tags sent during object upload (via X-Amz-Tagging header)
+// are properly stored and can be retrieved. This is the fix for GitHub issue #7589.
+func TestObjectTaggingOnUpload(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-with-tags"
+ objectContent := "Hello, World!"
+
+ // Put object with tags using the Tagging parameter (X-Amz-Tagging header)
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ Tagging: aws.String("env=production&team=platform"),
+ })
+ require.NoError(t, err, "Should be able to put object with tags")
+
+ // Get the tags back
+ tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags")
+
+ // Verify tags were stored correctly
+ require.Len(t, tagResp.TagSet, 2, "Should have 2 tags")
+
+ // Build a map for easier assertion
+ tagMap := make(map[string]string)
+ for _, tag := range tagResp.TagSet {
+ tagMap[*tag.Key] = *tag.Value
+ }
+
+ assert.Equal(t, "production", tagMap["env"], "env tag should be 'production'")
+ assert.Equal(t, "platform", tagMap["team"], "team tag should be 'platform'")
+}
+
+// TestObjectTaggingOnUploadWithSpecialCharacters tests tags with URL-encoded characters
+func TestObjectTaggingOnUploadWithSpecialCharacters(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-with-special-tags"
+ objectContent := "Hello, World!"
+
+ // Put object with tags containing special characters
+ // AWS SDK will URL-encode these automatically
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ Tagging: aws.String("timestamp=2025-07-16 14:40:39&path=/tmp/file.txt"),
+ })
+ require.NoError(t, err, "Should be able to put object with special character tags")
+
+ // Get the tags back
+ tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags")
+
+ // Verify tags were stored and URL-decoded correctly
+ require.Len(t, tagResp.TagSet, 2, "Should have 2 tags")
+
+ tagMap := make(map[string]string)
+ for _, tag := range tagResp.TagSet {
+ tagMap[*tag.Key] = *tag.Value
+ }
+
+ assert.Equal(t, "2025-07-16 14:40:39", tagMap["timestamp"], "timestamp tag should be decoded correctly")
+ assert.Equal(t, "/tmp/file.txt", tagMap["path"], "path tag should be decoded correctly")
+}
+
+// TestObjectTaggingOnUploadWithEmptyValue tests tags with empty values
+func TestObjectTaggingOnUploadWithEmptyValue(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-with-empty-tag"
+ objectContent := "Hello, World!"
+
+ // Put object with a tag that has an empty value
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ Tagging: aws.String("marker=&env=dev"),
+ })
+ require.NoError(t, err, "Should be able to put object with empty tag value")
+
+ // Get the tags back
+ tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags")
+
+ // Verify tags were stored correctly
+ require.Len(t, tagResp.TagSet, 2, "Should have 2 tags")
+
+ tagMap := make(map[string]string)
+ for _, tag := range tagResp.TagSet {
+ tagMap[*tag.Key] = *tag.Value
+ }
+
+ assert.Equal(t, "", tagMap["marker"], "marker tag should have empty value")
+ assert.Equal(t, "dev", tagMap["env"], "env tag should be 'dev'")
+}
+
+// TestPutObjectTaggingAPI tests the PutObjectTagging API separately from upload
+func TestPutObjectTaggingAPI(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-for-tagging-api"
+ objectContent := "Hello, World!"
+
+ // First, put object without tags
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ })
+ require.NoError(t, err, "Should be able to put object without tags")
+
+ // Get tags - should be empty
+ tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags")
+ assert.Len(t, tagResp.TagSet, 0, "Should have no tags initially")
+
+ // Now add tags using PutObjectTagging API
+ _, err = client.PutObjectTagging(context.TODO(), &s3.PutObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Tagging: &types.Tagging{
+ TagSet: []types.Tag{
+ {Key: aws.String("env"), Value: aws.String("staging")},
+ {Key: aws.String("version"), Value: aws.String("1.0")},
+ },
+ },
+ })
+ require.NoError(t, err, "Should be able to put object tags via API")
+
+ // Get tags - should now have the tags
+ tagResp, err = client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags after PutObjectTagging")
+ require.Len(t, tagResp.TagSet, 2, "Should have 2 tags")
+
+ tagMap := make(map[string]string)
+ for _, tag := range tagResp.TagSet {
+ tagMap[*tag.Key] = *tag.Value
+ }
+
+ assert.Equal(t, "staging", tagMap["env"], "env tag should be 'staging'")
+ assert.Equal(t, "1.0", tagMap["version"], "version tag should be '1.0'")
+}
+
+// TestDeleteObjectTagging tests the DeleteObjectTagging API
+func TestDeleteObjectTagging(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-for-delete-tags"
+ objectContent := "Hello, World!"
+
+ // Put object with tags
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ Tagging: aws.String("env=production"),
+ })
+ require.NoError(t, err, "Should be able to put object with tags")
+
+ // Verify tags exist
+ tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags")
+ require.Len(t, tagResp.TagSet, 1, "Should have 1 tag")
+
+ // Delete tags
+ _, err = client.DeleteObjectTagging(context.TODO(), &s3.DeleteObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to delete object tags")
+
+ // Verify tags are deleted
+ tagResp, err = client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags after deletion")
+ assert.Len(t, tagResp.TagSet, 0, "Should have no tags after deletion")
+}
+
+// TestTagsNotPreservedAfterObjectOverwrite tests that tags are NOT preserved when an object is overwritten
+// This matches AWS S3 behavior where overwriting an object replaces all metadata including tags
+func TestTagsNotPreservedAfterObjectOverwrite(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-overwrite-tags"
+ objectContent := "Original content"
+
+ // Put object with tags
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ Tagging: aws.String("original=true"),
+ })
+ require.NoError(t, err, "Should be able to put object with tags")
+
+ // Verify original tags exist
+ tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags")
+ require.Len(t, tagResp.TagSet, 1, "Should have 1 tag")
+ assert.Equal(t, "original", *tagResp.TagSet[0].Key)
+
+ // Overwrite the object WITHOUT tags
+ _, err = client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader("New content"),
+ })
+ require.NoError(t, err, "Should be able to overwrite object")
+
+ // Tags should be gone after overwrite (matches AWS S3 behavior)
+ tagResp, err = client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags after overwrite")
+ assert.Len(t, tagResp.TagSet, 0, "Tags should be cleared after object overwrite")
+}
+
+// TestMaximumNumberOfTags tests that we can store the maximum 10 tags per object
+func TestMaximumNumberOfTags(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-max-tags"
+ objectContent := "Hello, World!"
+
+ // Build 10 tags (S3 max)
+ tags := []string{}
+ for i := 1; i <= 10; i++ {
+ tags = append(tags, fmt.Sprintf("key%d=value%d", i, i))
+ }
+ tagging := strings.Join(tags, "&")
+
+ // Put object with 10 tags
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ Tagging: aws.String(tagging),
+ })
+ require.NoError(t, err, "Should be able to put object with 10 tags")
+
+ // Get the tags back
+ tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to get object tags")
+ assert.Len(t, tagResp.TagSet, 10, "Should have 10 tags")
+}
+
+// TestTagCountHeader tests that the x-amz-tagging-count header is returned in HeadObject
+func TestTagCountHeader(t *testing.T) {
+ client := getS3Client(t)
+ bucketName := createTestBucket(t, client)
+ defer cleanupTestBucket(t, client, bucketName)
+
+ objectKey := "test-object-tag-count"
+ objectContent := "Hello, World!"
+
+ // Put object with tags
+ _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ Body: strings.NewReader(objectContent),
+ Tagging: aws.String("env=prod&team=backend&version=2.0"),
+ })
+ require.NoError(t, err, "Should be able to put object with tags")
+
+ // Head object to get tag count
+ headResp, err := client.HeadObject(context.TODO(), &s3.HeadObjectInput{
+ Bucket: aws.String(bucketName),
+ Key: aws.String(objectKey),
+ })
+ require.NoError(t, err, "Should be able to head object")
+
+ // Check tag count header
+ if headResp.TagCount != nil {
+ assert.Equal(t, int32(3), *headResp.TagCount, "Tag count should be 3")
+ } else {
+ t.Log("Warning: TagCount header not returned - this may be expected depending on implementation")
+ }
+}
diff --git a/test/sftp/Makefile b/test/sftp/Makefile
new file mode 100644
index 000000000..bc46dd3ce
--- /dev/null
+++ b/test/sftp/Makefile
@@ -0,0 +1,41 @@
+.PHONY: all build test test-verbose test-short test-homedir test-debug clean deps tidy
+
+all: build test
+
+# Build the weed binary first
+build:
+ cd ../../weed && go build -o weed .
+
+# Install test dependencies
+deps:
+ go mod download
+
+# Run all tests
+test: build deps
+ go test -timeout 5m ./...
+
+# Run tests with verbose output
+test-verbose: build deps
+ go test -v -timeout 5m ./...
+
+# Run quick tests only (skip integration tests)
+test-short: deps
+ go test -short -v ./...
+
+# Run specific test
+test-homedir: build deps
+ go test -v -timeout 5m -run TestHomeDirPathTranslation ./...
+
+# Run tests with debug output from SeaweedFS
+test-debug: build deps
+ go test -v -timeout 5m ./... 2>&1 | tee test.log
+
+# Clean up test artifacts
+clean:
+ rm -f test.log
+ go clean -testcache
+
+# Update go.sum
+tidy:
+ go mod tidy
+
diff --git a/test/sftp/README.md b/test/sftp/README.md
new file mode 100644
index 000000000..17b5e67c7
--- /dev/null
+++ b/test/sftp/README.md
@@ -0,0 +1,92 @@
+# SeaweedFS SFTP Integration Tests
+
+This directory contains integration tests for the SeaweedFS SFTP server.
+
+## Prerequisites
+
+1. Build the SeaweedFS binary:
+ ```bash
+ cd ../../weed
+ go build -o weed .
+ ```
+
+2. Ensure `ssh-keygen` is available (for generating test SSH host keys)
+
+## Running Tests
+
+### Run all tests
+```bash
+make test
+```
+
+### Run tests with verbose output
+```bash
+make test-verbose
+```
+
+### Run a specific test
+```bash
+go test -v -run TestHomeDirPathTranslation
+```
+
+### Skip long-running tests
+```bash
+go test -short ./...
+```
+
+## Test Structure
+
+- `framework.go` - Test framework that starts SeaweedFS cluster with SFTP
+- `basic_test.go` - Basic SFTP operation tests including:
+ - HomeDir path translation (fixes issue #7470)
+ - File upload/download
+ - Directory operations
+ - Large file handling
+ - Edge cases
+
+## Test Configuration
+
+Tests use `testdata/userstore.json` which defines test users:
+
+| Username | Password | HomeDir | Permissions |
+|----------|----------|---------|-------------|
+| admin | adminpassword | / | Full access |
+| testuser | testuserpassword | /sftp/testuser | Full access to home |
+| readonly | readonlypassword | /public | Read-only |
+
+## Key Tests
+
+### TestHomeDirPathTranslation
+
+Tests the fix for [issue #7470](https://github.com/seaweedfs/seaweedfs/issues/7470) where
+users with a non-root HomeDir (e.g., `/sftp/testuser`) could not upload files to `/`
+because the path wasn't being translated to their home directory.
+
+The test verifies:
+- Uploading to `/` correctly maps to the user's HomeDir
+- Creating directories at `/` works
+- Listing `/` shows the user's home directory contents
+- All path operations respect the HomeDir translation
+
+## Debugging
+
+To debug test failures:
+
+1. Enable verbose output:
+ ```bash
+ go test -v -run TestName
+ ```
+
+2. Keep test artifacts (don't cleanup):
+ ```go
+ config := DefaultTestConfig()
+ config.SkipCleanup = true
+ ```
+
+3. Enable debug logging:
+ ```go
+ config := DefaultTestConfig()
+ config.EnableDebug = true
+ ```
+
+
diff --git a/test/sftp/basic_test.go b/test/sftp/basic_test.go
new file mode 100644
index 000000000..e5ffe90d1
--- /dev/null
+++ b/test/sftp/basic_test.go
@@ -0,0 +1,652 @@
+package sftp
+
+import (
+ "bytes"
+ "io"
+ "path"
+ "testing"
+
+ "github.com/stretchr/testify/require"
+)
+
+// TestHomeDirPathTranslation tests that SFTP operations correctly translate
+// paths relative to the user's HomeDir.
+// This is the fix for https://github.com/seaweedfs/seaweedfs/issues/7470
+func TestHomeDirPathTranslation(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ // Test with user "testuser" who has HomeDir="/sftp/testuser"
+ // When they upload to "/", it should actually go to "/sftp/testuser"
+ sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword")
+ require.NoError(t, err, "failed to connect as testuser")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ // Test 1: Upload file to "/" (should map to /sftp/testuser/)
+ t.Run("UploadToRoot", func(t *testing.T) {
+ testContent := []byte("Hello from SFTP test!")
+ filename := "test_upload.txt"
+
+ // Create file at "/" from user's perspective
+ file, err := sftpClient.Create("/" + filename)
+ require.NoError(t, err, "should be able to create file at /")
+
+ _, err = file.Write(testContent)
+ require.NoError(t, err, "should be able to write to file")
+ err = file.Close()
+ require.NoError(t, err, "should be able to close file")
+
+ // Verify file exists and has correct content
+ readFile, err := sftpClient.Open("/" + filename)
+ require.NoError(t, err, "should be able to open file")
+ defer readFile.Close()
+
+ content, err := io.ReadAll(readFile)
+ require.NoError(t, err, "should be able to read file")
+ require.Equal(t, testContent, content, "file content should match")
+
+ // Clean up
+ err = sftpClient.Remove("/" + filename)
+ require.NoError(t, err, "should be able to remove file")
+ })
+
+ // Test 2: Create directory at "/" (should map to /sftp/testuser/)
+ t.Run("CreateDirAtRoot", func(t *testing.T) {
+ dirname := "test_dir"
+
+ err := sftpClient.Mkdir("/" + dirname)
+ require.NoError(t, err, "should be able to create directory at /")
+
+ // Verify directory exists
+ info, err := sftpClient.Stat("/" + dirname)
+ require.NoError(t, err, "should be able to stat directory")
+ require.True(t, info.IsDir(), "should be a directory")
+
+ // Clean up
+ err = sftpClient.RemoveDirectory("/" + dirname)
+ require.NoError(t, err, "should be able to remove directory")
+ })
+
+ // Test 3: List directory at "/" (should list /sftp/testuser/)
+ t.Run("ListRoot", func(t *testing.T) {
+ // Create a test file first
+ testContent := []byte("list test content")
+ filename := "list_test.txt"
+
+ file, err := sftpClient.Create("/" + filename)
+ require.NoError(t, err)
+ _, err = file.Write(testContent)
+ require.NoError(t, err)
+ file.Close()
+
+ // List root directory
+ files, err := sftpClient.ReadDir("/")
+ require.NoError(t, err, "should be able to list root directory")
+
+ // Should find our test file
+ found := false
+ for _, f := range files {
+ if f.Name() == filename {
+ found = true
+ break
+ }
+ }
+ require.True(t, found, "should find test file in listing")
+
+ // Clean up
+ err = sftpClient.Remove("/" + filename)
+ require.NoError(t, err)
+ })
+
+ // Test 4: Nested directory operations
+ t.Run("NestedOperations", func(t *testing.T) {
+ // Create nested directory structure
+ err := sftpClient.MkdirAll("/nested/dir/structure")
+ require.NoError(t, err, "should be able to create nested directories")
+
+ // Create file in nested directory
+ testContent := []byte("nested file content")
+ file, err := sftpClient.Create("/nested/dir/structure/file.txt")
+ require.NoError(t, err)
+ _, err = file.Write(testContent)
+ require.NoError(t, err)
+ file.Close()
+
+ // Verify file exists
+ readFile, err := sftpClient.Open("/nested/dir/structure/file.txt")
+ require.NoError(t, err)
+ content, err := io.ReadAll(readFile)
+ require.NoError(t, err)
+ readFile.Close()
+ require.Equal(t, testContent, content)
+
+ // Clean up
+ err = sftpClient.Remove("/nested/dir/structure/file.txt")
+ require.NoError(t, err)
+ err = sftpClient.RemoveDirectory("/nested/dir/structure")
+ require.NoError(t, err)
+ err = sftpClient.RemoveDirectory("/nested/dir")
+ require.NoError(t, err)
+ err = sftpClient.RemoveDirectory("/nested")
+ require.NoError(t, err)
+ })
+
+ // Test 5: Rename operation
+ t.Run("RenameFile", func(t *testing.T) {
+ testContent := []byte("rename test content")
+
+ file, err := sftpClient.Create("/original.txt")
+ require.NoError(t, err)
+ _, err = file.Write(testContent)
+ require.NoError(t, err)
+ file.Close()
+
+ // Rename file
+ err = sftpClient.Rename("/original.txt", "/renamed.txt")
+ require.NoError(t, err, "should be able to rename file")
+
+ // Verify old file doesn't exist
+ _, err = sftpClient.Stat("/original.txt")
+ require.Error(t, err, "original file should not exist")
+
+ // Verify new file exists with correct content
+ readFile, err := sftpClient.Open("/renamed.txt")
+ require.NoError(t, err, "renamed file should exist")
+ content, err := io.ReadAll(readFile)
+ require.NoError(t, err)
+ readFile.Close()
+ require.Equal(t, testContent, content)
+
+ // Clean up
+ err = sftpClient.Remove("/renamed.txt")
+ require.NoError(t, err)
+ })
+}
+
+// TestAdminRootAccess tests that admin user with HomeDir="/" can access everything
+func TestAdminRootAccess(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ // Connect as admin with HomeDir="/"
+ sftpClient, sshConn, err := fw.ConnectSFTP("admin", "adminpassword")
+ require.NoError(t, err, "failed to connect as admin")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ // Admin should be able to create directories anywhere
+ t.Run("CreateAnyDirectory", func(t *testing.T) {
+ // Create the user's home directory structure
+ err := sftpClient.MkdirAll("/sftp/testuser")
+ require.NoError(t, err, "admin should be able to create any directory")
+
+ // Create file in that directory
+ testContent := []byte("admin created this")
+ file, err := sftpClient.Create("/sftp/testuser/admin_file.txt")
+ require.NoError(t, err)
+ _, err = file.Write(testContent)
+ require.NoError(t, err)
+ file.Close()
+
+ // Verify file exists
+ info, err := sftpClient.Stat("/sftp/testuser/admin_file.txt")
+ require.NoError(t, err)
+ require.False(t, info.IsDir())
+
+ // Clean up
+ err = sftpClient.Remove("/sftp/testuser/admin_file.txt")
+ require.NoError(t, err)
+ })
+}
+
+// TestLargeFileUpload tests uploading larger files through SFTP
+func TestLargeFileUpload(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword")
+ require.NoError(t, err, "failed to connect as testuser")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ // Create a 1MB file
+ t.Run("Upload1MB", func(t *testing.T) {
+ size := 1024 * 1024 // 1MB
+ testData := bytes.Repeat([]byte("A"), size)
+
+ file, err := sftpClient.Create("/large_file.bin")
+ require.NoError(t, err)
+ n, err := file.Write(testData)
+ require.NoError(t, err)
+ require.Equal(t, size, n)
+ file.Close()
+
+ // Verify file size
+ info, err := sftpClient.Stat("/large_file.bin")
+ require.NoError(t, err)
+ require.Equal(t, int64(size), info.Size())
+
+ // Verify content
+ readFile, err := sftpClient.Open("/large_file.bin")
+ require.NoError(t, err)
+ content, err := io.ReadAll(readFile)
+ require.NoError(t, err)
+ readFile.Close()
+ require.Equal(t, testData, content)
+
+ // Clean up
+ err = sftpClient.Remove("/large_file.bin")
+ require.NoError(t, err)
+ })
+}
+
+// TestStatOperations tests Stat and Lstat operations
+func TestStatOperations(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword")
+ require.NoError(t, err, "failed to connect as testuser")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ // Create a test file
+ testContent := []byte("stat test content")
+ file, err := sftpClient.Create("/stat_test.txt")
+ require.NoError(t, err)
+ _, err = file.Write(testContent)
+ require.NoError(t, err)
+ file.Close()
+
+ t.Run("StatFile", func(t *testing.T) {
+ info, err := sftpClient.Stat("/stat_test.txt")
+ require.NoError(t, err)
+ require.Equal(t, "stat_test.txt", info.Name())
+ require.Equal(t, int64(len(testContent)), info.Size())
+ require.False(t, info.IsDir())
+ })
+
+ t.Run("StatDirectory", func(t *testing.T) {
+ err := sftpClient.Mkdir("/stat_dir")
+ require.NoError(t, err)
+
+ info, err := sftpClient.Stat("/stat_dir")
+ require.NoError(t, err)
+ require.Equal(t, "stat_dir", info.Name())
+ require.True(t, info.IsDir())
+
+ // Clean up
+ err = sftpClient.RemoveDirectory("/stat_dir")
+ require.NoError(t, err)
+ })
+
+ t.Run("StatRoot", func(t *testing.T) {
+ // Should be able to stat "/" which maps to user's home directory
+ info, err := sftpClient.Stat("/")
+ require.NoError(t, err, "should be able to stat root (home) directory")
+ require.True(t, info.IsDir(), "root should be a directory")
+ })
+
+ // Clean up
+ err = sftpClient.Remove("/stat_test.txt")
+ require.NoError(t, err)
+}
+
+// TestWalk tests walking directory trees
+func TestWalk(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword")
+ require.NoError(t, err, "failed to connect as testuser")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ // Create directory structure
+ err = sftpClient.MkdirAll("/walk/a/b")
+ require.NoError(t, err)
+ err = sftpClient.MkdirAll("/walk/c")
+ require.NoError(t, err)
+
+ // Create files
+ for _, p := range []string{"/walk/file1.txt", "/walk/a/file2.txt", "/walk/a/b/file3.txt", "/walk/c/file4.txt"} {
+ file, err := sftpClient.Create(p)
+ require.NoError(t, err)
+ file.Write([]byte("test"))
+ file.Close()
+ }
+
+ t.Run("WalkEntireTree", func(t *testing.T) {
+ var paths []string
+ walker := sftpClient.Walk("/walk")
+ for walker.Step() {
+ if walker.Err() != nil {
+ continue
+ }
+ paths = append(paths, walker.Path())
+ }
+
+ // Should find all directories and files
+ require.Contains(t, paths, "/walk")
+ require.Contains(t, paths, "/walk/a")
+ require.Contains(t, paths, "/walk/a/b")
+ require.Contains(t, paths, "/walk/c")
+ })
+
+ // Clean up
+ for _, p := range []string{"/walk/file1.txt", "/walk/a/file2.txt", "/walk/a/b/file3.txt", "/walk/c/file4.txt"} {
+ require.NoError(t, sftpClient.Remove(p))
+ }
+ for _, p := range []string{"/walk/a/b", "/walk/a", "/walk/c", "/walk"} {
+ require.NoError(t, sftpClient.RemoveDirectory(p))
+ }
+}
+
+// TestCurrentWorkingDirectory tests that Getwd and Chdir work correctly
+func TestCurrentWorkingDirectory(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword")
+ require.NoError(t, err, "failed to connect as testuser")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ // Create test directory
+ err = sftpClient.Mkdir("/cwd_test")
+ require.NoError(t, err)
+
+ t.Run("GetCurrentDir", func(t *testing.T) {
+ cwd, err := sftpClient.Getwd()
+ require.NoError(t, err)
+ // The initial working directory should be the user's home directory
+ // which from the user's perspective is "/"
+ require.Equal(t, "/", cwd, "initial working directory should be the virtual root")
+ })
+
+ t.Run("ChangeAndCreate", func(t *testing.T) {
+ // Create file in subdirectory using relative path after chdir
+ // Note: pkg/sftp doesn't support Chdir, so we test using absolute paths
+ file, err := sftpClient.Create("/cwd_test/relative_file.txt")
+ require.NoError(t, err)
+ file.Write([]byte("test"))
+ file.Close()
+
+ // Verify using absolute path
+ _, err = sftpClient.Stat("/cwd_test/relative_file.txt")
+ require.NoError(t, err)
+
+ // Clean up
+ sftpClient.Remove("/cwd_test/relative_file.txt")
+ })
+
+ // Clean up
+ err = sftpClient.RemoveDirectory("/cwd_test")
+ require.NoError(t, err)
+}
+
+// TestPathEdgeCases tests various edge cases in path handling
+func TestPathEdgeCases(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword")
+ require.NoError(t, err, "failed to connect as testuser")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ t.Run("PathWithDotDot", func(t *testing.T) {
+ // Create directory structure
+ err := sftpClient.MkdirAll("/edge/subdir")
+ require.NoError(t, err)
+
+ // Create file using path with ..
+ file, err := sftpClient.Create("/edge/subdir/../file.txt")
+ require.NoError(t, err)
+ file.Write([]byte("test"))
+ file.Close()
+
+ // Verify file was created in /edge
+ _, err = sftpClient.Stat("/edge/file.txt")
+ require.NoError(t, err, "file should be created in parent directory")
+
+ // Clean up
+ sftpClient.Remove("/edge/file.txt")
+ sftpClient.RemoveDirectory("/edge/subdir")
+ sftpClient.RemoveDirectory("/edge")
+ })
+
+ t.Run("PathWithTrailingSlash", func(t *testing.T) {
+ err := sftpClient.Mkdir("/trailing")
+ require.NoError(t, err)
+
+ // Stat with trailing slash
+ info, err := sftpClient.Stat("/trailing/")
+ require.NoError(t, err)
+ require.True(t, info.IsDir())
+
+ // Clean up
+ sftpClient.RemoveDirectory("/trailing")
+ })
+
+ t.Run("CreateFileAtRootPath", func(t *testing.T) {
+ // This is the exact scenario from issue #7470
+ // User with HomeDir="/sftp/testuser" uploads to "/"
+ file, err := sftpClient.Create("/issue7470.txt")
+ require.NoError(t, err, "should be able to create file at / (issue #7470)")
+ file.Write([]byte("This tests the fix for issue #7470"))
+ file.Close()
+
+ // Verify
+ _, err = sftpClient.Stat("/issue7470.txt")
+ require.NoError(t, err)
+
+ // Clean up
+ sftpClient.Remove("/issue7470.txt")
+ })
+
+ // Security test: path traversal attacks should be blocked
+ t.Run("PathTraversalPrevention", func(t *testing.T) {
+ // User's HomeDir is "/sftp/testuser"
+ // Attempting to escape via "../.." should NOT create files outside home directory
+
+ // First, create a valid file to ensure we can write
+ validFile, err := sftpClient.Create("/valid.txt")
+ require.NoError(t, err)
+ validFile.Write([]byte("valid"))
+ validFile.Close()
+
+ // Try various path traversal attempts
+ // These should either:
+ // 1. Be blocked (error returned), OR
+ // 2. Be safely resolved to stay within home directory
+
+ traversalPaths := []string{
+ "/../escape.txt",
+ "/../../escape.txt",
+ "/../../../escape.txt",
+ "/subdir/../../escape.txt",
+ "/./../../escape.txt",
+ }
+
+ for _, traversalPath := range traversalPaths {
+ t.Run(traversalPath, func(t *testing.T) {
+ // Note: The pkg/sftp client sanitizes paths locally before sending them to the server.
+ // So "/../escape.txt" becomes "/escape.txt" on the wire.
+ // Therefore, we cannot trigger the server-side path traversal block with this client.
+ // Instead, we verify that the file is created successfully within the jail (contained).
+ // The server-side protection logic is verified in unit tests (sftpd/sftp_server_test.go).
+
+ file, err := sftpClient.Create(traversalPath)
+ require.NoError(t, err, "creation should succeed because client sanitizes path")
+ file.Close()
+
+ // Clean up
+ err = sftpClient.Remove(traversalPath)
+ require.NoError(t, err)
+ })
+ }
+
+ // Clean up
+ sftpClient.Remove("/valid.txt")
+ })
+}
+
+// TestFileContent tests reading and writing file content correctly
+func TestFileContent(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping integration test in short mode")
+ }
+
+ config := DefaultTestConfig()
+ config.EnableDebug = testing.Verbose()
+
+ fw := NewSftpTestFramework(t, config)
+ err := fw.Setup(config)
+ require.NoError(t, err, "failed to setup test framework")
+ defer fw.Cleanup()
+
+ sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword")
+ require.NoError(t, err, "failed to connect as testuser")
+ defer sshConn.Close()
+ defer sftpClient.Close()
+
+ t.Run("BinaryContent", func(t *testing.T) {
+ // Create binary data with all byte values
+ data := make([]byte, 256)
+ for i := 0; i < 256; i++ {
+ data[i] = byte(i)
+ }
+
+ file, err := sftpClient.Create("/binary.bin")
+ require.NoError(t, err)
+ n, err := file.Write(data)
+ require.NoError(t, err)
+ require.Equal(t, 256, n)
+ file.Close()
+
+ // Read back
+ readFile, err := sftpClient.Open("/binary.bin")
+ require.NoError(t, err)
+ content, err := io.ReadAll(readFile)
+ require.NoError(t, err)
+ readFile.Close()
+
+ require.Equal(t, data, content, "binary content should match")
+
+ // Clean up
+ sftpClient.Remove("/binary.bin")
+ })
+
+ t.Run("EmptyFile", func(t *testing.T) {
+ file, err := sftpClient.Create("/empty.txt")
+ require.NoError(t, err)
+ file.Close()
+
+ info, err := sftpClient.Stat("/empty.txt")
+ require.NoError(t, err)
+ require.Equal(t, int64(0), info.Size())
+
+ // Clean up
+ sftpClient.Remove("/empty.txt")
+ })
+
+ t.Run("UnicodeFilename", func(t *testing.T) {
+ filename := "/文件名.txt"
+ content := []byte("Unicode content: 你好世界")
+
+ file, err := sftpClient.Create(filename)
+ require.NoError(t, err)
+ file.Write(content)
+ file.Close()
+
+ // Read back
+ readFile, err := sftpClient.Open(filename)
+ require.NoError(t, err)
+ readContent, err := io.ReadAll(readFile)
+ require.NoError(t, err)
+ readFile.Close()
+
+ require.Equal(t, content, readContent)
+
+ // Verify in listing
+ files, err := sftpClient.ReadDir("/")
+ require.NoError(t, err)
+ found := false
+ for _, f := range files {
+ if f.Name() == path.Base(filename) {
+ found = true
+ break
+ }
+ }
+ require.True(t, found, "should find unicode filename in listing")
+
+ // Clean up
+ sftpClient.Remove(filename)
+ })
+}
+
diff --git a/test/sftp/framework.go b/test/sftp/framework.go
new file mode 100644
index 000000000..5572eac28
--- /dev/null
+++ b/test/sftp/framework.go
@@ -0,0 +1,423 @@
+package sftp
+
+import (
+ "fmt"
+ "net"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "syscall"
+ "testing"
+ "time"
+
+ "github.com/pkg/sftp"
+ "github.com/stretchr/testify/require"
+ "golang.org/x/crypto/ssh"
+)
+
+// SftpTestFramework provides utilities for SFTP integration testing
+type SftpTestFramework struct {
+ t *testing.T
+ tempDir string
+ dataDir string
+ masterProcess *os.Process
+ volumeProcess *os.Process
+ filerProcess *os.Process
+ sftpProcess *os.Process
+ masterAddr string
+ volumeAddr string
+ filerAddr string
+ sftpAddr string
+ weedBinary string
+ userStoreFile string
+ hostKeyFile string
+ isSetup bool
+ skipCleanup bool
+}
+
+// TestConfig holds configuration for SFTP tests
+type TestConfig struct {
+ NumVolumes int
+ EnableDebug bool
+ SkipCleanup bool // for debugging failed tests
+ UserStoreFile string
+}
+
+// DefaultTestConfig returns a default configuration for SFTP tests
+func DefaultTestConfig() *TestConfig {
+ return &TestConfig{
+ NumVolumes: 3,
+ EnableDebug: false,
+ SkipCleanup: false,
+ UserStoreFile: "",
+ }
+}
+
+// NewSftpTestFramework creates a new SFTP testing framework
+func NewSftpTestFramework(t *testing.T, config *TestConfig) *SftpTestFramework {
+ if config == nil {
+ config = DefaultTestConfig()
+ }
+
+ tempDir, err := os.MkdirTemp("", "seaweedfs_sftp_test_")
+ require.NoError(t, err)
+
+ // Generate SSH host key for SFTP server
+ hostKeyFile := filepath.Join(tempDir, "ssh_host_key")
+ cmd := exec.Command("ssh-keygen", "-t", "ed25519", "-f", hostKeyFile, "-N", "")
+ err = cmd.Run()
+ require.NoError(t, err, "failed to generate SSH host key")
+
+ // Use provided userstore or copy the test one
+ userStoreFile := config.UserStoreFile
+ if userStoreFile == "" {
+ // Copy test userstore to temp dir
+ userStoreFile = filepath.Join(tempDir, "userstore.json")
+ testDataPath := findTestDataPath()
+ input, err := os.ReadFile(filepath.Join(testDataPath, "userstore.json"))
+ require.NoError(t, err, "failed to read test userstore.json")
+ err = os.WriteFile(userStoreFile, input, 0644)
+ require.NoError(t, err, "failed to write userstore.json")
+ }
+
+ return &SftpTestFramework{
+ t: t,
+ tempDir: tempDir,
+ dataDir: filepath.Join(tempDir, "data"),
+ masterAddr: "127.0.0.1:19333",
+ volumeAddr: "127.0.0.1:18080",
+ filerAddr: "127.0.0.1:18888",
+ sftpAddr: "127.0.0.1:12022",
+ weedBinary: findWeedBinary(),
+ userStoreFile: userStoreFile,
+ hostKeyFile: hostKeyFile,
+ isSetup: false,
+ }
+}
+
+// Setup starts SeaweedFS cluster with SFTP server
+func (f *SftpTestFramework) Setup(config *TestConfig) error {
+ if f.isSetup {
+ return fmt.Errorf("framework already setup")
+ }
+
+ // Create all data directories
+ dirs := []string{
+ f.dataDir,
+ filepath.Join(f.dataDir, "master"),
+ filepath.Join(f.dataDir, "volume"),
+ }
+ for _, dir := range dirs {
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ return fmt.Errorf("failed to create directory %s: %v", dir, err)
+ }
+ }
+
+ // Start master
+ if err := f.startMaster(config); err != nil {
+ return fmt.Errorf("failed to start master: %v", err)
+ }
+
+ // Wait for master to be ready
+ if err := f.waitForService(f.masterAddr, 30*time.Second); err != nil {
+ return fmt.Errorf("master not ready: %v", err)
+ }
+
+ // Start volume server
+ if err := f.startVolumeServer(config); err != nil {
+ return fmt.Errorf("failed to start volume server: %v", err)
+ }
+
+ // Wait for volume server to be ready
+ if err := f.waitForService(f.volumeAddr, 30*time.Second); err != nil {
+ return fmt.Errorf("volume server not ready: %v", err)
+ }
+
+ // Start filer
+ if err := f.startFiler(config); err != nil {
+ return fmt.Errorf("failed to start filer: %v", err)
+ }
+
+ // Wait for filer to be ready
+ if err := f.waitForService(f.filerAddr, 30*time.Second); err != nil {
+ return fmt.Errorf("filer not ready: %v", err)
+ }
+
+ // Start SFTP server
+ if err := f.startSftpServer(config); err != nil {
+ return fmt.Errorf("failed to start SFTP server: %v", err)
+ }
+
+ // Wait for SFTP server to be ready
+ if err := f.waitForService(f.sftpAddr, 30*time.Second); err != nil {
+ return fmt.Errorf("SFTP server not ready: %v", err)
+ }
+
+ // Additional wait for all services to stabilize (gRPC endpoints)
+ time.Sleep(500 * time.Millisecond)
+
+ f.skipCleanup = config.SkipCleanup
+ f.isSetup = true
+ return nil
+}
+
+// Cleanup stops all processes and removes temporary files
+func (f *SftpTestFramework) Cleanup() {
+ // Stop processes in reverse order
+ processes := []*os.Process{f.sftpProcess, f.filerProcess, f.volumeProcess, f.masterProcess}
+ for _, proc := range processes {
+ if proc != nil {
+ proc.Signal(syscall.SIGTERM)
+ proc.Wait()
+ }
+ }
+
+ // Remove temp directory
+ if !f.skipCleanup {
+ os.RemoveAll(f.tempDir)
+ }
+}
+
+// GetSftpAddr returns the SFTP server address
+func (f *SftpTestFramework) GetSftpAddr() string {
+ return f.sftpAddr
+}
+
+// GetFilerAddr returns the filer address
+func (f *SftpTestFramework) GetFilerAddr() string {
+ return f.filerAddr
+}
+
+// ConnectSFTP creates an SFTP client connection with the given credentials
+func (f *SftpTestFramework) ConnectSFTP(username, password string) (*sftp.Client, *ssh.Client, error) {
+ // Load the known host public key for verification
+ hostKeyCallback, err := f.getHostKeyCallback()
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to get host key callback: %v", err)
+ }
+
+ config := &ssh.ClientConfig{
+ User: username,
+ Auth: []ssh.AuthMethod{
+ ssh.Password(password),
+ },
+ HostKeyCallback: hostKeyCallback,
+ Timeout: 5 * time.Second,
+ }
+
+ sshConn, err := ssh.Dial("tcp", f.sftpAddr, config)
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to connect SSH: %v", err)
+ }
+
+ sftpClient, err := sftp.NewClient(sshConn)
+ if err != nil {
+ sshConn.Close()
+ return nil, nil, fmt.Errorf("failed to create SFTP client: %v", err)
+ }
+
+ return sftpClient, sshConn, nil
+}
+
+// getHostKeyCallback returns a callback that verifies the server's host key
+// matches the known test server key we generated
+func (f *SftpTestFramework) getHostKeyCallback() (ssh.HostKeyCallback, error) {
+ // Read the public key file generated alongside the private key
+ pubKeyFile := f.hostKeyFile + ".pub"
+ pubKeyBytes, err := os.ReadFile(pubKeyFile)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read host public key: %v", err)
+ }
+
+ // Parse the public key
+ pubKey, _, _, _, err := ssh.ParseAuthorizedKey(pubKeyBytes)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse host public key: %v", err)
+ }
+
+ // Return a callback that verifies the server key matches our known key
+ return ssh.FixedHostKey(pubKey), nil
+}
+
+// startMaster starts the SeaweedFS master server
+func (f *SftpTestFramework) startMaster(config *TestConfig) error {
+ args := []string{
+ "master",
+ "-ip=127.0.0.1",
+ "-port=19333",
+ "-mdir=" + filepath.Join(f.dataDir, "master"),
+ "-raftBootstrap",
+ "-peers=none",
+ }
+
+ cmd := exec.Command(f.weedBinary, args...)
+ cmd.Dir = f.tempDir
+ if config.EnableDebug {
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+ f.masterProcess = cmd.Process
+ return nil
+}
+
+// startVolumeServer starts SeaweedFS volume server
+func (f *SftpTestFramework) startVolumeServer(config *TestConfig) error {
+ args := []string{
+ "volume",
+ "-mserver=" + f.masterAddr,
+ "-ip=127.0.0.1",
+ "-port=18080",
+ "-dir=" + filepath.Join(f.dataDir, "volume"),
+ fmt.Sprintf("-max=%d", config.NumVolumes),
+ }
+
+ cmd := exec.Command(f.weedBinary, args...)
+ cmd.Dir = f.tempDir
+ if config.EnableDebug {
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+ f.volumeProcess = cmd.Process
+ return nil
+}
+
+// startFiler starts the SeaweedFS filer server
+func (f *SftpTestFramework) startFiler(config *TestConfig) error {
+ args := []string{
+ "filer",
+ "-master=" + f.masterAddr,
+ "-ip=127.0.0.1",
+ "-port=18888",
+ }
+
+ cmd := exec.Command(f.weedBinary, args...)
+ cmd.Dir = f.tempDir
+ if config.EnableDebug {
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+ f.filerProcess = cmd.Process
+ return nil
+}
+
+// startSftpServer starts the SeaweedFS SFTP server
+func (f *SftpTestFramework) startSftpServer(config *TestConfig) error {
+ args := []string{
+ "sftp",
+ "-filer=" + f.filerAddr,
+ "-ip.bind=127.0.0.1",
+ "-port=12022",
+ "-sshPrivateKey=" + f.hostKeyFile,
+ "-userStoreFile=" + f.userStoreFile,
+ }
+
+ cmd := exec.Command(f.weedBinary, args...)
+ cmd.Dir = f.tempDir
+ if config.EnableDebug {
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+ f.sftpProcess = cmd.Process
+ return nil
+}
+
+// waitForService waits for a service to be available
+func (f *SftpTestFramework) waitForService(addr string, timeout time.Duration) error {
+ deadline := time.Now().Add(timeout)
+ for time.Now().Before(deadline) {
+ conn, err := net.DialTimeout("tcp", addr, 1*time.Second)
+ if err == nil {
+ conn.Close()
+ return nil
+ }
+ time.Sleep(100 * time.Millisecond)
+ }
+ return fmt.Errorf("service at %s not ready within timeout", addr)
+}
+
+// findWeedBinary locates the weed binary
+// Prefers local build over system-installed weed to ensure we test the latest code
+func findWeedBinary() string {
+ // Get the directory where this source file is located
+ // This ensures we find the locally built weed binary first
+ _, thisFile, _, ok := runtime.Caller(0)
+ if ok {
+ thisDir := filepath.Dir(thisFile)
+ // From test/sftp/, the weed binary should be at ../../weed/weed
+ candidates := []string{
+ filepath.Join(thisDir, "../../weed/weed"),
+ filepath.Join(thisDir, "../weed/weed"),
+ }
+ for _, candidate := range candidates {
+ if _, err := os.Stat(candidate); err == nil {
+ abs, _ := filepath.Abs(candidate)
+ return abs
+ }
+ }
+ }
+
+ // Try relative paths from current working directory
+ cwd, _ := os.Getwd()
+ candidates := []string{
+ filepath.Join(cwd, "../../weed/weed"),
+ filepath.Join(cwd, "../weed/weed"),
+ filepath.Join(cwd, "./weed"),
+ }
+ for _, candidate := range candidates {
+ if _, err := os.Stat(candidate); err == nil {
+ abs, _ := filepath.Abs(candidate)
+ return abs
+ }
+ }
+
+ // Fallback to PATH only if local build not found
+ if path, err := exec.LookPath("weed"); err == nil {
+ return path
+ }
+
+ // Default fallback
+ return "weed"
+}
+
+// findTestDataPath locates the testdata directory
+func findTestDataPath() string {
+ // Get the directory where this source file is located
+ _, thisFile, _, ok := runtime.Caller(0)
+ if ok {
+ thisDir := filepath.Dir(thisFile)
+ testDataPath := filepath.Join(thisDir, "testdata")
+ if _, err := os.Stat(testDataPath); err == nil {
+ return testDataPath
+ }
+ }
+
+ // Try relative paths from current working directory
+ cwd, _ := os.Getwd()
+ candidates := []string{
+ filepath.Join(cwd, "testdata"),
+ filepath.Join(cwd, "../sftp/testdata"),
+ filepath.Join(cwd, "test/sftp/testdata"),
+ }
+
+ for _, candidate := range candidates {
+ if _, err := os.Stat(candidate); err == nil {
+ return candidate
+ }
+ }
+
+ return "./testdata"
+}
+
diff --git a/test/sftp/go.mod b/test/sftp/go.mod
new file mode 100644
index 000000000..34d9053a8
--- /dev/null
+++ b/test/sftp/go.mod
@@ -0,0 +1,17 @@
+module seaweedfs-sftp-tests
+
+go 1.24.0
+
+require (
+ github.com/pkg/sftp v1.13.7
+ github.com/stretchr/testify v1.10.0
+ golang.org/x/crypto v0.45.0
+)
+
+require (
+ github.com/davecgh/go-spew v1.1.1 // indirect
+ github.com/kr/fs v0.1.0 // indirect
+ github.com/pmezard/go-difflib v1.0.0 // indirect
+ golang.org/x/sys v0.38.0 // indirect
+ gopkg.in/yaml.v3 v3.0.1 // indirect
+)
diff --git a/test/sftp/go.sum b/test/sftp/go.sum
new file mode 100644
index 000000000..112e6f88a
--- /dev/null
+++ b/test/sftp/go.sum
@@ -0,0 +1,64 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/kr/fs v0.1.0 h1:Jskdu9ieNAYnjxsi0LbQp1ulIKZV1LAFgK1tWhpZgl8=
+github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg=
+github.com/pkg/sftp v1.13.7 h1:uv+I3nNJvlKZIQGSr8JVQLNHFU9YhhNpvC14Y6KgmSM=
+github.com/pkg/sftp v1.13.7/go.mod h1:KMKI0t3T6hfA+lTR/ssZdunHo+uwq7ghoN09/FSu3DY=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4=
+golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
+golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
+golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0=
+golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
+golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/test/sftp/testdata/userstore.json b/test/sftp/testdata/userstore.json
new file mode 100644
index 000000000..66d78dd1d
--- /dev/null
+++ b/test/sftp/testdata/userstore.json
@@ -0,0 +1,37 @@
+[
+ {
+ "Username": "admin",
+ "Password": "adminpassword",
+ "PublicKeys": [],
+ "HomeDir": "/",
+ "Permissions": {
+ "/": ["*"]
+ },
+ "Uid": 0,
+ "Gid": 0
+ },
+ {
+ "Username": "testuser",
+ "Password": "testuserpassword",
+ "PublicKeys": [],
+ "HomeDir": "/sftp/testuser",
+ "Permissions": {
+ "/sftp/testuser": ["*"]
+ },
+ "Uid": 1001,
+ "Gid": 1001
+ },
+ {
+ "Username": "readonly",
+ "Password": "readonlypassword",
+ "PublicKeys": [],
+ "HomeDir": "/public",
+ "Permissions": {
+ "/public": ["read", "list"]
+ },
+ "Uid": 1002,
+ "Gid": 1002
+ }
+]
+
+
diff --git a/weed/command/scaffold/filer.toml b/weed/command/scaffold/filer.toml
index 61a7ced6d..9261591c4 100644
--- a/weed/command/scaffold/filer.toml
+++ b/weed/command/scaffold/filer.toml
@@ -201,8 +201,11 @@ table = "seaweedfs"
[redis2]
enabled = false
address = "localhost:6379"
+username = ""
password = ""
database = 0
+# prefix for filer redis keys
+keyPrefix = ""
enable_tls = false
ca_cert_path = ""
client_cert_path = ""
@@ -217,6 +220,8 @@ masterName = "master"
username = ""
password = ""
database = 0
+# prefix for filer redis keys
+keyPrefix = ""
enable_tls = false
ca_cert_path = ""
client_cert_path = ""
@@ -232,7 +237,10 @@ addresses = [
"localhost:30005",
"localhost:30006",
]
+username = ""
password = ""
+# prefix for filer redis keys
+keyPrefix = ""
enable_tls = false
ca_cert_path = ""
client_cert_path = ""
@@ -248,8 +256,11 @@ superLargeDirectories = []
[redis_lua]
enabled = false
address = "localhost:6379"
+username = ""
password = ""
database = 0
+# prefix for filer redis keys
+keyPrefix = ""
enable_tls = false
ca_cert_path = ""
client_cert_path = ""
@@ -264,6 +275,8 @@ masterName = "master"
username = ""
password = ""
database = 0
+# prefix for filer redis keys
+keyPrefix = ""
enable_tls = false
ca_cert_path = ""
client_cert_path = ""
@@ -279,7 +292,10 @@ addresses = [
"localhost:30005",
"localhost:30006",
]
+username = ""
password = ""
+# prefix for filer redis keys
+keyPrefix = ""
enable_tls = false
ca_cert_path = ""
client_cert_path = ""
@@ -373,8 +389,10 @@ dialTimeOut = 10
enabled = false
location = "/tmp/"
address = "localhost:6379"
+username = ""
password = ""
database = 1
+keyPrefix = ""
[tikv]
enabled = false
diff --git a/weed/command/server.go b/weed/command/server.go
index 52f47ec32..ebd9f359a 100644
--- a/weed/command/server.go
+++ b/weed/command/server.go
@@ -134,6 +134,7 @@ func init() {
serverOptions.v.port = cmdServer.Flag.Int("volume.port", 8080, "volume server http listen port")
serverOptions.v.portGrpc = cmdServer.Flag.Int("volume.port.grpc", 0, "volume server grpc listen port")
serverOptions.v.publicPort = cmdServer.Flag.Int("volume.port.public", 0, "volume server public port")
+ serverOptions.v.id = cmdServer.Flag.String("volume.id", "", "volume server id. If empty, default to ip:port")
serverOptions.v.indexType = cmdServer.Flag.String("volume.index", "memory", "Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance.")
serverOptions.v.diskType = cmdServer.Flag.String("volume.disk", "", "[hdd|ssd|<tag>] hard drive or solid state drive or any tag")
serverOptions.v.fixJpgOrientation = cmdServer.Flag.Bool("volume.images.fix.orientation", false, "Adjust jpg orientation when uploading.")
diff --git a/weed/command/volume.go b/weed/command/volume.go
index e21437e9a..514553172 100644
--- a/weed/command/volume.go
+++ b/weed/command/volume.go
@@ -41,6 +41,7 @@ type VolumeServerOptions struct {
folderMaxLimits []int32
idxFolder *string
ip *string
+ id *string
publicUrl *string
bindIp *string
mastersString *string
@@ -78,6 +79,7 @@ func init() {
v.portGrpc = cmdVolume.Flag.Int("port.grpc", 0, "grpc listen port")
v.publicPort = cmdVolume.Flag.Int("port.public", 0, "port opened to public")
v.ip = cmdVolume.Flag.String("ip", util.DetectedHostAddress(), "ip or server name, also used as identifier")
+ v.id = cmdVolume.Flag.String("id", "", "volume server id. If empty, default to ip:port")
v.publicUrl = cmdVolume.Flag.String("publicUrl", "", "Publicly accessible address")
v.bindIp = cmdVolume.Flag.String("ip.bind", "", "ip address to bind to. If empty, default to same as -ip option.")
v.mastersString = cmdVolume.Flag.String("master", "localhost:9333", "comma-separated master servers")
@@ -253,8 +255,11 @@ func (v VolumeServerOptions) startVolumeServer(volumeFolders, maxVolumeCounts, v
volumeNeedleMapKind = storage.NeedleMapLevelDbLarge
}
+ // Determine volume server ID: if not specified, use ip:port
+ volumeServerId := util.GetVolumeServerId(*v.id, *v.ip, *v.port)
+
volumeServer := weed_server.NewVolumeServer(volumeMux, publicVolumeMux,
- *v.ip, *v.port, *v.portGrpc, *v.publicUrl,
+ *v.ip, *v.port, *v.portGrpc, *v.publicUrl, volumeServerId,
v.folders, v.folderMaxLimits, minFreeSpaces, diskTypes,
*v.idxFolder,
volumeNeedleMapKind,
diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue.go b/weed/filer/empty_folder_cleanup/cleanup_queue.go
new file mode 100644
index 000000000..f92af389d
--- /dev/null
+++ b/weed/filer/empty_folder_cleanup/cleanup_queue.go
@@ -0,0 +1,207 @@
+package empty_folder_cleanup
+
+import (
+ "container/list"
+ "sync"
+ "time"
+)
+
+// CleanupQueue manages a deduplicated queue of folders pending cleanup.
+// It uses a doubly-linked list ordered by event time (oldest at front) and a map for O(1) deduplication.
+// Processing is triggered when:
+// - Queue size reaches maxSize, OR
+// - Oldest item exceeds maxAge
+type CleanupQueue struct {
+ mu sync.Mutex
+ items *list.List // Linked list of *queueItem ordered by time (front = oldest)
+ itemsMap map[string]*list.Element // folder -> list element for O(1) lookup
+ maxSize int // Max queue size before triggering cleanup
+ maxAge time.Duration // Max age before triggering cleanup
+}
+
+// queueItem represents an item in the cleanup queue
+type queueItem struct {
+ folder string
+ queueTime time.Time
+}
+
+// NewCleanupQueue creates a new CleanupQueue with the specified limits
+func NewCleanupQueue(maxSize int, maxAge time.Duration) *CleanupQueue {
+ return &CleanupQueue{
+ items: list.New(),
+ itemsMap: make(map[string]*list.Element),
+ maxSize: maxSize,
+ maxAge: maxAge,
+ }
+}
+
+// Add adds a folder to the queue with the specified event time.
+// The item is inserted in time-sorted order (oldest at front) to handle out-of-order events.
+// If folder already exists with an older time, the time is updated and position adjusted.
+// Returns true if the folder was newly added, false if it was updated.
+func (q *CleanupQueue) Add(folder string, eventTime time.Time) bool {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ // Check if folder already exists
+ if elem, exists := q.itemsMap[folder]; exists {
+ existingItem := elem.Value.(*queueItem)
+ // Only update if new event is later
+ if eventTime.After(existingItem.queueTime) {
+ // Remove from current position
+ q.items.Remove(elem)
+ // Re-insert with new time in sorted position
+ newElem := q.insertSorted(folder, eventTime)
+ q.itemsMap[folder] = newElem
+ }
+ return false
+ }
+
+ // Insert new folder in sorted position
+ elem := q.insertSorted(folder, eventTime)
+ q.itemsMap[folder] = elem
+ return true
+}
+
+// insertSorted inserts an item in the correct position to maintain time ordering (oldest at front)
+func (q *CleanupQueue) insertSorted(folder string, eventTime time.Time) *list.Element {
+ item := &queueItem{
+ folder: folder,
+ queueTime: eventTime,
+ }
+
+ // Find the correct position (insert before the first item with a later time)
+ for elem := q.items.Back(); elem != nil; elem = elem.Prev() {
+ existingItem := elem.Value.(*queueItem)
+ if !eventTime.Before(existingItem.queueTime) {
+ // Insert after this element
+ return q.items.InsertAfter(item, elem)
+ }
+ }
+
+ // This item is the oldest, insert at front
+ return q.items.PushFront(item)
+}
+
+// Remove removes a specific folder from the queue (e.g., when a file is created).
+// Returns true if the folder was found and removed.
+func (q *CleanupQueue) Remove(folder string) bool {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ elem, exists := q.itemsMap[folder]
+ if !exists {
+ return false
+ }
+
+ q.items.Remove(elem)
+ delete(q.itemsMap, folder)
+ return true
+}
+
+// ShouldProcess returns true if the queue should be processed.
+// This is true when:
+// - Queue size >= maxSize, OR
+// - Oldest item age > maxAge
+func (q *CleanupQueue) ShouldProcess() bool {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ return q.shouldProcessLocked()
+}
+
+// shouldProcessLocked checks if processing is needed (caller must hold lock)
+func (q *CleanupQueue) shouldProcessLocked() bool {
+ if q.items.Len() == 0 {
+ return false
+ }
+
+ // Check if queue is full
+ if q.items.Len() >= q.maxSize {
+ return true
+ }
+
+ // Check if oldest item exceeds max age
+ front := q.items.Front()
+ if front != nil {
+ item := front.Value.(*queueItem)
+ if time.Since(item.queueTime) > q.maxAge {
+ return true
+ }
+ }
+
+ return false
+}
+
+// Pop removes and returns the oldest folder from the queue.
+// Returns the folder and true if an item was available, or empty string and false if queue is empty.
+func (q *CleanupQueue) Pop() (string, bool) {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ front := q.items.Front()
+ if front == nil {
+ return "", false
+ }
+
+ item := front.Value.(*queueItem)
+ q.items.Remove(front)
+ delete(q.itemsMap, item.folder)
+
+ return item.folder, true
+}
+
+// Peek returns the oldest folder without removing it.
+// Returns the folder and queue time if available, or empty values if queue is empty.
+func (q *CleanupQueue) Peek() (folder string, queueTime time.Time, ok bool) {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ front := q.items.Front()
+ if front == nil {
+ return "", time.Time{}, false
+ }
+
+ item := front.Value.(*queueItem)
+ return item.folder, item.queueTime, true
+}
+
+// Len returns the current queue size.
+func (q *CleanupQueue) Len() int {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+ return q.items.Len()
+}
+
+// Contains checks if a folder is in the queue.
+func (q *CleanupQueue) Contains(folder string) bool {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+ _, exists := q.itemsMap[folder]
+ return exists
+}
+
+// Clear removes all items from the queue.
+func (q *CleanupQueue) Clear() {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ q.items.Init()
+ q.itemsMap = make(map[string]*list.Element)
+}
+
+// OldestAge returns the age of the oldest item in the queue, or 0 if empty.
+func (q *CleanupQueue) OldestAge() time.Duration {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ front := q.items.Front()
+ if front == nil {
+ return 0
+ }
+
+ item := front.Value.(*queueItem)
+ return time.Since(item.queueTime)
+}
+
+
diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue_test.go b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go
new file mode 100644
index 000000000..2effa3138
--- /dev/null
+++ b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go
@@ -0,0 +1,371 @@
+package empty_folder_cleanup
+
+import (
+ "testing"
+ "time"
+)
+
+func TestCleanupQueue_Add(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ now := time.Now()
+
+ // Add first item
+ if !q.Add("/buckets/b1/folder1", now) {
+ t.Error("expected Add to return true for new item")
+ }
+ if q.Len() != 1 {
+ t.Errorf("expected len 1, got %d", q.Len())
+ }
+
+ // Add second item with later time
+ if !q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) {
+ t.Error("expected Add to return true for new item")
+ }
+ if q.Len() != 2 {
+ t.Errorf("expected len 2, got %d", q.Len())
+ }
+
+ // Add duplicate with newer time - should update and reposition
+ if q.Add("/buckets/b1/folder1", now.Add(2*time.Second)) {
+ t.Error("expected Add to return false for existing item")
+ }
+ if q.Len() != 2 {
+ t.Errorf("expected len 2 after duplicate, got %d", q.Len())
+ }
+
+ // folder1 should now be at the back (newer time) - verify by popping
+ folder1, _ := q.Pop()
+ folder2, _ := q.Pop()
+ if folder1 != "/buckets/b1/folder2" || folder2 != "/buckets/b1/folder1" {
+ t.Errorf("expected folder1 to be moved to back, got %s, %s", folder1, folder2)
+ }
+}
+
+func TestCleanupQueue_Add_OutOfOrder(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ baseTime := time.Now()
+
+ // Add items out of order
+ q.Add("/buckets/b1/folder3", baseTime.Add(3*time.Second))
+ q.Add("/buckets/b1/folder1", baseTime.Add(1*time.Second))
+ q.Add("/buckets/b1/folder2", baseTime.Add(2*time.Second))
+
+ // Items should be in time order (oldest first) - verify by popping
+ expected := []string{"/buckets/b1/folder1", "/buckets/b1/folder2", "/buckets/b1/folder3"}
+ for i, exp := range expected {
+ folder, ok := q.Pop()
+ if !ok || folder != exp {
+ t.Errorf("at index %d: expected %s, got %s", i, exp, folder)
+ }
+ }
+}
+
+func TestCleanupQueue_Add_DuplicateWithOlderTime(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ baseTime := time.Now()
+
+ // Add folder at t=5
+ q.Add("/buckets/b1/folder1", baseTime.Add(5*time.Second))
+
+ // Try to add same folder with older time - should NOT update
+ q.Add("/buckets/b1/folder1", baseTime.Add(2*time.Second))
+
+ // Time should remain at t=5
+ _, queueTime, _ := q.Peek()
+ if queueTime != baseTime.Add(5*time.Second) {
+ t.Errorf("expected time to remain unchanged, got %v", queueTime)
+ }
+}
+
+func TestCleanupQueue_Remove(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ now := time.Now()
+
+ q.Add("/buckets/b1/folder1", now)
+ q.Add("/buckets/b1/folder2", now.Add(1*time.Second))
+ q.Add("/buckets/b1/folder3", now.Add(2*time.Second))
+
+ // Remove middle item
+ if !q.Remove("/buckets/b1/folder2") {
+ t.Error("expected Remove to return true for existing item")
+ }
+ if q.Len() != 2 {
+ t.Errorf("expected len 2, got %d", q.Len())
+ }
+ if q.Contains("/buckets/b1/folder2") {
+ t.Error("removed item should not be in queue")
+ }
+
+ // Remove non-existent item
+ if q.Remove("/buckets/b1/nonexistent") {
+ t.Error("expected Remove to return false for non-existent item")
+ }
+
+ // Verify order is preserved by popping
+ folder1, _ := q.Pop()
+ folder3, _ := q.Pop()
+ if folder1 != "/buckets/b1/folder1" || folder3 != "/buckets/b1/folder3" {
+ t.Errorf("unexpected order: %s, %s", folder1, folder3)
+ }
+}
+
+func TestCleanupQueue_Pop(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ now := time.Now()
+
+ // Pop from empty queue
+ folder, ok := q.Pop()
+ if ok {
+ t.Error("expected Pop to return false for empty queue")
+ }
+ if folder != "" {
+ t.Errorf("expected empty folder, got %s", folder)
+ }
+
+ // Add items and pop in order
+ q.Add("/buckets/b1/folder1", now)
+ q.Add("/buckets/b1/folder2", now.Add(1*time.Second))
+ q.Add("/buckets/b1/folder3", now.Add(2*time.Second))
+
+ folder, ok = q.Pop()
+ if !ok || folder != "/buckets/b1/folder1" {
+ t.Errorf("expected folder1, got %s (ok=%v)", folder, ok)
+ }
+
+ folder, ok = q.Pop()
+ if !ok || folder != "/buckets/b1/folder2" {
+ t.Errorf("expected folder2, got %s (ok=%v)", folder, ok)
+ }
+
+ folder, ok = q.Pop()
+ if !ok || folder != "/buckets/b1/folder3" {
+ t.Errorf("expected folder3, got %s (ok=%v)", folder, ok)
+ }
+
+ // Queue should be empty now
+ if q.Len() != 0 {
+ t.Errorf("expected empty queue, got len %d", q.Len())
+ }
+}
+
+func TestCleanupQueue_Peek(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ now := time.Now()
+
+ // Peek empty queue
+ folder, _, ok := q.Peek()
+ if ok {
+ t.Error("expected Peek to return false for empty queue")
+ }
+
+ // Add item and peek
+ q.Add("/buckets/b1/folder1", now)
+ folder, queueTime, ok := q.Peek()
+ if !ok || folder != "/buckets/b1/folder1" {
+ t.Errorf("expected folder1, got %s (ok=%v)", folder, ok)
+ }
+ if queueTime != now {
+ t.Errorf("expected queue time %v, got %v", now, queueTime)
+ }
+
+ // Peek should not remove item
+ if q.Len() != 1 {
+ t.Errorf("Peek should not remove item, len=%d", q.Len())
+ }
+}
+
+func TestCleanupQueue_Contains(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ now := time.Now()
+
+ q.Add("/buckets/b1/folder1", now)
+
+ if !q.Contains("/buckets/b1/folder1") {
+ t.Error("expected Contains to return true")
+ }
+ if q.Contains("/buckets/b1/folder2") {
+ t.Error("expected Contains to return false for non-existent")
+ }
+}
+
+func TestCleanupQueue_ShouldProcess_MaxSize(t *testing.T) {
+ q := NewCleanupQueue(3, 10*time.Minute)
+ now := time.Now()
+
+ // Empty queue
+ if q.ShouldProcess() {
+ t.Error("empty queue should not need processing")
+ }
+
+ // Add items below max
+ q.Add("/buckets/b1/folder1", now)
+ q.Add("/buckets/b1/folder2", now.Add(1*time.Second))
+ if q.ShouldProcess() {
+ t.Error("queue below max should not need processing")
+ }
+
+ // Add item to reach max
+ q.Add("/buckets/b1/folder3", now.Add(2*time.Second))
+ if !q.ShouldProcess() {
+ t.Error("queue at max should need processing")
+ }
+}
+
+func TestCleanupQueue_ShouldProcess_MaxAge(t *testing.T) {
+ q := NewCleanupQueue(100, 100*time.Millisecond) // Short max age for testing
+
+ // Add item with old event time
+ oldTime := time.Now().Add(-1 * time.Second) // 1 second ago
+ q.Add("/buckets/b1/folder1", oldTime)
+
+ // Item is older than maxAge, should need processing
+ if !q.ShouldProcess() {
+ t.Error("old item should trigger processing")
+ }
+
+ // Clear and add fresh item
+ q.Clear()
+ q.Add("/buckets/b1/folder2", time.Now())
+
+ // Fresh item should not trigger processing
+ if q.ShouldProcess() {
+ t.Error("fresh item should not trigger processing")
+ }
+}
+
+func TestCleanupQueue_Clear(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ now := time.Now()
+
+ q.Add("/buckets/b1/folder1", now)
+ q.Add("/buckets/b1/folder2", now.Add(1*time.Second))
+ q.Add("/buckets/b1/folder3", now.Add(2*time.Second))
+
+ q.Clear()
+
+ if q.Len() != 0 {
+ t.Errorf("expected empty queue after Clear, got len %d", q.Len())
+ }
+ if q.Contains("/buckets/b1/folder1") {
+ t.Error("queue should not contain items after Clear")
+ }
+}
+
+func TestCleanupQueue_OldestAge(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+
+ // Empty queue
+ if q.OldestAge() != 0 {
+ t.Error("empty queue should have zero oldest age")
+ }
+
+ // Add item with time in the past
+ oldTime := time.Now().Add(-5 * time.Minute)
+ q.Add("/buckets/b1/folder1", oldTime)
+
+ // Age should be approximately 5 minutes
+ age := q.OldestAge()
+ if age < 4*time.Minute || age > 6*time.Minute {
+ t.Errorf("expected ~5m age, got %v", age)
+ }
+}
+
+func TestCleanupQueue_TimeOrder(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ baseTime := time.Now()
+
+ // Add items in order
+ items := []string{
+ "/buckets/b1/a",
+ "/buckets/b1/b",
+ "/buckets/b1/c",
+ "/buckets/b1/d",
+ "/buckets/b1/e",
+ }
+ for i, item := range items {
+ q.Add(item, baseTime.Add(time.Duration(i)*time.Second))
+ }
+
+ // Pop should return in time order
+ for i, expected := range items {
+ got, ok := q.Pop()
+ if !ok {
+ t.Errorf("Pop %d: expected item, got empty", i)
+ }
+ if got != expected {
+ t.Errorf("Pop %d: expected %s, got %s", i, expected, got)
+ }
+ }
+}
+
+func TestCleanupQueue_DuplicateWithNewerTime(t *testing.T) {
+ q := NewCleanupQueue(100, 10*time.Minute)
+ baseTime := time.Now()
+
+ // Add items
+ q.Add("/buckets/b1/folder1", baseTime)
+ q.Add("/buckets/b1/folder2", baseTime.Add(1*time.Second))
+ q.Add("/buckets/b1/folder3", baseTime.Add(2*time.Second))
+
+ // Add duplicate with newer time - should update and reposition
+ q.Add("/buckets/b1/folder1", baseTime.Add(3*time.Second))
+
+ // folder1 should now be at the back (newest time) - verify by popping
+ expected := []string{"/buckets/b1/folder2", "/buckets/b1/folder3", "/buckets/b1/folder1"}
+ for i, exp := range expected {
+ folder, ok := q.Pop()
+ if !ok || folder != exp {
+ t.Errorf("at index %d: expected %s, got %s", i, exp, folder)
+ }
+ }
+}
+
+func TestCleanupQueue_Concurrent(t *testing.T) {
+ q := NewCleanupQueue(1000, 10*time.Minute)
+ done := make(chan bool)
+ now := time.Now()
+
+ // Concurrent adds
+ go func() {
+ for i := 0; i < 100; i++ {
+ q.Add("/buckets/b1/folder"+string(rune('A'+i%26)), now.Add(time.Duration(i)*time.Millisecond))
+ }
+ done <- true
+ }()
+
+ // Concurrent removes
+ go func() {
+ for i := 0; i < 50; i++ {
+ q.Remove("/buckets/b1/folder" + string(rune('A'+i%26)))
+ }
+ done <- true
+ }()
+
+ // Concurrent pops
+ go func() {
+ for i := 0; i < 30; i++ {
+ q.Pop()
+ }
+ done <- true
+ }()
+
+ // Concurrent reads
+ go func() {
+ for i := 0; i < 100; i++ {
+ q.Len()
+ q.Contains("/buckets/b1/folderA")
+ q.ShouldProcess()
+ }
+ done <- true
+ }()
+
+ // Wait for all goroutines
+ for i := 0; i < 4; i++ {
+ <-done
+ }
+
+ // Just verify no panic occurred and queue is in consistent state
+ _ = q.Len()
+}
+
+
diff --git a/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go b/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go
new file mode 100644
index 000000000..70856aaf1
--- /dev/null
+++ b/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go
@@ -0,0 +1,436 @@
+package empty_folder_cleanup
+
+import (
+ "context"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager"
+ "github.com/seaweedfs/seaweedfs/weed/glog"
+ "github.com/seaweedfs/seaweedfs/weed/pb"
+ "github.com/seaweedfs/seaweedfs/weed/util"
+)
+
+const (
+ DefaultMaxCountCheck = 1000
+ DefaultCacheExpiry = 5 * time.Minute
+ DefaultQueueMaxSize = 1000
+ DefaultQueueMaxAge = 10 * time.Minute
+ DefaultProcessorSleep = 10 * time.Second // How often to check queue
+)
+
+// FilerOperations defines the filer operations needed by EmptyFolderCleaner
+type FilerOperations interface {
+ CountDirectoryEntries(ctx context.Context, dirPath util.FullPath, limit int) (count int, err error)
+ DeleteEntryMetaAndData(ctx context.Context, p util.FullPath, isRecursive, ignoreRecursiveError, shouldDeleteChunks, isFromOtherCluster bool, signatures []int32, ifNotModifiedAfter int64) error
+}
+
+// folderState tracks the state of a folder for empty folder cleanup
+type folderState struct {
+ roughCount int // Cached rough count (up to maxCountCheck)
+ lastAddTime time.Time // Last time an item was added
+ lastDelTime time.Time // Last time an item was deleted
+ lastCheck time.Time // Last time we checked the actual count
+}
+
+// EmptyFolderCleaner handles asynchronous cleanup of empty folders
+// Each filer owns specific folders via consistent hashing based on the peer filer list
+type EmptyFolderCleaner struct {
+ filer FilerOperations
+ lockRing *lock_manager.LockRing
+ host pb.ServerAddress
+
+ // Folder state tracking
+ mu sync.RWMutex
+ folderCounts map[string]*folderState // Rough count cache
+
+ // Cleanup queue (thread-safe, has its own lock)
+ cleanupQueue *CleanupQueue
+
+ // Configuration
+ maxCountCheck int // Max items to count (1000)
+ cacheExpiry time.Duration // How long to keep cache entries
+ processorSleep time.Duration // How often processor checks queue
+ bucketPath string // e.g., "/buckets"
+
+ // Control
+ enabled bool
+ stopCh chan struct{}
+}
+
+// NewEmptyFolderCleaner creates a new EmptyFolderCleaner
+func NewEmptyFolderCleaner(filer FilerOperations, lockRing *lock_manager.LockRing, host pb.ServerAddress, bucketPath string) *EmptyFolderCleaner {
+ efc := &EmptyFolderCleaner{
+ filer: filer,
+ lockRing: lockRing,
+ host: host,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(DefaultQueueMaxSize, DefaultQueueMaxAge),
+ maxCountCheck: DefaultMaxCountCheck,
+ cacheExpiry: DefaultCacheExpiry,
+ processorSleep: DefaultProcessorSleep,
+ bucketPath: bucketPath,
+ enabled: true,
+ stopCh: make(chan struct{}),
+ }
+ go efc.cacheEvictionLoop()
+ go efc.cleanupProcessor()
+ return efc
+}
+
+// SetEnabled enables or disables the cleaner
+func (efc *EmptyFolderCleaner) SetEnabled(enabled bool) {
+ efc.mu.Lock()
+ defer efc.mu.Unlock()
+ efc.enabled = enabled
+}
+
+// IsEnabled returns whether the cleaner is enabled
+func (efc *EmptyFolderCleaner) IsEnabled() bool {
+ efc.mu.RLock()
+ defer efc.mu.RUnlock()
+ return efc.enabled
+}
+
+// ownsFolder checks if this filer owns the folder via consistent hashing
+func (efc *EmptyFolderCleaner) ownsFolder(folder string) bool {
+ servers := efc.lockRing.GetSnapshot()
+ if len(servers) <= 1 {
+ return true // Single filer case
+ }
+ return efc.hashKeyToServer(folder, servers) == efc.host
+}
+
+// hashKeyToServer uses consistent hashing to map a folder to a server
+func (efc *EmptyFolderCleaner) hashKeyToServer(key string, servers []pb.ServerAddress) pb.ServerAddress {
+ if len(servers) == 0 {
+ return ""
+ }
+ x := util.HashStringToLong(key)
+ if x < 0 {
+ x = -x
+ }
+ x = x % int64(len(servers))
+ return servers[x]
+}
+
+// OnDeleteEvent is called when a file or directory is deleted
+// Both file and directory deletions count towards making the parent folder empty
+// eventTime is the time when the delete event occurred (for proper ordering)
+func (efc *EmptyFolderCleaner) OnDeleteEvent(directory string, entryName string, isDirectory bool, eventTime time.Time) {
+ // Skip if not under bucket path (must be at least /buckets/<bucket>/...)
+ if efc.bucketPath != "" && !isUnderBucketPath(directory, efc.bucketPath) {
+ return
+ }
+
+ // Check if we own this folder
+ if !efc.ownsFolder(directory) {
+ glog.V(4).Infof("EmptyFolderCleaner: not owner of %s, skipping", directory)
+ return
+ }
+
+ efc.mu.Lock()
+ defer efc.mu.Unlock()
+
+ // Check enabled inside lock to avoid race with Stop()
+ if !efc.enabled {
+ return
+ }
+
+ glog.V(3).Infof("EmptyFolderCleaner: delete event in %s/%s (isDir=%v)", directory, entryName, isDirectory)
+
+ // Update cached count (create entry if needed)
+ state, exists := efc.folderCounts[directory]
+ if !exists {
+ state = &folderState{}
+ efc.folderCounts[directory] = state
+ }
+ if state.roughCount > 0 {
+ state.roughCount--
+ }
+ state.lastDelTime = eventTime
+
+ // Only add to cleanup queue if roughCount suggests folder might be empty
+ if state.roughCount > 0 {
+ glog.V(3).Infof("EmptyFolderCleaner: skipping queue for %s, roughCount=%d", directory, state.roughCount)
+ return
+ }
+
+ // Add to cleanup queue with event time (handles out-of-order events)
+ if efc.cleanupQueue.Add(directory, eventTime) {
+ glog.V(3).Infof("EmptyFolderCleaner: queued %s for cleanup", directory)
+ }
+}
+
+// OnCreateEvent is called when a file or directory is created
+// Both file and directory creations cancel pending cleanup for the parent folder
+func (efc *EmptyFolderCleaner) OnCreateEvent(directory string, entryName string, isDirectory bool) {
+ // Skip if not under bucket path (must be at least /buckets/<bucket>/...)
+ if efc.bucketPath != "" && !isUnderBucketPath(directory, efc.bucketPath) {
+ return
+ }
+
+ efc.mu.Lock()
+ defer efc.mu.Unlock()
+
+ // Check enabled inside lock to avoid race with Stop()
+ if !efc.enabled {
+ return
+ }
+
+ // Update cached count only if already tracked (no need to track new folders)
+ if state, exists := efc.folderCounts[directory]; exists {
+ state.roughCount++
+ state.lastAddTime = time.Now()
+ }
+
+ // Remove from cleanup queue (cancel pending cleanup)
+ if efc.cleanupQueue.Remove(directory) {
+ glog.V(3).Infof("EmptyFolderCleaner: cancelled cleanup for %s due to new entry", directory)
+ }
+}
+
+// cleanupProcessor runs in background and processes the cleanup queue
+func (efc *EmptyFolderCleaner) cleanupProcessor() {
+ ticker := time.NewTicker(efc.processorSleep)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-efc.stopCh:
+ return
+ case <-ticker.C:
+ efc.processCleanupQueue()
+ }
+ }
+}
+
+// processCleanupQueue processes items from the cleanup queue
+func (efc *EmptyFolderCleaner) processCleanupQueue() {
+ // Check if we should process
+ if !efc.cleanupQueue.ShouldProcess() {
+ return
+ }
+
+ glog.V(3).Infof("EmptyFolderCleaner: processing cleanup queue (len=%d, age=%v)",
+ efc.cleanupQueue.Len(), efc.cleanupQueue.OldestAge())
+
+ // Process all items that are ready
+ for efc.cleanupQueue.Len() > 0 {
+ // Check if still enabled
+ if !efc.IsEnabled() {
+ return
+ }
+
+ // Pop the oldest item
+ folder, ok := efc.cleanupQueue.Pop()
+ if !ok {
+ break
+ }
+
+ // Execute cleanup for this folder
+ efc.executeCleanup(folder)
+
+ // If queue is no longer full and oldest item is not old enough, stop processing
+ if !efc.cleanupQueue.ShouldProcess() {
+ break
+ }
+ }
+}
+
+// executeCleanup performs the actual cleanup of an empty folder
+func (efc *EmptyFolderCleaner) executeCleanup(folder string) {
+ efc.mu.Lock()
+
+ // Quick check: if we have cached count and it's > 0, skip
+ if state, exists := efc.folderCounts[folder]; exists {
+ if state.roughCount > 0 {
+ glog.V(3).Infof("EmptyFolderCleaner: skipping %s, cached count=%d", folder, state.roughCount)
+ efc.mu.Unlock()
+ return
+ }
+ // If there was an add after our delete, skip
+ if !state.lastAddTime.IsZero() && state.lastAddTime.After(state.lastDelTime) {
+ glog.V(3).Infof("EmptyFolderCleaner: skipping %s, add happened after delete", folder)
+ efc.mu.Unlock()
+ return
+ }
+ }
+ efc.mu.Unlock()
+
+ // Re-check ownership (topology might have changed)
+ if !efc.ownsFolder(folder) {
+ glog.V(3).Infof("EmptyFolderCleaner: no longer owner of %s, skipping", folder)
+ return
+ }
+
+ // Check if folder is actually empty (count up to maxCountCheck)
+ ctx := context.Background()
+ count, err := efc.countItems(ctx, folder)
+ if err != nil {
+ glog.V(2).Infof("EmptyFolderCleaner: error counting items in %s: %v", folder, err)
+ return
+ }
+
+ efc.mu.Lock()
+ // Update cache
+ if _, exists := efc.folderCounts[folder]; !exists {
+ efc.folderCounts[folder] = &folderState{}
+ }
+ efc.folderCounts[folder].roughCount = count
+ efc.folderCounts[folder].lastCheck = time.Now()
+ efc.mu.Unlock()
+
+ if count > 0 {
+ glog.V(3).Infof("EmptyFolderCleaner: folder %s has %d items, not empty", folder, count)
+ return
+ }
+
+ // Delete the empty folder
+ glog.V(2).Infof("EmptyFolderCleaner: deleting empty folder %s", folder)
+ if err := efc.deleteFolder(ctx, folder); err != nil {
+ glog.V(2).Infof("EmptyFolderCleaner: failed to delete empty folder %s: %v", folder, err)
+ return
+ }
+
+ // Clean up cache entry
+ efc.mu.Lock()
+ delete(efc.folderCounts, folder)
+ efc.mu.Unlock()
+
+ // Note: No need to recursively check parent folder here.
+ // The deletion of this folder will generate a metadata event,
+ // which will trigger OnDeleteEvent for the parent folder.
+}
+
+// countItems counts items in a folder (up to maxCountCheck)
+func (efc *EmptyFolderCleaner) countItems(ctx context.Context, folder string) (int, error) {
+ return efc.filer.CountDirectoryEntries(ctx, util.FullPath(folder), efc.maxCountCheck)
+}
+
+// deleteFolder deletes an empty folder
+func (efc *EmptyFolderCleaner) deleteFolder(ctx context.Context, folder string) error {
+ return efc.filer.DeleteEntryMetaAndData(ctx, util.FullPath(folder), false, false, false, false, nil, 0)
+}
+
+// isUnderPath checks if child is under parent path
+func isUnderPath(child, parent string) bool {
+ if parent == "" || parent == "/" {
+ return true
+ }
+ // Ensure parent ends without slash for proper prefix matching
+ if len(parent) > 0 && parent[len(parent)-1] == '/' {
+ parent = parent[:len(parent)-1]
+ }
+ // Child must start with parent and then have a / or be exactly parent
+ if len(child) < len(parent) {
+ return false
+ }
+ if child[:len(parent)] != parent {
+ return false
+ }
+ if len(child) == len(parent) {
+ return true
+ }
+ return child[len(parent)] == '/'
+}
+
+// isUnderBucketPath checks if directory is inside a bucket (under /buckets/<bucket>/...)
+// This ensures we only clean up folders inside buckets, not the buckets themselves
+func isUnderBucketPath(directory, bucketPath string) bool {
+ if bucketPath == "" {
+ return true
+ }
+ // Ensure bucketPath ends without slash
+ if len(bucketPath) > 0 && bucketPath[len(bucketPath)-1] == '/' {
+ bucketPath = bucketPath[:len(bucketPath)-1]
+ }
+ // Directory must be under bucketPath
+ if !isUnderPath(directory, bucketPath) {
+ return false
+ }
+ // Directory must be at least /buckets/<bucket>/<something>
+ // i.e., depth must be at least bucketPath depth + 2
+ // For /buckets (depth 1), we need at least /buckets/mybucket/folder (depth 3)
+ bucketPathDepth := strings.Count(bucketPath, "/")
+ directoryDepth := strings.Count(directory, "/")
+ return directoryDepth >= bucketPathDepth+2
+}
+
+// cacheEvictionLoop periodically removes stale entries from folderCounts
+func (efc *EmptyFolderCleaner) cacheEvictionLoop() {
+ ticker := time.NewTicker(efc.cacheExpiry)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-efc.stopCh:
+ return
+ case <-ticker.C:
+ efc.evictStaleCacheEntries()
+ }
+ }
+}
+
+// evictStaleCacheEntries removes cache entries that haven't been accessed recently
+func (efc *EmptyFolderCleaner) evictStaleCacheEntries() {
+ efc.mu.Lock()
+ defer efc.mu.Unlock()
+
+ now := time.Now()
+ expiredCount := 0
+ for folder, state := range efc.folderCounts {
+ // Skip if folder is in cleanup queue
+ if efc.cleanupQueue.Contains(folder) {
+ continue
+ }
+
+ // Find the most recent activity time for this folder
+ lastActivity := state.lastCheck
+ if state.lastAddTime.After(lastActivity) {
+ lastActivity = state.lastAddTime
+ }
+ if state.lastDelTime.After(lastActivity) {
+ lastActivity = state.lastDelTime
+ }
+
+ // Evict if no activity within cache expiry period
+ if now.Sub(lastActivity) > efc.cacheExpiry {
+ delete(efc.folderCounts, folder)
+ expiredCount++
+ }
+ }
+
+ if expiredCount > 0 {
+ glog.V(3).Infof("EmptyFolderCleaner: evicted %d stale cache entries", expiredCount)
+ }
+}
+
+// Stop stops the cleaner and cancels all pending tasks
+func (efc *EmptyFolderCleaner) Stop() {
+ close(efc.stopCh)
+
+ efc.mu.Lock()
+ defer efc.mu.Unlock()
+
+ efc.enabled = false
+ efc.cleanupQueue.Clear()
+ efc.folderCounts = make(map[string]*folderState) // Clear cache on stop
+}
+
+// GetPendingCleanupCount returns the number of pending cleanup tasks (for testing)
+func (efc *EmptyFolderCleaner) GetPendingCleanupCount() int {
+ return efc.cleanupQueue.Len()
+}
+
+// GetCachedFolderCount returns the cached count for a folder (for testing)
+func (efc *EmptyFolderCleaner) GetCachedFolderCount(folder string) (int, bool) {
+ efc.mu.RLock()
+ defer efc.mu.RUnlock()
+ if state, exists := efc.folderCounts[folder]; exists {
+ return state.roughCount, true
+ }
+ return 0, false
+}
+
diff --git a/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go b/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go
new file mode 100644
index 000000000..fbc05ccf8
--- /dev/null
+++ b/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go
@@ -0,0 +1,569 @@
+package empty_folder_cleanup
+
+import (
+ "testing"
+ "time"
+
+ "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager"
+ "github.com/seaweedfs/seaweedfs/weed/pb"
+)
+
+func Test_isUnderPath(t *testing.T) {
+ tests := []struct {
+ name string
+ child string
+ parent string
+ expected bool
+ }{
+ {"child under parent", "/buckets/mybucket/folder/file.txt", "/buckets", true},
+ {"child is parent", "/buckets", "/buckets", true},
+ {"child not under parent", "/other/path", "/buckets", false},
+ {"empty parent", "/any/path", "", true},
+ {"root parent", "/any/path", "/", true},
+ {"parent with trailing slash", "/buckets/mybucket", "/buckets/", true},
+ {"similar prefix but not under", "/buckets-other/file", "/buckets", false},
+ {"deeply nested", "/buckets/a/b/c/d/e/f", "/buckets/a/b", true},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result := isUnderPath(tt.child, tt.parent)
+ if result != tt.expected {
+ t.Errorf("isUnderPath(%q, %q) = %v, want %v", tt.child, tt.parent, result, tt.expected)
+ }
+ })
+ }
+}
+
+func Test_isUnderBucketPath(t *testing.T) {
+ tests := []struct {
+ name string
+ directory string
+ bucketPath string
+ expected bool
+ }{
+ // Should NOT process - bucket path itself
+ {"bucket path itself", "/buckets", "/buckets", false},
+ // Should NOT process - bucket directory (immediate child)
+ {"bucket directory", "/buckets/mybucket", "/buckets", false},
+ // Should process - folder inside bucket
+ {"folder in bucket", "/buckets/mybucket/folder", "/buckets", true},
+ // Should process - nested folder
+ {"nested folder", "/buckets/mybucket/a/b/c", "/buckets", true},
+ // Should NOT process - outside buckets
+ {"outside buckets", "/other/path", "/buckets", false},
+ // Empty bucket path allows all
+ {"empty bucket path", "/any/path", "", true},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result := isUnderBucketPath(tt.directory, tt.bucketPath)
+ if result != tt.expected {
+ t.Errorf("isUnderBucketPath(%q, %q) = %v, want %v", tt.directory, tt.bucketPath, result, tt.expected)
+ }
+ })
+ }
+}
+
+func TestEmptyFolderCleaner_ownsFolder(t *testing.T) {
+ // Create a LockRing with multiple servers
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+
+ servers := []pb.ServerAddress{
+ "filer1:8888",
+ "filer2:8888",
+ "filer3:8888",
+ }
+ lockRing.SetSnapshot(servers)
+
+ // Create cleaner for filer1
+ cleaner1 := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ }
+
+ // Create cleaner for filer2
+ cleaner2 := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer2:8888",
+ }
+
+ // Create cleaner for filer3
+ cleaner3 := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer3:8888",
+ }
+
+ // Test that exactly one filer owns each folder
+ testFolders := []string{
+ "/buckets/mybucket/folder1",
+ "/buckets/mybucket/folder2",
+ "/buckets/mybucket/folder3",
+ "/buckets/mybucket/a/b/c",
+ "/buckets/otherbucket/x",
+ }
+
+ for _, folder := range testFolders {
+ ownCount := 0
+ if cleaner1.ownsFolder(folder) {
+ ownCount++
+ }
+ if cleaner2.ownsFolder(folder) {
+ ownCount++
+ }
+ if cleaner3.ownsFolder(folder) {
+ ownCount++
+ }
+
+ if ownCount != 1 {
+ t.Errorf("folder %q owned by %d filers, expected exactly 1", folder, ownCount)
+ }
+ }
+}
+
+func TestEmptyFolderCleaner_ownsFolder_singleServer(t *testing.T) {
+ // Create a LockRing with a single server
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ }
+
+ // Single filer should own all folders
+ testFolders := []string{
+ "/buckets/mybucket/folder1",
+ "/buckets/mybucket/folder2",
+ "/buckets/otherbucket/x",
+ }
+
+ for _, folder := range testFolders {
+ if !cleaner.ownsFolder(folder) {
+ t.Errorf("single filer should own folder %q", folder)
+ }
+ }
+}
+
+func TestEmptyFolderCleaner_ownsFolder_emptyRing(t *testing.T) {
+ // Create an empty LockRing
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ }
+
+ // With empty ring, should own all folders
+ if !cleaner.ownsFolder("/buckets/mybucket/folder") {
+ t.Error("should own folder with empty ring")
+ }
+}
+
+func TestEmptyFolderCleaner_OnCreateEvent_cancelsCleanup(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ folder := "/buckets/mybucket/testfolder"
+ now := time.Now()
+
+ // Simulate delete event
+ cleaner.OnDeleteEvent(folder, "file.txt", false, now)
+
+ // Check that cleanup is queued
+ if cleaner.GetPendingCleanupCount() != 1 {
+ t.Errorf("expected 1 pending cleanup, got %d", cleaner.GetPendingCleanupCount())
+ }
+
+ // Simulate create event
+ cleaner.OnCreateEvent(folder, "newfile.txt", false)
+
+ // Check that cleanup is cancelled
+ if cleaner.GetPendingCleanupCount() != 0 {
+ t.Errorf("expected 0 pending cleanups after create, got %d", cleaner.GetPendingCleanupCount())
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_OnDeleteEvent_deduplication(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ folder := "/buckets/mybucket/testfolder"
+ now := time.Now()
+
+ // Simulate multiple delete events for same folder
+ for i := 0; i < 5; i++ {
+ cleaner.OnDeleteEvent(folder, "file"+string(rune('0'+i))+".txt", false, now.Add(time.Duration(i)*time.Second))
+ }
+
+ // Check that only 1 cleanup is queued (deduplicated)
+ if cleaner.GetPendingCleanupCount() != 1 {
+ t.Errorf("expected 1 pending cleanup after deduplication, got %d", cleaner.GetPendingCleanupCount())
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_OnDeleteEvent_multipleFolders(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ now := time.Now()
+
+ // Delete files in different folders
+ cleaner.OnDeleteEvent("/buckets/mybucket/folder1", "file.txt", false, now)
+ cleaner.OnDeleteEvent("/buckets/mybucket/folder2", "file.txt", false, now.Add(1*time.Second))
+ cleaner.OnDeleteEvent("/buckets/mybucket/folder3", "file.txt", false, now.Add(2*time.Second))
+
+ // Each folder should be queued
+ if cleaner.GetPendingCleanupCount() != 3 {
+ t.Errorf("expected 3 pending cleanups, got %d", cleaner.GetPendingCleanupCount())
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_OnDeleteEvent_notOwner(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888", "filer2:8888"})
+
+ // Create cleaner for filer that doesn't own the folder
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ now := time.Now()
+
+ // Try many folders, looking for one that filer1 doesn't own
+ foundNonOwned := false
+ for i := 0; i < 100; i++ {
+ folder := "/buckets/mybucket/folder" + string(rune('0'+i%10)) + string(rune('0'+i/10))
+ if !cleaner.ownsFolder(folder) {
+ // This folder is not owned by filer1
+ cleaner.OnDeleteEvent(folder, "file.txt", false, now)
+ if cleaner.GetPendingCleanupCount() != 0 {
+ t.Errorf("non-owner should not queue cleanup for folder %s", folder)
+ }
+ foundNonOwned = true
+ break
+ }
+ }
+
+ if !foundNonOwned {
+ t.Skip("could not find a folder not owned by filer1")
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_OnDeleteEvent_disabled(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: false, // Disabled
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ folder := "/buckets/mybucket/testfolder"
+ now := time.Now()
+
+ // Simulate delete event
+ cleaner.OnDeleteEvent(folder, "file.txt", false, now)
+
+ // Check that no cleanup is queued when disabled
+ if cleaner.GetPendingCleanupCount() != 0 {
+ t.Errorf("disabled cleaner should not queue cleanup, got %d", cleaner.GetPendingCleanupCount())
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_OnDeleteEvent_directoryDeletion(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ folder := "/buckets/mybucket/testfolder"
+ now := time.Now()
+
+ // Simulate directory delete event - should trigger cleanup
+ // because subdirectory deletion also makes parent potentially empty
+ cleaner.OnDeleteEvent(folder, "subdir", true, now)
+
+ // Check that cleanup IS queued for directory deletion
+ if cleaner.GetPendingCleanupCount() != 1 {
+ t.Errorf("directory deletion should trigger cleanup, got %d", cleaner.GetPendingCleanupCount())
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_cachedCounts(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ folder := "/buckets/mybucket/testfolder"
+
+ // Initialize cached count
+ cleaner.folderCounts[folder] = &folderState{roughCount: 5}
+
+ // Simulate create events
+ cleaner.OnCreateEvent(folder, "newfile1.txt", false)
+ cleaner.OnCreateEvent(folder, "newfile2.txt", false)
+
+ // Check cached count increased
+ count, exists := cleaner.GetCachedFolderCount(folder)
+ if !exists {
+ t.Error("cached folder count should exist")
+ }
+ if count != 7 {
+ t.Errorf("expected cached count 7, got %d", count)
+ }
+
+ // Simulate delete events
+ now := time.Now()
+ cleaner.OnDeleteEvent(folder, "file1.txt", false, now)
+ cleaner.OnDeleteEvent(folder, "file2.txt", false, now.Add(1*time.Second))
+
+ // Check cached count decreased
+ count, exists = cleaner.GetCachedFolderCount(folder)
+ if !exists {
+ t.Error("cached folder count should exist")
+ }
+ if count != 5 {
+ t.Errorf("expected cached count 5, got %d", count)
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_Stop(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ now := time.Now()
+
+ // Queue some cleanups
+ cleaner.OnDeleteEvent("/buckets/mybucket/folder1", "file1.txt", false, now)
+ cleaner.OnDeleteEvent("/buckets/mybucket/folder2", "file2.txt", false, now.Add(1*time.Second))
+ cleaner.OnDeleteEvent("/buckets/mybucket/folder3", "file3.txt", false, now.Add(2*time.Second))
+
+ // Verify cleanups are queued
+ if cleaner.GetPendingCleanupCount() < 1 {
+ t.Error("expected at least 1 pending cleanup before stop")
+ }
+
+ // Stop the cleaner
+ cleaner.Stop()
+
+ // Verify all cleanups are cancelled
+ if cleaner.GetPendingCleanupCount() != 0 {
+ t.Errorf("expected 0 pending cleanups after stop, got %d", cleaner.GetPendingCleanupCount())
+ }
+}
+
+func TestEmptyFolderCleaner_cacheEviction(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ cacheExpiry: 100 * time.Millisecond, // Short expiry for testing
+ stopCh: make(chan struct{}),
+ }
+
+ folder1 := "/buckets/mybucket/folder1"
+ folder2 := "/buckets/mybucket/folder2"
+ folder3 := "/buckets/mybucket/folder3"
+
+ // Add some cache entries with old timestamps
+ oldTime := time.Now().Add(-1 * time.Hour)
+ cleaner.folderCounts[folder1] = &folderState{roughCount: 5, lastCheck: oldTime}
+ cleaner.folderCounts[folder2] = &folderState{roughCount: 3, lastCheck: oldTime}
+ // folder3 has recent activity
+ cleaner.folderCounts[folder3] = &folderState{roughCount: 2, lastCheck: time.Now()}
+
+ // Verify all entries exist
+ if len(cleaner.folderCounts) != 3 {
+ t.Errorf("expected 3 cache entries, got %d", len(cleaner.folderCounts))
+ }
+
+ // Run eviction
+ cleaner.evictStaleCacheEntries()
+
+ // Verify stale entries are evicted
+ if len(cleaner.folderCounts) != 1 {
+ t.Errorf("expected 1 cache entry after eviction, got %d", len(cleaner.folderCounts))
+ }
+
+ // Verify the recent entry still exists
+ if _, exists := cleaner.folderCounts[folder3]; !exists {
+ t.Error("expected folder3 to still exist in cache")
+ }
+
+ // Verify stale entries are removed
+ if _, exists := cleaner.folderCounts[folder1]; exists {
+ t.Error("expected folder1 to be evicted")
+ }
+ if _, exists := cleaner.folderCounts[folder2]; exists {
+ t.Error("expected folder2 to be evicted")
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_cacheEviction_skipsEntriesInQueue(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ cacheExpiry: 100 * time.Millisecond,
+ stopCh: make(chan struct{}),
+ }
+
+ folder := "/buckets/mybucket/folder"
+ oldTime := time.Now().Add(-1 * time.Hour)
+
+ // Add a stale cache entry
+ cleaner.folderCounts[folder] = &folderState{roughCount: 0, lastCheck: oldTime}
+ // Also add to cleanup queue
+ cleaner.cleanupQueue.Add(folder, time.Now())
+
+ // Run eviction
+ cleaner.evictStaleCacheEntries()
+
+ // Verify entry is NOT evicted because it's in cleanup queue
+ if _, exists := cleaner.folderCounts[folder]; !exists {
+ t.Error("expected folder to still exist in cache (is in cleanup queue)")
+ }
+
+ cleaner.Stop()
+}
+
+func TestEmptyFolderCleaner_queueFIFOOrder(t *testing.T) {
+ lockRing := lock_manager.NewLockRing(5 * time.Second)
+ lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"})
+
+ cleaner := &EmptyFolderCleaner{
+ lockRing: lockRing,
+ host: "filer1:8888",
+ bucketPath: "/buckets",
+ enabled: true,
+ folderCounts: make(map[string]*folderState),
+ cleanupQueue: NewCleanupQueue(1000, 10*time.Minute),
+ stopCh: make(chan struct{}),
+ }
+
+ now := time.Now()
+
+ // Add folders in order
+ folders := []string{
+ "/buckets/mybucket/folder1",
+ "/buckets/mybucket/folder2",
+ "/buckets/mybucket/folder3",
+ }
+ for i, folder := range folders {
+ cleaner.OnDeleteEvent(folder, "file.txt", false, now.Add(time.Duration(i)*time.Second))
+ }
+
+ // Verify queue length
+ if cleaner.GetPendingCleanupCount() != 3 {
+ t.Errorf("expected 3 queued folders, got %d", cleaner.GetPendingCleanupCount())
+ }
+
+ // Verify time-sorted order by popping
+ for i, expected := range folders {
+ folder, ok := cleaner.cleanupQueue.Pop()
+ if !ok || folder != expected {
+ t.Errorf("expected folder %s at index %d, got %s", expected, i, folder)
+ }
+ }
+
+ cleaner.Stop()
+}
+
diff --git a/weed/filer/filer.go b/weed/filer/filer.go
index f9f3d4fb2..382eb644f 100644
--- a/weed/filer/filer.go
+++ b/weed/filer/filer.go
@@ -11,6 +11,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/s3api/s3bucket"
"github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager"
+ "github.com/seaweedfs/seaweedfs/weed/filer/empty_folder_cleanup"
"github.com/seaweedfs/seaweedfs/weed/cluster"
"github.com/seaweedfs/seaweedfs/weed/pb"
@@ -56,6 +57,7 @@ type Filer struct {
MaxFilenameLength uint32
deletionQuit chan struct{}
DeletionRetryQueue *DeletionRetryQueue
+ EmptyFolderCleaner *empty_folder_cleanup.EmptyFolderCleaner
}
func NewFiler(masters pb.ServerDiscovery, grpcDialOption grpc.DialOption, filerHost pb.ServerAddress, filerGroup string, collection string, replication string, dataCenter string, maxFilenameLength uint32, notifyFn func()) *Filer {
@@ -116,6 +118,9 @@ func (f *Filer) AggregateFromPeers(self pb.ServerAddress, existingNodes []*maste
f.Dlm.LockRing.SetSnapshot(snapshot)
glog.V(0).Infof("%s aggregate from peers %+v", self, snapshot)
+ // Initialize the empty folder cleaner using the same LockRing as Dlm for consistent hashing
+ f.EmptyFolderCleaner = empty_folder_cleanup.NewEmptyFolderCleaner(f, f.Dlm.LockRing, self, f.DirBucketsPath)
+
f.MetaAggregator = NewMetaAggregator(f, self, f.GrpcDialOption)
f.MasterClient.SetOnPeerUpdateFn(func(update *master_pb.ClusterNodeUpdate, startFrom time.Time) {
if update.NodeType != cluster.FilerType {
@@ -506,6 +511,9 @@ func (f *Filer) IsDirectoryEmpty(ctx context.Context, dirPath util.FullPath) (bo
func (f *Filer) Shutdown() {
close(f.deletionQuit)
+ if f.EmptyFolderCleaner != nil {
+ f.EmptyFolderCleaner.Stop()
+ }
f.LocalMetaLogBuffer.ShutdownLogBuffer()
f.Store.Shutdown()
}
diff --git a/weed/filer/filer_notify.go b/weed/filer/filer_notify.go
index 845a0678e..45c9b070f 100644
--- a/weed/filer/filer_notify.go
+++ b/weed/filer/filer_notify.go
@@ -66,6 +66,10 @@ func (f *Filer) NotifyUpdateEvent(ctx context.Context, oldEntry, newEntry *Entry
f.logMetaEvent(ctx, fullpath, eventNotification)
+ // Trigger empty folder cleanup for local events
+ // Remote events are handled via MetaAggregator.onMetadataChangeEvent
+ f.triggerLocalEmptyFolderCleanup(oldEntry, newEntry)
+
}
func (f *Filer) logMetaEvent(ctx context.Context, fullpath string, eventNotification *filer_pb.EventNotification) {
@@ -89,6 +93,41 @@ func (f *Filer) logMetaEvent(ctx context.Context, fullpath string, eventNotifica
}
+// triggerLocalEmptyFolderCleanup triggers empty folder cleanup for local events
+// This is needed because onMetadataChangeEvent is only called for remote peer events
+func (f *Filer) triggerLocalEmptyFolderCleanup(oldEntry, newEntry *Entry) {
+ if f.EmptyFolderCleaner == nil || !f.EmptyFolderCleaner.IsEnabled() {
+ return
+ }
+
+ eventTime := time.Now()
+
+ // Handle delete events (oldEntry exists, newEntry is nil)
+ if oldEntry != nil && newEntry == nil {
+ dir, name := oldEntry.FullPath.DirAndName()
+ f.EmptyFolderCleaner.OnDeleteEvent(dir, name, oldEntry.IsDirectory(), eventTime)
+ }
+
+ // Handle create events (oldEntry is nil, newEntry exists)
+ if oldEntry == nil && newEntry != nil {
+ dir, name := newEntry.FullPath.DirAndName()
+ f.EmptyFolderCleaner.OnCreateEvent(dir, name, newEntry.IsDirectory())
+ }
+
+ // Handle rename/move events (both exist but paths differ)
+ if oldEntry != nil && newEntry != nil {
+ oldDir, oldName := oldEntry.FullPath.DirAndName()
+ newDir, newName := newEntry.FullPath.DirAndName()
+
+ if oldDir != newDir || oldName != newName {
+ // Treat old location as delete
+ f.EmptyFolderCleaner.OnDeleteEvent(oldDir, oldName, oldEntry.IsDirectory(), eventTime)
+ // Treat new location as create
+ f.EmptyFolderCleaner.OnCreateEvent(newDir, newName, newEntry.IsDirectory())
+ }
+ }
+}
+
func (f *Filer) logFlushFunc(logBuffer *log_buffer.LogBuffer, startTime, stopTime time.Time, buf []byte, minOffset, maxOffset int64) {
if len(buf) == 0 {
diff --git a/weed/filer/filer_on_meta_event.go b/weed/filer/filer_on_meta_event.go
index acbf4aa47..4ee80b3a6 100644
--- a/weed/filer/filer_on_meta_event.go
+++ b/weed/filer/filer_on_meta_event.go
@@ -2,6 +2,7 @@ package filer
import (
"bytes"
+ "time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
@@ -13,6 +14,7 @@ func (f *Filer) onMetadataChangeEvent(event *filer_pb.SubscribeMetadataResponse)
f.maybeReloadFilerConfiguration(event)
f.maybeReloadRemoteStorageConfigurationAndMapping(event)
f.onBucketEvents(event)
+ f.onEmptyFolderCleanupEvents(event)
}
func (f *Filer) onBucketEvents(event *filer_pb.SubscribeMetadataResponse) {
@@ -32,6 +34,43 @@ func (f *Filer) onBucketEvents(event *filer_pb.SubscribeMetadataResponse) {
}
}
+// onEmptyFolderCleanupEvents handles create/delete events for empty folder cleanup
+func (f *Filer) onEmptyFolderCleanupEvents(event *filer_pb.SubscribeMetadataResponse) {
+ if f.EmptyFolderCleaner == nil || !f.EmptyFolderCleaner.IsEnabled() {
+ return
+ }
+
+ message := event.EventNotification
+ directory := event.Directory
+ eventTime := time.Unix(0, event.TsNs)
+
+ // Handle delete events - trigger folder cleanup check
+ if filer_pb.IsDelete(event) && message.OldEntry != nil {
+ f.EmptyFolderCleaner.OnDeleteEvent(directory, message.OldEntry.Name, message.OldEntry.IsDirectory, eventTime)
+ }
+
+ // Handle create events - cancel pending cleanup for the folder
+ if filer_pb.IsCreate(event) && message.NewEntry != nil {
+ f.EmptyFolderCleaner.OnCreateEvent(directory, message.NewEntry.Name, message.NewEntry.IsDirectory)
+ }
+
+ // Handle rename/move events
+ if filer_pb.IsRename(event) {
+ // Treat the old location as a delete
+ if message.OldEntry != nil {
+ f.EmptyFolderCleaner.OnDeleteEvent(directory, message.OldEntry.Name, message.OldEntry.IsDirectory, eventTime)
+ }
+ // Treat the new location as a create
+ if message.NewEntry != nil {
+ newDir := message.NewParentPath
+ if newDir == "" {
+ newDir = directory
+ }
+ f.EmptyFolderCleaner.OnCreateEvent(newDir, message.NewEntry.Name, message.NewEntry.IsDirectory)
+ }
+ }
+}
+
func (f *Filer) maybeReloadFilerConfiguration(event *filer_pb.SubscribeMetadataResponse) {
if DirectoryEtcSeaweedFS != event.Directory {
if DirectoryEtcSeaweedFS != event.EventNotification.NewParentPath {
diff --git a/weed/filer/filer_search.go b/weed/filer/filer_search.go
index 294fc0e7f..e6366e82f 100644
--- a/weed/filer/filer_search.go
+++ b/weed/filer/filer_search.go
@@ -41,6 +41,19 @@ func (f *Filer) ListDirectoryEntries(ctx context.Context, p util.FullPath, start
return entries, hasMore, err
}
+// CountDirectoryEntries counts entries in a directory up to limit
+func (f *Filer) CountDirectoryEntries(ctx context.Context, p util.FullPath, limit int) (count int, err error) {
+ entries, hasMore, err := f.ListDirectoryEntries(ctx, p, "", false, int64(limit), "", "", "")
+ if err != nil {
+ return 0, err
+ }
+ count = len(entries)
+ if hasMore {
+ count = limit // At least this many
+ }
+ return count, nil
+}
+
// For now, prefix and namePattern are mutually exclusive
func (f *Filer) StreamListDirectoryEntries(ctx context.Context, p util.FullPath, startFileName string, inclusive bool, limit int64, prefix string, namePattern string, namePatternExclude string, eachEntryFunc ListEachEntryFunc) (lastFileName string, err error) {
if strings.HasSuffix(string(p), "/") && len(p) > 1 {
diff --git a/weed/filer/redis2/redis_cluster_store.go b/weed/filer/redis2/redis_cluster_store.go
index 6e4f11d22..5e1593e9e 100644
--- a/weed/filer/redis2/redis_cluster_store.go
+++ b/weed/filer/redis2/redis_cluster_store.go
@@ -25,20 +25,24 @@ func (store *RedisCluster2Store) Initialize(configuration util.Configuration, pr
return store.initialize(
configuration.GetStringSlice(prefix+"addresses"),
+ configuration.GetString(prefix+"username"),
configuration.GetString(prefix+"password"),
+ configuration.GetString(prefix+"keyPrefix"),
configuration.GetBool(prefix+"useReadOnly"),
configuration.GetBool(prefix+"routeByLatency"),
configuration.GetStringSlice(prefix+"superLargeDirectories"),
)
}
-func (store *RedisCluster2Store) initialize(addresses []string, password string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) {
+func (store *RedisCluster2Store) initialize(addresses []string, username string, password string, keyPrefix string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) {
store.Client = redis.NewClusterClient(&redis.ClusterOptions{
Addrs: addresses,
+ Username: username,
Password: password,
ReadOnly: readOnly,
RouteByLatency: routeByLatency,
})
+ store.keyPrefix = keyPrefix
store.loadSuperLargeDirectories(superLargeDirectories)
return
}
diff --git a/weed/filer/redis2/redis_sentinel_store.go b/weed/filer/redis2/redis_sentinel_store.go
index 5fc368fc7..dc15285bd 100644
--- a/weed/filer/redis2/redis_sentinel_store.go
+++ b/weed/filer/redis2/redis_sentinel_store.go
@@ -26,10 +26,11 @@ func (store *Redis2SentinelStore) Initialize(configuration util.Configuration, p
configuration.GetString(prefix+"username"),
configuration.GetString(prefix+"password"),
configuration.GetInt(prefix+"database"),
+ configuration.GetString(prefix+"keyPrefix"),
)
}
-func (store *Redis2SentinelStore) initialize(addresses []string, masterName string, username string, password string, database int) (err error) {
+func (store *Redis2SentinelStore) initialize(addresses []string, masterName string, username string, password string, database int, keyPrefix string) (err error) {
store.Client = redis.NewFailoverClient(&redis.FailoverOptions{
MasterName: masterName,
SentinelAddrs: addresses,
@@ -41,5 +42,6 @@ func (store *Redis2SentinelStore) initialize(addresses []string, masterName stri
ReadTimeout: time.Second * 30,
WriteTimeout: time.Second * 5,
})
+ store.keyPrefix = keyPrefix
return
}
diff --git a/weed/filer/redis2/redis_store.go b/weed/filer/redis2/redis_store.go
index f9322be42..7193699f9 100644
--- a/weed/filer/redis2/redis_store.go
+++ b/weed/filer/redis2/redis_store.go
@@ -27,8 +27,10 @@ func (store *Redis2Store) GetName() string {
func (store *Redis2Store) Initialize(configuration util.Configuration, prefix string) (err error) {
return store.initialize(
configuration.GetString(prefix+"address"),
+ configuration.GetString(prefix+"username"),
configuration.GetString(prefix+"password"),
configuration.GetInt(prefix+"database"),
+ configuration.GetString(prefix+"keyPrefix"),
configuration.GetStringSlice(prefix+"superLargeDirectories"),
configuration.GetBool(prefix+"enable_mtls"),
configuration.GetString(prefix+"ca_cert_path"),
@@ -37,7 +39,13 @@ func (store *Redis2Store) Initialize(configuration util.Configuration, prefix st
)
}
-func (store *Redis2Store) initialize(hostPort string, password string, database int, superLargeDirectories []string, enableMtls bool, caCertPath string, clientCertPath string, clientKeyPath string) (err error) {
+func (store *Redis2Store) initialize(hostPort string, username string, password string, database int, keyPrefix string, superLargeDirectories []string, enableMtls bool, caCertPath string, clientCertPath string, clientKeyPath string) (err error) {
+ opt := &redis.Options{
+ Addr: hostPort,
+ Username: username,
+ Password: password,
+ DB: database,
+ }
if enableMtls {
clientCert, err := tls.LoadX509KeyPair(clientCertPath, clientKeyPath)
if err != nil {
@@ -59,25 +67,15 @@ func (store *Redis2Store) initialize(hostPort string, password string, database
glog.Fatalf("Error parsing redis host and port from %s: %v", hostPort, err)
}
- tlsConfig := &tls.Config{
+ opt.TLSConfig = &tls.Config{
Certificates: []tls.Certificate{clientCert},
RootCAs: caCertPool,
ServerName: redisHost,
MinVersion: tls.VersionTLS12,
}
- store.Client = redis.NewClient(&redis.Options{
- Addr: hostPort,
- Password: password,
- DB: database,
- TLSConfig: tlsConfig,
- })
- } else {
- store.Client = redis.NewClient(&redis.Options{
- Addr: hostPort,
- Password: password,
- DB: database,
- })
}
+ store.Client = redis.NewClient(opt)
+ store.keyPrefix = keyPrefix
store.loadSuperLargeDirectories(superLargeDirectories)
return
}
diff --git a/weed/filer/redis2/universal_redis_store.go b/weed/filer/redis2/universal_redis_store.go
index 0dbf7a72a..a8429a764 100644
--- a/weed/filer/redis2/universal_redis_store.go
+++ b/weed/filer/redis2/universal_redis_store.go
@@ -19,6 +19,7 @@ const (
type UniversalRedis2Store struct {
Client redis.UniversalClient
+ keyPrefix string
superLargeDirectoryHash map[string]bool
}
@@ -35,6 +36,13 @@ func (store *UniversalRedis2Store) loadSuperLargeDirectories(superLargeDirectori
}
}
+func (store *UniversalRedis2Store) getKey(key string) string {
+ if store.keyPrefix == "" {
+ return key
+ }
+ return store.keyPrefix + key
+}
+
func (store *UniversalRedis2Store) BeginTransaction(ctx context.Context) (context.Context, error) {
return ctx, nil
}
@@ -57,7 +65,7 @@ func (store *UniversalRedis2Store) InsertEntry(ctx context.Context, entry *filer
}
if name != "" {
- if err = store.Client.ZAddNX(ctx, genDirectoryListKey(dir), redis.Z{Score: 0, Member: name}).Err(); err != nil {
+ if err = store.Client.ZAddNX(ctx, store.getKey(genDirectoryListKey(dir)), redis.Z{Score: 0, Member: name}).Err(); err != nil {
return fmt.Errorf("persisting %s in parent dir: %v", entry.FullPath, err)
}
}
@@ -75,7 +83,7 @@ func (store *UniversalRedis2Store) doInsertEntry(ctx context.Context, entry *fil
value = util.MaybeGzipData(value)
}
- if err = store.Client.Set(ctx, string(entry.FullPath), value, time.Duration(entry.TtlSec)*time.Second).Err(); err != nil {
+ if err = store.Client.Set(ctx, store.getKey(string(entry.FullPath)), value, time.Duration(entry.TtlSec)*time.Second).Err(); err != nil {
return fmt.Errorf("persisting %s : %v", entry.FullPath, err)
}
return nil
@@ -88,7 +96,7 @@ func (store *UniversalRedis2Store) UpdateEntry(ctx context.Context, entry *filer
func (store *UniversalRedis2Store) FindEntry(ctx context.Context, fullpath util.FullPath) (entry *filer.Entry, err error) {
- data, err := store.Client.Get(ctx, string(fullpath)).Result()
+ data, err := store.Client.Get(ctx, store.getKey(string(fullpath))).Result()
if err == redis.Nil {
return nil, filer_pb.ErrNotFound
}
@@ -110,12 +118,12 @@ func (store *UniversalRedis2Store) FindEntry(ctx context.Context, fullpath util.
func (store *UniversalRedis2Store) DeleteEntry(ctx context.Context, fullpath util.FullPath) (err error) {
- _, err = store.Client.Del(ctx, genDirectoryListKey(string(fullpath))).Result()
+ _, err = store.Client.Del(ctx, store.getKey(genDirectoryListKey(string(fullpath)))).Result()
if err != nil {
return fmt.Errorf("delete dir list %s : %v", fullpath, err)
}
- _, err = store.Client.Del(ctx, string(fullpath)).Result()
+ _, err = store.Client.Del(ctx, store.getKey(string(fullpath))).Result()
if err != nil {
return fmt.Errorf("delete %s : %v", fullpath, err)
}
@@ -125,7 +133,7 @@ func (store *UniversalRedis2Store) DeleteEntry(ctx context.Context, fullpath uti
return nil
}
if name != "" {
- _, err = store.Client.ZRem(ctx, genDirectoryListKey(dir), name).Result()
+ _, err = store.Client.ZRem(ctx, store.getKey(genDirectoryListKey(dir)), name).Result()
if err != nil {
return fmt.Errorf("DeleteEntry %s in parent dir: %v", fullpath, err)
}
@@ -140,7 +148,7 @@ func (store *UniversalRedis2Store) DeleteFolderChildren(ctx context.Context, ful
return nil
}
- members, err := store.Client.ZRangeByLex(ctx, genDirectoryListKey(string(fullpath)), &redis.ZRangeBy{
+ members, err := store.Client.ZRangeByLex(ctx, store.getKey(genDirectoryListKey(string(fullpath))), &redis.ZRangeBy{
Min: "-",
Max: "+",
}).Result()
@@ -150,12 +158,12 @@ func (store *UniversalRedis2Store) DeleteFolderChildren(ctx context.Context, ful
for _, fileName := range members {
path := util.NewFullPath(string(fullpath), fileName)
- _, err = store.Client.Del(ctx, string(path)).Result()
+ _, err = store.Client.Del(ctx, store.getKey(string(path))).Result()
if err != nil {
return fmt.Errorf("DeleteFolderChildren %s in parent dir: %v", fullpath, err)
}
// not efficient, but need to remove if it is a directory
- store.Client.Del(ctx, genDirectoryListKey(string(path)))
+ store.Client.Del(ctx, store.getKey(genDirectoryListKey(string(path))))
}
return nil
@@ -167,7 +175,7 @@ func (store *UniversalRedis2Store) ListDirectoryPrefixedEntries(ctx context.Cont
func (store *UniversalRedis2Store) ListDirectoryEntries(ctx context.Context, dirPath util.FullPath, startFileName string, includeStartFile bool, limit int64, eachEntryFunc filer.ListEachEntryFunc) (lastFileName string, err error) {
- dirListKey := genDirectoryListKey(string(dirPath))
+ dirListKey := store.getKey(genDirectoryListKey(string(dirPath)))
min := "-"
if startFileName != "" {
@@ -201,7 +209,7 @@ func (store *UniversalRedis2Store) ListDirectoryEntries(ctx context.Context, dir
} else {
if entry.TtlSec > 0 {
if entry.Attr.Crtime.Add(time.Duration(entry.TtlSec) * time.Second).Before(time.Now()) {
- store.Client.Del(ctx, string(path)).Result()
+ store.Client.Del(ctx, store.getKey(string(path))).Result()
store.Client.ZRem(ctx, dirListKey, fileName).Result()
continue
}
diff --git a/weed/filer/redis_lua/redis_cluster_store.go b/weed/filer/redis_lua/redis_cluster_store.go
index 251aadbcd..b64342fc2 100644
--- a/weed/filer/redis_lua/redis_cluster_store.go
+++ b/weed/filer/redis_lua/redis_cluster_store.go
@@ -25,20 +25,24 @@ func (store *RedisLuaClusterStore) Initialize(configuration util.Configuration,
return store.initialize(
configuration.GetStringSlice(prefix+"addresses"),
+ configuration.GetString(prefix+"username"),
configuration.GetString(prefix+"password"),
+ configuration.GetString(prefix+"keyPrefix"),
configuration.GetBool(prefix+"useReadOnly"),
configuration.GetBool(prefix+"routeByLatency"),
configuration.GetStringSlice(prefix+"superLargeDirectories"),
)
}
-func (store *RedisLuaClusterStore) initialize(addresses []string, password string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) {
+func (store *RedisLuaClusterStore) initialize(addresses []string, username string, password string, keyPrefix string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) {
store.Client = redis.NewClusterClient(&redis.ClusterOptions{
Addrs: addresses,
+ Username: username,
Password: password,
ReadOnly: readOnly,
RouteByLatency: routeByLatency,
})
+ store.keyPrefix = keyPrefix
store.loadSuperLargeDirectories(superLargeDirectories)
return
}
diff --git a/weed/filer/redis_lua/redis_sentinel_store.go b/weed/filer/redis_lua/redis_sentinel_store.go
index f22a7fa66..12a582ac3 100644
--- a/weed/filer/redis_lua/redis_sentinel_store.go
+++ b/weed/filer/redis_lua/redis_sentinel_store.go
@@ -26,10 +26,11 @@ func (store *RedisLuaSentinelStore) Initialize(configuration util.Configuration,
configuration.GetString(prefix+"username"),
configuration.GetString(prefix+"password"),
configuration.GetInt(prefix+"database"),
+ configuration.GetString(prefix+"keyPrefix"),
)
}
-func (store *RedisLuaSentinelStore) initialize(addresses []string, masterName string, username string, password string, database int) (err error) {
+func (store *RedisLuaSentinelStore) initialize(addresses []string, masterName string, username string, password string, database int, keyPrefix string) (err error) {
store.Client = redis.NewFailoverClient(&redis.FailoverOptions{
MasterName: masterName,
SentinelAddrs: addresses,
@@ -41,5 +42,6 @@ func (store *RedisLuaSentinelStore) initialize(addresses []string, masterName st
ReadTimeout: time.Second * 30,
WriteTimeout: time.Second * 5,
})
+ store.keyPrefix = keyPrefix
return
}
diff --git a/weed/filer/redis_lua/redis_store.go b/weed/filer/redis_lua/redis_store.go
index 8574baa09..4f6354e96 100644
--- a/weed/filer/redis_lua/redis_store.go
+++ b/weed/filer/redis_lua/redis_store.go
@@ -21,18 +21,22 @@ func (store *RedisLuaStore) GetName() string {
func (store *RedisLuaStore) Initialize(configuration util.Configuration, prefix string) (err error) {
return store.initialize(
configuration.GetString(prefix+"address"),
+ configuration.GetString(prefix+"username"),
configuration.GetString(prefix+"password"),
configuration.GetInt(prefix+"database"),
+ configuration.GetString(prefix+"keyPrefix"),
configuration.GetStringSlice(prefix+"superLargeDirectories"),
)
}
-func (store *RedisLuaStore) initialize(hostPort string, password string, database int, superLargeDirectories []string) (err error) {
+func (store *RedisLuaStore) initialize(hostPort string, username string, password string, database int, keyPrefix string, superLargeDirectories []string) (err error) {
store.Client = redis.NewClient(&redis.Options{
Addr: hostPort,
+ Username: username,
Password: password,
DB: database,
})
+ store.keyPrefix = keyPrefix
store.loadSuperLargeDirectories(superLargeDirectories)
return
}
diff --git a/weed/filer/redis_lua/universal_redis_store.go b/weed/filer/redis_lua/universal_redis_store.go
index 35f6d4991..0a02a0730 100644
--- a/weed/filer/redis_lua/universal_redis_store.go
+++ b/weed/filer/redis_lua/universal_redis_store.go
@@ -20,6 +20,7 @@ const (
type UniversalRedisLuaStore struct {
Client redis.UniversalClient
+ keyPrefix string
superLargeDirectoryHash map[string]bool
}
@@ -36,6 +37,13 @@ func (store *UniversalRedisLuaStore) loadSuperLargeDirectories(superLargeDirecto
}
}
+func (store *UniversalRedisLuaStore) getKey(key string) string {
+ if store.keyPrefix == "" {
+ return key
+ }
+ return store.keyPrefix + key
+}
+
func (store *UniversalRedisLuaStore) BeginTransaction(ctx context.Context) (context.Context, error) {
return ctx, nil
}
@@ -60,7 +68,7 @@ func (store *UniversalRedisLuaStore) InsertEntry(ctx context.Context, entry *fil
dir, name := entry.FullPath.DirAndName()
err = stored_procedure.InsertEntryScript.Run(ctx, store.Client,
- []string{string(entry.FullPath), genDirectoryListKey(dir)},
+ []string{store.getKey(string(entry.FullPath)), store.getKey(genDirectoryListKey(dir))},
value, entry.TtlSec,
store.isSuperLargeDirectory(dir), 0, name).Err()
@@ -78,7 +86,7 @@ func (store *UniversalRedisLuaStore) UpdateEntry(ctx context.Context, entry *fil
func (store *UniversalRedisLuaStore) FindEntry(ctx context.Context, fullpath util.FullPath) (entry *filer.Entry, err error) {
- data, err := store.Client.Get(ctx, string(fullpath)).Result()
+ data, err := store.Client.Get(ctx, store.getKey(string(fullpath))).Result()
if err == redis.Nil {
return nil, filer_pb.ErrNotFound
}
@@ -103,7 +111,7 @@ func (store *UniversalRedisLuaStore) DeleteEntry(ctx context.Context, fullpath u
dir, name := fullpath.DirAndName()
err = stored_procedure.DeleteEntryScript.Run(ctx, store.Client,
- []string{string(fullpath), genDirectoryListKey(string(fullpath)), genDirectoryListKey(dir)},
+ []string{store.getKey(string(fullpath)), store.getKey(genDirectoryListKey(string(fullpath))), store.getKey(genDirectoryListKey(dir))},
store.isSuperLargeDirectory(dir), name).Err()
if err != nil {
@@ -120,7 +128,7 @@ func (store *UniversalRedisLuaStore) DeleteFolderChildren(ctx context.Context, f
}
err = stored_procedure.DeleteFolderChildrenScript.Run(ctx, store.Client,
- []string{string(fullpath)}).Err()
+ []string{store.getKey(string(fullpath))}).Err()
if err != nil {
return fmt.Errorf("DeleteFolderChildren %s : %v", fullpath, err)
@@ -135,7 +143,7 @@ func (store *UniversalRedisLuaStore) ListDirectoryPrefixedEntries(ctx context.Co
func (store *UniversalRedisLuaStore) ListDirectoryEntries(ctx context.Context, dirPath util.FullPath, startFileName string, includeStartFile bool, limit int64, eachEntryFunc filer.ListEachEntryFunc) (lastFileName string, err error) {
- dirListKey := genDirectoryListKey(string(dirPath))
+ dirListKey := store.getKey(genDirectoryListKey(string(dirPath)))
min := "-"
if startFileName != "" {
diff --git a/weed/pb/master.proto b/weed/pb/master.proto
index f8049c466..afbf31de9 100644
--- a/weed/pb/master.proto
+++ b/weed/pb/master.proto
@@ -81,6 +81,7 @@ message Heartbeat {
map<string, uint32> max_volume_counts = 4;
uint32 grpc_port = 20;
repeated string location_uuids = 21;
+ string id = 22; // volume server id, independent of ip:port for stable identification
}
message HeartbeatResponse {
@@ -289,6 +290,7 @@ message DataNodeInfo {
string id = 1;
map<string, DiskInfo> diskInfos = 2;
uint32 grpc_port = 3;
+ string address = 4; // ip:port for connecting to the volume server
}
message RackInfo {
string id = 1;
diff --git a/weed/pb/master_pb/master.pb.go b/weed/pb/master_pb/master.pb.go
index 19df43d71..41d46fad1 100644
--- a/weed/pb/master_pb/master.pb.go
+++ b/weed/pb/master_pb/master.pb.go
@@ -44,6 +44,7 @@ type Heartbeat struct {
MaxVolumeCounts map[string]uint32 `protobuf:"bytes,4,rep,name=max_volume_counts,json=maxVolumeCounts,proto3" json:"max_volume_counts,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"varint,2,opt,name=value"`
GrpcPort uint32 `protobuf:"varint,20,opt,name=grpc_port,json=grpcPort,proto3" json:"grpc_port,omitempty"`
LocationUuids []string `protobuf:"bytes,21,rep,name=location_uuids,json=locationUuids,proto3" json:"location_uuids,omitempty"`
+ Id string `protobuf:"bytes,22,opt,name=id,proto3" json:"id,omitempty"` // volume server id, independent of ip:port for stable identification
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
@@ -204,6 +205,13 @@ func (x *Heartbeat) GetLocationUuids() []string {
return nil
}
+func (x *Heartbeat) GetId() string {
+ if x != nil {
+ return x.Id
+ }
+ return ""
+}
+
type HeartbeatResponse struct {
state protoimpl.MessageState `protogen:"open.v1"`
VolumeSizeLimit uint64 `protobuf:"varint,1,opt,name=volume_size_limit,json=volumeSizeLimit,proto3" json:"volume_size_limit,omitempty"`
@@ -2039,6 +2047,7 @@ type DataNodeInfo struct {
Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"`
DiskInfos map[string]*DiskInfo `protobuf:"bytes,2,rep,name=diskInfos,proto3" json:"diskInfos,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"`
GrpcPort uint32 `protobuf:"varint,3,opt,name=grpc_port,json=grpcPort,proto3" json:"grpc_port,omitempty"`
+ Address string `protobuf:"bytes,4,opt,name=address,proto3" json:"address,omitempty"` // ip:port for connecting to the volume server
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
@@ -2094,6 +2103,13 @@ func (x *DataNodeInfo) GetGrpcPort() uint32 {
return 0
}
+func (x *DataNodeInfo) GetAddress() string {
+ if x != nil {
+ return x.Address
+ }
+ return ""
+}
+
type RackInfo struct {
state protoimpl.MessageState `protogen:"open.v1"`
Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"`
@@ -4038,7 +4054,7 @@ var File_master_proto protoreflect.FileDescriptor
const file_master_proto_rawDesc = "" +
"\n" +
- "\fmaster.proto\x12\tmaster_pb\"\xc0\a\n" +
+ "\fmaster.proto\x12\tmaster_pb\"\xd0\a\n" +
"\tHeartbeat\x12\x0e\n" +
"\x02ip\x18\x01 \x01(\tR\x02ip\x12\x12\n" +
"\x04port\x18\x02 \x01(\rR\x04port\x12\x1d\n" +
@@ -4063,7 +4079,8 @@ const file_master_proto_rawDesc = "" +
"\x10has_no_ec_shards\x18\x13 \x01(\bR\rhasNoEcShards\x12U\n" +
"\x11max_volume_counts\x18\x04 \x03(\v2).master_pb.Heartbeat.MaxVolumeCountsEntryR\x0fmaxVolumeCounts\x12\x1b\n" +
"\tgrpc_port\x18\x14 \x01(\rR\bgrpcPort\x12%\n" +
- "\x0elocation_uuids\x18\x15 \x03(\tR\rlocationUuids\x1aB\n" +
+ "\x0elocation_uuids\x18\x15 \x03(\tR\rlocationUuids\x12\x0e\n" +
+ "\x02id\x18\x16 \x01(\tR\x02id\x1aB\n" +
"\x14MaxVolumeCountsEntry\x12\x10\n" +
"\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" +
"\x05value\x18\x02 \x01(\rR\x05value:\x028\x01\"\xcd\x02\n" +
@@ -4254,11 +4271,12 @@ const file_master_proto_rawDesc = "" +
"\fvolume_infos\x18\x06 \x03(\v2#.master_pb.VolumeInformationMessageR\vvolumeInfos\x12P\n" +
"\x0eec_shard_infos\x18\a \x03(\v2*.master_pb.VolumeEcShardInformationMessageR\fecShardInfos\x12.\n" +
"\x13remote_volume_count\x18\b \x01(\x03R\x11remoteVolumeCount\x12\x17\n" +
- "\adisk_id\x18\t \x01(\rR\x06diskId\"\xd4\x01\n" +
+ "\adisk_id\x18\t \x01(\rR\x06diskId\"\xee\x01\n" +
"\fDataNodeInfo\x12\x0e\n" +
"\x02id\x18\x01 \x01(\tR\x02id\x12D\n" +
"\tdiskInfos\x18\x02 \x03(\v2&.master_pb.DataNodeInfo.DiskInfosEntryR\tdiskInfos\x12\x1b\n" +
- "\tgrpc_port\x18\x03 \x01(\rR\bgrpcPort\x1aQ\n" +
+ "\tgrpc_port\x18\x03 \x01(\rR\bgrpcPort\x12\x18\n" +
+ "\aaddress\x18\x04 \x01(\tR\aaddress\x1aQ\n" +
"\x0eDiskInfosEntry\x12\x10\n" +
"\x03key\x18\x01 \x01(\tR\x03key\x12)\n" +
"\x05value\x18\x02 \x01(\v2\x13.master_pb.DiskInfoR\x05value:\x028\x01\"\xf0\x01\n" +
diff --git a/weed/pb/server_address.go b/weed/pb/server_address.go
index a0aa79ae4..943b85519 100644
--- a/weed/pb/server_address.go
+++ b/weed/pb/server_address.go
@@ -2,11 +2,12 @@ package pb
import (
"fmt"
- "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
- "github.com/seaweedfs/seaweedfs/weed/util"
"net"
"strconv"
"strings"
+
+ "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
+ "github.com/seaweedfs/seaweedfs/weed/util"
)
type ServerAddress string
@@ -32,7 +33,12 @@ func NewServerAddressWithGrpcPort(address string, grpcPort int) ServerAddress {
}
func NewServerAddressFromDataNode(dn *master_pb.DataNodeInfo) ServerAddress {
- return NewServerAddressWithGrpcPort(dn.Id, int(dn.GrpcPort))
+ // Use Address field if available (new behavior), fall back to Id for backward compatibility
+ addr := dn.Address
+ if addr == "" {
+ addr = dn.Id // backward compatibility: old nodes use ip:port as id
+ }
+ return NewServerAddressWithGrpcPort(addr, int(dn.GrpcPort))
}
func NewServerAddressFromLocation(dn *master_pb.Location) ServerAddress {
diff --git a/weed/s3api/chunked_reader_v4.go b/weed/s3api/chunked_reader_v4.go
index c21b57009..f841c3e1e 100644
--- a/weed/s3api/chunked_reader_v4.go
+++ b/weed/s3api/chunked_reader_v4.go
@@ -116,6 +116,7 @@ func (iam *IdentityAccessManagement) newChunkedReader(req *http.Request) (io.Rea
}
checkSumWriter := getCheckSumWriter(checksumAlgorithm)
+ hasTrailer := amzTrailerHeader != ""
return &s3ChunkedReader{
cred: credential,
@@ -129,6 +130,7 @@ func (iam *IdentityAccessManagement) newChunkedReader(req *http.Request) (io.Rea
checkSumWriter: checkSumWriter,
state: readChunkHeader,
iam: iam,
+ hasTrailer: hasTrailer,
}, s3err.ErrNone
}
@@ -170,6 +172,7 @@ type s3ChunkedReader struct {
n uint64 // Unread bytes in chunk
err error
iam *IdentityAccessManagement
+ hasTrailer bool
}
// Read chunk reads the chunk token signature portion.
@@ -281,10 +284,10 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) {
}
// If we're using unsigned streaming upload, there is no signature to verify at each chunk.
- if cr.chunkSignature != "" {
- cr.state = verifyChunk
- } else if cr.lastChunk {
+ if cr.lastChunk && cr.hasTrailer {
cr.state = readTrailerChunk
+ } else if cr.chunkSignature != "" {
+ cr.state = verifyChunk
} else {
cr.state = readChunkHeader
}
@@ -304,7 +307,11 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) {
// This implementation currently only supports the first case.
// TODO: Implement the second case (signed upload with additional checksum computation for each chunk)
- extractedCheckSumAlgorithm, extractedChecksum := parseChunkChecksum(cr.reader)
+ extractedCheckSumAlgorithm, extractedChecksum, err := parseChunkChecksum(cr.reader)
+ if err != nil {
+ cr.err = err
+ return 0, err
+ }
if extractedCheckSumAlgorithm.String() != cr.checkSumAlgorithm {
errorMessage := fmt.Sprintf("checksum algorithm in trailer '%s' does not match the one advertised in the header '%s'", extractedCheckSumAlgorithm.String(), cr.checkSumAlgorithm)
@@ -313,6 +320,7 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) {
return 0, cr.err
}
+ // Validate checksum for data integrity (required for both signed and unsigned streaming with trailers)
computedChecksum := cr.checkSumWriter.Sum(nil)
base64Checksum := base64.StdEncoding.EncodeToString(computedChecksum)
if string(extractedChecksum) != base64Checksum {
@@ -324,11 +332,6 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) {
// TODO: Extract signature from trailer chunk and verify it.
// For now, we just read the trailer chunk and discard it.
- // Reading remaining CRLF.
- for i := 0; i < 2; i++ {
- cr.err = readCRLF(cr.reader)
- }
-
cr.state = eofChunk
case readChunk:
@@ -506,41 +509,37 @@ func parseS3ChunkExtension(buf []byte) ([]byte, []byte) {
return buf[:semi], parseChunkSignature(buf[semi:])
}
-func parseChunkChecksum(b *bufio.Reader) (ChecksumAlgorithm, []byte) {
- // When using unsigned upload, this would be the raw contents of the trailer chunk:
- //
- // x-amz-checksum-crc32:YABb/g==\n\r\n\r\n // Trailer chunk (note optional \n character)
- // \r\n // CRLF
- //
- // When using signed upload with an additional checksum algorithm, this would be the raw contents of the trailer chunk:
- //
- // x-amz-checksum-crc32:YABb/g==\n\r\n // Trailer chunk (note optional \n character)
- // trailer-signature\r\n
- // \r\n // CRLF
- //
-
- // x-amz-checksum-crc32:YABb/g==\n
- bytesRead, err := readChunkLine(b)
- if err != nil {
- return ChecksumAlgorithmNone, nil
- }
+func parseChunkChecksum(b *bufio.Reader) (ChecksumAlgorithm, []byte, error) {
+ // Read trailer lines until empty line
+ var checksumAlgorithm ChecksumAlgorithm
+ var checksum []byte
- // Split on ':'
- parts := bytes.SplitN(bytesRead, []byte(":"), 2)
- checksumKey := string(parts[0])
- checksumValue := parts[1]
+ for {
+ bytesRead, err := readChunkLine(b)
+ if err != nil {
+ return ChecksumAlgorithmNone, nil, err
+ }
- // Discard all trailing whitespace characters
- checksumValue = trimTrailingWhitespace(checksumValue)
+ if len(bytesRead) == 0 {
+ break
+ }
- // If the checksum key is not a supported checksum algorithm, return an error.
- // TODO: Bubble that error up to the caller
- extractedAlgorithm, err := extractChecksumAlgorithm(checksumKey)
- if err != nil {
- return ChecksumAlgorithmNone, nil
+ parts := bytes.SplitN(bytesRead, []byte(":"), 2)
+ if len(parts) == 2 {
+ key := string(bytes.TrimSpace(parts[0]))
+ value := bytes.TrimSpace(parts[1])
+ if alg, err := extractChecksumAlgorithm(key); err == nil {
+ if checksumAlgorithm != ChecksumAlgorithmNone {
+ glog.V(3).Infof("multiple checksum headers found in trailer, using last: %s", key)
+ }
+ checksumAlgorithm = alg
+ checksum = value
+ }
+ // Ignore other trailer headers like x-amz-trailer-signature
+ }
}
- return extractedAlgorithm, checksumValue
+ return checksumAlgorithm, checksum, nil
}
func parseChunkSignature(chunk []byte) []byte {
diff --git a/weed/s3api/s3api_bucket_config.go b/weed/s3api/s3api_bucket_config.go
index 00449d80a..a10374339 100644
--- a/weed/s3api/s3api_bucket_config.go
+++ b/weed/s3api/s3api_bucket_config.go
@@ -519,7 +519,9 @@ func (s3a *S3ApiServer) getVersioningState(bucket string) (string, error) {
config, errCode := s3a.getBucketConfig(bucket)
if errCode != s3err.ErrNone {
if errCode == s3err.ErrNoSuchBucket {
- return "", nil
+ // Signal to callers that the bucket does not exist so they can
+ // decide whether to auto-create it (e.g., in PUT handlers).
+ return "", filer_pb.ErrNotFound
}
glog.Errorf("getVersioningState: failed to get bucket config for %s: %v", bucket, errCode)
return "", fmt.Errorf("failed to get bucket config: %v", errCode)
diff --git a/weed/s3api/s3api_object_handlers_copy.go b/weed/s3api/s3api_object_handlers_copy.go
index 4e465919c..66d4ded80 100644
--- a/weed/s3api/s3api_object_handlers_copy.go
+++ b/weed/s3api/s3api_object_handlers_copy.go
@@ -199,7 +199,9 @@ func (s3a *S3ApiServer) CopyObjectHandler(w http.ResponseWriter, r *http.Request
}
// Process metadata and tags and apply to destination
- processedMetadata, tagErr := processMetadataBytes(r.Header, entry.Extended, replaceMeta, replaceTagging)
+ // Use dstEntry.Extended (already filtered) as the source, not entry.Extended,
+ // to preserve the encryption header filtering. Fixes GitHub #7562.
+ processedMetadata, tagErr := processMetadataBytes(r.Header, dstEntry.Extended, replaceMeta, replaceTagging)
if tagErr != nil {
s3err.WriteErrorResponse(w, r, s3err.ErrInvalidCopySource)
return
@@ -1543,7 +1545,7 @@ func (s3a *S3ApiServer) copyMultipartSSECChunk(chunk *filer_pb.FileChunk, copySo
}
// copyMultipartCrossEncryption handles all cross-encryption and decrypt-only copy scenarios
-// This unified function supports: SSE-C↔SSE-KMS, SSE-C→Plain, SSE-KMS→Plain
+// This unified function supports: SSE-C↔SSE-KMS↔SSE-S3, and any→Plain
func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstBucket, dstPath string) ([]*filer_pb.FileChunk, map[string][]byte, error) {
var dstChunks []*filer_pb.FileChunk
@@ -1552,6 +1554,7 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h
var destKMSKeyID string
var destKMSEncryptionContext map[string]string
var destKMSBucketKeyEnabled bool
+ var destSSES3Key *SSES3Key
if state.DstSSEC {
var err error
@@ -1565,7 +1568,13 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h
if err != nil {
return nil, nil, fmt.Errorf("failed to parse destination SSE-KMS headers: %w", err)
}
- } else {
+ } else if state.DstSSES3 {
+ // Generate SSE-S3 key for destination
+ var err error
+ destSSES3Key, err = GenerateSSES3Key()
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to generate SSE-S3 key: %w", err)
+ }
}
// Parse source encryption parameters
@@ -1584,12 +1593,18 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h
var err error
if chunk.GetSseType() == filer_pb.SSEType_SSE_C {
- copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, sourceSSECKey, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, dstPath, dstBucket, state)
+ copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, sourceSSECKey, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state)
} else if chunk.GetSseType() == filer_pb.SSEType_SSE_KMS {
- copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, dstPath, dstBucket, state)
+ copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state)
+ } else if chunk.GetSseType() == filer_pb.SSEType_SSE_S3 {
+ copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state)
} else {
- // Unencrypted chunk, copy directly
- copiedChunk, err = s3a.copySingleChunk(chunk, dstPath)
+ // Unencrypted chunk - may need encryption if destination requires it
+ if state.DstSSEC || state.DstSSEKMS || state.DstSSES3 {
+ copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state)
+ } else {
+ copiedChunk, err = s3a.copySingleChunk(chunk, dstPath)
+ }
}
if err != nil {
@@ -1640,6 +1655,40 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h
} else {
glog.Errorf("Failed to serialize SSE-KMS metadata: %v", serErr)
}
+ } else if state.DstSSES3 && destSSES3Key != nil {
+ // For SSE-S3 destination, create object-level metadata
+ var sses3Metadata *SSES3Key
+ if len(dstChunks) == 0 {
+ // Handle 0-byte files - generate IV for metadata even though there's no content to encrypt
+ if entry.Attributes.FileSize != 0 {
+ return nil, nil, fmt.Errorf("internal error: no chunks created for non-empty SSE-S3 destination object")
+ }
+ // Generate IV for 0-byte object metadata
+ iv := make([]byte, s3_constants.AESBlockSize)
+ if _, err := io.ReadFull(rand.Reader, iv); err != nil {
+ return nil, nil, fmt.Errorf("generate IV for 0-byte object: %w", err)
+ }
+ destSSES3Key.IV = iv
+ sses3Metadata = destSSES3Key
+ } else {
+ // For non-empty objects, use the first chunk's metadata
+ if dstChunks[0].GetSseType() != filer_pb.SSEType_SSE_S3 || len(dstChunks[0].GetSseMetadata()) == 0 {
+ return nil, nil, fmt.Errorf("internal error: first chunk is missing expected SSE-S3 metadata for destination object")
+ }
+ keyManager := GetSSES3KeyManager()
+ var err error
+ sses3Metadata, err = DeserializeSSES3Metadata(dstChunks[0].GetSseMetadata(), keyManager)
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to deserialize SSE-S3 metadata from first chunk: %w", err)
+ }
+ }
+ // Use the derived key with its IV for object-level metadata
+ keyData, serErr := SerializeSSES3Metadata(sses3Metadata)
+ if serErr != nil {
+ return nil, nil, fmt.Errorf("failed to serialize SSE-S3 metadata: %w", serErr)
+ }
+ dstMetadata[s3_constants.SeaweedFSSSES3Key] = keyData
+ dstMetadata[s3_constants.AmzServerSideEncryption] = []byte("AES256")
}
// For unencrypted destination, no metadata needed (dstMetadata remains empty)
@@ -1647,7 +1696,7 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h
}
// copyCrossEncryptionChunk handles copying a single chunk with cross-encryption support
-func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sourceSSECKey *SSECustomerKey, destSSECKey *SSECustomerKey, destKMSKeyID string, destKMSEncryptionContext map[string]string, destKMSBucketKeyEnabled bool, dstPath, dstBucket string, state *EncryptionState) (*filer_pb.FileChunk, error) {
+func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sourceSSECKey *SSECustomerKey, destSSECKey *SSECustomerKey, destKMSKeyID string, destKMSEncryptionContext map[string]string, destKMSBucketKeyEnabled bool, destSSES3Key *SSES3Key, dstPath, dstBucket string, state *EncryptionState) (*filer_pb.FileChunk, error) {
// Create destination chunk
dstChunk := s3a.createDestinationChunk(chunk, chunk.Offset, chunk.Size)
@@ -1747,6 +1796,30 @@ func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sour
previewLen = len(finalData)
}
+ } else if chunk.GetSseType() == filer_pb.SSEType_SSE_S3 {
+ // Decrypt SSE-S3 source
+ if len(chunk.GetSseMetadata()) == 0 {
+ return nil, fmt.Errorf("SSE-S3 chunk missing per-chunk metadata")
+ }
+
+ keyManager := GetSSES3KeyManager()
+ sourceSSEKey, err := DeserializeSSES3Metadata(chunk.GetSseMetadata(), keyManager)
+ if err != nil {
+ return nil, fmt.Errorf("failed to deserialize SSE-S3 metadata: %w", err)
+ }
+
+ decryptedReader, decErr := CreateSSES3DecryptedReader(bytes.NewReader(encryptedData), sourceSSEKey, sourceSSEKey.IV)
+ if decErr != nil {
+ return nil, fmt.Errorf("create SSE-S3 decrypted reader: %w", decErr)
+ }
+
+ decryptedData, readErr := io.ReadAll(decryptedReader)
+ if readErr != nil {
+ return nil, fmt.Errorf("decrypt SSE-S3 chunk data: %w", readErr)
+ }
+ finalData = decryptedData
+ glog.V(4).Infof("Decrypted SSE-S3 chunk, size: %d", len(finalData))
+
} else {
// Source is unencrypted
finalData = encryptedData
@@ -1808,6 +1881,36 @@ func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sour
dstChunk.SseMetadata = kmsMetadata
glog.V(4).Infof("Re-encrypted chunk with SSE-KMS")
+
+ } else if state.DstSSES3 && destSSES3Key != nil {
+ // Encrypt with SSE-S3
+ encryptedReader, iv, encErr := CreateSSES3EncryptedReader(bytes.NewReader(finalData), destSSES3Key)
+ if encErr != nil {
+ return nil, fmt.Errorf("create SSE-S3 encrypted reader: %w", encErr)
+ }
+
+ reencryptedData, readErr := io.ReadAll(encryptedReader)
+ if readErr != nil {
+ return nil, fmt.Errorf("re-encrypt with SSE-S3: %w", readErr)
+ }
+ finalData = reencryptedData
+
+ // Create per-chunk SSE-S3 metadata with chunk-specific IV
+ chunkSSEKey := &SSES3Key{
+ Key: destSSES3Key.Key,
+ KeyID: destSSES3Key.KeyID,
+ Algorithm: destSSES3Key.Algorithm,
+ IV: iv,
+ }
+ sses3Metadata, err := SerializeSSES3Metadata(chunkSSEKey)
+ if err != nil {
+ return nil, fmt.Errorf("serialize SSE-S3 metadata: %w", err)
+ }
+
+ dstChunk.SseType = filer_pb.SSEType_SSE_S3
+ dstChunk.SseMetadata = sses3Metadata
+
+ glog.V(4).Infof("Re-encrypted chunk with SSE-S3")
}
// For unencrypted destination, finalData remains as decrypted plaintext
diff --git a/weed/s3api/s3api_object_handlers_copy_unified.go b/weed/s3api/s3api_object_handlers_copy_unified.go
index 255c3eb2d..f1b4ff280 100644
--- a/weed/s3api/s3api_object_handlers_copy_unified.go
+++ b/weed/s3api/s3api_object_handlers_copy_unified.go
@@ -1,7 +1,6 @@
package s3api
import (
- "context"
"errors"
"fmt"
"net/http"
@@ -133,9 +132,9 @@ func (s3a *S3ApiServer) executeEncryptCopy(entry *filer_pb.Entry, r *http.Reques
}
if state.DstSSES3 {
- // Use streaming copy for SSE-S3 encryption
- chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath)
- return chunks, nil, err
+ // Use chunk-by-chunk copy for SSE-S3 encryption (consistent with SSE-C and SSE-KMS)
+ glog.V(2).Infof("Plain→SSE-S3 copy: using unified multipart encrypt copy")
+ return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath)
}
return nil, nil, fmt.Errorf("unknown target encryption type")
@@ -143,30 +142,18 @@ func (s3a *S3ApiServer) executeEncryptCopy(entry *filer_pb.Entry, r *http.Reques
// executeDecryptCopy handles encrypted → plain copies
func (s3a *S3ApiServer) executeDecryptCopy(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstPath string) ([]*filer_pb.FileChunk, map[string][]byte, error) {
- // Use unified multipart-aware decrypt copy for all encryption types
- if state.SrcSSEC || state.SrcSSEKMS {
+ // Use unified multipart-aware decrypt copy for all encryption types (consistent chunk-by-chunk)
+ if state.SrcSSEC || state.SrcSSEKMS || state.SrcSSES3 {
glog.V(2).Infof("Encrypted→Plain copy: using unified multipart decrypt copy")
return s3a.copyMultipartCrossEncryption(entry, r, state, "", dstPath)
}
- if state.SrcSSES3 {
- // Use streaming copy for SSE-S3 decryption
- chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath)
- return chunks, nil, err
- }
-
return nil, nil, fmt.Errorf("unknown source encryption type")
}
// executeReencryptCopy handles encrypted → encrypted copies with different keys/methods
func (s3a *S3ApiServer) executeReencryptCopy(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstBucket, dstPath string) ([]*filer_pb.FileChunk, map[string][]byte, error) {
- // Check if we should use streaming copy for better performance
- if s3a.shouldUseStreamingCopy(entry, state) {
- chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath)
- return chunks, nil, err
- }
-
- // Fallback to chunk-by-chunk approach for compatibility
+ // Use chunk-by-chunk approach for all cross-encryption scenarios (consistent behavior)
if state.SrcSSEC && state.DstSSEC {
return s3a.copyChunksWithSSEC(entry, r)
}
@@ -177,83 +164,8 @@ func (s3a *S3ApiServer) executeReencryptCopy(entry *filer_pb.Entry, r *http.Requ
return chunks, dstMetadata, err
}
- if state.SrcSSEC && state.DstSSEKMS {
- // SSE-C → SSE-KMS: use unified multipart-aware cross-encryption copy
- glog.V(2).Infof("SSE-C→SSE-KMS cross-encryption copy: using unified multipart copy")
- return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath)
- }
-
- if state.SrcSSEKMS && state.DstSSEC {
- // SSE-KMS → SSE-C: use unified multipart-aware cross-encryption copy
- glog.V(2).Infof("SSE-KMS→SSE-C cross-encryption copy: using unified multipart copy")
- return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath)
- }
-
- // Handle SSE-S3 cross-encryption scenarios
- if state.SrcSSES3 || state.DstSSES3 {
- // Any scenario involving SSE-S3 uses streaming copy
- chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath)
- return chunks, nil, err
- }
-
- return nil, nil, fmt.Errorf("unsupported cross-encryption scenario")
-}
-
-// shouldUseStreamingCopy determines if streaming copy should be used
-func (s3a *S3ApiServer) shouldUseStreamingCopy(entry *filer_pb.Entry, state *EncryptionState) bool {
- // Use streaming copy for large files or when beneficial
- fileSize := entry.Attributes.FileSize
-
- // Use streaming for files larger than 10MB
- if fileSize > 10*1024*1024 {
- return true
- }
-
- // Check if this is a multipart encrypted object
- isMultipartEncrypted := false
- if state.IsSourceEncrypted() {
- encryptedChunks := 0
- for _, chunk := range entry.GetChunks() {
- if chunk.GetSseType() != filer_pb.SSEType_NONE {
- encryptedChunks++
- }
- }
- isMultipartEncrypted = encryptedChunks > 1
- }
-
- // For multipart encrypted objects, avoid streaming copy to use per-chunk metadata approach
- if isMultipartEncrypted {
- glog.V(3).Infof("Multipart encrypted object detected, using chunk-by-chunk approach")
- return false
- }
-
- // Use streaming for cross-encryption scenarios (for single-part objects only)
- if state.IsSourceEncrypted() && state.IsTargetEncrypted() {
- srcType := s3a.getEncryptionTypeString(state.SrcSSEC, state.SrcSSEKMS, state.SrcSSES3)
- dstType := s3a.getEncryptionTypeString(state.DstSSEC, state.DstSSEKMS, state.DstSSES3)
- if srcType != dstType {
- return true
- }
- }
-
- // Use streaming for compressed files
- if isCompressedEntry(entry) {
- return true
- }
-
- // Use streaming for SSE-S3 scenarios (always)
- if state.SrcSSES3 || state.DstSSES3 {
- return true
- }
-
- return false
-}
-
-// executeStreamingReencryptCopy performs streaming re-encryption copy
-func (s3a *S3ApiServer) executeStreamingReencryptCopy(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstPath string) ([]*filer_pb.FileChunk, error) {
- // Create streaming copy manager
- streamingManager := NewStreamingCopyManager(s3a)
-
- // Execute streaming copy
- return streamingManager.ExecuteStreamingCopy(context.Background(), entry, r, dstPath, state)
+ // All other cross-encryption scenarios use unified multipart copy
+ // This includes: SSE-C↔SSE-KMS, SSE-C↔SSE-S3, SSE-KMS↔SSE-S3, SSE-S3↔SSE-S3
+ glog.V(2).Infof("Cross-encryption copy: using unified multipart copy")
+ return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath)
}
diff --git a/weed/s3api/s3api_object_handlers_delete.go b/weed/s3api/s3api_object_handlers_delete.go
index f779a6edc..6e373bb4e 100644
--- a/weed/s3api/s3api_object_handlers_delete.go
+++ b/weed/s3api/s3api_object_handlers_delete.go
@@ -1,12 +1,10 @@
package s3api
import (
- "context"
"encoding/xml"
"fmt"
"io"
"net/http"
- "slices"
"strings"
"github.com/seaweedfs/seaweedfs/weed/filer"
@@ -127,22 +125,9 @@ func (s3a *S3ApiServer) DeleteObjectHandler(w http.ResponseWriter, r *http.Reque
dir, name := target.DirAndName()
err := s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
- // Use operation context that won't be cancelled if request terminates
- // This ensures deletion completes atomically to avoid inconsistent state
- opCtx := context.WithoutCancel(r.Context())
-
- if err := doDeleteEntry(client, dir, name, true, false); err != nil {
- return err
- }
-
- // Cleanup empty directories
- if !s3a.option.AllowEmptyFolder && strings.LastIndex(object, "/") > 0 {
- bucketPath := fmt.Sprintf("%s/%s", s3a.option.BucketsPath, bucket)
- // Recursively delete empty parent directories, stop at bucket path
- filer_pb.DoDeleteEmptyParentDirectories(opCtx, client, util.FullPath(dir), util.FullPath(bucketPath), nil)
- }
-
- return nil
+ return doDeleteEntry(client, dir, name, true, false)
+ // Note: Empty folder cleanup is now handled asynchronously by EmptyFolderCleaner
+ // which listens to metadata events and uses consistent hashing for coordination
})
if err != nil {
s3err.WriteErrorResponse(w, r, s3err.ErrInternalError)
@@ -222,8 +207,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h
var deleteErrors []DeleteError
var auditLog *s3err.AccessLog
- directoriesWithDeletion := make(map[string]bool)
-
if s3err.Logger != nil {
auditLog = s3err.GetAccessLog(r, http.StatusNoContent, s3err.ErrNone)
}
@@ -245,10 +228,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h
versioningConfigured := (versioningState != "")
s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
- // Use operation context that won't be cancelled if request terminates
- // This ensures batch deletion completes atomically to avoid inconsistent state
- opCtx := context.WithoutCancel(r.Context())
-
// delete file entries
for _, object := range deleteObjects.Objects {
if object.Key == "" {
@@ -357,10 +336,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h
err := doDeleteEntry(client, parentDirectoryPath, entryName, isDeleteData, isRecursive)
if err == nil {
- // Track directory for empty directory cleanup
- if !s3a.option.AllowEmptyFolder {
- directoriesWithDeletion[parentDirectoryPath] = true
- }
deletedObjects = append(deletedObjects, object)
} else if strings.Contains(err.Error(), filer.MsgFailDelNonEmptyFolder) {
deletedObjects = append(deletedObjects, object)
@@ -380,30 +355,8 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h
}
}
- // Cleanup empty directories - optimize by processing deepest first
- if !s3a.option.AllowEmptyFolder && len(directoriesWithDeletion) > 0 {
- bucketPath := fmt.Sprintf("%s/%s", s3a.option.BucketsPath, bucket)
-
- // Collect and sort directories by depth (deepest first) to avoid redundant checks
- var allDirs []string
- for dirPath := range directoriesWithDeletion {
- allDirs = append(allDirs, dirPath)
- }
- // Sort by depth (deeper directories first)
- slices.SortFunc(allDirs, func(a, b string) int {
- return strings.Count(b, "/") - strings.Count(a, "/")
- })
-
- // Track already-checked directories to avoid redundant work
- checked := make(map[string]bool)
- for _, dirPath := range allDirs {
- if !checked[dirPath] {
- // Recursively delete empty parent directories, stop at bucket path
- // Mark this directory and all its parents as checked during recursion
- filer_pb.DoDeleteEmptyParentDirectories(opCtx, client, util.FullPath(dirPath), util.FullPath(bucketPath), checked)
- }
- }
- }
+ // Note: Empty folder cleanup is now handled asynchronously by EmptyFolderCleaner
+ // which listens to metadata events and uses consistent hashing for coordination
return nil
})
diff --git a/weed/s3api/s3api_object_handlers_put.go b/weed/s3api/s3api_object_handlers_put.go
index 100796b2e..f848790de 100644
--- a/weed/s3api/s3api_object_handlers_put.go
+++ b/weed/s3api/s3api_object_handlers_put.go
@@ -8,6 +8,7 @@ import (
"fmt"
"io"
"net/http"
+ "net/url"
"path/filepath"
"strconv"
"strings"
@@ -548,6 +549,28 @@ func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader
}
}
+ // Parse and store object tags from X-Amz-Tagging header
+ // Fix for GitHub issue #7589: Tags sent during object upload were not being stored
+ if tagging := r.Header.Get(s3_constants.AmzObjectTagging); tagging != "" {
+ parsedTags, err := url.ParseQuery(tagging)
+ if err != nil {
+ glog.Warningf("putToFiler: Invalid S3 tag format in header '%s': %v", tagging, err)
+ return "", s3err.ErrInvalidTag, SSEResponseMetadata{}
+ }
+ for key, values := range parsedTags {
+ if len(values) > 1 {
+ glog.Warningf("putToFiler: Duplicate tag key '%s' in header", key)
+ return "", s3err.ErrInvalidTag, SSEResponseMetadata{}
+ }
+ value := ""
+ if len(values) > 0 {
+ value = values[0]
+ }
+ entry.Extended[s3_constants.AmzObjectTagging+"-"+key] = []byte(value)
+ }
+ glog.V(3).Infof("putToFiler: stored %d tags from X-Amz-Tagging header", len(parsedTags))
+ }
+
// Set SSE-C metadata
if customerKey != nil && len(sseIV) > 0 {
// Store IV as RAW bytes (matches filer behavior - filer decodes base64 headers and stores raw bytes)
@@ -680,9 +703,9 @@ func filerErrorToS3Error(err error) s3err.ErrorCode {
if err == nil {
return s3err.ErrNone
}
-
+
errString := err.Error()
-
+
switch {
case errString == constants.ErrMsgBadDigest:
return s3err.ErrBadDigest
diff --git a/weed/s3api/s3api_streaming_copy.go b/weed/s3api/s3api_streaming_copy.go
index 457986858..94729c003 100644
--- a/weed/s3api/s3api_streaming_copy.go
+++ b/weed/s3api/s3api_streaming_copy.go
@@ -59,18 +59,19 @@ func NewStreamingCopyManager(s3a *S3ApiServer) *StreamingCopyManager {
}
}
-// ExecuteStreamingCopy performs a streaming copy operation
-func (scm *StreamingCopyManager) ExecuteStreamingCopy(ctx context.Context, entry *filer_pb.Entry, r *http.Request, dstPath string, state *EncryptionState) ([]*filer_pb.FileChunk, error) {
+// ExecuteStreamingCopy performs a streaming copy operation and returns the encryption spec
+// The encryption spec is needed for SSE-S3 to properly set destination metadata (fixes GitHub #7562)
+func (scm *StreamingCopyManager) ExecuteStreamingCopy(ctx context.Context, entry *filer_pb.Entry, r *http.Request, dstPath string, state *EncryptionState) ([]*filer_pb.FileChunk, *EncryptionSpec, error) {
// Create streaming copy specification
spec, err := scm.createStreamingSpec(entry, r, state)
if err != nil {
- return nil, fmt.Errorf("create streaming spec: %w", err)
+ return nil, nil, fmt.Errorf("create streaming spec: %w", err)
}
// Create source reader from entry
sourceReader, err := scm.createSourceReader(entry)
if err != nil {
- return nil, fmt.Errorf("create source reader: %w", err)
+ return nil, nil, fmt.Errorf("create source reader: %w", err)
}
defer sourceReader.Close()
@@ -79,11 +80,16 @@ func (scm *StreamingCopyManager) ExecuteStreamingCopy(ctx context.Context, entry
// Create processing pipeline
processedReader, err := scm.createProcessingPipeline(spec)
if err != nil {
- return nil, fmt.Errorf("create processing pipeline: %w", err)
+ return nil, nil, fmt.Errorf("create processing pipeline: %w", err)
}
// Stream to destination
- return scm.streamToDestination(ctx, processedReader, spec, dstPath)
+ chunks, err := scm.streamToDestination(ctx, processedReader, spec, dstPath)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return chunks, spec.EncryptionSpec, nil
}
// createStreamingSpec creates a streaming specification based on copy parameters
@@ -453,8 +459,8 @@ func (scm *StreamingCopyManager) streamToChunks(ctx context.Context, reader io.R
for {
n, err := reader.Read(buffer)
if n > 0 {
- // Create chunk for this data
- chunk, chunkErr := scm.createChunkFromData(buffer[:n], offset, dstPath)
+ // Create chunk for this data, setting SSE type and per-chunk metadata (including chunk-specific IVs for SSE-S3)
+ chunk, chunkErr := scm.createChunkFromData(buffer[:n], offset, dstPath, spec.EncryptionSpec)
if chunkErr != nil {
return nil, fmt.Errorf("create chunk from data: %w", chunkErr)
}
@@ -474,7 +480,7 @@ func (scm *StreamingCopyManager) streamToChunks(ctx context.Context, reader io.R
}
// createChunkFromData creates a chunk from streaming data
-func (scm *StreamingCopyManager) createChunkFromData(data []byte, offset int64, dstPath string) (*filer_pb.FileChunk, error) {
+func (scm *StreamingCopyManager) createChunkFromData(data []byte, offset int64, dstPath string, encSpec *EncryptionSpec) (*filer_pb.FileChunk, error) {
// Assign new volume
assignResult, err := scm.s3a.assignNewVolume(dstPath)
if err != nil {
@@ -487,6 +493,42 @@ func (scm *StreamingCopyManager) createChunkFromData(data []byte, offset int64,
Size: uint64(len(data)),
}
+ // Set SSE type and metadata on chunk if destination is encrypted
+ // This is critical for GetObject to know to decrypt the data - fixes GitHub #7562
+ if encSpec != nil && encSpec.NeedsEncryption {
+ switch encSpec.DestinationType {
+ case EncryptionTypeSSEC:
+ chunk.SseType = filer_pb.SSEType_SSE_C
+ // SSE-C metadata is handled at object level, not per-chunk for streaming copy
+ case EncryptionTypeSSEKMS:
+ chunk.SseType = filer_pb.SSEType_SSE_KMS
+ // SSE-KMS metadata is handled at object level, not per-chunk for streaming copy
+ case EncryptionTypeSSES3:
+ chunk.SseType = filer_pb.SSEType_SSE_S3
+ // Create per-chunk SSE-S3 metadata with chunk-specific IV
+ if sseKey, ok := encSpec.DestinationKey.(*SSES3Key); ok {
+ // Calculate chunk-specific IV using base IV and chunk offset
+ baseIV := encSpec.DestinationIV
+ if len(baseIV) == 0 {
+ return nil, fmt.Errorf("SSE-S3 encryption requires DestinationIV to be set for chunk at offset %d", offset)
+ }
+ chunkIV, _ := calculateIVWithOffset(baseIV, offset)
+ // Create chunk key with the chunk-specific IV
+ chunkSSEKey := &SSES3Key{
+ Key: sseKey.Key,
+ KeyID: sseKey.KeyID,
+ Algorithm: sseKey.Algorithm,
+ IV: chunkIV,
+ }
+ chunkMetadata, serErr := SerializeSSES3Metadata(chunkSSEKey)
+ if serErr != nil {
+ return nil, fmt.Errorf("failed to serialize chunk SSE-S3 metadata: %w", serErr)
+ }
+ chunk.SseMetadata = chunkMetadata
+ }
+ }
+ }
+
// Set file ID
if err := scm.s3a.setChunkFileId(chunk, assignResult); err != nil {
return nil, err
diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go
index dcf279e1d..e053d9ea7 100644
--- a/weed/server/master_grpc_server.go
+++ b/weed/server/master_grpc_server.go
@@ -137,8 +137,8 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ
dcName, rackName := ms.Topo.Configuration.Locate(heartbeat.Ip, heartbeat.DataCenter, heartbeat.Rack)
dc := ms.Topo.GetOrCreateDataCenter(dcName)
rack := dc.GetOrCreateRack(rackName)
- dn = rack.GetOrCreateDataNode(heartbeat.Ip, int(heartbeat.Port), int(heartbeat.GrpcPort), heartbeat.PublicUrl, heartbeat.MaxVolumeCounts)
- glog.V(0).Infof("added volume server %d: %v:%d %v", dn.Counter, heartbeat.GetIp(), heartbeat.GetPort(), heartbeat.LocationUuids)
+ dn = rack.GetOrCreateDataNode(heartbeat.Ip, int(heartbeat.Port), int(heartbeat.GrpcPort), heartbeat.PublicUrl, heartbeat.Id, heartbeat.MaxVolumeCounts)
+ glog.V(0).Infof("added volume server %d: %v (id=%s, ip=%v:%d) %v", dn.Counter, dn.Id(), heartbeat.Id, heartbeat.GetIp(), heartbeat.GetPort(), heartbeat.LocationUuids)
uuidlist, err := ms.RegisterUuids(heartbeat)
if err != nil {
if stream_err := stream.Send(&master_pb.HeartbeatResponse{
diff --git a/weed/server/master_grpc_server_volume.go b/weed/server/master_grpc_server_volume.go
index a7ef8e7e9..d00cb5df4 100644
--- a/weed/server/master_grpc_server_volume.go
+++ b/weed/server/master_grpc_server_volume.go
@@ -253,7 +253,7 @@ func (ms *MasterServer) LookupEcVolume(ctx context.Context, req *master_pb.Looku
var locations []*master_pb.Location
for _, dn := range shardLocations {
locations = append(locations, &master_pb.Location{
- Url: string(dn.Id()),
+ Url: dn.Url(),
PublicUrl: dn.PublicUrl,
DataCenter: dn.GetDataCenterId(),
})
diff --git a/weed/server/volume_server.go b/weed/server/volume_server.go
index 4f8a7fb0d..65909996a 100644
--- a/weed/server/volume_server.go
+++ b/weed/server/volume_server.go
@@ -55,7 +55,7 @@ type VolumeServer struct {
}
func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string,
- port int, grpcPort int, publicUrl string,
+ port int, grpcPort int, publicUrl string, id string,
folders []string, maxCounts []int32, minFreeSpaces []util.MinFreeSpace, diskTypes []types.DiskType,
idxFolder string,
needleMapKind storage.NeedleMapKind,
@@ -114,7 +114,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string,
vs.checkWithMaster()
- vs.store = storage.NewStore(vs.grpcDialOption, ip, port, grpcPort, publicUrl, folders, maxCounts, minFreeSpaces, idxFolder, vs.needleMapKind, diskTypes, ldbTimeout)
+ vs.store = storage.NewStore(vs.grpcDialOption, ip, port, grpcPort, publicUrl, id, folders, maxCounts, minFreeSpaces, idxFolder, vs.needleMapKind, diskTypes, ldbTimeout)
vs.guard = security.NewGuard(whiteList, signingKey, expiresAfterSec, readSigningKey, readExpiresAfterSec)
handleStaticResources(adminMux)
diff --git a/weed/server/volume_server_handlers_admin.go b/weed/server/volume_server_handlers_admin.go
index ec6490662..a54369277 100644
--- a/weed/server/volume_server_handlers_admin.go
+++ b/weed/server/volume_server_handlers_admin.go
@@ -4,28 +4,33 @@ import (
"net/http"
"path/filepath"
- "github.com/seaweedfs/seaweedfs/weed/topology"
"github.com/seaweedfs/seaweedfs/weed/util/version"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/stats"
)
+// healthzHandler checks the local health of the volume server.
+// It only checks local conditions to avoid cascading failures when remote
+// volume servers go down. Previously, this handler checked if all replicated
+// volumes could reach their remote replicas, which caused healthy volume
+// servers to fail health checks when a peer went down.
+// See https://github.com/seaweedfs/seaweedfs/issues/6823
func (vs *VolumeServer) healthzHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Server", "SeaweedFS Volume "+version.VERSION)
- volumeInfos := vs.store.VolumeInfos()
- for _, vinfo := range volumeInfos {
- if len(vinfo.Collection) == 0 {
- continue
- }
- if vinfo.ReplicaPlacement.GetCopyCount() > 1 {
- _, err := topology.GetWritableRemoteReplications(vs.store, vs.grpcDialOption, vinfo.Id, vs.GetMaster)
- if err != nil {
- w.WriteHeader(http.StatusServiceUnavailable)
- return
- }
- }
+
+ // Check if the server is shutting down
+ if vs.store.IsStopping() {
+ w.WriteHeader(http.StatusServiceUnavailable)
+ return
}
+
+ // Check if we can communicate with master
+ if !vs.isHeartbeating {
+ w.WriteHeader(http.StatusServiceUnavailable)
+ return
+ }
+
w.WriteHeader(http.StatusOK)
}
diff --git a/weed/sftpd/sftp_file_writer.go b/weed/sftpd/sftp_file_writer.go
index 0a662d021..fed60eec0 100644
--- a/weed/sftpd/sftp_file_writer.go
+++ b/weed/sftpd/sftp_file_writer.go
@@ -72,6 +72,7 @@ func (l listerat) ListAt(ls []os.FileInfo, offset int64) (int, error) {
type SeaweedSftpFileWriter struct {
fs SftpServer
req *sftp.Request
+ absPath string // Absolute path after HomeDir translation
mu sync.Mutex
tmpFile *os.File
permissions os.FileMode
@@ -105,6 +106,6 @@ func (w *SeaweedSftpFileWriter) Close() error {
return err
}
- // Stream the file instead of loading it
- return w.fs.putFile(w.req.Filepath, w.tmpFile, w.fs.user)
+ // Stream the file to the absolute path (after HomeDir translation)
+ return w.fs.putFile(w.absPath, w.tmpFile, w.fs.user)
}
diff --git a/weed/sftpd/sftp_filer.go b/weed/sftpd/sftp_filer.go
index 9baaf41d7..eb196cc28 100644
--- a/weed/sftpd/sftp_filer.go
+++ b/weed/sftpd/sftp_filer.go
@@ -100,18 +100,26 @@ func (fs *SftpServer) withTimeoutContext(fn func(ctx context.Context) error) err
// ==================== Command Dispatcher ====================
func (fs *SftpServer) dispatchCmd(r *sftp.Request) error {
- glog.V(0).Infof("Dispatch: %s %s", r.Method, r.Filepath)
+ absPath, err := fs.toAbsolutePath(r.Filepath)
+ if err != nil {
+ return err
+ }
+ glog.V(1).Infof("Dispatch: %s %s (absolute: %s)", r.Method, r.Filepath, absPath)
switch r.Method {
case "Remove":
- return fs.removeEntry(r)
+ return fs.removeEntry(absPath)
case "Rename":
- return fs.renameEntry(r)
+ absTarget, err := fs.toAbsolutePath(r.Target)
+ if err != nil {
+ return err
+ }
+ return fs.renameEntry(absPath, absTarget)
case "Mkdir":
- return fs.makeDir(r)
+ return fs.makeDir(absPath)
case "Rmdir":
- return fs.removeDir(r)
+ return fs.removeDir(absPath)
case "Setstat":
- return fs.setFileStat(r)
+ return fs.setFileStatWithRequest(absPath, r)
default:
return fmt.Errorf("unsupported: %s", r.Method)
}
@@ -120,10 +128,14 @@ func (fs *SftpServer) dispatchCmd(r *sftp.Request) error {
// ==================== File Operations ====================
func (fs *SftpServer) readFile(r *sftp.Request) (io.ReaderAt, error) {
- if err := fs.checkFilePermission(r.Filepath, "read"); err != nil {
+ absPath, err := fs.toAbsolutePath(r.Filepath)
+ if err != nil {
+ return nil, err
+ }
+ if err := fs.checkFilePermission(absPath, "read"); err != nil {
return nil, err
}
- entry, err := fs.getEntry(r.Filepath)
+ entry, err := fs.getEntry(absPath)
if err != nil {
return nil, err
}
@@ -131,7 +143,11 @@ func (fs *SftpServer) readFile(r *sftp.Request) (io.ReaderAt, error) {
}
func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) {
- dir, _ := util.FullPath(r.Filepath).DirAndName()
+ absPath, err := fs.toAbsolutePath(r.Filepath)
+ if err != nil {
+ return nil, err
+ }
+ dir, _ := util.FullPath(absPath).DirAndName()
if err := fs.checkFilePermission(dir, "write"); err != nil {
glog.Errorf("Permission denied for %s", dir)
return nil, err
@@ -145,6 +161,7 @@ func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) {
return &SeaweedSftpFileWriter{
fs: *fs,
req: r,
+ absPath: absPath,
tmpFile: tmpFile,
permissions: 0644,
uid: fs.user.Uid,
@@ -153,16 +170,20 @@ func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) {
}, nil
}
-func (fs *SftpServer) removeEntry(r *sftp.Request) error {
- return fs.deleteEntry(r.Filepath, false)
+func (fs *SftpServer) removeEntry(absPath string) error {
+ return fs.deleteEntry(absPath, false)
}
-func (fs *SftpServer) renameEntry(r *sftp.Request) error {
- if err := fs.checkFilePermission(r.Filepath, "rename"); err != nil {
+func (fs *SftpServer) renameEntry(absPath, absTarget string) error {
+ if err := fs.checkFilePermission(absPath, "rename"); err != nil {
+ return err
+ }
+ targetDir, _ := util.FullPath(absTarget).DirAndName()
+ if err := fs.checkFilePermission(targetDir, "write"); err != nil {
return err
}
- oldDir, oldName := util.FullPath(r.Filepath).DirAndName()
- newDir, newName := util.FullPath(r.Target).DirAndName()
+ oldDir, oldName := util.FullPath(absPath).DirAndName()
+ newDir, newName := util.FullPath(absTarget).DirAndName()
return fs.callWithClient(false, func(ctx context.Context, client filer_pb.SeaweedFilerClient) error {
_, err := client.AtomicRenameEntry(ctx, &filer_pb.AtomicRenameEntryRequest{
OldDirectory: oldDir, OldName: oldName,
@@ -172,15 +193,15 @@ func (fs *SftpServer) renameEntry(r *sftp.Request) error {
})
}
-func (fs *SftpServer) setFileStat(r *sftp.Request) error {
- if err := fs.checkFilePermission(r.Filepath, "write"); err != nil {
+func (fs *SftpServer) setFileStatWithRequest(absPath string, r *sftp.Request) error {
+ if err := fs.checkFilePermission(absPath, "write"); err != nil {
return err
}
- entry, err := fs.getEntry(r.Filepath)
+ entry, err := fs.getEntry(absPath)
if err != nil {
return err
}
- dir, _ := util.FullPath(r.Filepath).DirAndName()
+ dir, _ := util.FullPath(absPath).DirAndName()
// apply attrs
if r.AttrFlags().Permissions {
entry.Attributes.FileMode = uint32(r.Attributes().FileMode())
@@ -201,18 +222,22 @@ func (fs *SftpServer) setFileStat(r *sftp.Request) error {
// ==================== Directory Operations ====================
func (fs *SftpServer) listDir(r *sftp.Request) (sftp.ListerAt, error) {
- if err := fs.checkFilePermission(r.Filepath, "list"); err != nil {
+ absPath, err := fs.toAbsolutePath(r.Filepath)
+ if err != nil {
+ return nil, err
+ }
+ if err := fs.checkFilePermission(absPath, "list"); err != nil {
return nil, err
}
if r.Method == "Stat" || r.Method == "Lstat" {
- entry, err := fs.getEntry(r.Filepath)
+ entry, err := fs.getEntry(absPath)
if err != nil {
return nil, err
}
fi := &EnhancedFileInfo{FileInfo: FileInfoFromEntry(entry), uid: entry.Attributes.Uid, gid: entry.Attributes.Gid}
return listerat([]os.FileInfo{fi}), nil
}
- return fs.listAllPages(r.Filepath)
+ return fs.listAllPages(absPath)
}
func (fs *SftpServer) listAllPages(dirPath string) (sftp.ListerAt, error) {
@@ -259,18 +284,19 @@ func (fs *SftpServer) fetchDirectoryPage(dirPath, start string) ([]os.FileInfo,
}
// makeDir creates a new directory with proper permissions.
-func (fs *SftpServer) makeDir(r *sftp.Request) error {
+func (fs *SftpServer) makeDir(absPath string) error {
if fs.user == nil {
return fmt.Errorf("cannot create directory: no user info")
}
- dir, name := util.FullPath(r.Filepath).DirAndName()
- if err := fs.checkFilePermission(r.Filepath, "mkdir"); err != nil {
+ dir, name := util.FullPath(absPath).DirAndName()
+ if err := fs.checkFilePermission(dir, "write"); err != nil {
return err
}
// default mode and ownership
err := filer_pb.Mkdir(context.Background(), fs, string(dir), name, func(entry *filer_pb.Entry) {
mode := uint32(0755 | os.ModeDir)
- if strings.HasPrefix(r.Filepath, fs.user.HomeDir) {
+ // Defensive check: all paths should be under HomeDir after toAbsolutePath translation
+ if absPath == fs.user.HomeDir || strings.HasPrefix(absPath, fs.user.HomeDir+"/") {
mode = uint32(0700 | os.ModeDir)
}
entry.Attributes.FileMode = mode
@@ -288,8 +314,8 @@ func (fs *SftpServer) makeDir(r *sftp.Request) error {
}
// removeDir deletes a directory.
-func (fs *SftpServer) removeDir(r *sftp.Request) error {
- return fs.deleteEntry(r.Filepath, false)
+func (fs *SftpServer) removeDir(absPath string) error {
+ return fs.deleteEntry(absPath, false)
}
func (fs *SftpServer) putFile(filepath string, reader io.Reader, user *user.User) error {
diff --git a/weed/sftpd/sftp_server.go b/weed/sftpd/sftp_server.go
index f158aeb64..e53098e6b 100644
--- a/weed/sftpd/sftp_server.go
+++ b/weed/sftpd/sftp_server.go
@@ -6,6 +6,8 @@ import (
"fmt"
"io"
"os"
+ "path"
+ "strings"
"time"
"github.com/pkg/sftp"
@@ -37,6 +39,28 @@ func NewSftpServer(filerAddr pb.ServerAddress, grpcDialOption grpc.DialOption, d
}
}
+// toAbsolutePath translates a user-relative path to an absolute filer path.
+// When a user has HomeDir="/sftp/user", their view of "/" maps to "/sftp/user".
+// This implements chroot-like behavior where the user's home directory
+// becomes their root.
+func (fs *SftpServer) toAbsolutePath(userPath string) (string, error) {
+ // If user has root as home directory, no translation needed
+ if fs.user.HomeDir == "" || fs.user.HomeDir == "/" {
+ return path.Clean(userPath), nil
+ }
+
+ // Concatenate home directory with user path, then clean to resolve any ".." components
+ p := path.Join(fs.user.HomeDir, strings.TrimPrefix(userPath, "/"))
+
+ // Security check: ensure the final path is within the home directory.
+ // This prevents path traversal attacks like `../..` that could escape the chroot jail.
+ if !strings.HasPrefix(p, fs.user.HomeDir+"/") && p != fs.user.HomeDir {
+ return "", fmt.Errorf("path traversal attempt: %s resolves to %s which is outside home dir %s", userPath, p, fs.user.HomeDir)
+ }
+
+ return p, nil
+}
+
// Fileread is invoked for “get” requests.
func (fs *SftpServer) Fileread(req *sftp.Request) (io.ReaderAt, error) {
return fs.readFile(req)
diff --git a/weed/sftpd/sftp_server_test.go b/weed/sftpd/sftp_server_test.go
new file mode 100644
index 000000000..0af94ca14
--- /dev/null
+++ b/weed/sftpd/sftp_server_test.go
@@ -0,0 +1,103 @@
+package sftpd
+
+import (
+ "testing"
+
+ "github.com/seaweedfs/seaweedfs/weed/sftpd/user"
+ "github.com/stretchr/testify/assert"
+)
+
+func stringPtr(s string) *string {
+ return &s
+}
+
+func TestToAbsolutePath(t *testing.T) {
+ tests := []struct {
+ name string
+ homeDir *string // Use pointer to distinguish between unset and empty
+ userPath string
+ expected string
+ expectError bool
+ }{
+ {
+ name: "normal path",
+ userPath: "/foo.txt",
+ expected: "/sftp/testuser/foo.txt",
+ },
+ {
+ name: "root path",
+ userPath: "/",
+ expected: "/sftp/testuser",
+ },
+ {
+ name: "path with dot",
+ userPath: "/./foo.txt",
+ expected: "/sftp/testuser/foo.txt",
+ },
+ {
+ name: "path traversal attempts",
+ userPath: "/../foo.txt",
+ expectError: true,
+ },
+ {
+ name: "path traversal attempts 2",
+ userPath: "../../foo.txt",
+ expectError: true,
+ },
+ {
+ name: "path traversal attempts 3",
+ userPath: "/subdir/../../foo.txt",
+ expectError: true,
+ },
+ {
+ name: "empty path",
+ userPath: "",
+ expected: "/sftp/testuser",
+ },
+ {
+ name: "multiple slashes",
+ userPath: "//foo.txt",
+ expected: "/sftp/testuser/foo.txt",
+ },
+ {
+ name: "trailing slash",
+ userPath: "/foo/",
+ expected: "/sftp/testuser/foo",
+ },
+ {
+ name: "empty HomeDir passthrough",
+ homeDir: stringPtr(""),
+ userPath: "/foo.txt",
+ expected: "/foo.txt",
+ },
+ {
+ name: "root HomeDir passthrough",
+ homeDir: stringPtr("/"),
+ userPath: "/foo.txt",
+ expected: "/foo.txt",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ homeDir := "/sftp/testuser" // default
+ if tt.homeDir != nil {
+ homeDir = *tt.homeDir
+ }
+
+ fs := &SftpServer{
+ user: &user.User{
+ HomeDir: homeDir,
+ },
+ }
+
+ got, err := fs.toAbsolutePath(tt.userPath)
+ if tt.expectError {
+ assert.Error(t, err)
+ } else {
+ assert.NoError(t, err)
+ assert.Equal(t, tt.expected, got)
+ }
+ })
+ }
+}
diff --git a/weed/sftpd/sftp_service.go b/weed/sftpd/sftp_service.go
index e50bd87ba..4d21815a9 100644
--- a/weed/sftpd/sftp_service.go
+++ b/weed/sftpd/sftp_service.go
@@ -284,8 +284,8 @@ func (s *SFTPService) handleChannel(newChannel ssh.NewChannel, fs *SftpServer) {
// handleSFTP starts the SFTP server on the SSH channel.
func (s *SFTPService) handleSFTP(channel ssh.Channel, fs *SftpServer) {
- // Create server options with initial working directory set to user's home
- serverOptions := sftp.WithStartDirectory(fs.user.HomeDir)
+ // Start at virtual root "/" - toAbsolutePath translates this to the user's HomeDir
+ serverOptions := sftp.WithStartDirectory("/")
server := sftp.NewRequestServer(channel, sftp.Handlers{
FileGet: fs,
FilePut: fs,
diff --git a/weed/sftpd/user/filestore.go b/weed/sftpd/user/filestore.go
index c522a388a..4c372aa76 100644
--- a/weed/sftpd/user/filestore.go
+++ b/weed/sftpd/user/filestore.go
@@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"os"
+ "path"
"sync"
"golang.org/x/crypto/ssh"
@@ -99,6 +100,10 @@ func (s *FileStore) loadUsers() error {
user.PublicKeys[i] = string(pubKey.Marshal())
}
}
+ // Clean HomeDir to handle trailing slashes and normalize path
+ if user.HomeDir != "" {
+ user.HomeDir = path.Clean(user.HomeDir)
+ }
s.users[user.Username] = user
}
diff --git a/weed/shell/command_ec_common.go b/weed/shell/command_ec_common.go
index f059b4e74..f2cc581da 100644
--- a/weed/shell/command_ec_common.go
+++ b/weed/shell/command_ec_common.go
@@ -26,12 +26,25 @@ type DataCenterId string
type EcNodeId string
type RackId string
+// EcDisk represents a single disk on a volume server
+type EcDisk struct {
+ diskId uint32
+ diskType string
+ freeEcSlots int
+ ecShardCount int // Total EC shards on this disk
+ // Map of volumeId -> shardBits for shards on this disk
+ ecShards map[needle.VolumeId]erasure_coding.ShardBits
+}
+
type EcNode struct {
info *master_pb.DataNodeInfo
dc DataCenterId
rack RackId
freeEcSlot int
+ // disks maps diskId -> EcDisk for disk-level balancing
+ disks map[uint32]*EcDisk
}
+
type CandidateEcNode struct {
ecNode *EcNode
shardCount int
@@ -229,7 +242,7 @@ func collectCollectionsForVolumeIds(t *master_pb.TopologyInfo, vids []needle.Vol
return collections
}
-func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, destinationEcNode *EcNode, applyBalancing bool) (err error) {
+func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, destinationEcNode *EcNode, destDiskId uint32, applyBalancing bool) (err error) {
if !commandEnv.isLocked() {
return fmt.Errorf("lock is lost")
@@ -242,7 +255,7 @@ func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode,
existingServerAddress := pb.NewServerAddressFromDataNode(existingLocation.info)
// ask destination node to copy shard and the ecx file from source node, and mount it
- copiedShardIds, err = oneServerCopyAndMountEcShardsFromSource(commandEnv.option.GrpcDialOption, destinationEcNode, []uint32{uint32(shardId)}, vid, collection, existingServerAddress)
+ copiedShardIds, err = oneServerCopyAndMountEcShardsFromSource(commandEnv.option.GrpcDialOption, destinationEcNode, []uint32{uint32(shardId)}, vid, collection, existingServerAddress, destDiskId)
if err != nil {
return err
}
@@ -259,7 +272,11 @@ func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode,
return err
}
- fmt.Printf("moved ec shard %d.%d %s => %s\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id)
+ if destDiskId > 0 {
+ fmt.Printf("moved ec shard %d.%d %s => %s (disk %d)\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id, destDiskId)
+ } else {
+ fmt.Printf("moved ec shard %d.%d %s => %s\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id)
+ }
}
@@ -272,7 +289,7 @@ func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode,
func oneServerCopyAndMountEcShardsFromSource(grpcDialOption grpc.DialOption,
targetServer *EcNode, shardIdsToCopy []uint32,
- volumeId needle.VolumeId, collection string, existingLocation pb.ServerAddress) (copiedShardIds []uint32, err error) {
+ volumeId needle.VolumeId, collection string, existingLocation pb.ServerAddress, destDiskId uint32) (copiedShardIds []uint32, err error) {
fmt.Printf("allocate %d.%v %s => %s\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id)
@@ -289,6 +306,7 @@ func oneServerCopyAndMountEcShardsFromSource(grpcDialOption grpc.DialOption,
CopyEcjFile: true,
CopyVifFile: true,
SourceDataNode: string(existingLocation),
+ DiskId: destDiskId,
})
if copyErr != nil {
return fmt.Errorf("copy %d.%v %s => %s : %v\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id, copyErr)
@@ -410,12 +428,74 @@ func collectEcVolumeServersByDc(topo *master_pb.TopologyInfo, selectedDataCenter
}
freeEcSlots := countFreeShardSlots(dn, types.HardDriveType)
- ecNodes = append(ecNodes, &EcNode{
+ ecNode := &EcNode{
info: dn,
dc: dc,
rack: rack,
freeEcSlot: int(freeEcSlots),
- })
+ disks: make(map[uint32]*EcDisk),
+ }
+
+ // Build disk-level information from volumes and EC shards
+ // First, discover all unique disk IDs from VolumeInfos (includes empty disks)
+ allDiskIds := make(map[uint32]string) // diskId -> diskType
+ for diskType, diskInfo := range dn.DiskInfos {
+ if diskInfo == nil {
+ continue
+ }
+ // Get all disk IDs from volumes
+ for _, vi := range diskInfo.VolumeInfos {
+ allDiskIds[vi.DiskId] = diskType
+ }
+ // Also get disk IDs from EC shards
+ for _, ecShardInfo := range diskInfo.EcShardInfos {
+ allDiskIds[ecShardInfo.DiskId] = diskType
+ }
+ }
+
+ // Group EC shards by disk_id
+ diskShards := make(map[uint32]map[needle.VolumeId]erasure_coding.ShardBits)
+ for _, diskInfo := range dn.DiskInfos {
+ if diskInfo == nil {
+ continue
+ }
+ for _, ecShardInfo := range diskInfo.EcShardInfos {
+ diskId := ecShardInfo.DiskId
+ if diskShards[diskId] == nil {
+ diskShards[diskId] = make(map[needle.VolumeId]erasure_coding.ShardBits)
+ }
+ vid := needle.VolumeId(ecShardInfo.Id)
+ diskShards[diskId][vid] = erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
+ }
+ }
+
+ // Create EcDisk for each discovered disk
+ diskCount := len(allDiskIds)
+ if diskCount == 0 {
+ diskCount = 1
+ }
+ freePerDisk := int(freeEcSlots) / diskCount
+
+ for diskId, diskType := range allDiskIds {
+ shards := diskShards[diskId]
+ if shards == nil {
+ shards = make(map[needle.VolumeId]erasure_coding.ShardBits)
+ }
+ totalShardCount := 0
+ for _, shardBits := range shards {
+ totalShardCount += shardBits.ShardIdCount()
+ }
+
+ ecNode.disks[diskId] = &EcDisk{
+ diskId: diskId,
+ diskType: diskType,
+ freeEcSlots: freePerDisk,
+ ecShardCount: totalShardCount,
+ ecShards: shards,
+ }
+ }
+
+ ecNodes = append(ecNodes, ecNode)
totalFreeEcSlots += freeEcSlots
})
return
@@ -884,10 +964,16 @@ func (ecb *ecBalancer) doBalanceEcRack(ecRack *EcRack) error {
for _, shards := range fullDiskInfo.EcShardInfos {
if _, found := emptyNodeIds[shards.Id]; !found {
for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() {
+ vid := needle.VolumeId(shards.Id)
+ destDiskId := pickBestDiskOnNode(emptyNode, vid)
- fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
+ if destDiskId > 0 {
+ fmt.Printf("%s moves ec shards %d.%d to %s (disk %d)\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id, destDiskId)
+ } else {
+ fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
+ }
- err := moveMountedShardToEcNode(ecb.commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, ecb.applyBalancing)
+ err := moveMountedShardToEcNode(ecb.commandEnv, fullNode, shards.Collection, vid, shardId, emptyNode, destDiskId, ecb.applyBalancing)
if err != nil {
return err
}
@@ -957,18 +1043,98 @@ func (ecb *ecBalancer) pickEcNodeToBalanceShardsInto(vid needle.VolumeId, existi
if len(targets) == 0 {
return nil, errors.New(details)
}
+
+ // When multiple nodes have the same shard count, prefer nodes with better disk distribution
+ // (i.e., nodes with more disks that have fewer shards of this volume)
+ if len(targets) > 1 {
+ slices.SortFunc(targets, func(a, b *EcNode) int {
+ aScore := diskDistributionScore(a, vid)
+ bScore := diskDistributionScore(b, vid)
+ return aScore - bScore // Lower score is better
+ })
+ return targets[0], nil
+ }
+
return targets[rand.IntN(len(targets))], nil
}
+// diskDistributionScore calculates a score for how well-distributed shards are on the node's disks
+// Lower score is better (means more room for balanced distribution)
+func diskDistributionScore(ecNode *EcNode, vid needle.VolumeId) int {
+ if len(ecNode.disks) == 0 {
+ return 0
+ }
+
+ // Sum the existing shard count for this volume on each disk
+ // Lower total means more room for new shards
+ score := 0
+ for _, disk := range ecNode.disks {
+ if shardBits, ok := disk.ecShards[vid]; ok {
+ score += shardBits.ShardIdCount() * 10 // Weight shards of this volume heavily
+ }
+ score += disk.ecShardCount // Also consider total shards on disk
+ }
+ return score
+}
+
+// pickBestDiskOnNode selects the best disk on a node for placing a new EC shard
+// It prefers disks with fewer shards and more free slots
+func pickBestDiskOnNode(ecNode *EcNode, vid needle.VolumeId) uint32 {
+ if len(ecNode.disks) == 0 {
+ return 0 // No disk info available, let the server decide
+ }
+
+ var bestDiskId uint32
+ bestScore := -1
+
+ for diskId, disk := range ecNode.disks {
+ if disk.freeEcSlots <= 0 {
+ continue
+ }
+
+ // Check if this volume already has shards on this disk
+ existingShards := 0
+ if shardBits, ok := disk.ecShards[vid]; ok {
+ existingShards = shardBits.ShardIdCount()
+ }
+
+ // Score: prefer disks with fewer total shards and fewer shards of this volume
+ // Lower score is better
+ score := disk.ecShardCount*10 + existingShards*100
+
+ if bestScore == -1 || score < bestScore {
+ bestScore = score
+ bestDiskId = diskId
+ }
+ }
+
+ return bestDiskId
+}
+
+// pickEcNodeAndDiskToBalanceShardsInto picks both a destination node and specific disk
+func (ecb *ecBalancer) pickEcNodeAndDiskToBalanceShardsInto(vid needle.VolumeId, existingLocation *EcNode, possibleDestinations []*EcNode) (*EcNode, uint32, error) {
+ node, err := ecb.pickEcNodeToBalanceShardsInto(vid, existingLocation, possibleDestinations)
+ if err != nil {
+ return nil, 0, err
+ }
+
+ diskId := pickBestDiskOnNode(node, vid)
+ return node, diskId, nil
+}
+
func (ecb *ecBalancer) pickOneEcNodeAndMoveOneShard(existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode) error {
- destNode, err := ecb.pickEcNodeToBalanceShardsInto(vid, existingLocation, possibleDestinationEcNodes)
+ destNode, destDiskId, err := ecb.pickEcNodeAndDiskToBalanceShardsInto(vid, existingLocation, possibleDestinationEcNodes)
if err != nil {
- fmt.Printf("WARNING: Could not find suitable taget node for %d.%d:\n%s", vid, shardId, err.Error())
+ fmt.Printf("WARNING: Could not find suitable target node for %d.%d:\n%s", vid, shardId, err.Error())
return nil
}
- fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destNode.info.Id)
- return moveMountedShardToEcNode(ecb.commandEnv, existingLocation, collection, vid, shardId, destNode, ecb.applyBalancing)
+ if destDiskId > 0 {
+ fmt.Printf("%s moves ec shard %d.%d to %s (disk %d)\n", existingLocation.info.Id, vid, shardId, destNode.info.Id, destDiskId)
+ } else {
+ fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destNode.info.Id)
+ }
+ return moveMountedShardToEcNode(ecb.commandEnv, existingLocation, collection, vid, shardId, destNode, destDiskId, ecb.applyBalancing)
}
func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) map[erasure_coding.ShardId]*EcNode {
diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go
index d7b015979..4d775000f 100644
--- a/weed/shell/command_volume_check_disk.go
+++ b/weed/shell/command_volume_check_disk.go
@@ -10,6 +10,8 @@ import (
"math"
"math/rand/v2"
"net/http"
+ "strings"
+ "sync"
"time"
"slices"
@@ -32,6 +34,7 @@ type commandVolumeCheckDisk struct{}
type volumeCheckDisk struct {
commandEnv *CommandEnv
writer io.Writer
+ writerMu sync.Mutex
now time.Time
slowMode bool
@@ -40,6 +43,8 @@ type volumeCheckDisk struct {
syncDeletions bool
fixReadOnly bool
nonRepairThreshold float64
+
+ ewg *ErrorWaitGroup
}
func (c *commandVolumeCheckDisk) Name() string {
@@ -59,9 +64,9 @@ func (c *commandVolumeCheckDisk) Help() string {
append entries in B and not in A to A
optionally, for each non-writable volume replica A
- if volume is not full
+ select a writable volume replica B
+ if entries in A don't match B
prune late volume entries not matching its index file
- select a writable volume replica B
append missing entries from B into A
mark the volume as writable (healthy)
@@ -92,6 +97,7 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write
applyChangesAlias := fsckCommand.Bool("force", false, "apply the fix (alias for -apply)")
fixReadOnly := fsckCommand.Bool("fixReadOnly", false, "apply the fix even on readonly volumes (EXPERIMENTAL!)")
syncDeletions := fsckCommand.Bool("syncDeleted", false, "sync of deletions the fix")
+ maxParallelization := fsckCommand.Int("maxParallelization", DefaultMaxParallelization, "run up to X tasks in parallel, whenever possible")
nonRepairThreshold := fsckCommand.Float64("nonRepairThreshold", 0.3, "repair when missing keys is not more than this limit")
if err = fsckCommand.Parse(args); err != nil {
return nil
@@ -115,6 +121,8 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write
syncDeletions: *syncDeletions,
fixReadOnly: *fixReadOnly,
nonRepairThreshold: *nonRepairThreshold,
+
+ ewg: NewErrorWaitGroup(*maxParallelization),
}
// collect topology information
@@ -137,23 +145,21 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write
if err := vcd.checkWritableVolumes(volumeReplicas); err != nil {
return err
}
- if err := vcd.checkReadOnlyVolumes(volumeReplicas); err != nil {
- return err
- }
+ vcd.checkReadOnlyVolumes(volumeReplicas)
- return nil
+ return vcd.ewg.Wait()
}
// checkWritableVolumes fixes volume replicas which are not read-only.
func (vcd *volumeCheckDisk) checkWritableVolumes(volumeReplicas map[uint32][]*VolumeReplica) error {
- vcd.write("Pass #1 (writable volumes)\n")
+ vcd.write("Pass #1 (writable volumes)")
for _, replicas := range volumeReplicas {
// filter readonly replica
var writableReplicas []*VolumeReplica
for _, replica := range replicas {
if replica.info.ReadOnly {
- vcd.write("skipping readonly volume %d on %s\n", replica.info.Id, replica.location.dataNode.Id)
+ vcd.write("skipping readonly volume %d on %s", replica.info.Id, replica.location.dataNode.Id)
} else {
writableReplicas = append(writableReplicas, replica)
}
@@ -166,16 +172,23 @@ func (vcd *volumeCheckDisk) checkWritableVolumes(volumeReplicas map[uint32][]*Vo
a, b := writableReplicas[0], writableReplicas[1]
shouldSkip, err := vcd.shouldSkipVolume(a, b)
if err != nil {
- vcd.write("error checking if volume %d should be skipped: %v\n", a.info.Id, err)
+ vcd.write("error checking if volume %d should be skipped: %v", a.info.Id, err)
// Continue with sync despite error to be safe
} else if shouldSkip {
// always choose the larger volume to be the source
writableReplicas = append(writableReplicas[:1], writableReplicas[2:]...)
continue
}
- if err := vcd.syncTwoReplicas(a, b, true); err != nil {
- vcd.write("sync volume %d on %s and %s: %v\n", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err)
+
+ modified, err := vcd.syncTwoReplicas(a, b, true)
+ if err != nil {
+ vcd.write("failed to sync volumes %d on %s and %s: %v", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err)
+ } else {
+ if modified {
+ vcd.write("synced %s and %s for volume %d", a.location.dataNode.Id, b.location.dataNode.Id, a.info.Id)
+ }
}
+
// always choose the larger volume to be the source
if a.info.FileCount > b.info.FileCount {
writableReplicas = append(writableReplicas[:1], writableReplicas[2:]...)
@@ -204,7 +217,7 @@ func (vcd *volumeCheckDisk) makeVolumeWritable(vid uint32, vr *VolumeReplica) er
return err
}
- vcd.write("volume %d on %s is now writable\n", vid, vr.location.dataNode.Id)
+ vcd.write("volume %d on %s is now writable", vid, vr.location.dataNode.Id)
return nil
}
@@ -224,15 +237,15 @@ func (vcd *volumeCheckDisk) makeVolumeReadonly(vid uint32, vr *VolumeReplica) er
return err
}
- vcd.write("volume %d on %s is now read-only\n", vid, vr.location.dataNode.Id)
+ vcd.write("volume %d on %s is now read-only", vid, vr.location.dataNode.Id)
return nil
}
-func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*VolumeReplica) error {
+func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*VolumeReplica) {
if !vcd.fixReadOnly {
- return nil
+ return
}
- vcd.write("Pass #2 (read-only volumes)\n")
+ vcd.write("Pass #2 (read-only volumes)")
for vid, replicas := range volumeReplicas {
roReplicas := []*VolumeReplica{}
@@ -246,11 +259,11 @@ func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*Vo
}
}
if len(roReplicas) == 0 {
- vcd.write("no read-only replicas for volume %d\n", vid)
+ vcd.write("no read-only replicas for volume %d", vid)
continue
}
if len(rwReplicas) == 0 {
- vcd.write("got %d read-only replicas for volume %d and no writable replicas to fix from\n", len(roReplicas), vid)
+ vcd.write("got %d read-only replicas for volume %d and no writable replicas to fix from", len(roReplicas), vid)
continue
}
@@ -261,35 +274,44 @@ func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*Vo
skip, err := vcd.shouldSkipVolume(r, source)
if err != nil {
- vcd.write("error checking if volume %d should be skipped: %v\n", r.info.Id, err)
+ vcd.ewg.AddErrorf("failed to check if volume %d should be skipped: %v\n", r.info.Id, err)
continue
}
if skip {
continue
}
- // make volume writable...
- if err := vcd.makeVolumeWritable(vid, r); err != nil {
- return err
- }
+ vcd.ewg.Add(func() error {
+ // make volume writable...
+ if err := vcd.makeVolumeWritable(vid, r); err != nil {
+ return err
+ }
- // ...fix it...
- // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes.
- if err := vcd.syncTwoReplicas(source, r, false); err != nil {
- vcd.write("sync read-only volume %d on %s from %s: %v\n", vid, r.location.dataNode.Id, source.location.dataNode.Id, err)
+ // ...try to fix it...
+ // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes...
+ modified, err := vcd.syncTwoReplicas(source, r, false)
+ if err != nil {
+ vcd.write("sync read-only volume %d on %s from %s: %v", vid, r.location.dataNode.Id, source.location.dataNode.Id, err)
- // ...or revert it back to read-only, if something went wrong.
- if roErr := vcd.makeVolumeReadonly(vid, r); roErr != nil {
- return fmt.Errorf("failed to make volume %d on %s readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr)
+ if roErr := vcd.makeVolumeReadonly(vid, r); roErr != nil {
+ return fmt.Errorf("failed to revert volume %d on %s to readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr)
+ }
+ return err
+ } else {
+ if modified {
+ vcd.write("volume %d on %s is now synced to %d and writable", vid, r.location.dataNode.Id, source.location.dataNode.Id)
+ } else {
+ // ...or restore back to read-only, if no changes were made.
+ if err := vcd.makeVolumeReadonly(vid, r); err != nil {
+ return fmt.Errorf("failed to revert volume %d on %s to readonly: %v", vid, r.location.dataNode.Id, err)
+ }
+ }
}
- vcd.write("volume %d on %s is now read-only\n", vid, r.location.dataNode.Id)
- return err
- }
+ return nil
+ })
}
}
-
- return nil
}
func (vcd *volumeCheckDisk) grpcDialOption() grpc.DialOption {
@@ -297,12 +319,15 @@ func (vcd *volumeCheckDisk) grpcDialOption() grpc.DialOption {
}
func (vcd *volumeCheckDisk) write(format string, a ...any) {
- fmt.Fprintf(vcd.writer, format, a...)
+ vcd.writerMu.Lock()
+ defer vcd.writerMu.Unlock()
+ fmt.Fprintf(vcd.writer, strings.TrimRight(format, "\r\n "), a...)
+ fmt.Fprint(vcd.writer, "\n")
}
func (vcd *volumeCheckDisk) writeVerbose(format string, a ...any) {
if vcd.verbose {
- fmt.Fprintf(vcd.writer, format, a...)
+ vcd.write(format, a...)
}
}
@@ -388,7 +413,7 @@ func (vcd *volumeCheckDisk) shouldSkipVolume(a, b *VolumeReplica) (bool, error)
if doSyncDeletedCount && !eqDeletedFileCount {
return false, nil
}
- vcd.writeVerbose("skipping active volumes %d with the same file counts on %s and %s\n",
+ vcd.writeVerbose("skipping active volumes %d with the same file counts on %s and %s",
a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id)
} else {
return false, nil
@@ -399,35 +424,39 @@ func (vcd *volumeCheckDisk) shouldSkipVolume(a, b *VolumeReplica) (bool, error)
// syncTwoReplicas attempts to sync all entries from a source volume replica into a target. If bi-directional mode
// is enabled, changes from target are also synced back into the source.
-func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi bool) (err error) {
+// Returns true if source and/or target were modified, false otherwise.
+func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi bool) (modified bool, err error) {
sourceHasChanges, targetHasChanges := true, true
const maxIterations = 5
iteration := 0
+ modified = false
+
for (sourceHasChanges || targetHasChanges) && iteration < maxIterations {
iteration++
- vcd.writeVerbose("sync iteration %d/%d for volume %d\n", iteration, maxIterations, source.info.Id)
+ vcd.writeVerbose("sync iteration %d/%d for volume %d", iteration, maxIterations, source.info.Id)
prevSourceHasChanges, prevTargetHasChanges := sourceHasChanges, targetHasChanges
if sourceHasChanges, targetHasChanges, err = vcd.checkBoth(source, target, bidi); err != nil {
- return err
+ return modified, err
}
+ modified = modified || sourceHasChanges || targetHasChanges
// Detect if we're stuck in a loop with no progress
if iteration > 1 && prevSourceHasChanges == sourceHasChanges && prevTargetHasChanges == targetHasChanges && (sourceHasChanges || targetHasChanges) {
- vcd.write("volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop\n",
+ vcd.write("volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop",
source.info.Id, source.location.dataNode.Id, target.location.dataNode.Id, iteration)
- return fmt.Errorf("sync not making progress after %d iterations", iteration)
+ return modified, fmt.Errorf("sync not making progress after %d iterations", iteration)
}
}
if iteration >= maxIterations && (sourceHasChanges || targetHasChanges) {
- vcd.write("volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention\n",
+ vcd.write("volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention",
source.info.Id, maxIterations, source.location.dataNode.Id, target.location.dataNode.Id)
- return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations)
+ return modified, fmt.Errorf("reached maximum sync iterations (%d)", maxIterations)
}
- return nil
+ return modified, nil
}
// checkBoth performs a sync between source and target volume replicas. If bi-directional mode is enabled, changes from target are also synced back into the source.
@@ -512,7 +541,7 @@ func (vcd *volumeCheckDisk) doVolumeCheckDisk(minuend, subtrahend *needle_map.Me
return nil
})
- vcd.write("volume %d %s has %d entries, %s missed %d and partially deleted %d entries\n",
+ vcd.write("volume %d %s has %d entries, %s missed %d and partially deleted %d entries",
source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles), len(partiallyDeletedNeedles))
if counter == 0 || (len(missingNeedles) == 0 && len(partiallyDeletedNeedles) == 0) {
@@ -536,7 +565,7 @@ func (vcd *volumeCheckDisk) doVolumeCheckDisk(minuend, subtrahend *needle_map.Me
continue
}
- vcd.writeVerbose("read %s %s => %s\n", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id)
+ vcd.writeVerbose("read %s %s => %s", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id)
hasChanges = true
if err = vcd.writeNeedleBlobToTarget(pb.NewServerAddressFromDataNode(target.location.dataNode), source.info.Id, needleValue, needleBlob); err != nil {
@@ -549,7 +578,7 @@ func (vcd *volumeCheckDisk) doVolumeCheckDisk(minuend, subtrahend *needle_map.Me
var fidList []string
for _, needleValue := range partiallyDeletedNeedles {
fidList = append(fidList, needleValue.Key.FileId(source.info.Id))
- vcd.writeVerbose("delete %s %s => %s\n", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id)
+ vcd.writeVerbose("delete %s %s => %s", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id)
}
deleteResults := operation.DeleteFileIdsAtOneVolumeServer(
pb.NewServerAddressFromDataNode(target.location.dataNode),
@@ -604,7 +633,7 @@ func (vcd *volumeCheckDisk) readIndexDatabase(db *needle_map.MemDb, collection s
return err
}
- vcd.writeVerbose("load collection %s volume %d index size %d from %s ...\n", collection, volumeId, buf.Len(), volumeServer)
+ vcd.writeVerbose("load collection %s volume %d index size %d from %s ...", collection, volumeId, buf.Len(), volumeServer)
return db.LoadFilterFromReaderAt(bytes.NewReader(buf.Bytes()), true, false)
}
@@ -616,7 +645,7 @@ func (vcd *volumeCheckDisk) copyVolumeIndexFile(collection string, volumeId uint
copyFileClient, err := volumeServerClient.CopyFile(context.Background(), &volume_server_pb.CopyFileRequest{
VolumeId: volumeId,
- Ext: ".idx",
+ Ext: ext,
CompactionRevision: math.MaxUint32,
StopOffset: math.MaxInt64,
Collection: collection,
diff --git a/weed/shell/command_volume_check_disk_test.go b/weed/shell/command_volume_check_disk_test.go
index eee9103a8..ec958fbc4 100644
--- a/weed/shell/command_volume_check_disk_test.go
+++ b/weed/shell/command_volume_check_disk_test.go
@@ -278,14 +278,14 @@ func TestVolumeCheckDiskHelperMethods(t *testing.T) {
}
// Test write method
- vcd.write("test %s\n", "message")
+ vcd.write("test %s", "message")
if buf.String() != "test message\n" {
t.Errorf("write() output = %q, want %q", buf.String(), "test message\n")
}
// Test writeVerbose with verbose=true
buf.Reset()
- vcd.writeVerbose("verbose %d\n", 123)
+ vcd.writeVerbose("verbose %d", 123)
if buf.String() != "verbose 123\n" {
t.Errorf("writeVerbose() with verbose=true output = %q, want %q", buf.String(), "verbose 123\n")
}
@@ -293,7 +293,7 @@ func TestVolumeCheckDiskHelperMethods(t *testing.T) {
// Test writeVerbose with verbose=false
buf.Reset()
vcd.verbose = false
- vcd.writeVerbose("should not appear\n")
+ vcd.writeVerbose("should not appear")
if buf.String() != "" {
t.Errorf("writeVerbose() with verbose=false output = %q, want empty", buf.String())
}
diff --git a/weed/shell/command_volume_server_evacuate.go b/weed/shell/command_volume_server_evacuate.go
index 5c1805c89..6135eb3eb 100644
--- a/weed/shell/command_volume_server_evacuate.go
+++ b/weed/shell/command_volume_server_evacuate.go
@@ -197,8 +197,14 @@ func (c *commandVolumeServerEvacuate) moveAwayOneEcVolume(commandEnv *CommandEnv
if ecShardInfo.Collection != "" {
collectionPrefix = ecShardInfo.Collection + "_"
}
- fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id)
- err = moveMountedShardToEcNode(commandEnv, thisNode, ecShardInfo.Collection, needle.VolumeId(ecShardInfo.Id), shardId, emptyNode, applyChange)
+ vid := needle.VolumeId(ecShardInfo.Id)
+ destDiskId := pickBestDiskOnNode(emptyNode, vid)
+ if destDiskId > 0 {
+ fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s (disk %d)\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id, destDiskId)
+ } else {
+ fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id)
+ }
+ err = moveMountedShardToEcNode(commandEnv, thisNode, ecShardInfo.Collection, vid, shardId, emptyNode, destDiskId, applyChange)
if err != nil {
return
} else {
diff --git a/weed/shell/common.go b/weed/shell/common.go
index 43571176e..cb2df5828 100644
--- a/weed/shell/common.go
+++ b/weed/shell/common.go
@@ -2,6 +2,7 @@ package shell
import (
"errors"
+ "fmt"
"sync"
)
@@ -64,6 +65,13 @@ func (ewg *ErrorWaitGroup) Add(f ErrorWaitGroupTask) {
}()
}
+// AddErrorf adds an error to an ErrorWaitGroupTask result, without queueing any goroutines.
+func (ewg *ErrorWaitGroup) AddErrorf(format string, a ...interface{}) {
+ ewg.errorsMu.Lock()
+ ewg.errors = append(ewg.errors, fmt.Errorf(format, a...))
+ ewg.errorsMu.Unlock()
+}
+
// Wait sleeps until all ErrorWaitGroupTasks are completed, then returns errors for them.
func (ewg *ErrorWaitGroup) Wait() error {
ewg.wg.Wait()
diff --git a/weed/storage/erasure_coding/placement/placement.go b/weed/storage/erasure_coding/placement/placement.go
new file mode 100644
index 000000000..67e21c1f8
--- /dev/null
+++ b/weed/storage/erasure_coding/placement/placement.go
@@ -0,0 +1,420 @@
+// Package placement provides consolidated EC shard placement logic used by
+// both shell commands and worker tasks.
+//
+// This package encapsulates the algorithms for:
+// - Selecting destination nodes/disks for EC shards
+// - Ensuring proper spread across racks, servers, and disks
+// - Balancing shards across the cluster
+package placement
+
+import (
+ "fmt"
+ "sort"
+)
+
+// DiskCandidate represents a disk that can receive EC shards
+type DiskCandidate struct {
+ NodeID string
+ DiskID uint32
+ DataCenter string
+ Rack string
+
+ // Capacity information
+ VolumeCount int64
+ MaxVolumeCount int64
+ ShardCount int // Current number of EC shards on this disk
+ FreeSlots int // Available slots for new shards
+
+ // Load information
+ LoadCount int // Number of active tasks on this disk
+}
+
+// NodeCandidate represents a server node that can receive EC shards
+type NodeCandidate struct {
+ NodeID string
+ DataCenter string
+ Rack string
+ FreeSlots int
+ ShardCount int // Total shards across all disks
+ Disks []*DiskCandidate // All disks on this node
+}
+
+// PlacementRequest configures EC shard placement behavior
+type PlacementRequest struct {
+ // ShardsNeeded is the total number of shards to place
+ ShardsNeeded int
+
+ // MaxShardsPerServer limits how many shards can be placed on a single server
+ // 0 means no limit (but prefer spreading when possible)
+ MaxShardsPerServer int
+
+ // MaxShardsPerRack limits how many shards can be placed in a single rack
+ // 0 means no limit
+ MaxShardsPerRack int
+
+ // MaxTaskLoad is the maximum task load count for a disk to be considered
+ MaxTaskLoad int
+
+ // PreferDifferentServers when true, spreads shards across different servers
+ // before using multiple disks on the same server
+ PreferDifferentServers bool
+
+ // PreferDifferentRacks when true, spreads shards across different racks
+ // before using multiple servers in the same rack
+ PreferDifferentRacks bool
+}
+
+// DefaultPlacementRequest returns the default placement configuration
+func DefaultPlacementRequest() PlacementRequest {
+ return PlacementRequest{
+ ShardsNeeded: 14,
+ MaxShardsPerServer: 0,
+ MaxShardsPerRack: 0,
+ MaxTaskLoad: 5,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+}
+
+// PlacementResult contains the selected destinations for EC shards
+type PlacementResult struct {
+ SelectedDisks []*DiskCandidate
+
+ // Statistics
+ ServersUsed int
+ RacksUsed int
+ DCsUsed int
+
+ // Distribution maps
+ ShardsPerServer map[string]int
+ ShardsPerRack map[string]int
+ ShardsPerDC map[string]int
+}
+
+// SelectDestinations selects the best disks for EC shard placement.
+// This is the main entry point for EC placement logic.
+//
+// The algorithm works in multiple passes:
+// 1. First pass: Select one disk from each rack (maximize rack diversity)
+// 2. Second pass: Select one disk from each unused server in used racks (maximize server diversity)
+// 3. Third pass: Select additional disks from servers already used (maximize disk diversity)
+func SelectDestinations(disks []*DiskCandidate, config PlacementRequest) (*PlacementResult, error) {
+ if len(disks) == 0 {
+ return nil, fmt.Errorf("no disk candidates provided")
+ }
+ if config.ShardsNeeded <= 0 {
+ return nil, fmt.Errorf("shardsNeeded must be positive, got %d", config.ShardsNeeded)
+ }
+
+ // Filter suitable disks
+ suitable := filterSuitableDisks(disks, config)
+ if len(suitable) == 0 {
+ return nil, fmt.Errorf("no suitable disks found after filtering")
+ }
+
+ // Build indexes for efficient lookup
+ rackToDisks := groupDisksByRack(suitable)
+
+ result := &PlacementResult{
+ SelectedDisks: make([]*DiskCandidate, 0, config.ShardsNeeded),
+ ShardsPerServer: make(map[string]int),
+ ShardsPerRack: make(map[string]int),
+ ShardsPerDC: make(map[string]int),
+ }
+
+ usedDisks := make(map[string]bool) // "nodeID:diskID" -> bool
+ usedServers := make(map[string]bool) // nodeID -> bool
+ usedRacks := make(map[string]bool) // "dc:rack" -> bool
+
+ // Pass 1: Select one disk from each rack (maximize rack diversity)
+ if config.PreferDifferentRacks {
+ // Sort racks by number of available servers (descending) to prioritize racks with more options
+ sortedRacks := sortRacksByServerCount(rackToDisks)
+ for _, rackKey := range sortedRacks {
+ if len(result.SelectedDisks) >= config.ShardsNeeded {
+ break
+ }
+ rackDisks := rackToDisks[rackKey]
+ // Select best disk from this rack, preferring a new server
+ disk := selectBestDiskFromRack(rackDisks, usedServers, usedDisks, config)
+ if disk != nil {
+ addDiskToResult(result, disk, usedDisks, usedServers, usedRacks)
+ }
+ }
+ }
+
+ // Pass 2: Select disks from unused servers in already-used racks
+ if config.PreferDifferentServers && len(result.SelectedDisks) < config.ShardsNeeded {
+ for _, rackKey := range getSortedRackKeys(rackToDisks) {
+ if len(result.SelectedDisks) >= config.ShardsNeeded {
+ break
+ }
+ rackDisks := rackToDisks[rackKey]
+ for _, disk := range sortDisksByScore(rackDisks) {
+ if len(result.SelectedDisks) >= config.ShardsNeeded {
+ break
+ }
+ diskKey := getDiskKey(disk)
+ if usedDisks[diskKey] {
+ continue
+ }
+ // Skip if server already used (we want different servers in this pass)
+ if usedServers[disk.NodeID] {
+ continue
+ }
+ // Check server limit
+ if config.MaxShardsPerServer > 0 && result.ShardsPerServer[disk.NodeID] >= config.MaxShardsPerServer {
+ continue
+ }
+ // Check rack limit
+ if config.MaxShardsPerRack > 0 && result.ShardsPerRack[getRackKey(disk)] >= config.MaxShardsPerRack {
+ continue
+ }
+ addDiskToResult(result, disk, usedDisks, usedServers, usedRacks)
+ }
+ }
+ }
+
+ // Pass 3: Fill remaining slots from already-used servers (different disks)
+ // Use round-robin across servers to balance shards evenly
+ if len(result.SelectedDisks) < config.ShardsNeeded {
+ // Group remaining disks by server
+ serverToRemainingDisks := make(map[string][]*DiskCandidate)
+ for _, disk := range suitable {
+ if !usedDisks[getDiskKey(disk)] {
+ serverToRemainingDisks[disk.NodeID] = append(serverToRemainingDisks[disk.NodeID], disk)
+ }
+ }
+
+ // Sort each server's disks by score
+ for serverID := range serverToRemainingDisks {
+ serverToRemainingDisks[serverID] = sortDisksByScore(serverToRemainingDisks[serverID])
+ }
+
+ // Round-robin: repeatedly select from the server with the fewest shards
+ for len(result.SelectedDisks) < config.ShardsNeeded {
+ // Find server with fewest shards that still has available disks
+ var bestServer string
+ minShards := -1
+ for serverID, disks := range serverToRemainingDisks {
+ if len(disks) == 0 {
+ continue
+ }
+ // Check server limit
+ if config.MaxShardsPerServer > 0 && result.ShardsPerServer[serverID] >= config.MaxShardsPerServer {
+ continue
+ }
+ shardCount := result.ShardsPerServer[serverID]
+ if minShards == -1 || shardCount < minShards {
+ minShards = shardCount
+ bestServer = serverID
+ } else if shardCount == minShards && serverID < bestServer {
+ // Tie-break by server name for determinism
+ bestServer = serverID
+ }
+ }
+
+ if bestServer == "" {
+ // No more servers with available disks
+ break
+ }
+
+ // Pop the best disk from this server
+ disks := serverToRemainingDisks[bestServer]
+ disk := disks[0]
+ serverToRemainingDisks[bestServer] = disks[1:]
+
+ // Check rack limit
+ if config.MaxShardsPerRack > 0 && result.ShardsPerRack[getRackKey(disk)] >= config.MaxShardsPerRack {
+ continue
+ }
+
+ addDiskToResult(result, disk, usedDisks, usedServers, usedRacks)
+ }
+ }
+
+ // Calculate final statistics
+ result.ServersUsed = len(usedServers)
+ result.RacksUsed = len(usedRacks)
+ dcSet := make(map[string]bool)
+ for _, disk := range result.SelectedDisks {
+ dcSet[disk.DataCenter] = true
+ }
+ result.DCsUsed = len(dcSet)
+
+ return result, nil
+}
+
+// filterSuitableDisks filters disks that are suitable for EC placement
+func filterSuitableDisks(disks []*DiskCandidate, config PlacementRequest) []*DiskCandidate {
+ var suitable []*DiskCandidate
+ for _, disk := range disks {
+ if disk.FreeSlots <= 0 {
+ continue
+ }
+ if config.MaxTaskLoad > 0 && disk.LoadCount > config.MaxTaskLoad {
+ continue
+ }
+ suitable = append(suitable, disk)
+ }
+ return suitable
+}
+
+// groupDisksByRack groups disks by their rack (dc:rack key)
+func groupDisksByRack(disks []*DiskCandidate) map[string][]*DiskCandidate {
+ result := make(map[string][]*DiskCandidate)
+ for _, disk := range disks {
+ key := getRackKey(disk)
+ result[key] = append(result[key], disk)
+ }
+ return result
+}
+
+// groupDisksByServer groups disks by their server
+func groupDisksByServer(disks []*DiskCandidate) map[string][]*DiskCandidate {
+ result := make(map[string][]*DiskCandidate)
+ for _, disk := range disks {
+ result[disk.NodeID] = append(result[disk.NodeID], disk)
+ }
+ return result
+}
+
+// getRackKey returns the unique key for a rack (dc:rack)
+func getRackKey(disk *DiskCandidate) string {
+ return fmt.Sprintf("%s:%s", disk.DataCenter, disk.Rack)
+}
+
+// getDiskKey returns the unique key for a disk (nodeID:diskID)
+func getDiskKey(disk *DiskCandidate) string {
+ return fmt.Sprintf("%s:%d", disk.NodeID, disk.DiskID)
+}
+
+// sortRacksByServerCount returns rack keys sorted by number of servers (ascending)
+func sortRacksByServerCount(rackToDisks map[string][]*DiskCandidate) []string {
+ // Count unique servers per rack
+ rackServerCount := make(map[string]int)
+ for rackKey, disks := range rackToDisks {
+ servers := make(map[string]bool)
+ for _, disk := range disks {
+ servers[disk.NodeID] = true
+ }
+ rackServerCount[rackKey] = len(servers)
+ }
+
+ keys := getSortedRackKeys(rackToDisks)
+ sort.Slice(keys, func(i, j int) bool {
+ // Sort by server count (descending) to pick from racks with more options first
+ return rackServerCount[keys[i]] > rackServerCount[keys[j]]
+ })
+ return keys
+}
+
+// getSortedRackKeys returns rack keys in a deterministic order
+func getSortedRackKeys(rackToDisks map[string][]*DiskCandidate) []string {
+ keys := make([]string, 0, len(rackToDisks))
+ for k := range rackToDisks {
+ keys = append(keys, k)
+ }
+ sort.Strings(keys)
+ return keys
+}
+
+// selectBestDiskFromRack selects the best disk from a rack for EC placement
+// It prefers servers that haven't been used yet
+func selectBestDiskFromRack(disks []*DiskCandidate, usedServers, usedDisks map[string]bool, config PlacementRequest) *DiskCandidate {
+ var bestDisk *DiskCandidate
+ bestScore := -1.0
+ bestIsFromUnusedServer := false
+
+ for _, disk := range disks {
+ if usedDisks[getDiskKey(disk)] {
+ continue
+ }
+ isFromUnusedServer := !usedServers[disk.NodeID]
+ score := calculateDiskScore(disk)
+
+ // Prefer unused servers
+ if isFromUnusedServer && !bestIsFromUnusedServer {
+ bestDisk = disk
+ bestScore = score
+ bestIsFromUnusedServer = true
+ } else if isFromUnusedServer == bestIsFromUnusedServer && score > bestScore {
+ bestDisk = disk
+ bestScore = score
+ }
+ }
+
+ return bestDisk
+}
+
+// sortDisksByScore returns disks sorted by score (best first)
+func sortDisksByScore(disks []*DiskCandidate) []*DiskCandidate {
+ sorted := make([]*DiskCandidate, len(disks))
+ copy(sorted, disks)
+ sort.Slice(sorted, func(i, j int) bool {
+ return calculateDiskScore(sorted[i]) > calculateDiskScore(sorted[j])
+ })
+ return sorted
+}
+
+// calculateDiskScore calculates a score for a disk candidate
+// Higher score is better
+func calculateDiskScore(disk *DiskCandidate) float64 {
+ score := 0.0
+
+ // Primary factor: available capacity (lower utilization is better)
+ if disk.MaxVolumeCount > 0 {
+ utilization := float64(disk.VolumeCount) / float64(disk.MaxVolumeCount)
+ score += (1.0 - utilization) * 60.0 // Up to 60 points
+ } else {
+ score += 30.0 // Default if no max count
+ }
+
+ // Secondary factor: fewer shards already on this disk is better
+ score += float64(10-disk.ShardCount) * 2.0 // Up to 20 points
+
+ // Tertiary factor: lower load is better
+ score += float64(10 - disk.LoadCount) // Up to 10 points
+
+ return score
+}
+
+// addDiskToResult adds a disk to the result and updates tracking maps
+func addDiskToResult(result *PlacementResult, disk *DiskCandidate,
+ usedDisks, usedServers, usedRacks map[string]bool) {
+ diskKey := getDiskKey(disk)
+ rackKey := getRackKey(disk)
+
+ result.SelectedDisks = append(result.SelectedDisks, disk)
+ usedDisks[diskKey] = true
+ usedServers[disk.NodeID] = true
+ usedRacks[rackKey] = true
+ result.ShardsPerServer[disk.NodeID]++
+ result.ShardsPerRack[rackKey]++
+ result.ShardsPerDC[disk.DataCenter]++
+}
+
+// VerifySpread checks if the placement result meets diversity requirements
+func VerifySpread(result *PlacementResult, minServers, minRacks int) error {
+ if result.ServersUsed < minServers {
+ return fmt.Errorf("only %d servers used, need at least %d", result.ServersUsed, minServers)
+ }
+ if result.RacksUsed < minRacks {
+ return fmt.Errorf("only %d racks used, need at least %d", result.RacksUsed, minRacks)
+ }
+ return nil
+}
+
+// CalculateIdealDistribution returns the ideal number of shards per server
+// when we have a certain number of shards and servers
+func CalculateIdealDistribution(totalShards, numServers int) (min, max int) {
+ if numServers <= 0 {
+ return 0, totalShards
+ }
+ min = totalShards / numServers
+ max = min
+ if totalShards%numServers != 0 {
+ max = min + 1
+ }
+ return
+}
diff --git a/weed/storage/erasure_coding/placement/placement_test.go b/weed/storage/erasure_coding/placement/placement_test.go
new file mode 100644
index 000000000..6cb94a4da
--- /dev/null
+++ b/weed/storage/erasure_coding/placement/placement_test.go
@@ -0,0 +1,517 @@
+package placement
+
+import (
+"testing"
+)
+
+// Helper function to create disk candidates for testing
+func makeDisk(nodeID string, diskID uint32, dc, rack string, freeSlots int) *DiskCandidate {
+ return &DiskCandidate{
+ NodeID: nodeID,
+ DiskID: diskID,
+ DataCenter: dc,
+ Rack: rack,
+ VolumeCount: 0,
+ MaxVolumeCount: 100,
+ ShardCount: 0,
+ FreeSlots: freeSlots,
+ LoadCount: 0,
+ }
+}
+
+func TestSelectDestinations_SingleRack(t *testing.T) {
+ // Test: 3 servers in same rack, each with 2 disks, need 6 shards
+ // Expected: Should spread across all 6 disks (one per disk)
+ disks := []*DiskCandidate{
+ makeDisk("server1", 0, "dc1", "rack1", 10),
+ makeDisk("server1", 1, "dc1", "rack1", 10),
+ makeDisk("server2", 0, "dc1", "rack1", 10),
+ makeDisk("server2", 1, "dc1", "rack1", 10),
+ makeDisk("server3", 0, "dc1", "rack1", 10),
+ makeDisk("server3", 1, "dc1", "rack1", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 6,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.SelectedDisks) != 6 {
+ t.Errorf("expected 6 selected disks, got %d", len(result.SelectedDisks))
+ }
+
+ // Verify all 3 servers are used
+ if result.ServersUsed != 3 {
+ t.Errorf("expected 3 servers used, got %d", result.ServersUsed)
+ }
+
+ // Verify each disk is unique
+ diskSet := make(map[string]bool)
+ for _, disk := range result.SelectedDisks {
+ key := getDiskKey(disk)
+ if diskSet[key] {
+ t.Errorf("disk %s selected multiple times", key)
+ }
+ diskSet[key] = true
+ }
+}
+
+func TestSelectDestinations_MultipleRacks(t *testing.T) {
+ // Test: 2 racks with 2 servers each, each server has 2 disks
+ // Need 8 shards
+ // Expected: Should spread across all 8 disks
+ disks := []*DiskCandidate{
+ makeDisk("server1", 0, "dc1", "rack1", 10),
+ makeDisk("server1", 1, "dc1", "rack1", 10),
+ makeDisk("server2", 0, "dc1", "rack1", 10),
+ makeDisk("server2", 1, "dc1", "rack1", 10),
+ makeDisk("server3", 0, "dc1", "rack2", 10),
+ makeDisk("server3", 1, "dc1", "rack2", 10),
+ makeDisk("server4", 0, "dc1", "rack2", 10),
+ makeDisk("server4", 1, "dc1", "rack2", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 8,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.SelectedDisks) != 8 {
+ t.Errorf("expected 8 selected disks, got %d", len(result.SelectedDisks))
+ }
+
+ // Verify all 4 servers are used
+ if result.ServersUsed != 4 {
+ t.Errorf("expected 4 servers used, got %d", result.ServersUsed)
+ }
+
+ // Verify both racks are used
+ if result.RacksUsed != 2 {
+ t.Errorf("expected 2 racks used, got %d", result.RacksUsed)
+ }
+}
+
+func TestSelectDestinations_PrefersDifferentServers(t *testing.T) {
+ // Test: 4 servers with 4 disks each, need 4 shards
+ // Expected: Should use one disk from each server
+ disks := []*DiskCandidate{
+ makeDisk("server1", 0, "dc1", "rack1", 10),
+ makeDisk("server1", 1, "dc1", "rack1", 10),
+ makeDisk("server1", 2, "dc1", "rack1", 10),
+ makeDisk("server1", 3, "dc1", "rack1", 10),
+ makeDisk("server2", 0, "dc1", "rack1", 10),
+ makeDisk("server2", 1, "dc1", "rack1", 10),
+ makeDisk("server2", 2, "dc1", "rack1", 10),
+ makeDisk("server2", 3, "dc1", "rack1", 10),
+ makeDisk("server3", 0, "dc1", "rack1", 10),
+ makeDisk("server3", 1, "dc1", "rack1", 10),
+ makeDisk("server3", 2, "dc1", "rack1", 10),
+ makeDisk("server3", 3, "dc1", "rack1", 10),
+ makeDisk("server4", 0, "dc1", "rack1", 10),
+ makeDisk("server4", 1, "dc1", "rack1", 10),
+ makeDisk("server4", 2, "dc1", "rack1", 10),
+ makeDisk("server4", 3, "dc1", "rack1", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 4,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.SelectedDisks) != 4 {
+ t.Errorf("expected 4 selected disks, got %d", len(result.SelectedDisks))
+ }
+
+ // Verify all 4 servers are used (one shard per server)
+ if result.ServersUsed != 4 {
+ t.Errorf("expected 4 servers used, got %d", result.ServersUsed)
+ }
+
+ // Each server should have exactly 1 shard
+ for server, count := range result.ShardsPerServer {
+ if count != 1 {
+ t.Errorf("server %s has %d shards, expected 1", server, count)
+ }
+ }
+}
+
+func TestSelectDestinations_SpilloverToMultipleDisksPerServer(t *testing.T) {
+ // Test: 2 servers with 4 disks each, need 6 shards
+ // Expected: First pick one from each server (2 shards), then one more from each (4 shards),
+ // then fill remaining from any server (6 shards)
+ disks := []*DiskCandidate{
+ makeDisk("server1", 0, "dc1", "rack1", 10),
+ makeDisk("server1", 1, "dc1", "rack1", 10),
+ makeDisk("server1", 2, "dc1", "rack1", 10),
+ makeDisk("server1", 3, "dc1", "rack1", 10),
+ makeDisk("server2", 0, "dc1", "rack1", 10),
+ makeDisk("server2", 1, "dc1", "rack1", 10),
+ makeDisk("server2", 2, "dc1", "rack1", 10),
+ makeDisk("server2", 3, "dc1", "rack1", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 6,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.SelectedDisks) != 6 {
+ t.Errorf("expected 6 selected disks, got %d", len(result.SelectedDisks))
+ }
+
+ // Both servers should be used
+ if result.ServersUsed != 2 {
+ t.Errorf("expected 2 servers used, got %d", result.ServersUsed)
+ }
+
+ // Each server should have exactly 3 shards (balanced)
+ for server, count := range result.ShardsPerServer {
+ if count != 3 {
+ t.Errorf("server %s has %d shards, expected 3", server, count)
+ }
+ }
+}
+
+func TestSelectDestinations_MaxShardsPerServer(t *testing.T) {
+ // Test: 2 servers with 4 disks each, need 6 shards, max 2 per server
+ // Expected: Should only select 4 shards (2 per server limit)
+ disks := []*DiskCandidate{
+ makeDisk("server1", 0, "dc1", "rack1", 10),
+ makeDisk("server1", 1, "dc1", "rack1", 10),
+ makeDisk("server1", 2, "dc1", "rack1", 10),
+ makeDisk("server1", 3, "dc1", "rack1", 10),
+ makeDisk("server2", 0, "dc1", "rack1", 10),
+ makeDisk("server2", 1, "dc1", "rack1", 10),
+ makeDisk("server2", 2, "dc1", "rack1", 10),
+ makeDisk("server2", 3, "dc1", "rack1", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 6,
+ MaxShardsPerServer: 2,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // Should only get 4 shards due to server limit
+ if len(result.SelectedDisks) != 4 {
+ t.Errorf("expected 4 selected disks (limit 2 per server), got %d", len(result.SelectedDisks))
+ }
+
+ // No server should exceed the limit
+ for server, count := range result.ShardsPerServer {
+ if count > 2 {
+ t.Errorf("server %s has %d shards, exceeds limit of 2", server, count)
+ }
+ }
+}
+
+func TestSelectDestinations_14ShardsAcross7Servers(t *testing.T) {
+ // Test: Real-world EC scenario - 14 shards across 7 servers with 2 disks each
+ // Expected: Should spread evenly (2 shards per server)
+ var disks []*DiskCandidate
+ for i := 1; i <= 7; i++ {
+ serverID := "server" + string(rune('0'+i))
+ disks = append(disks, makeDisk(serverID, 0, "dc1", "rack1", 10))
+ disks = append(disks, makeDisk(serverID, 1, "dc1", "rack1", 10))
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 14,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.SelectedDisks) != 14 {
+ t.Errorf("expected 14 selected disks, got %d", len(result.SelectedDisks))
+ }
+
+ // All 7 servers should be used
+ if result.ServersUsed != 7 {
+ t.Errorf("expected 7 servers used, got %d", result.ServersUsed)
+ }
+
+ // Each server should have exactly 2 shards
+ for server, count := range result.ShardsPerServer {
+ if count != 2 {
+ t.Errorf("server %s has %d shards, expected 2", server, count)
+ }
+ }
+}
+
+func TestSelectDestinations_FewerServersThanShards(t *testing.T) {
+ // Test: Only 3 servers but need 6 shards
+ // Expected: Should distribute evenly (2 per server)
+ disks := []*DiskCandidate{
+ makeDisk("server1", 0, "dc1", "rack1", 10),
+ makeDisk("server1", 1, "dc1", "rack1", 10),
+ makeDisk("server1", 2, "dc1", "rack1", 10),
+ makeDisk("server2", 0, "dc1", "rack1", 10),
+ makeDisk("server2", 1, "dc1", "rack1", 10),
+ makeDisk("server2", 2, "dc1", "rack1", 10),
+ makeDisk("server3", 0, "dc1", "rack1", 10),
+ makeDisk("server3", 1, "dc1", "rack1", 10),
+ makeDisk("server3", 2, "dc1", "rack1", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 6,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.SelectedDisks) != 6 {
+ t.Errorf("expected 6 selected disks, got %d", len(result.SelectedDisks))
+ }
+
+ // All 3 servers should be used
+ if result.ServersUsed != 3 {
+ t.Errorf("expected 3 servers used, got %d", result.ServersUsed)
+ }
+
+ // Each server should have exactly 2 shards
+ for server, count := range result.ShardsPerServer {
+ if count != 2 {
+ t.Errorf("server %s has %d shards, expected 2", server, count)
+ }
+ }
+}
+
+func TestSelectDestinations_NoSuitableDisks(t *testing.T) {
+ // Test: All disks have no free slots
+ disks := []*DiskCandidate{
+ {NodeID: "server1", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 0},
+ {NodeID: "server2", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 0},
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 4,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ _, err := SelectDestinations(disks, config)
+ if err == nil {
+ t.Error("expected error for no suitable disks, got nil")
+ }
+}
+
+func TestSelectDestinations_EmptyInput(t *testing.T) {
+ config := DefaultPlacementRequest()
+ _, err := SelectDestinations([]*DiskCandidate{}, config)
+ if err == nil {
+ t.Error("expected error for empty input, got nil")
+ }
+}
+
+func TestSelectDestinations_FiltersByLoad(t *testing.T) {
+ // Test: Some disks have too high load
+ disks := []*DiskCandidate{
+ {NodeID: "server1", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 10, LoadCount: 10},
+ {NodeID: "server2", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 10, LoadCount: 2},
+ {NodeID: "server3", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 10, LoadCount: 1},
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 2,
+ MaxTaskLoad: 5,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // Should only select from server2 and server3 (server1 has too high load)
+ for _, disk := range result.SelectedDisks {
+ if disk.NodeID == "server1" {
+ t.Errorf("disk from server1 should not be selected (load too high)")
+ }
+ }
+}
+
+func TestCalculateDiskScore(t *testing.T) {
+ // Test that score calculation works as expected
+ lowUtilDisk := &DiskCandidate{
+ VolumeCount: 10,
+ MaxVolumeCount: 100,
+ ShardCount: 0,
+ LoadCount: 0,
+ }
+
+ highUtilDisk := &DiskCandidate{
+ VolumeCount: 90,
+ MaxVolumeCount: 100,
+ ShardCount: 5,
+ LoadCount: 5,
+ }
+
+ lowScore := calculateDiskScore(lowUtilDisk)
+ highScore := calculateDiskScore(highUtilDisk)
+
+ if lowScore <= highScore {
+ t.Errorf("low utilization disk should have higher score: low=%f, high=%f", lowScore, highScore)
+ }
+}
+
+func TestCalculateIdealDistribution(t *testing.T) {
+ tests := []struct {
+ totalShards int
+ numServers int
+ expectedMin int
+ expectedMax int
+ }{
+ {14, 7, 2, 2}, // Even distribution
+ {14, 4, 3, 4}, // Uneven: 14/4 = 3 remainder 2
+ {6, 3, 2, 2}, // Even distribution
+ {7, 3, 2, 3}, // Uneven: 7/3 = 2 remainder 1
+ {10, 0, 0, 10}, // Edge case: no servers
+ {0, 5, 0, 0}, // Edge case: no shards
+ }
+
+ for _, tt := range tests {
+ min, max := CalculateIdealDistribution(tt.totalShards, tt.numServers)
+ if min != tt.expectedMin || max != tt.expectedMax {
+ t.Errorf("CalculateIdealDistribution(%d, %d) = (%d, %d), want (%d, %d)",
+tt.totalShards, tt.numServers, min, max, tt.expectedMin, tt.expectedMax)
+ }
+ }
+}
+
+func TestVerifySpread(t *testing.T) {
+ result := &PlacementResult{
+ ServersUsed: 3,
+ RacksUsed: 2,
+ }
+
+ // Should pass
+ if err := VerifySpread(result, 3, 2); err != nil {
+ t.Errorf("unexpected error: %v", err)
+ }
+
+ // Should fail - not enough servers
+ if err := VerifySpread(result, 4, 2); err == nil {
+ t.Error("expected error for insufficient servers")
+ }
+
+ // Should fail - not enough racks
+ if err := VerifySpread(result, 3, 3); err == nil {
+ t.Error("expected error for insufficient racks")
+ }
+}
+
+func TestSelectDestinations_MultiDC(t *testing.T) {
+ // Test: 2 DCs, each with 2 racks, each rack has 2 servers
+ disks := []*DiskCandidate{
+ // DC1, Rack1
+ makeDisk("dc1-r1-s1", 0, "dc1", "rack1", 10),
+ makeDisk("dc1-r1-s1", 1, "dc1", "rack1", 10),
+ makeDisk("dc1-r1-s2", 0, "dc1", "rack1", 10),
+ makeDisk("dc1-r1-s2", 1, "dc1", "rack1", 10),
+ // DC1, Rack2
+ makeDisk("dc1-r2-s1", 0, "dc1", "rack2", 10),
+ makeDisk("dc1-r2-s1", 1, "dc1", "rack2", 10),
+ makeDisk("dc1-r2-s2", 0, "dc1", "rack2", 10),
+ makeDisk("dc1-r2-s2", 1, "dc1", "rack2", 10),
+ // DC2, Rack1
+ makeDisk("dc2-r1-s1", 0, "dc2", "rack1", 10),
+ makeDisk("dc2-r1-s1", 1, "dc2", "rack1", 10),
+ makeDisk("dc2-r1-s2", 0, "dc2", "rack1", 10),
+ makeDisk("dc2-r1-s2", 1, "dc2", "rack1", 10),
+ // DC2, Rack2
+ makeDisk("dc2-r2-s1", 0, "dc2", "rack2", 10),
+ makeDisk("dc2-r2-s1", 1, "dc2", "rack2", 10),
+ makeDisk("dc2-r2-s2", 0, "dc2", "rack2", 10),
+ makeDisk("dc2-r2-s2", 1, "dc2", "rack2", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 8,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.SelectedDisks) != 8 {
+ t.Errorf("expected 8 selected disks, got %d", len(result.SelectedDisks))
+ }
+
+ // Should use all 4 racks
+ if result.RacksUsed != 4 {
+ t.Errorf("expected 4 racks used, got %d", result.RacksUsed)
+ }
+
+ // Should use both DCs
+ if result.DCsUsed != 2 {
+ t.Errorf("expected 2 DCs used, got %d", result.DCsUsed)
+ }
+}
+
+func TestSelectDestinations_SameRackDifferentDC(t *testing.T) {
+ // Test: Same rack name in different DCs should be treated as different racks
+ disks := []*DiskCandidate{
+ makeDisk("dc1-s1", 0, "dc1", "rack1", 10),
+ makeDisk("dc2-s1", 0, "dc2", "rack1", 10),
+ }
+
+ config := PlacementRequest{
+ ShardsNeeded: 2,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
+
+ result, err := SelectDestinations(disks, config)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // Should use 2 racks (dc1:rack1 and dc2:rack1 are different)
+ if result.RacksUsed != 2 {
+ t.Errorf("expected 2 racks used (different DCs), got %d", result.RacksUsed)
+ }
+}
diff --git a/weed/storage/store.go b/weed/storage/store.go
index cc07f8702..7a336d1ff 100644
--- a/weed/storage/store.go
+++ b/weed/storage/store.go
@@ -63,6 +63,7 @@ type Store struct {
Port int
GrpcPort int
PublicUrl string
+ Id string // volume server id, independent of ip:port for stable identification
Locations []*DiskLocation
dataCenter string // optional information, overwriting master setting if exists
rack string // optional information, overwriting master setting if exists
@@ -76,13 +77,13 @@ type Store struct {
}
func (s *Store) String() (str string) {
- str = fmt.Sprintf("Ip:%s, Port:%d, GrpcPort:%d PublicUrl:%s, dataCenter:%s, rack:%s, connected:%v, volumeSizeLimit:%d", s.Ip, s.Port, s.GrpcPort, s.PublicUrl, s.dataCenter, s.rack, s.connected, s.GetVolumeSizeLimit())
+ str = fmt.Sprintf("Id:%s, Ip:%s, Port:%d, GrpcPort:%d PublicUrl:%s, dataCenter:%s, rack:%s, connected:%v, volumeSizeLimit:%d", s.Id, s.Ip, s.Port, s.GrpcPort, s.PublicUrl, s.dataCenter, s.rack, s.connected, s.GetVolumeSizeLimit())
return
}
-func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int, publicUrl string, dirnames []string, maxVolumeCounts []int32,
+func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int, publicUrl string, id string, dirnames []string, maxVolumeCounts []int32,
minFreeSpaces []util.MinFreeSpace, idxFolder string, needleMapKind NeedleMapKind, diskTypes []DiskType, ldbTimeout int64) (s *Store) {
- s = &Store{grpcDialOption: grpcDialOption, Port: port, Ip: ip, GrpcPort: grpcPort, PublicUrl: publicUrl, NeedleMapKind: needleMapKind}
+ s = &Store{grpcDialOption: grpcDialOption, Port: port, Ip: ip, GrpcPort: grpcPort, PublicUrl: publicUrl, Id: id, NeedleMapKind: needleMapKind}
s.Locations = make([]*DiskLocation, 0)
var wg sync.WaitGroup
@@ -414,6 +415,7 @@ func (s *Store) CollectHeartbeat() *master_pb.Heartbeat {
Port: uint32(s.Port),
GrpcPort: uint32(s.GrpcPort),
PublicUrl: s.PublicUrl,
+ Id: s.Id,
MaxVolumeCounts: maxVolumeCounts,
MaxFileKey: NeedleIdToUint64(maxFileKey),
DataCenter: s.dataCenter,
@@ -467,6 +469,10 @@ func (s *Store) SetStopping() {
}
}
+func (s *Store) IsStopping() bool {
+ return s.isStopping
+}
+
func (s *Store) LoadNewVolumes() {
for _, location := range s.Locations {
location.loadExistingVolumes(s.NeedleMapKind, 0)
diff --git a/weed/storage/store_ec_delete.go b/weed/storage/store_ec_delete.go
index a3e028bbb..9fcb092a2 100644
--- a/weed/storage/store_ec_delete.go
+++ b/weed/storage/store_ec_delete.go
@@ -3,6 +3,7 @@ package storage
import (
"context"
"fmt"
+
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/glog"
@@ -21,7 +22,8 @@ func (s *Store) DeleteEcShardNeedle(ecVolume *erasure_coding.EcVolume, n *needle
return 0, err
}
- if cookie != n.Cookie {
+ // cookie == 0 indicates SkipCookieCheck was requested (e.g., orphan cleanup)
+ if cookie != 0 && cookie != n.Cookie {
return 0, fmt.Errorf("unexpected cookie %x", cookie)
}
@@ -45,22 +47,17 @@ func (s *Store) doDeleteNeedleFromAtLeastOneRemoteEcShards(ecVolume *erasure_cod
shardId, _ := intervals[0].ToShardIdAndOffset(erasure_coding.ErasureCodingLargeBlockSize, erasure_coding.ErasureCodingSmallBlockSize)
- hasDeletionSuccess := false
err = s.doDeleteNeedleFromRemoteEcShardServers(shardId, ecVolume, needleId)
if err == nil {
- hasDeletionSuccess = true
+ return nil
}
for shardId = erasure_coding.DataShardsCount; shardId < erasure_coding.TotalShardsCount; shardId++ {
if parityDeletionError := s.doDeleteNeedleFromRemoteEcShardServers(shardId, ecVolume, needleId); parityDeletionError == nil {
- hasDeletionSuccess = true
+ return nil
}
}
- if hasDeletionSuccess {
- return nil
- }
-
return err
}
@@ -77,11 +74,9 @@ func (s *Store) doDeleteNeedleFromRemoteEcShardServers(shardId erasure_coding.Sh
for _, sourceDataNode := range sourceDataNodes {
glog.V(4).Infof("delete from remote ec shard %d.%d from %s", ecVolume.VolumeId, shardId, sourceDataNode)
- err := s.doDeleteNeedleFromRemoteEcShard(sourceDataNode, ecVolume.VolumeId, ecVolume.Collection, ecVolume.Version, needleId)
- if err != nil {
+ if err := s.doDeleteNeedleFromRemoteEcShard(sourceDataNode, ecVolume.VolumeId, ecVolume.Collection, ecVolume.Version, needleId); err != nil {
return err
}
- glog.V(1).Infof("delete from remote ec shard %d.%d from %s: %v", ecVolume.VolumeId, shardId, sourceDataNode, err)
}
return nil
diff --git a/weed/storage/store_load_balancing_test.go b/weed/storage/store_load_balancing_test.go
index 15e709d53..35475a6ae 100644
--- a/weed/storage/store_load_balancing_test.go
+++ b/weed/storage/store_load_balancing_test.go
@@ -31,7 +31,7 @@ func newTestStore(t *testing.T, numDirs int) *Store {
diskTypes = append(diskTypes, types.HardDriveType)
}
- store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080",
+ store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080", "",
dirs, maxCounts, minFreeSpaces, "", NeedleMapInMemory, diskTypes, 3)
// Consume channel messages to prevent blocking
diff --git a/weed/topology/data_node.go b/weed/topology/data_node.go
index 4f2dbe464..07e00ac0a 100644
--- a/weed/topology/data_node.go
+++ b/weed/topology/data_node.go
@@ -269,6 +269,7 @@ func (dn *DataNode) ToDataNodeInfo() *master_pb.DataNodeInfo {
Id: string(dn.Id()),
DiskInfos: make(map[string]*master_pb.DiskInfo),
GrpcPort: uint32(dn.GrpcPort),
+ Address: dn.Url(), // ip:port for connecting to the volume server
}
for _, c := range dn.Children() {
disk := c.(*Disk)
diff --git a/weed/topology/rack.go b/weed/topology/rack.go
index f526cd84d..1e5c8b632 100644
--- a/weed/topology/rack.go
+++ b/weed/topology/rack.go
@@ -5,6 +5,7 @@ import (
"strings"
"time"
+ "github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
"github.com/seaweedfs/seaweedfs/weed/util"
@@ -34,17 +35,73 @@ func (r *Rack) FindDataNode(ip string, port int) *DataNode {
}
return nil
}
-func (r *Rack) GetOrCreateDataNode(ip string, port int, grpcPort int, publicUrl string, maxVolumeCounts map[string]uint32) *DataNode {
+
+// FindDataNodeById finds a DataNode by its ID using O(1) map lookup
+func (r *Rack) FindDataNodeById(id string) *DataNode {
+ r.RLock()
+ defer r.RUnlock()
+ if c, ok := r.children[NodeId(id)]; ok {
+ return c.(*DataNode)
+ }
+ return nil
+}
+
+func (r *Rack) GetOrCreateDataNode(ip string, port int, grpcPort int, publicUrl string, id string, maxVolumeCounts map[string]uint32) *DataNode {
r.Lock()
defer r.Unlock()
- for _, c := range r.children {
+
+ // Normalize the id parameter (trim whitespace)
+ id = strings.TrimSpace(id)
+
+ // Determine the node ID: use provided id, or fall back to ip:port for backward compatibility
+ nodeId := util.GetVolumeServerId(id, ip, port)
+
+ // First, try to find by node ID using O(1) map lookup (stable identity)
+ if c, ok := r.children[NodeId(nodeId)]; ok {
dn := c.(*DataNode)
- if dn.MatchLocation(ip, port) {
- dn.LastSeen = time.Now().Unix()
- return dn
+ // Log if IP or Port changed (e.g., pod rescheduled in K8s)
+ if dn.Ip != ip || dn.Port != port {
+ glog.V(0).Infof("DataNode %s address changed from %s:%d to %s:%d", nodeId, dn.Ip, dn.Port, ip, port)
}
+ // Update the IP/Port in case they changed
+ dn.Ip = ip
+ dn.Port = port
+ dn.GrpcPort = grpcPort
+ dn.PublicUrl = publicUrl
+ dn.LastSeen = time.Now().Unix()
+ return dn
}
- dn := NewDataNode(util.JoinHostPort(ip, port))
+
+ // For backward compatibility: if explicit id was provided, also check by ip:port
+ // to handle transition from old (ip:port) to new (explicit id) behavior
+ ipPortId := util.JoinHostPort(ip, port)
+ if nodeId != ipPortId {
+ for oldId, c := range r.children {
+ dn := c.(*DataNode)
+ if dn.MatchLocation(ip, port) {
+ // Only transition if the oldId exactly matches ip:port (legacy identification).
+ // If oldId is different, this is a node with an explicit id that happens to
+ // reuse the same ip:port - don't incorrectly merge them.
+ if string(oldId) != ipPortId {
+ glog.Warningf("Volume server with id %s has ip:port %s which is used by node %s", nodeId, ipPortId, oldId)
+ continue
+ }
+ // Found a legacy node identified by ip:port, transition it to use the new explicit id
+ glog.V(0).Infof("Volume server %s transitioning id from %s to %s", dn.Url(), oldId, nodeId)
+ // Re-key the node in the children map with the new id
+ delete(r.children, oldId)
+ dn.id = NodeId(nodeId)
+ r.children[NodeId(nodeId)] = dn
+ // Update connection info in case they changed
+ dn.GrpcPort = grpcPort
+ dn.PublicUrl = publicUrl
+ dn.LastSeen = time.Now().Unix()
+ return dn
+ }
+ }
+ }
+
+ dn := NewDataNode(nodeId)
dn.Ip = ip
dn.Port = port
dn.GrpcPort = grpcPort
diff --git a/weed/topology/topology_test.go b/weed/topology/topology_test.go
index 8515d2f81..e5a8969fc 100644
--- a/weed/topology/topology_test.go
+++ b/weed/topology/topology_test.go
@@ -34,7 +34,7 @@ func TestHandlingVolumeServerHeartbeat(t *testing.T) {
maxVolumeCounts := make(map[string]uint32)
maxVolumeCounts[""] = 25
maxVolumeCounts["ssd"] = 12
- dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts)
+ dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts)
{
volumeCount := 7
@@ -180,7 +180,7 @@ func TestAddRemoveVolume(t *testing.T) {
maxVolumeCounts := make(map[string]uint32)
maxVolumeCounts[""] = 25
maxVolumeCounts["ssd"] = 12
- dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts)
+ dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts)
v := storage.VolumeInfo{
Id: needle.VolumeId(1),
@@ -218,7 +218,7 @@ func TestVolumeReadOnlyStatusChange(t *testing.T) {
rack := dc.GetOrCreateRack("rack1")
maxVolumeCounts := make(map[string]uint32)
maxVolumeCounts[""] = 25
- dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts)
+ dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts)
// Create a writable volume
v := storage.VolumeInfo{
@@ -267,7 +267,7 @@ func TestVolumeReadOnlyAndRemoteStatusChange(t *testing.T) {
rack := dc.GetOrCreateRack("rack1")
maxVolumeCounts := make(map[string]uint32)
maxVolumeCounts[""] = 25
- dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts)
+ dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts)
// Create a writable, local volume
v := storage.VolumeInfo{
@@ -331,7 +331,7 @@ func TestListCollections(t *testing.T) {
topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
dc := topo.GetOrCreateDataCenter("dc1")
rack := dc.GetOrCreateRack("rack1")
- dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", nil)
+ dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", nil)
topo.RegisterVolumeLayout(storage.VolumeInfo{
Id: needle.VolumeId(1111),
@@ -396,3 +396,112 @@ func TestListCollections(t *testing.T) {
})
}
}
+
+func TestDataNodeIdBasedIdentification(t *testing.T) {
+ topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
+ dc := topo.GetOrCreateDataCenter("dc1")
+ rack := dc.GetOrCreateRack("rack1")
+
+ maxVolumeCounts := make(map[string]uint32)
+ maxVolumeCounts[""] = 10
+
+ // Test 1: Create a DataNode with explicit id
+ dn1 := rack.GetOrCreateDataNode("10.0.0.1", 8080, 18080, "10.0.0.1:8080", "node-1", maxVolumeCounts)
+ if string(dn1.Id()) != "node-1" {
+ t.Errorf("expected node id 'node-1', got '%s'", dn1.Id())
+ }
+ if dn1.Ip != "10.0.0.1" {
+ t.Errorf("expected ip '10.0.0.1', got '%s'", dn1.Ip)
+ }
+
+ // Test 2: Same id with different IP should return the same DataNode (K8s pod reschedule scenario)
+ dn2 := rack.GetOrCreateDataNode("10.0.0.2", 8080, 18080, "10.0.0.2:8080", "node-1", maxVolumeCounts)
+ if dn1 != dn2 {
+ t.Errorf("expected same DataNode for same id, got different nodes")
+ }
+ // IP should be updated to the new value
+ if dn2.Ip != "10.0.0.2" {
+ t.Errorf("expected ip to be updated to '10.0.0.2', got '%s'", dn2.Ip)
+ }
+ if dn2.PublicUrl != "10.0.0.2:8080" {
+ t.Errorf("expected publicUrl to be updated to '10.0.0.2:8080', got '%s'", dn2.PublicUrl)
+ }
+
+ // Test 3: Different id should create a new DataNode
+ dn3 := rack.GetOrCreateDataNode("10.0.0.3", 8080, 18080, "10.0.0.3:8080", "node-2", maxVolumeCounts)
+ if string(dn3.Id()) != "node-2" {
+ t.Errorf("expected node id 'node-2', got '%s'", dn3.Id())
+ }
+ if dn1 == dn3 {
+ t.Errorf("expected different DataNode for different id")
+ }
+
+ // Test 4: Empty id should fall back to ip:port (backward compatibility)
+ dn4 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "", maxVolumeCounts)
+ if string(dn4.Id()) != "10.0.0.4:8080" {
+ t.Errorf("expected node id '10.0.0.4:8080' for empty id, got '%s'", dn4.Id())
+ }
+
+ // Test 5: Same ip:port with empty id should return the same DataNode
+ dn5 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "", maxVolumeCounts)
+ if dn4 != dn5 {
+ t.Errorf("expected same DataNode for same ip:port with empty id")
+ }
+
+ // Verify we have 3 unique DataNodes total:
+ // - node-1 (dn1/dn2 share the same id)
+ // - node-2 (dn3)
+ // - 10.0.0.4:8080 (dn4/dn5 share the same ip:port)
+ children := rack.Children()
+ if len(children) != 3 {
+ t.Errorf("expected 3 DataNodes, got %d", len(children))
+ }
+
+ // Test 6: Transition from ip:port to explicit id
+ // First, the node exists with ip:port as id (dn4/dn5)
+ // Now the same volume server starts sending an explicit id
+ dn6 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "node-4-explicit", maxVolumeCounts)
+ // Should return the same DataNode instance
+ if dn6 != dn4 {
+ t.Errorf("expected same DataNode instance during transition")
+ }
+ // But the id should now be updated to the explicit id
+ if string(dn6.Id()) != "node-4-explicit" {
+ t.Errorf("expected node id to transition to 'node-4-explicit', got '%s'", dn6.Id())
+ }
+ // The node should be re-keyed in the children map
+ if rack.FindDataNodeById("node-4-explicit") != dn6 {
+ t.Errorf("expected to find DataNode by new explicit id")
+ }
+ // Old ip:port key should no longer work
+ if rack.FindDataNodeById("10.0.0.4:8080") != nil {
+ t.Errorf("expected old ip:port id to be removed from children map")
+ }
+
+ // Still 3 unique DataNodes (node-1, node-2, node-4-explicit)
+ children = rack.Children()
+ if len(children) != 3 {
+ t.Errorf("expected 3 DataNodes after transition, got %d", len(children))
+ }
+
+ // Test 7: Prevent incorrect transition when a new node reuses ip:port of a node with explicit id
+ // Scenario: node-1 runs at 10.0.0.1:8080, dies, new node-99 starts at same ip:port
+ // The transition should NOT happen because node-1 already has an explicit id
+ dn7 := rack.GetOrCreateDataNode("10.0.0.1", 8080, 18080, "10.0.0.1:8080", "node-99", maxVolumeCounts)
+ // Should create a NEW DataNode, not reuse node-1
+ if dn7 == dn1 {
+ t.Errorf("expected new DataNode for node-99, got reused node-1")
+ }
+ if string(dn7.Id()) != "node-99" {
+ t.Errorf("expected node id 'node-99', got '%s'", dn7.Id())
+ }
+ // node-1 should still exist with its original id
+ if rack.FindDataNodeById("node-1") == nil {
+ t.Errorf("node-1 should still exist")
+ }
+ // Now we have 4 DataNodes
+ children = rack.Children()
+ if len(children) != 4 {
+ t.Errorf("expected 4 DataNodes, got %d", len(children))
+ }
+}
diff --git a/weed/util/network.go b/weed/util/network.go
index 328808dbc..f7dbeebb7 100644
--- a/weed/util/network.go
+++ b/weed/util/network.go
@@ -64,3 +64,14 @@ func JoinHostPort(host string, port int) string {
}
return net.JoinHostPort(host, portStr)
}
+
+// GetVolumeServerId returns the volume server ID.
+// If id is provided (non-empty after trimming), use it as the identifier.
+// Otherwise, fall back to ip:port for backward compatibility.
+func GetVolumeServerId(id, ip string, port int) string {
+ volumeServerId := strings.TrimSpace(id)
+ if volumeServerId == "" {
+ volumeServerId = JoinHostPort(ip, port)
+ }
+ return volumeServerId
+}
diff --git a/weed/worker/tasks/erasure_coding/detection.go b/weed/worker/tasks/erasure_coding/detection.go
index cd74bed33..c5568fe26 100644
--- a/weed/worker/tasks/erasure_coding/detection.go
+++ b/weed/worker/tasks/erasure_coding/detection.go
@@ -9,6 +9,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
+ "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding/placement"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
@@ -429,85 +430,100 @@ func createECTaskParams(multiPlan *topology.MultiDestinationPlan) *worker_pb.Era
}
// selectBestECDestinations selects multiple disks for EC shard placement with diversity
+// Uses the consolidated placement package for proper rack/server/disk spreading
func selectBestECDestinations(disks []*topology.DiskInfo, sourceRack, sourceDC string, shardsNeeded int) []*topology.DiskInfo {
if len(disks) == 0 {
return nil
}
- // Group disks by rack and DC for diversity
- rackGroups := make(map[string][]*topology.DiskInfo)
- for _, disk := range disks {
- rackKey := fmt.Sprintf("%s:%s", disk.DataCenter, disk.Rack)
- rackGroups[rackKey] = append(rackGroups[rackKey], disk)
+ // Convert topology.DiskInfo to placement.DiskCandidate
+ candidates := diskInfosToCandidates(disks)
+ if len(candidates) == 0 {
+ return nil
}
- var selected []*topology.DiskInfo
- usedRacks := make(map[string]bool)
+ // Configure placement for EC shards
+ config := placement.PlacementRequest{
+ ShardsNeeded: shardsNeeded,
+ MaxShardsPerServer: 0, // No hard limit, but prefer spreading
+ MaxShardsPerRack: 0, // No hard limit, but prefer spreading
+ MaxTaskLoad: topology.MaxTaskLoadForECPlacement,
+ PreferDifferentServers: true,
+ PreferDifferentRacks: true,
+ }
- // First pass: select one disk from each rack for maximum diversity
- for rackKey, rackDisks := range rackGroups {
- if len(selected) >= shardsNeeded {
- break
- }
+ // Use the shared placement algorithm
+ result, err := placement.SelectDestinations(candidates, config)
+ if err != nil {
+ glog.V(2).Infof("EC placement failed: %v", err)
+ return nil
+ }
- // Select best disk from this rack
- bestDisk := selectBestFromRack(rackDisks, sourceRack, sourceDC)
- if bestDisk != nil {
- selected = append(selected, bestDisk)
- usedRacks[rackKey] = true
+ // Convert back to topology.DiskInfo
+ return candidatesToDiskInfos(result.SelectedDisks, disks)
+}
+
+// diskInfosToCandidates converts topology.DiskInfo slice to placement.DiskCandidate slice
+func diskInfosToCandidates(disks []*topology.DiskInfo) []*placement.DiskCandidate {
+ var candidates []*placement.DiskCandidate
+ for _, disk := range disks {
+ if disk.DiskInfo == nil {
+ continue
}
- }
- // Second pass: if we need more disks, select from racks we've already used
- if len(selected) < shardsNeeded {
- for _, disk := range disks {
- if len(selected) >= shardsNeeded {
- break
- }
+ // Calculate free slots (using default max if not set)
+ freeSlots := int(disk.DiskInfo.MaxVolumeCount - disk.DiskInfo.VolumeCount)
+ if freeSlots < 0 {
+ freeSlots = 0
+ }
- // Skip if already selected
- alreadySelected := false
- for _, sel := range selected {
- if sel.NodeID == disk.NodeID && sel.DiskID == disk.DiskID {
- alreadySelected = true
- break
+ // Calculate EC shard count for this specific disk
+ // EcShardInfos contains all shards, so we need to filter by DiskId and sum actual shard counts
+ ecShardCount := 0
+ if disk.DiskInfo.EcShardInfos != nil {
+ for _, shardInfo := range disk.DiskInfo.EcShardInfos {
+ if shardInfo.DiskId == disk.DiskID {
+ ecShardCount += erasure_coding.ShardBits(shardInfo.EcIndexBits).ShardIdCount()
}
}
-
- if !alreadySelected && isDiskSuitableForEC(disk) {
- selected = append(selected, disk)
- }
}
- }
- return selected
+ candidates = append(candidates, &placement.DiskCandidate{
+ NodeID: disk.NodeID,
+ DiskID: disk.DiskID,
+ DataCenter: disk.DataCenter,
+ Rack: disk.Rack,
+ VolumeCount: disk.DiskInfo.VolumeCount,
+ MaxVolumeCount: disk.DiskInfo.MaxVolumeCount,
+ ShardCount: ecShardCount,
+ FreeSlots: freeSlots,
+ LoadCount: disk.LoadCount,
+ })
+ }
+ return candidates
}
-// selectBestFromRack selects the best disk from a rack for EC placement
-func selectBestFromRack(disks []*topology.DiskInfo, sourceRack, sourceDC string) *topology.DiskInfo {
- if len(disks) == 0 {
- return nil
+// candidatesToDiskInfos converts placement results back to topology.DiskInfo
+func candidatesToDiskInfos(candidates []*placement.DiskCandidate, originalDisks []*topology.DiskInfo) []*topology.DiskInfo {
+ // Create a map for quick lookup
+ diskMap := make(map[string]*topology.DiskInfo)
+ for _, disk := range originalDisks {
+ key := fmt.Sprintf("%s:%d", disk.NodeID, disk.DiskID)
+ diskMap[key] = disk
}
- var bestDisk *topology.DiskInfo
- bestScore := -1.0
-
- for _, disk := range disks {
- if !isDiskSuitableForEC(disk) {
- continue
- }
-
- score := calculateECScore(disk, sourceRack, sourceDC)
- if score > bestScore {
- bestScore = score
- bestDisk = disk
+ var result []*topology.DiskInfo
+ for _, candidate := range candidates {
+ key := fmt.Sprintf("%s:%d", candidate.NodeID, candidate.DiskID)
+ if disk, ok := diskMap[key]; ok {
+ result = append(result, disk)
}
}
-
- return bestDisk
+ return result
}
// calculateECScore calculates placement score for EC operations
+// Used for logging and plan metadata
func calculateECScore(disk *topology.DiskInfo, sourceRack, sourceDC string) float64 {
if disk.DiskInfo == nil {
return 0.0
@@ -524,14 +540,12 @@ func calculateECScore(disk *topology.DiskInfo, sourceRack, sourceDC string) floa
// Consider current load (secondary factor)
score += (10.0 - float64(disk.LoadCount)) // Up to 10 points for low load
- // Note: We don't penalize placing shards on the same rack/DC as source
- // since the original volume will be deleted after EC conversion.
- // This allows for better network efficiency and storage utilization.
-
return score
}
// isDiskSuitableForEC checks if a disk is suitable for EC placement
+// Note: This is kept for backward compatibility but the placement package
+// handles filtering internally
func isDiskSuitableForEC(disk *topology.DiskInfo) bool {
if disk.DiskInfo == nil {
return false