From 61c0514a1c0119875dc7e7ebb8c45b2914c732bf Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 13:31:35 -0800 Subject: filer: add username and keyPrefix support for Redis stores (#7591) * filer: add username and keyPrefix support for Redis stores Addresses https://github.com/seaweedfs/seaweedfs/issues/7299 - Add username config option to redis2, redis_cluster2, redis_lua, and redis_lua_cluster stores (sentinel stores already had it) - Add keyPrefix config option to all Redis stores to prefix all keys, useful for Envoy Redis Proxy or multi-tenant Redis setups * refactor: reduce duplication in redis.NewClient creation Address code review feedback by defining redis.Options once and conditionally setting TLSConfig instead of duplicating the entire NewClient call. * filer.toml: add username and keyPrefix to redis2.tmp example --- weed/command/scaffold/filer.toml | 18 ++++++++++++++++ weed/filer/redis2/redis_cluster_store.go | 6 +++++- weed/filer/redis2/redis_sentinel_store.go | 4 +++- weed/filer/redis2/redis_store.go | 26 +++++++++++------------ weed/filer/redis2/universal_redis_store.go | 30 +++++++++++++++++---------- weed/filer/redis_lua/redis_cluster_store.go | 6 +++++- weed/filer/redis_lua/redis_sentinel_store.go | 4 +++- weed/filer/redis_lua/redis_store.go | 6 +++++- weed/filer/redis_lua/universal_redis_store.go | 18 +++++++++++----- 9 files changed, 83 insertions(+), 35 deletions(-) diff --git a/weed/command/scaffold/filer.toml b/weed/command/scaffold/filer.toml index 61a7ced6d..9261591c4 100644 --- a/weed/command/scaffold/filer.toml +++ b/weed/command/scaffold/filer.toml @@ -201,8 +201,11 @@ table = "seaweedfs" [redis2] enabled = false address = "localhost:6379" +username = "" password = "" database = 0 +# prefix for filer redis keys +keyPrefix = "" enable_tls = false ca_cert_path = "" client_cert_path = "" @@ -217,6 +220,8 @@ masterName = "master" username = "" password = "" database = 0 +# prefix for filer redis keys +keyPrefix = "" enable_tls = false ca_cert_path = "" client_cert_path = "" @@ -232,7 +237,10 @@ addresses = [ "localhost:30005", "localhost:30006", ] +username = "" password = "" +# prefix for filer redis keys +keyPrefix = "" enable_tls = false ca_cert_path = "" client_cert_path = "" @@ -248,8 +256,11 @@ superLargeDirectories = [] [redis_lua] enabled = false address = "localhost:6379" +username = "" password = "" database = 0 +# prefix for filer redis keys +keyPrefix = "" enable_tls = false ca_cert_path = "" client_cert_path = "" @@ -264,6 +275,8 @@ masterName = "master" username = "" password = "" database = 0 +# prefix for filer redis keys +keyPrefix = "" enable_tls = false ca_cert_path = "" client_cert_path = "" @@ -279,7 +292,10 @@ addresses = [ "localhost:30005", "localhost:30006", ] +username = "" password = "" +# prefix for filer redis keys +keyPrefix = "" enable_tls = false ca_cert_path = "" client_cert_path = "" @@ -373,8 +389,10 @@ dialTimeOut = 10 enabled = false location = "/tmp/" address = "localhost:6379" +username = "" password = "" database = 1 +keyPrefix = "" [tikv] enabled = false diff --git a/weed/filer/redis2/redis_cluster_store.go b/weed/filer/redis2/redis_cluster_store.go index 6e4f11d22..5e1593e9e 100644 --- a/weed/filer/redis2/redis_cluster_store.go +++ b/weed/filer/redis2/redis_cluster_store.go @@ -25,20 +25,24 @@ func (store *RedisCluster2Store) Initialize(configuration util.Configuration, pr return store.initialize( configuration.GetStringSlice(prefix+"addresses"), + configuration.GetString(prefix+"username"), configuration.GetString(prefix+"password"), + configuration.GetString(prefix+"keyPrefix"), configuration.GetBool(prefix+"useReadOnly"), configuration.GetBool(prefix+"routeByLatency"), configuration.GetStringSlice(prefix+"superLargeDirectories"), ) } -func (store *RedisCluster2Store) initialize(addresses []string, password string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) { +func (store *RedisCluster2Store) initialize(addresses []string, username string, password string, keyPrefix string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) { store.Client = redis.NewClusterClient(&redis.ClusterOptions{ Addrs: addresses, + Username: username, Password: password, ReadOnly: readOnly, RouteByLatency: routeByLatency, }) + store.keyPrefix = keyPrefix store.loadSuperLargeDirectories(superLargeDirectories) return } diff --git a/weed/filer/redis2/redis_sentinel_store.go b/weed/filer/redis2/redis_sentinel_store.go index 5fc368fc7..dc15285bd 100644 --- a/weed/filer/redis2/redis_sentinel_store.go +++ b/weed/filer/redis2/redis_sentinel_store.go @@ -26,10 +26,11 @@ func (store *Redis2SentinelStore) Initialize(configuration util.Configuration, p configuration.GetString(prefix+"username"), configuration.GetString(prefix+"password"), configuration.GetInt(prefix+"database"), + configuration.GetString(prefix+"keyPrefix"), ) } -func (store *Redis2SentinelStore) initialize(addresses []string, masterName string, username string, password string, database int) (err error) { +func (store *Redis2SentinelStore) initialize(addresses []string, masterName string, username string, password string, database int, keyPrefix string) (err error) { store.Client = redis.NewFailoverClient(&redis.FailoverOptions{ MasterName: masterName, SentinelAddrs: addresses, @@ -41,5 +42,6 @@ func (store *Redis2SentinelStore) initialize(addresses []string, masterName stri ReadTimeout: time.Second * 30, WriteTimeout: time.Second * 5, }) + store.keyPrefix = keyPrefix return } diff --git a/weed/filer/redis2/redis_store.go b/weed/filer/redis2/redis_store.go index f9322be42..7193699f9 100644 --- a/weed/filer/redis2/redis_store.go +++ b/weed/filer/redis2/redis_store.go @@ -27,8 +27,10 @@ func (store *Redis2Store) GetName() string { func (store *Redis2Store) Initialize(configuration util.Configuration, prefix string) (err error) { return store.initialize( configuration.GetString(prefix+"address"), + configuration.GetString(prefix+"username"), configuration.GetString(prefix+"password"), configuration.GetInt(prefix+"database"), + configuration.GetString(prefix+"keyPrefix"), configuration.GetStringSlice(prefix+"superLargeDirectories"), configuration.GetBool(prefix+"enable_mtls"), configuration.GetString(prefix+"ca_cert_path"), @@ -37,7 +39,13 @@ func (store *Redis2Store) Initialize(configuration util.Configuration, prefix st ) } -func (store *Redis2Store) initialize(hostPort string, password string, database int, superLargeDirectories []string, enableMtls bool, caCertPath string, clientCertPath string, clientKeyPath string) (err error) { +func (store *Redis2Store) initialize(hostPort string, username string, password string, database int, keyPrefix string, superLargeDirectories []string, enableMtls bool, caCertPath string, clientCertPath string, clientKeyPath string) (err error) { + opt := &redis.Options{ + Addr: hostPort, + Username: username, + Password: password, + DB: database, + } if enableMtls { clientCert, err := tls.LoadX509KeyPair(clientCertPath, clientKeyPath) if err != nil { @@ -59,25 +67,15 @@ func (store *Redis2Store) initialize(hostPort string, password string, database glog.Fatalf("Error parsing redis host and port from %s: %v", hostPort, err) } - tlsConfig := &tls.Config{ + opt.TLSConfig = &tls.Config{ Certificates: []tls.Certificate{clientCert}, RootCAs: caCertPool, ServerName: redisHost, MinVersion: tls.VersionTLS12, } - store.Client = redis.NewClient(&redis.Options{ - Addr: hostPort, - Password: password, - DB: database, - TLSConfig: tlsConfig, - }) - } else { - store.Client = redis.NewClient(&redis.Options{ - Addr: hostPort, - Password: password, - DB: database, - }) } + store.Client = redis.NewClient(opt) + store.keyPrefix = keyPrefix store.loadSuperLargeDirectories(superLargeDirectories) return } diff --git a/weed/filer/redis2/universal_redis_store.go b/weed/filer/redis2/universal_redis_store.go index 0dbf7a72a..a8429a764 100644 --- a/weed/filer/redis2/universal_redis_store.go +++ b/weed/filer/redis2/universal_redis_store.go @@ -19,6 +19,7 @@ const ( type UniversalRedis2Store struct { Client redis.UniversalClient + keyPrefix string superLargeDirectoryHash map[string]bool } @@ -35,6 +36,13 @@ func (store *UniversalRedis2Store) loadSuperLargeDirectories(superLargeDirectori } } +func (store *UniversalRedis2Store) getKey(key string) string { + if store.keyPrefix == "" { + return key + } + return store.keyPrefix + key +} + func (store *UniversalRedis2Store) BeginTransaction(ctx context.Context) (context.Context, error) { return ctx, nil } @@ -57,7 +65,7 @@ func (store *UniversalRedis2Store) InsertEntry(ctx context.Context, entry *filer } if name != "" { - if err = store.Client.ZAddNX(ctx, genDirectoryListKey(dir), redis.Z{Score: 0, Member: name}).Err(); err != nil { + if err = store.Client.ZAddNX(ctx, store.getKey(genDirectoryListKey(dir)), redis.Z{Score: 0, Member: name}).Err(); err != nil { return fmt.Errorf("persisting %s in parent dir: %v", entry.FullPath, err) } } @@ -75,7 +83,7 @@ func (store *UniversalRedis2Store) doInsertEntry(ctx context.Context, entry *fil value = util.MaybeGzipData(value) } - if err = store.Client.Set(ctx, string(entry.FullPath), value, time.Duration(entry.TtlSec)*time.Second).Err(); err != nil { + if err = store.Client.Set(ctx, store.getKey(string(entry.FullPath)), value, time.Duration(entry.TtlSec)*time.Second).Err(); err != nil { return fmt.Errorf("persisting %s : %v", entry.FullPath, err) } return nil @@ -88,7 +96,7 @@ func (store *UniversalRedis2Store) UpdateEntry(ctx context.Context, entry *filer func (store *UniversalRedis2Store) FindEntry(ctx context.Context, fullpath util.FullPath) (entry *filer.Entry, err error) { - data, err := store.Client.Get(ctx, string(fullpath)).Result() + data, err := store.Client.Get(ctx, store.getKey(string(fullpath))).Result() if err == redis.Nil { return nil, filer_pb.ErrNotFound } @@ -110,12 +118,12 @@ func (store *UniversalRedis2Store) FindEntry(ctx context.Context, fullpath util. func (store *UniversalRedis2Store) DeleteEntry(ctx context.Context, fullpath util.FullPath) (err error) { - _, err = store.Client.Del(ctx, genDirectoryListKey(string(fullpath))).Result() + _, err = store.Client.Del(ctx, store.getKey(genDirectoryListKey(string(fullpath)))).Result() if err != nil { return fmt.Errorf("delete dir list %s : %v", fullpath, err) } - _, err = store.Client.Del(ctx, string(fullpath)).Result() + _, err = store.Client.Del(ctx, store.getKey(string(fullpath))).Result() if err != nil { return fmt.Errorf("delete %s : %v", fullpath, err) } @@ -125,7 +133,7 @@ func (store *UniversalRedis2Store) DeleteEntry(ctx context.Context, fullpath uti return nil } if name != "" { - _, err = store.Client.ZRem(ctx, genDirectoryListKey(dir), name).Result() + _, err = store.Client.ZRem(ctx, store.getKey(genDirectoryListKey(dir)), name).Result() if err != nil { return fmt.Errorf("DeleteEntry %s in parent dir: %v", fullpath, err) } @@ -140,7 +148,7 @@ func (store *UniversalRedis2Store) DeleteFolderChildren(ctx context.Context, ful return nil } - members, err := store.Client.ZRangeByLex(ctx, genDirectoryListKey(string(fullpath)), &redis.ZRangeBy{ + members, err := store.Client.ZRangeByLex(ctx, store.getKey(genDirectoryListKey(string(fullpath))), &redis.ZRangeBy{ Min: "-", Max: "+", }).Result() @@ -150,12 +158,12 @@ func (store *UniversalRedis2Store) DeleteFolderChildren(ctx context.Context, ful for _, fileName := range members { path := util.NewFullPath(string(fullpath), fileName) - _, err = store.Client.Del(ctx, string(path)).Result() + _, err = store.Client.Del(ctx, store.getKey(string(path))).Result() if err != nil { return fmt.Errorf("DeleteFolderChildren %s in parent dir: %v", fullpath, err) } // not efficient, but need to remove if it is a directory - store.Client.Del(ctx, genDirectoryListKey(string(path))) + store.Client.Del(ctx, store.getKey(genDirectoryListKey(string(path)))) } return nil @@ -167,7 +175,7 @@ func (store *UniversalRedis2Store) ListDirectoryPrefixedEntries(ctx context.Cont func (store *UniversalRedis2Store) ListDirectoryEntries(ctx context.Context, dirPath util.FullPath, startFileName string, includeStartFile bool, limit int64, eachEntryFunc filer.ListEachEntryFunc) (lastFileName string, err error) { - dirListKey := genDirectoryListKey(string(dirPath)) + dirListKey := store.getKey(genDirectoryListKey(string(dirPath))) min := "-" if startFileName != "" { @@ -201,7 +209,7 @@ func (store *UniversalRedis2Store) ListDirectoryEntries(ctx context.Context, dir } else { if entry.TtlSec > 0 { if entry.Attr.Crtime.Add(time.Duration(entry.TtlSec) * time.Second).Before(time.Now()) { - store.Client.Del(ctx, string(path)).Result() + store.Client.Del(ctx, store.getKey(string(path))).Result() store.Client.ZRem(ctx, dirListKey, fileName).Result() continue } diff --git a/weed/filer/redis_lua/redis_cluster_store.go b/weed/filer/redis_lua/redis_cluster_store.go index 251aadbcd..b64342fc2 100644 --- a/weed/filer/redis_lua/redis_cluster_store.go +++ b/weed/filer/redis_lua/redis_cluster_store.go @@ -25,20 +25,24 @@ func (store *RedisLuaClusterStore) Initialize(configuration util.Configuration, return store.initialize( configuration.GetStringSlice(prefix+"addresses"), + configuration.GetString(prefix+"username"), configuration.GetString(prefix+"password"), + configuration.GetString(prefix+"keyPrefix"), configuration.GetBool(prefix+"useReadOnly"), configuration.GetBool(prefix+"routeByLatency"), configuration.GetStringSlice(prefix+"superLargeDirectories"), ) } -func (store *RedisLuaClusterStore) initialize(addresses []string, password string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) { +func (store *RedisLuaClusterStore) initialize(addresses []string, username string, password string, keyPrefix string, readOnly, routeByLatency bool, superLargeDirectories []string) (err error) { store.Client = redis.NewClusterClient(&redis.ClusterOptions{ Addrs: addresses, + Username: username, Password: password, ReadOnly: readOnly, RouteByLatency: routeByLatency, }) + store.keyPrefix = keyPrefix store.loadSuperLargeDirectories(superLargeDirectories) return } diff --git a/weed/filer/redis_lua/redis_sentinel_store.go b/weed/filer/redis_lua/redis_sentinel_store.go index f22a7fa66..12a582ac3 100644 --- a/weed/filer/redis_lua/redis_sentinel_store.go +++ b/weed/filer/redis_lua/redis_sentinel_store.go @@ -26,10 +26,11 @@ func (store *RedisLuaSentinelStore) Initialize(configuration util.Configuration, configuration.GetString(prefix+"username"), configuration.GetString(prefix+"password"), configuration.GetInt(prefix+"database"), + configuration.GetString(prefix+"keyPrefix"), ) } -func (store *RedisLuaSentinelStore) initialize(addresses []string, masterName string, username string, password string, database int) (err error) { +func (store *RedisLuaSentinelStore) initialize(addresses []string, masterName string, username string, password string, database int, keyPrefix string) (err error) { store.Client = redis.NewFailoverClient(&redis.FailoverOptions{ MasterName: masterName, SentinelAddrs: addresses, @@ -41,5 +42,6 @@ func (store *RedisLuaSentinelStore) initialize(addresses []string, masterName st ReadTimeout: time.Second * 30, WriteTimeout: time.Second * 5, }) + store.keyPrefix = keyPrefix return } diff --git a/weed/filer/redis_lua/redis_store.go b/weed/filer/redis_lua/redis_store.go index 8574baa09..4f6354e96 100644 --- a/weed/filer/redis_lua/redis_store.go +++ b/weed/filer/redis_lua/redis_store.go @@ -21,18 +21,22 @@ func (store *RedisLuaStore) GetName() string { func (store *RedisLuaStore) Initialize(configuration util.Configuration, prefix string) (err error) { return store.initialize( configuration.GetString(prefix+"address"), + configuration.GetString(prefix+"username"), configuration.GetString(prefix+"password"), configuration.GetInt(prefix+"database"), + configuration.GetString(prefix+"keyPrefix"), configuration.GetStringSlice(prefix+"superLargeDirectories"), ) } -func (store *RedisLuaStore) initialize(hostPort string, password string, database int, superLargeDirectories []string) (err error) { +func (store *RedisLuaStore) initialize(hostPort string, username string, password string, database int, keyPrefix string, superLargeDirectories []string) (err error) { store.Client = redis.NewClient(&redis.Options{ Addr: hostPort, + Username: username, Password: password, DB: database, }) + store.keyPrefix = keyPrefix store.loadSuperLargeDirectories(superLargeDirectories) return } diff --git a/weed/filer/redis_lua/universal_redis_store.go b/weed/filer/redis_lua/universal_redis_store.go index 35f6d4991..0a02a0730 100644 --- a/weed/filer/redis_lua/universal_redis_store.go +++ b/weed/filer/redis_lua/universal_redis_store.go @@ -20,6 +20,7 @@ const ( type UniversalRedisLuaStore struct { Client redis.UniversalClient + keyPrefix string superLargeDirectoryHash map[string]bool } @@ -36,6 +37,13 @@ func (store *UniversalRedisLuaStore) loadSuperLargeDirectories(superLargeDirecto } } +func (store *UniversalRedisLuaStore) getKey(key string) string { + if store.keyPrefix == "" { + return key + } + return store.keyPrefix + key +} + func (store *UniversalRedisLuaStore) BeginTransaction(ctx context.Context) (context.Context, error) { return ctx, nil } @@ -60,7 +68,7 @@ func (store *UniversalRedisLuaStore) InsertEntry(ctx context.Context, entry *fil dir, name := entry.FullPath.DirAndName() err = stored_procedure.InsertEntryScript.Run(ctx, store.Client, - []string{string(entry.FullPath), genDirectoryListKey(dir)}, + []string{store.getKey(string(entry.FullPath)), store.getKey(genDirectoryListKey(dir))}, value, entry.TtlSec, store.isSuperLargeDirectory(dir), 0, name).Err() @@ -78,7 +86,7 @@ func (store *UniversalRedisLuaStore) UpdateEntry(ctx context.Context, entry *fil func (store *UniversalRedisLuaStore) FindEntry(ctx context.Context, fullpath util.FullPath) (entry *filer.Entry, err error) { - data, err := store.Client.Get(ctx, string(fullpath)).Result() + data, err := store.Client.Get(ctx, store.getKey(string(fullpath))).Result() if err == redis.Nil { return nil, filer_pb.ErrNotFound } @@ -103,7 +111,7 @@ func (store *UniversalRedisLuaStore) DeleteEntry(ctx context.Context, fullpath u dir, name := fullpath.DirAndName() err = stored_procedure.DeleteEntryScript.Run(ctx, store.Client, - []string{string(fullpath), genDirectoryListKey(string(fullpath)), genDirectoryListKey(dir)}, + []string{store.getKey(string(fullpath)), store.getKey(genDirectoryListKey(string(fullpath))), store.getKey(genDirectoryListKey(dir))}, store.isSuperLargeDirectory(dir), name).Err() if err != nil { @@ -120,7 +128,7 @@ func (store *UniversalRedisLuaStore) DeleteFolderChildren(ctx context.Context, f } err = stored_procedure.DeleteFolderChildrenScript.Run(ctx, store.Client, - []string{string(fullpath)}).Err() + []string{store.getKey(string(fullpath))}).Err() if err != nil { return fmt.Errorf("DeleteFolderChildren %s : %v", fullpath, err) @@ -135,7 +143,7 @@ func (store *UniversalRedisLuaStore) ListDirectoryPrefixedEntries(ctx context.Co func (store *UniversalRedisLuaStore) ListDirectoryEntries(ctx context.Context, dirPath util.FullPath, startFileName string, includeStartFile bool, limit int64, eachEntryFunc filer.ListEachEntryFunc) (lastFileName string, err error) { - dirListKey := genDirectoryListKey(string(dirPath)) + dirListKey := store.getKey(genDirectoryListKey(string(dirPath))) min := "-" if startFileName != "" { -- cgit v1.2.3 From 8c585a9682748353c78b96e5f55238074497b35a Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 15:19:42 -0800 Subject: Fix S3 object tagging issue #7589 - Add X-Amz-Tagging header parsing in putToFiler function for PUT object operations - Store tags with X-Amz-Tagging- prefix in entry.Extended metadata - Add comprehensive test suite for S3 object tagging functionality - Tests cover upload tagging, API operations, special characters, and edge cases --- test/s3/tagging/Makefile | 23 ++ test/s3/tagging/README.md | 53 ++++ test/s3/tagging/s3_tagging_test.go | 433 ++++++++++++++++++++++++++++++++ weed/s3api/s3api_object_handlers_put.go | 27 +- 4 files changed, 534 insertions(+), 2 deletions(-) create mode 100644 test/s3/tagging/Makefile create mode 100644 test/s3/tagging/README.md create mode 100644 test/s3/tagging/s3_tagging_test.go diff --git a/test/s3/tagging/Makefile b/test/s3/tagging/Makefile new file mode 100644 index 000000000..6d3343a70 --- /dev/null +++ b/test/s3/tagging/Makefile @@ -0,0 +1,23 @@ +# S3 Object Tagging Tests +# Tests for GitHub issue #7589: S3 object Tags query comes back empty + +.PHONY: test + +# Run all tagging tests +test: + go test -v ./... + +# Run specific test +test-upload: + go test -v -run TestObjectTaggingOnUpload + +test-special-chars: + go test -v -run TestObjectTaggingOnUploadWithSpecialCharacters + +test-api: + go test -v -run TestPutObjectTaggingAPI + +# Test configuration (override these for different environments) +S3_ENDPOINT ?= http://localhost:8333 +S3_ACCESS_KEY ?= some_access_key1 +S3_SECRET_KEY ?= some_secret_key1 diff --git a/test/s3/tagging/README.md b/test/s3/tagging/README.md new file mode 100644 index 000000000..73f2a46c4 --- /dev/null +++ b/test/s3/tagging/README.md @@ -0,0 +1,53 @@ +# S3 Object Tagging Tests + +This directory contains tests for S3 object tagging functionality. + +## Issue Reference + +These tests were created to verify the fix for [GitHub Issue #7589](https://github.com/seaweedfs/seaweedfs/issues/7589): +**S3 object Tags query comes back empty** + +## Problem Description + +When uploading an object with tags using the `X-Amz-Tagging` header, the tags were not being stored. +When querying the object tagging with `GetObjectTagging`, the response was empty. + +This was a regression between SeaweedFS 4.00 and 4.01. + +## Root Cause + +The `putToFiler` function in `s3api_object_handlers_put.go` was not parsing the `X-Amz-Tagging` header +and storing the tags in the entry's Extended metadata. The code was only copying user metadata +(headers starting with `X-Amz-Meta-`) but not object tags. + +## Fix + +Added tag parsing logic to `putToFiler` that: +1. Reads the `X-Amz-Tagging` header +2. Parses it using `url.ParseQuery()` for proper URL decoding +3. Stores each tag with the prefix `X-Amz-Tagging-` in the entry's Extended metadata + +## Running Tests + +```bash +# Run all tagging tests +cd test/s3/tagging +make test + +# Run specific test +make test-upload + +# Or using go test directly +go test -v ./... +``` + +## Test Cases + +1. **TestObjectTaggingOnUpload** - Basic test for tags sent during object upload +2. **TestObjectTaggingOnUploadWithSpecialCharacters** - Tests URL-encoded tag values +3. **TestObjectTaggingOnUploadWithEmptyValue** - Tests tags with empty values +4. **TestPutObjectTaggingAPI** - Tests the PutObjectTagging API separately +5. **TestDeleteObjectTagging** - Tests tag deletion +6. **TestTagsNotPreservedAfterObjectOverwrite** - Verifies AWS S3 behavior on overwrite +7. **TestMaximumNumberOfTags** - Tests storing the maximum 10 tags +8. **TestTagCountHeader** - Tests the x-amz-tagging-count header in HeadObject diff --git a/test/s3/tagging/s3_tagging_test.go b/test/s3/tagging/s3_tagging_test.go new file mode 100644 index 000000000..1dd4895b6 --- /dev/null +++ b/test/s3/tagging/s3_tagging_test.go @@ -0,0 +1,433 @@ +package tagging + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// S3TestConfig holds configuration for S3 tests +type S3TestConfig struct { + Endpoint string + AccessKey string + SecretKey string + Region string + BucketPrefix string + UseSSL bool + SkipVerifySSL bool +} + +// getDefaultConfig returns a fresh instance of the default test configuration +func getDefaultConfig() *S3TestConfig { + return &S3TestConfig{ + Endpoint: "http://localhost:8333", // Default SeaweedFS S3 port + AccessKey: "some_access_key1", + SecretKey: "some_secret_key1", + Region: "us-east-1", + BucketPrefix: "test-tagging-", + UseSSL: false, + SkipVerifySSL: true, + } +} + +// getS3Client creates an AWS S3 client for testing +func getS3Client(t *testing.T) *s3.Client { + defaultConfig := getDefaultConfig() + cfg, err := config.LoadDefaultConfig(context.TODO(), + config.WithRegion(defaultConfig.Region), + config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + defaultConfig.AccessKey, + defaultConfig.SecretKey, + "", + )), + config.WithEndpointResolverWithOptions(aws.EndpointResolverWithOptionsFunc( + func(service, region string, options ...interface{}) (aws.Endpoint, error) { + return aws.Endpoint{ + URL: defaultConfig.Endpoint, + SigningRegion: defaultConfig.Region, + }, nil + })), + ) + require.NoError(t, err) + + client := s3.NewFromConfig(cfg, func(o *s3.Options) { + o.UsePathStyle = true + }) + return client +} + +// createTestBucket creates a test bucket with a unique name +func createTestBucket(t *testing.T, client *s3.Client) string { + defaultConfig := getDefaultConfig() + bucketName := fmt.Sprintf("%s%d", defaultConfig.BucketPrefix, time.Now().UnixNano()) + + _, err := client.CreateBucket(context.TODO(), &s3.CreateBucketInput{ + Bucket: aws.String(bucketName), + }) + require.NoError(t, err) + + // Wait for bucket metadata to be fully processed + time.Sleep(50 * time.Millisecond) + + return bucketName +} + +// cleanupTestBucket removes the test bucket and all its contents +func cleanupTestBucket(t *testing.T, client *s3.Client, bucketName string) { + // First, delete all objects in the bucket + listResp, err := client.ListObjectsV2(context.TODO(), &s3.ListObjectsV2Input{ + Bucket: aws.String(bucketName), + }) + if err == nil { + for _, obj := range listResp.Contents { + _, err := client.DeleteObject(context.TODO(), &s3.DeleteObjectInput{ + Bucket: aws.String(bucketName), + Key: obj.Key, + }) + if err != nil { + t.Logf("Warning: failed to delete object %s: %v", *obj.Key, err) + } + } + } + + // Then delete the bucket + _, err = client.DeleteBucket(context.TODO(), &s3.DeleteBucketInput{ + Bucket: aws.String(bucketName), + }) + if err != nil { + t.Logf("Warning: failed to delete bucket %s: %v", bucketName, err) + } +} + +// TestObjectTaggingOnUpload tests that tags sent during object upload (via X-Amz-Tagging header) +// are properly stored and can be retrieved. This is the fix for GitHub issue #7589. +func TestObjectTaggingOnUpload(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-with-tags" + objectContent := "Hello, World!" + + // Put object with tags using the Tagging parameter (X-Amz-Tagging header) + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + Tagging: aws.String("env=production&team=platform"), + }) + require.NoError(t, err, "Should be able to put object with tags") + + // Get the tags back + tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags") + + // Verify tags were stored correctly + require.Len(t, tagResp.TagSet, 2, "Should have 2 tags") + + // Build a map for easier assertion + tagMap := make(map[string]string) + for _, tag := range tagResp.TagSet { + tagMap[*tag.Key] = *tag.Value + } + + assert.Equal(t, "production", tagMap["env"], "env tag should be 'production'") + assert.Equal(t, "platform", tagMap["team"], "team tag should be 'platform'") +} + +// TestObjectTaggingOnUploadWithSpecialCharacters tests tags with URL-encoded characters +func TestObjectTaggingOnUploadWithSpecialCharacters(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-with-special-tags" + objectContent := "Hello, World!" + + // Put object with tags containing special characters + // AWS SDK will URL-encode these automatically + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + Tagging: aws.String("timestamp=2025-07-16 14:40:39&path=/tmp/file.txt"), + }) + require.NoError(t, err, "Should be able to put object with special character tags") + + // Get the tags back + tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags") + + // Verify tags were stored and URL-decoded correctly + require.Len(t, tagResp.TagSet, 2, "Should have 2 tags") + + tagMap := make(map[string]string) + for _, tag := range tagResp.TagSet { + tagMap[*tag.Key] = *tag.Value + } + + assert.Equal(t, "2025-07-16 14:40:39", tagMap["timestamp"], "timestamp tag should be decoded correctly") + assert.Equal(t, "/tmp/file.txt", tagMap["path"], "path tag should be decoded correctly") +} + +// TestObjectTaggingOnUploadWithEmptyValue tests tags with empty values +func TestObjectTaggingOnUploadWithEmptyValue(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-with-empty-tag" + objectContent := "Hello, World!" + + // Put object with a tag that has an empty value + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + Tagging: aws.String("marker=&env=dev"), + }) + require.NoError(t, err, "Should be able to put object with empty tag value") + + // Get the tags back + tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags") + + // Verify tags were stored correctly + require.Len(t, tagResp.TagSet, 2, "Should have 2 tags") + + tagMap := make(map[string]string) + for _, tag := range tagResp.TagSet { + tagMap[*tag.Key] = *tag.Value + } + + assert.Equal(t, "", tagMap["marker"], "marker tag should have empty value") + assert.Equal(t, "dev", tagMap["env"], "env tag should be 'dev'") +} + +// TestPutObjectTaggingAPI tests the PutObjectTagging API separately from upload +func TestPutObjectTaggingAPI(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-for-tagging-api" + objectContent := "Hello, World!" + + // First, put object without tags + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + }) + require.NoError(t, err, "Should be able to put object without tags") + + // Get tags - should be empty + tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags") + assert.Len(t, tagResp.TagSet, 0, "Should have no tags initially") + + // Now add tags using PutObjectTagging API + _, err = client.PutObjectTagging(context.TODO(), &s3.PutObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Tagging: &types.Tagging{ + TagSet: []types.Tag{ + {Key: aws.String("env"), Value: aws.String("staging")}, + {Key: aws.String("version"), Value: aws.String("1.0")}, + }, + }, + }) + require.NoError(t, err, "Should be able to put object tags via API") + + // Get tags - should now have the tags + tagResp, err = client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags after PutObjectTagging") + require.Len(t, tagResp.TagSet, 2, "Should have 2 tags") + + tagMap := make(map[string]string) + for _, tag := range tagResp.TagSet { + tagMap[*tag.Key] = *tag.Value + } + + assert.Equal(t, "staging", tagMap["env"], "env tag should be 'staging'") + assert.Equal(t, "1.0", tagMap["version"], "version tag should be '1.0'") +} + +// TestDeleteObjectTagging tests the DeleteObjectTagging API +func TestDeleteObjectTagging(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-for-delete-tags" + objectContent := "Hello, World!" + + // Put object with tags + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + Tagging: aws.String("env=production"), + }) + require.NoError(t, err, "Should be able to put object with tags") + + // Verify tags exist + tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags") + require.Len(t, tagResp.TagSet, 1, "Should have 1 tag") + + // Delete tags + _, err = client.DeleteObjectTagging(context.TODO(), &s3.DeleteObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to delete object tags") + + // Verify tags are deleted + tagResp, err = client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags after deletion") + assert.Len(t, tagResp.TagSet, 0, "Should have no tags after deletion") +} + +// TestTagsNotPreservedAfterObjectOverwrite tests that tags are NOT preserved when an object is overwritten +// This matches AWS S3 behavior where overwriting an object replaces all metadata including tags +func TestTagsNotPreservedAfterObjectOverwrite(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-overwrite-tags" + objectContent := "Original content" + + // Put object with tags + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + Tagging: aws.String("original=true"), + }) + require.NoError(t, err, "Should be able to put object with tags") + + // Verify original tags exist + tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags") + require.Len(t, tagResp.TagSet, 1, "Should have 1 tag") + assert.Equal(t, "original", *tagResp.TagSet[0].Key) + + // Overwrite the object WITHOUT tags + _, err = client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader("New content"), + }) + require.NoError(t, err, "Should be able to overwrite object") + + // Tags should be gone after overwrite (matches AWS S3 behavior) + tagResp, err = client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags after overwrite") + assert.Len(t, tagResp.TagSet, 0, "Tags should be cleared after object overwrite") +} + +// TestMaximumNumberOfTags tests that we can store the maximum 10 tags per object +func TestMaximumNumberOfTags(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-max-tags" + objectContent := "Hello, World!" + + // Build 10 tags (S3 max) + tags := []string{} + for i := 1; i <= 10; i++ { + tags = append(tags, fmt.Sprintf("key%d=value%d", i, i)) + } + tagging := strings.Join(tags, "&") + + // Put object with 10 tags + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + Tagging: aws.String(tagging), + }) + require.NoError(t, err, "Should be able to put object with 10 tags") + + // Get the tags back + tagResp, err := client.GetObjectTagging(context.TODO(), &s3.GetObjectTaggingInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to get object tags") + assert.Len(t, tagResp.TagSet, 10, "Should have 10 tags") +} + +// TestTagCountHeader tests that the x-amz-tagging-count header is returned in HeadObject +func TestTagCountHeader(t *testing.T) { + client := getS3Client(t) + bucketName := createTestBucket(t, client) + defer cleanupTestBucket(t, client, bucketName) + + objectKey := "test-object-tag-count" + objectContent := "Hello, World!" + + // Put object with tags + _, err := client.PutObject(context.TODO(), &s3.PutObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + Body: strings.NewReader(objectContent), + Tagging: aws.String("env=prod&team=backend&version=2.0"), + }) + require.NoError(t, err, "Should be able to put object with tags") + + // Head object to get tag count + headResp, err := client.HeadObject(context.TODO(), &s3.HeadObjectInput{ + Bucket: aws.String(bucketName), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Should be able to head object") + + // Check tag count header + if headResp.TagCount != nil { + assert.Equal(t, int32(3), *headResp.TagCount, "Tag count should be 3") + } else { + t.Log("Warning: TagCount header not returned - this may be expected depending on implementation") + } +} diff --git a/weed/s3api/s3api_object_handlers_put.go b/weed/s3api/s3api_object_handlers_put.go index 100796b2e..f848790de 100644 --- a/weed/s3api/s3api_object_handlers_put.go +++ b/weed/s3api/s3api_object_handlers_put.go @@ -8,6 +8,7 @@ import ( "fmt" "io" "net/http" + "net/url" "path/filepath" "strconv" "strings" @@ -548,6 +549,28 @@ func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader } } + // Parse and store object tags from X-Amz-Tagging header + // Fix for GitHub issue #7589: Tags sent during object upload were not being stored + if tagging := r.Header.Get(s3_constants.AmzObjectTagging); tagging != "" { + parsedTags, err := url.ParseQuery(tagging) + if err != nil { + glog.Warningf("putToFiler: Invalid S3 tag format in header '%s': %v", tagging, err) + return "", s3err.ErrInvalidTag, SSEResponseMetadata{} + } + for key, values := range parsedTags { + if len(values) > 1 { + glog.Warningf("putToFiler: Duplicate tag key '%s' in header", key) + return "", s3err.ErrInvalidTag, SSEResponseMetadata{} + } + value := "" + if len(values) > 0 { + value = values[0] + } + entry.Extended[s3_constants.AmzObjectTagging+"-"+key] = []byte(value) + } + glog.V(3).Infof("putToFiler: stored %d tags from X-Amz-Tagging header", len(parsedTags)) + } + // Set SSE-C metadata if customerKey != nil && len(sseIV) > 0 { // Store IV as RAW bytes (matches filer behavior - filer decodes base64 headers and stores raw bytes) @@ -680,9 +703,9 @@ func filerErrorToS3Error(err error) s3err.ErrorCode { if err == nil { return s3err.ErrNone } - + errString := err.Error() - + switch { case errString == constants.ErrMsgBadDigest: return s3err.ErrBadDigest -- cgit v1.2.3 -- cgit v1.2.3 From a33e5a9e6a696049e8f0e1b62b9e3ded8201675f Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 15:40:06 -0800 Subject: Add S3 object tagging tests to CI workflow - Modified test/s3/tagging/s3_tagging_test.go to use environment variables for configurable endpoint and credentials - Added s3-tagging-tests job to .github/workflows/s3-go-tests.yml to run tagging tests in CI - Tests will now run automatically on pull requests --- .github/workflows/s3-go-tests.yml | 111 +++++++++++++++++++++++++++++++++++++ test/s3/tagging/s3_tagging_test.go | 19 ++++++- 2 files changed, 127 insertions(+), 3 deletions(-) diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml index 6e04a8d44..556cf534e 100644 --- a/.github/workflows/s3-go-tests.yml +++ b/.github/workflows/s3-go-tests.yml @@ -411,4 +411,115 @@ jobs: path: test/s3/versioning/weed-test*.log retention-days: 7 + s3-tagging-tests: + name: S3 Tagging Tests + runs-on: ubuntu-22.04 + timeout-minutes: 20 + + steps: + - name: Check out code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: 'go.mod' + id: go + + - name: Install SeaweedFS + run: | + go install -buildvcs=false + + - name: Run S3 Tagging Tests + timeout-minutes: 15 + run: | + cd weed + set -x + # Create clean data directory for this test run + export WEED_DATA_DIR="/tmp/seaweedfs-tagging-test-$(date +%s)" + mkdir -p "$WEED_DATA_DIR" + weed server -filer -filer.maxMB=64 -s3 -ip.bind 0.0.0.0 \ + -dir="$WEED_DATA_DIR" \ + -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ + -volume.max=100 -volume.preStopSeconds=1 \ + -master.port=9338 -volume.port=8084 -filer.port=8893 -s3.port=8006 -metricsPort=9329 \ + -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & + pid=$! + + # Wait for all SeaweedFS components to be ready + echo "Waiting for SeaweedFS components to start..." + for i in {1..30}; do + if curl -s http://localhost:9338/cluster/status > /dev/null 2>&1; then + echo "Master server is ready" + break + fi + echo "Waiting for master server... ($i/30)" + sleep 2 + done + + for i in {1..30}; do + if curl -s http://localhost:8084/status > /dev/null 2>&1; then + echo "Volume server is ready" + break + fi + echo "Waiting for volume server... ($i/30)" + sleep 2 + done + + for i in {1..30}; do + if curl -s http://localhost:8893/ > /dev/null 2>&1; then + echo "Filer is ready" + break + fi + echo "Waiting for filer... ($i/30)" + sleep 2 + done + + for i in {1..30}; do + if curl -s http://localhost:8006/ > /dev/null 2>&1; then + echo "S3 server is ready" + break + fi + echo "Waiting for S3 server... ($i/30)" + sleep 2 + done + + echo "All SeaweedFS components are ready!" + cd ../test/s3/tagging + + # Set environment variables for the test + export S3_ENDPOINT="http://localhost:8006" + export S3_ACCESS_KEY="0555b35654ad1656d804" + export S3_SECRET_KEY="h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==" + + # Additional wait for S3-Filer integration to be fully ready + echo "Waiting additional 10 seconds for S3-Filer integration..." + sleep 10 + + # Test S3 connection before running tests + echo "Testing S3 connection..." + for i in {1..10}; do + if curl -s -f http://localhost:8006/ > /dev/null 2>&1; then + echo "S3 connection test successful" + break + fi + echo "S3 connection test failed, retrying... ($i/10)" + sleep 2 + done + + echo "✅ S3 server is responding, starting tests..." + + go test -v ./... + kill -9 $pid || true + # Clean up data directory + rm -rf "$WEED_DATA_DIR" || true + + - name: Upload test logs on failure + if: failure() + uses: actions/upload-artifact@v5 + with: + name: s3-tagging-test-logs + path: test/s3/tagging/weed-test*.log + retention-days: 3 + # Removed SSE-C integration tests and compatibility job \ No newline at end of file diff --git a/test/s3/tagging/s3_tagging_test.go b/test/s3/tagging/s3_tagging_test.go index 1dd4895b6..c490ca1aa 100644 --- a/test/s3/tagging/s3_tagging_test.go +++ b/test/s3/tagging/s3_tagging_test.go @@ -3,6 +3,7 @@ package tagging import ( "context" "fmt" + "os" "strings" "testing" "time" @@ -29,10 +30,22 @@ type S3TestConfig struct { // getDefaultConfig returns a fresh instance of the default test configuration func getDefaultConfig() *S3TestConfig { + endpoint := os.Getenv("S3_ENDPOINT") + if endpoint == "" { + endpoint = "http://localhost:8333" // Default SeaweedFS S3 port + } + accessKey := os.Getenv("S3_ACCESS_KEY") + if accessKey == "" { + accessKey = "some_access_key1" + } + secretKey := os.Getenv("S3_SECRET_KEY") + if secretKey == "" { + secretKey = "some_secret_key1" + } return &S3TestConfig{ - Endpoint: "http://localhost:8333", // Default SeaweedFS S3 port - AccessKey: "some_access_key1", - SecretKey: "some_secret_key1", + Endpoint: endpoint, + AccessKey: accessKey, + SecretKey: secretKey, Region: "us-east-1", BucketPrefix: "test-tagging-", UseSSL: false, -- cgit v1.2.3 From 310be2aece950ad6360b5c0e7d3c61fb40a572d4 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 15:43:32 -0800 Subject: Fix CI workflow: remove cd weed since working directory is already set to weed --- .github/workflows/s3-go-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml index 556cf534e..bcd4b9652 100644 --- a/.github/workflows/s3-go-tests.yml +++ b/.github/workflows/s3-go-tests.yml @@ -433,7 +433,6 @@ jobs: - name: Run S3 Tagging Tests timeout-minutes: 15 run: | - cd weed set -x # Create clean data directory for this test run export WEED_DATA_DIR="/tmp/seaweedfs-tagging-test-$(date +%s)" -- cgit v1.2.3 From 1eafaecd70f57db1223ca063da27ba5ff608d871 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 15:47:19 -0800 Subject: Add comment to s3-tagging-tests job to trigger CI re-run --- .github/workflows/s3-go-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml index bcd4b9652..480931e46 100644 --- a/.github/workflows/s3-go-tests.yml +++ b/.github/workflows/s3-go-tests.yml @@ -412,6 +412,7 @@ jobs: retention-days: 7 s3-tagging-tests: + # CI job for S3 object tagging tests name: S3 Tagging Tests runs-on: ubuntu-22.04 timeout-minutes: 20 -- cgit v1.2.3 From 60487e75f31910b944ecb43f68e43940bdebbd04 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 16:08:48 -0800 Subject: Fix port conflict in s3-tagging-tests CI job by changing volume port from 8084 to 8085 --- .github/workflows/s3-go-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml index 480931e46..74ca57269 100644 --- a/.github/workflows/s3-go-tests.yml +++ b/.github/workflows/s3-go-tests.yml @@ -442,7 +442,7 @@ jobs: -dir="$WEED_DATA_DIR" \ -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ -volume.max=100 -volume.preStopSeconds=1 \ - -master.port=9338 -volume.port=8084 -filer.port=8893 -s3.port=8006 -metricsPort=9329 \ + -master.port=9338 -volume.port=8085 -filer.port=8893 -s3.port=8006 -metricsPort=9329 \ -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & pid=$! @@ -458,7 +458,7 @@ jobs: done for i in {1..30}; do - if curl -s http://localhost:8084/status > /dev/null 2>&1; then + if curl -s http://localhost:8085/status > /dev/null 2>&1; then echo "Volume server is ready" break fi -- cgit v1.2.3 From 5d50baad5a868e1f91d66946f7ab78c4aa537a81 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 16:08:48 -0800 Subject: Fix port conflict in s3-tagging-tests CI job by changing volume port from 8084 to 8085 --- .github/workflows/s3-go-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml index 480931e46..74ca57269 100644 --- a/.github/workflows/s3-go-tests.yml +++ b/.github/workflows/s3-go-tests.yml @@ -442,7 +442,7 @@ jobs: -dir="$WEED_DATA_DIR" \ -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ -volume.max=100 -volume.preStopSeconds=1 \ - -master.port=9338 -volume.port=8084 -filer.port=8893 -s3.port=8006 -metricsPort=9329 \ + -master.port=9338 -volume.port=8085 -filer.port=8893 -s3.port=8006 -metricsPort=9329 \ -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & pid=$! @@ -458,7 +458,7 @@ jobs: done for i in {1..30}; do - if curl -s http://localhost:8084/status > /dev/null 2>&1; then + if curl -s http://localhost:8085/status > /dev/null 2>&1; then echo "Volume server is ready" break fi -- cgit v1.2.3 From ec417955941a658c4ab0cfc66c697e1e5932eecf Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 16:22:51 -0800 Subject: Update s3-tagging-tests to use Makefile server management like other S3 tests --- .github/workflows/s3-go-tests.yml | 90 ++--------- test/s3/tagging/Makefile | 322 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 324 insertions(+), 88 deletions(-) diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml index 74ca57269..54023fa78 100644 --- a/.github/workflows/s3-go-tests.yml +++ b/.github/workflows/s3-go-tests.yml @@ -433,86 +433,24 @@ jobs: - name: Run S3 Tagging Tests timeout-minutes: 15 + working-directory: test/s3/tagging run: | set -x - # Create clean data directory for this test run - export WEED_DATA_DIR="/tmp/seaweedfs-tagging-test-$(date +%s)" - mkdir -p "$WEED_DATA_DIR" - weed server -filer -filer.maxMB=64 -s3 -ip.bind 0.0.0.0 \ - -dir="$WEED_DATA_DIR" \ - -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 \ - -volume.max=100 -volume.preStopSeconds=1 \ - -master.port=9338 -volume.port=8085 -filer.port=8893 -s3.port=8006 -metricsPort=9329 \ - -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config="$GITHUB_WORKSPACE/docker/compose/s3.json" -master.peers=none & - pid=$! - - # Wait for all SeaweedFS components to be ready - echo "Waiting for SeaweedFS components to start..." - for i in {1..30}; do - if curl -s http://localhost:9338/cluster/status > /dev/null 2>&1; then - echo "Master server is ready" - break - fi - echo "Waiting for master server... ($i/30)" - sleep 2 - done - - for i in {1..30}; do - if curl -s http://localhost:8085/status > /dev/null 2>&1; then - echo "Volume server is ready" - break - fi - echo "Waiting for volume server... ($i/30)" - sleep 2 - done - - for i in {1..30}; do - if curl -s http://localhost:8893/ > /dev/null 2>&1; then - echo "Filer is ready" - break - fi - echo "Waiting for filer... ($i/30)" - sleep 2 - done - - for i in {1..30}; do - if curl -s http://localhost:8006/ > /dev/null 2>&1; then - echo "S3 server is ready" - break - fi - echo "Waiting for S3 server... ($i/30)" - sleep 2 - done - - echo "All SeaweedFS components are ready!" - cd ../test/s3/tagging - - # Set environment variables for the test - export S3_ENDPOINT="http://localhost:8006" - export S3_ACCESS_KEY="0555b35654ad1656d804" - export S3_SECRET_KEY="h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==" - - # Additional wait for S3-Filer integration to be fully ready - echo "Waiting additional 10 seconds for S3-Filer integration..." - sleep 10 + echo "=== System Information ===" + uname -a + free -h - # Test S3 connection before running tests - echo "Testing S3 connection..." - for i in {1..10}; do - if curl -s -f http://localhost:8006/ > /dev/null 2>&1; then - echo "S3 connection test successful" - break + # Run the specific test that is equivalent to AWS S3 tagging behavior + make test-with-server || { + echo "❌ Test failed, checking logs..." + if [ -f weed-test.log ]; then + echo "=== Server logs ===" + tail -100 weed-test.log fi - echo "S3 connection test failed, retrying... ($i/10)" - sleep 2 - done - - echo "✅ S3 server is responding, starting tests..." - - go test -v ./... - kill -9 $pid || true - # Clean up data directory - rm -rf "$WEED_DATA_DIR" || true + echo "=== Process information ===" + ps aux | grep -E "(weed|test)" || true + exit 1 + } - name: Upload test logs on failure if: failure() diff --git a/test/s3/tagging/Makefile b/test/s3/tagging/Makefile index 6d3343a70..6e7b3fb88 100644 --- a/test/s3/tagging/Makefile +++ b/test/s3/tagging/Makefile @@ -1,23 +1,321 @@ # S3 Object Tagging Tests # Tests for GitHub issue #7589: S3 object Tags query comes back empty -.PHONY: test +.PHONY: help build-weed setup-server start-server stop-server test-tagging test-tagging-quick test-tagging-comprehensive test-all clean logs check-deps test-with-server -# Run all tagging tests -test: - go test -v ./... +# Configuration +WEED_BINARY := ../../../weed/weed_binary +S3_PORT := 8006 +MASTER_PORT := 9338 +VOLUME_PORT := 8085 +FILER_PORT := 8893 +TEST_TIMEOUT := 10m +TEST_PATTERN := TestObjectTagging -# Run specific test +# Default target +help: + @echo "S3 Object Tagging Tests Makefile" + @echo "" + @echo "Available targets:" + @echo " help - Show this help message" + @echo " build-weed - Build the SeaweedFS binary" + @echo " check-deps - Check dependencies and build binary if needed" + @echo " start-server - Start SeaweedFS server for testing" + @echo " stop-server - Stop SeaweedFS server" + @echo " test-tagging - Run all tagging tests" + @echo " test-tagging-quick - Run core tagging tests only" + @echo " test-tagging-comprehensive - Run comprehensive tagging tests" + @echo " test-with-server - Start server, run tests, stop server" + @echo " logs - Show server logs" + @echo " clean - Clean up test artifacts and stop server" + @echo " health-check - Check if server is accessible" + @echo "" + @echo "Configuration:" + @echo " S3_PORT=${S3_PORT}" + @echo " TEST_TIMEOUT=${TEST_TIMEOUT}" + +# Build the SeaweedFS binary +build-weed: + @echo "Building SeaweedFS binary..." + @cd ../../../weed && go build -o weed_binary . + @chmod +x $(WEED_BINARY) + @echo "✅ SeaweedFS binary built at $(WEED_BINARY)" + +check-deps: build-weed + @echo "Checking dependencies..." + @echo "🔍 DEBUG: Checking Go installation..." + @command -v go >/dev/null 2>&1 || (echo "Go is required but not installed" && exit 1) + @echo "🔍 DEBUG: Go version: $$(go version)" + @echo "🔍 DEBUG: Checking binary at $(WEED_BINARY)..." + @test -f $(WEED_BINARY) || (echo "SeaweedFS binary not found at $(WEED_BINARY)" && exit 1) + @echo "🔍 DEBUG: Binary size: $$(ls -lh $(WEED_BINARY) | awk '{print $$5}')" + @echo "🔍 DEBUG: Binary permissions: $$(ls -la $(WEED_BINARY) | awk '{print $$1}')" + @echo "🔍 DEBUG: Checking Go module dependencies..." + @go list -m github.com/aws/aws-sdk-go-v2 >/dev/null 2>&1 || (echo "AWS SDK Go v2 not found. Run 'go mod tidy'." && exit 1) + @go list -m github.com/stretchr/testify >/dev/null 2>&1 || (echo "Testify not found. Run 'go mod tidy'." && exit 1) + @echo "✅ All dependencies are available" + +# Start SeaweedFS server for testing +start-server: check-deps + @echo "Starting SeaweedFS server..." + @echo "🔍 DEBUG: Current working directory: $$(pwd)" + @echo "🔍 DEBUG: Checking for existing weed processes..." + @ps aux | grep weed | grep -v grep || echo "No existing weed processes found" + @echo "🔍 DEBUG: Cleaning up any existing PID file..." + @rm -f weed-server.pid + @echo "🔍 DEBUG: Checking for port conflicts..." + @if netstat -tlnp 2>/dev/null | grep $(S3_PORT) >/dev/null; then \ + echo "⚠️ Port $(S3_PORT) is already in use, trying to find the process..."; \ + netstat -tlnp 2>/dev/null | grep $(S3_PORT) || true; \ + else \ + echo "✅ Port $(S3_PORT) is available"; \ + fi + @echo "🔍 DEBUG: Checking binary at $(WEED_BINARY)" + @ls -la $(WEED_BINARY) || (echo "❌ Binary not found!" && exit 1) + @echo "🔍 DEBUG: Checking config file at ../../../docker/compose/s3.json" + @ls -la ../../../docker/compose/s3.json || echo "⚠️ Config file not found, continuing without it" + @echo "🔍 DEBUG: Creating volume directory..." + @mkdir -p ./test-volume-data + @echo "🔍 DEBUG: Launching SeaweedFS server in background..." + @echo "🔍 DEBUG: Command: $(WEED_BINARY) server -filer -filer.maxMB=64 -s3 -ip.bind 0.0.0.0 -dir=./test-volume-data -master.raftHashicorp -master.electionTimeout 1s -master.volumeSizeLimitMB=100 -volume.max=100 -volume.preStopSeconds=1 -master.port=$(MASTER_PORT) -volume.port=$(VOLUME_PORT) -filer.port=$(FILER_PORT) -s3.port=$(S3_PORT) -metricsPort=9329 -s3.allowEmptyFolder=false -s3.allowDeleteBucketNotEmpty=true -s3.config=../../../docker/compose/s3.json -master.peers=none" + @$(WEED_BINARY) server \ + -filer \ + -filer.maxMB=64 \ + -s3 \ + -ip.bind 0.0.0.0 \ + -dir=./test-volume-data \ + -master.raftHashicorp \ + -master.electionTimeout 1s \ + -master.volumeSizeLimitMB=100 \ + -volume.max=100 \ + -volume.preStopSeconds=1 \ + -master.port=$(MASTER_PORT) \ + -volume.port=$(VOLUME_PORT) \ + -filer.port=$(FILER_PORT) \ + -s3.port=$(S3_PORT) \ + -metricsPort=9329 \ + -s3.allowEmptyFolder=false \ + -s3.allowDeleteBucketNotEmpty=true \ + -s3.config=../../../docker/compose/s3.json \ + -master.peers=none \ + > weed-test.log 2>&1 & echo $$! > weed-server.pid + @echo "🔍 DEBUG: Server PID: $$(cat weed-server.pid 2>/dev/null || echo 'PID file not found')" + @echo "🔍 DEBUG: Checking if PID is still running..." + @sleep 2 + @if [ -f weed-server.pid ]; then \ + SERVER_PID=$$(cat weed-server.pid); \ + ps -p $$SERVER_PID || echo "⚠️ Server PID $$SERVER_PID not found after 2 seconds"; \ + else \ + echo "⚠️ PID file not found"; \ + fi + @echo "🔍 DEBUG: Waiting for server to start (up to 90 seconds)..." + @for i in $$(seq 1 90); do \ + echo "🔍 DEBUG: Attempt $$i/90 - checking port $(S3_PORT)"; \ + if curl -s http://localhost:$(S3_PORT) >/dev/null 2>&1; then \ + echo "✅ SeaweedFS server started successfully on port $(S3_PORT) after $$i seconds"; \ + exit 0; \ + fi; \ + if [ $$i -eq 5 ]; then \ + echo "🔍 DEBUG: After 5 seconds, checking process and logs..."; \ + ps aux | grep weed | grep -v grep || echo "No weed processes found"; \ + if [ -f weed-test.log ]; then \ + echo "=== First server logs ==="; \ + head -20 weed-test.log; \ + fi; \ + fi; \ + if [ $$i -eq 15 ]; then \ + echo "🔍 DEBUG: After 15 seconds, checking port bindings..."; \ + netstat -tlnp 2>/dev/null | grep $(S3_PORT) || echo "Port $(S3_PORT) not bound"; \ + netstat -tlnp 2>/dev/null | grep $(MASTER_PORT) || echo "Port $(MASTER_PORT) not bound"; \ + netstat -tlnp 2>/dev/null | grep $(VOLUME_PORT) || echo "Port $(VOLUME_PORT) not bound"; \ + fi; \ + if [ $$i -eq 30 ]; then \ + echo "⚠️ Server taking longer than expected (30s), checking logs..."; \ + if [ -f weed-test.log ]; then \ + echo "=== Recent server logs ==="; \ + tail -20 weed-test.log; \ + fi; \ + fi; \ + sleep 1; \ + done; \ + echo "❌ Server failed to start within 90 seconds"; \ + echo "🔍 DEBUG: Final process check:"; \ + ps aux | grep weed | grep -v grep || echo "No weed processes found"; \ + echo "🔍 DEBUG: Final port check:"; \ + netstat -tlnp 2>/dev/null | grep -E "($(S3_PORT)|$(MASTER_PORT)|$(VOLUME_PORT))" || echo "No ports bound"; \ + echo "=== Full server logs ==="; \ + if [ -f weed-test.log ]; then \ + cat weed-test.log; \ + else \ + echo "No log file found"; \ + fi; \ + exit 1 + +# Stop SeaweedFS server +stop-server: + @echo "Stopping SeaweedFS server..." + @if [ -f weed-server.pid ]; then \ + SERVER_PID=$$(cat weed-server.pid); \ + echo "Killing server PID $$SERVER_PID"; \ + if ps -p $$SERVER_PID >/dev/null 2>&1; then \ + kill -TERM $$SERVER_PID 2>/dev/null || true; \ + sleep 2; \ + if ps -p $$SERVER_PID >/dev/null 2>&1; then \ + echo "Process still running, sending KILL signal..."; \ + kill -KILL $$SERVER_PID 2>/dev/null || true; \ + sleep 1; \ + fi; \ + else \ + echo "Process $$SERVER_PID not found (already stopped)"; \ + fi; \ + rm -f weed-server.pid; \ + else \ + echo "No PID file found, checking for running processes..."; \ + echo "⚠️ Skipping automatic process cleanup to avoid CI issues"; \ + echo "Note: Any remaining weed processes should be cleaned up by the CI environment"; \ + fi + @echo "✅ SeaweedFS server stopped" + +# Show server logs +logs: + @if test -f weed-test.log; then \ + echo "=== SeaweedFS Server Logs ==="; \ + tail -f weed-test.log; \ + else \ + echo "No log file found. Server may not be running."; \ + fi + +# Core tagging tests (basic functionality) +test-tagging-quick: check-deps + @echo "Running core tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUpload|TestPutObjectTaggingAPI" . + @echo "✅ Core tagging tests completed" + +# All tagging tests (comprehensive) +test-tagging: check-deps + @echo "Running all tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "$(TEST_PATTERN)" . + @echo "✅ All tagging tests completed" + +# Comprehensive tagging tests (all features) +test-tagging-comprehensive: check-deps + @echo "Running comprehensive tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTagging" . + @echo "✅ Comprehensive tagging tests completed" + +# All tests without server management +test-tagging-simple: check-deps + @echo "Running tagging tests (assuming server is already running)..." + @go test -v -timeout=$(TEST_TIMEOUT) . + @echo "✅ All tagging tests completed" + +# Start server, run tests, stop server +test-with-server: start-server + @echo "Running tagging tests with managed server..." + @sleep 5 # Give server time to fully start + @make test-tagging-comprehensive || (echo "Tests failed, stopping server..." && make stop-server && exit 1) + @make stop-server + @echo "✅ All tests completed with managed server" + +# Health check +health-check: + @echo "Checking server health..." + @if curl -s http://localhost:$(S3_PORT) >/dev/null 2>&1; then \ + echo "✅ Server is accessible on port $(S3_PORT)"; \ + else \ + echo "❌ Server is not accessible on port $(S3_PORT)"; \ + exit 1; \ + fi + +# Clean up +clean: + @echo "Cleaning up test artifacts..." + @make stop-server + @rm -f weed-test.log + @rm -f weed-server.pid + @rm -rf ./test-volume-data + @rm -f tagging.test + @go clean -testcache + @echo "✅ Cleanup completed" + +# Individual test targets for specific functionality test-upload: - go test -v -run TestObjectTaggingOnUpload + @echo "Running upload tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUpload" . test-special-chars: - go test -v -run TestObjectTaggingOnUploadWithSpecialCharacters + @echo "Running special characters tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUploadWithSpecialCharacters" . test-api: - go test -v -run TestPutObjectTaggingAPI + @echo "Running API tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestPutObjectTaggingAPI" . + +test-get: + @echo "Running get tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestGetObjectTaggingAPI" . + +test-delete: + @echo "Running delete tagging tests..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestDeleteObjectTaggingAPI" . + +# Development targets +dev-start: start-server + @echo "Development server started. Access S3 API at http://localhost:$(S3_PORT)" + @echo "To stop: make stop-server" + +dev-test: check-deps + @echo "Running tests in development mode..." + @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTaggingOnUpload" . + +# CI targets +ci-test: check-deps + @echo "Running tests in CI mode..." + @go test -v -timeout=$(TEST_TIMEOUT) -race . + +# All targets +test-all: test-tagging test-tagging-comprehensive + @echo "✅ All tagging tests completed" + +# Benchmark targets +benchmark-tagging: + @echo "Running tagging performance benchmarks..." + @go test -v -timeout=$(TEST_TIMEOUT) -bench=. -benchmem . + +# Coverage targets +coverage: + @echo "Running tests with coverage..." + @go test -v -timeout=$(TEST_TIMEOUT) -coverprofile=coverage.out . + @go tool cover -html=coverage.out -o coverage.html + @echo "Coverage report generated: coverage.html" + +# Format and lint +fmt: + @echo "Formatting Go code..." + @go fmt . + +lint: + @echo "Running linter..." + @golint . || echo "golint not available, skipping..." + +# Install dependencies for development +install-deps: + @echo "Installing Go dependencies..." + @go mod tidy + @go mod download + +# Show current configuration +show-config: + @echo "Current configuration:" + @echo " WEED_BINARY: $(WEED_BINARY)" + @echo " S3_PORT: $(S3_PORT)" + @echo " TEST_TIMEOUT: $(TEST_TIMEOUT)" + @echo " TEST_PATTERN: $(TEST_PATTERN)" -# Test configuration (override these for different environments) -S3_ENDPOINT ?= http://localhost:8333 -S3_ACCESS_KEY ?= some_access_key1 -S3_SECRET_KEY ?= some_secret_key1 +# Legacy targets for backward compatibility +test: test-with-server +test-verbose: test-tagging-comprehensive +test-single: test-upload +test-clean: clean +build: check-deps +setup: check-deps -- cgit v1.2.3 From 208d008fe3ea5e9089082e605c9f77483e524a8d Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 16:34:13 -0800 Subject: Fix tagging test pattern to run our comprehensive tests instead of basic tests --- test/s3/tagging/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/s3/tagging/Makefile b/test/s3/tagging/Makefile index 6e7b3fb88..aa2f18f7c 100644 --- a/test/s3/tagging/Makefile +++ b/test/s3/tagging/Makefile @@ -10,7 +10,7 @@ MASTER_PORT := 9338 VOLUME_PORT := 8085 FILER_PORT := 8893 TEST_TIMEOUT := 10m -TEST_PATTERN := TestObjectTagging +TEST_PATTERN := TestObjectTaggingOnUpload|TestPutObjectTaggingAPI|TestDeleteObjectTagging|TestTag # Default target help: @@ -200,7 +200,7 @@ test-tagging: check-deps # Comprehensive tagging tests (all features) test-tagging-comprehensive: check-deps @echo "Running comprehensive tagging tests..." - @go test -v -timeout=$(TEST_TIMEOUT) -run "TestObjectTagging" . + @go test -v -timeout=$(TEST_TIMEOUT) . @echo "✅ Comprehensive tagging tests completed" # All tests without server management -- cgit v1.2.3 From 1a67e6118ef5953bfd98d29d8c0b59a53f5484ff Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 16:39:22 -0800 Subject: Set S3_ENDPOINT environment variable in CI workflow for tagging tests --- .github/workflows/s3-go-tests.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/s3-go-tests.yml b/.github/workflows/s3-go-tests.yml index 54023fa78..78387d4f1 100644 --- a/.github/workflows/s3-go-tests.yml +++ b/.github/workflows/s3-go-tests.yml @@ -440,6 +440,11 @@ jobs: uname -a free -h + # Set environment variables for the test + export S3_ENDPOINT="http://localhost:8006" + export S3_ACCESS_KEY="0555b35654ad1656d804" + export S3_SECRET_KEY="h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==" + # Run the specific test that is equivalent to AWS S3 tagging behavior make test-with-server || { echo "❌ Test failed, checking logs..." -- cgit v1.2.3 From 099a351f3b084454f0d437b2d967c0f4a3bb9e1f Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 1 Dec 2025 17:41:42 -0800 Subject: Fix issue #6847: S3 chunked encoding includes headers in stored content (#7595) * Fix issue #6847: S3 chunked encoding includes headers in stored content - Add hasTrailer flag to s3ChunkedReader to track trailer presence - Update state transition logic to properly handle trailers in unsigned streaming - Enhance parseChunkChecksum to handle multiple trailer lines - Skip checksum verification for unsigned streaming uploads - Add test case for mixed format handling (unsigned headers with signed chunks) - Remove redundant CRLF reading in trailer processing This fixes the issue where chunk-signature and x-amz headers were appearing in stored file content when using chunked encoding with newer AWS SDKs. * Fix checksum validation for unsigned streaming uploads - Always validate checksum for data integrity regardless of signing - Correct checksum value in test case - Addresses PR review feedback about checksum verification * Add warning log when multiple checksum headers found in trailer - Log a warning when multiple valid checksum headers appear in trailers - Uses last checksum header as suggested by CodeRabbit reviewer - Improves debugging for edge cases with multiple checksum algorithms * Improve trailer parsing robustness in parseChunkChecksum - Remove redundant trimTrailingWhitespace call since readChunkLine already trims - Use bytes.TrimSpace for both key and value to handle whitespace around colon separator - Follows HTTP header specifications for optional whitespace around separators - Addresses Gemini Code Assist review feedback --- weed/s3api/chunked_reader_v4.go | 77 ++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/weed/s3api/chunked_reader_v4.go b/weed/s3api/chunked_reader_v4.go index c21b57009..f841c3e1e 100644 --- a/weed/s3api/chunked_reader_v4.go +++ b/weed/s3api/chunked_reader_v4.go @@ -116,6 +116,7 @@ func (iam *IdentityAccessManagement) newChunkedReader(req *http.Request) (io.Rea } checkSumWriter := getCheckSumWriter(checksumAlgorithm) + hasTrailer := amzTrailerHeader != "" return &s3ChunkedReader{ cred: credential, @@ -129,6 +130,7 @@ func (iam *IdentityAccessManagement) newChunkedReader(req *http.Request) (io.Rea checkSumWriter: checkSumWriter, state: readChunkHeader, iam: iam, + hasTrailer: hasTrailer, }, s3err.ErrNone } @@ -170,6 +172,7 @@ type s3ChunkedReader struct { n uint64 // Unread bytes in chunk err error iam *IdentityAccessManagement + hasTrailer bool } // Read chunk reads the chunk token signature portion. @@ -281,10 +284,10 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) { } // If we're using unsigned streaming upload, there is no signature to verify at each chunk. - if cr.chunkSignature != "" { - cr.state = verifyChunk - } else if cr.lastChunk { + if cr.lastChunk && cr.hasTrailer { cr.state = readTrailerChunk + } else if cr.chunkSignature != "" { + cr.state = verifyChunk } else { cr.state = readChunkHeader } @@ -304,7 +307,11 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) { // This implementation currently only supports the first case. // TODO: Implement the second case (signed upload with additional checksum computation for each chunk) - extractedCheckSumAlgorithm, extractedChecksum := parseChunkChecksum(cr.reader) + extractedCheckSumAlgorithm, extractedChecksum, err := parseChunkChecksum(cr.reader) + if err != nil { + cr.err = err + return 0, err + } if extractedCheckSumAlgorithm.String() != cr.checkSumAlgorithm { errorMessage := fmt.Sprintf("checksum algorithm in trailer '%s' does not match the one advertised in the header '%s'", extractedCheckSumAlgorithm.String(), cr.checkSumAlgorithm) @@ -313,6 +320,7 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) { return 0, cr.err } + // Validate checksum for data integrity (required for both signed and unsigned streaming with trailers) computedChecksum := cr.checkSumWriter.Sum(nil) base64Checksum := base64.StdEncoding.EncodeToString(computedChecksum) if string(extractedChecksum) != base64Checksum { @@ -324,11 +332,6 @@ func (cr *s3ChunkedReader) Read(buf []byte) (n int, err error) { // TODO: Extract signature from trailer chunk and verify it. // For now, we just read the trailer chunk and discard it. - // Reading remaining CRLF. - for i := 0; i < 2; i++ { - cr.err = readCRLF(cr.reader) - } - cr.state = eofChunk case readChunk: @@ -506,41 +509,37 @@ func parseS3ChunkExtension(buf []byte) ([]byte, []byte) { return buf[:semi], parseChunkSignature(buf[semi:]) } -func parseChunkChecksum(b *bufio.Reader) (ChecksumAlgorithm, []byte) { - // When using unsigned upload, this would be the raw contents of the trailer chunk: - // - // x-amz-checksum-crc32:YABb/g==\n\r\n\r\n // Trailer chunk (note optional \n character) - // \r\n // CRLF - // - // When using signed upload with an additional checksum algorithm, this would be the raw contents of the trailer chunk: - // - // x-amz-checksum-crc32:YABb/g==\n\r\n // Trailer chunk (note optional \n character) - // trailer-signature\r\n - // \r\n // CRLF - // - - // x-amz-checksum-crc32:YABb/g==\n - bytesRead, err := readChunkLine(b) - if err != nil { - return ChecksumAlgorithmNone, nil - } +func parseChunkChecksum(b *bufio.Reader) (ChecksumAlgorithm, []byte, error) { + // Read trailer lines until empty line + var checksumAlgorithm ChecksumAlgorithm + var checksum []byte - // Split on ':' - parts := bytes.SplitN(bytesRead, []byte(":"), 2) - checksumKey := string(parts[0]) - checksumValue := parts[1] + for { + bytesRead, err := readChunkLine(b) + if err != nil { + return ChecksumAlgorithmNone, nil, err + } - // Discard all trailing whitespace characters - checksumValue = trimTrailingWhitespace(checksumValue) + if len(bytesRead) == 0 { + break + } - // If the checksum key is not a supported checksum algorithm, return an error. - // TODO: Bubble that error up to the caller - extractedAlgorithm, err := extractChecksumAlgorithm(checksumKey) - if err != nil { - return ChecksumAlgorithmNone, nil + parts := bytes.SplitN(bytesRead, []byte(":"), 2) + if len(parts) == 2 { + key := string(bytes.TrimSpace(parts[0])) + value := bytes.TrimSpace(parts[1]) + if alg, err := extractChecksumAlgorithm(key); err == nil { + if checksumAlgorithm != ChecksumAlgorithmNone { + glog.V(3).Infof("multiple checksum headers found in trailer, using last: %s", key) + } + checksumAlgorithm = alg + checksum = value + } + // Ignore other trailer headers like x-amz-trailer-signature + } } - return extractedAlgorithm, checksumValue + return checksumAlgorithm, checksum, nil } func parseChunkSignature(chunk []byte) []byte { -- cgit v1.2.3 From 733ca8e6df3de57e2dcc0923fb2f166d3222921d Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 2 Dec 2025 09:24:31 -0800 Subject: Fix SSE-S3 copy: preserve encryption metadata and set chunk SSE type (#7598) * Fix SSE-S3 copy: preserve encryption metadata and set chunk SSE type Fixes GitHub #7562: Copying objects between encrypted buckets was failing. Root causes: 1. processMetadataBytes was re-adding SSE headers from source entry, undoing the encryption header filtering. Now uses dstEntry.Extended which is already filtered. 2. SSE-S3 streaming copy returned nil metadata. Now properly generates and returns SSE-S3 destination metadata (SeaweedFSSSES3Key, AES256 header) via ExecuteStreamingCopyWithMetadata. 3. Chunks created during streaming copy didn't have SseType set. Now sets SseType and per-chunk SseMetadata with chunk-specific IVs for SSE-S3, enabling proper decryption on GetObject. * Address review: make SSE-S3 metadata serialization failures fatal errors - In executeEncryptCopy: return error instead of just logging if SerializeSSES3Metadata fails - In createChunkFromData: return error if chunk SSE-S3 metadata serialization fails This ensures objects/chunks are never created without proper encryption metadata, preventing unreadable/corrupted data. * fmt * Refactor: reuse function names instead of creating WithMetadata variants - Change ExecuteStreamingCopy to return (*EncryptionSpec, error) directly - Remove ExecuteStreamingCopyWithMetadata wrapper - Change executeStreamingReencryptCopy to return (*EncryptionSpec, error) - Remove executeStreamingReencryptCopyWithMetadata wrapper - Update callers to ignore encryption spec with _ where not needed * Add TODO documenting large file SSE-S3 copy limitation The streaming copy approach encrypts the entire stream with a single IV but stores data in chunks with per-chunk IVs. This causes decryption issues for large files. Small inline files work correctly. This is a known architectural issue that needs separate work to fix. * Use chunk-by-chunk encryption for SSE-S3 copy (consistent with SSE-C/SSE-KMS) Instead of streaming encryption (which had IV mismatch issues for multi-chunk files), SSE-S3 now uses the same chunk-by-chunk approach as SSE-C and SSE-KMS: 1. Extended copyMultipartCrossEncryption to handle SSE-S3: - Added SSE-S3 source decryption in copyCrossEncryptionChunk - Added SSE-S3 destination encryption with per-chunk IVs - Added object-level metadata generation for SSE-S3 destinations 2. Updated routing in executeEncryptCopy/executeDecryptCopy/executeReencryptCopy to use copyMultipartCrossEncryption for all SSE-S3 scenarios 3. Removed streaming copy functions (shouldUseStreamingCopy, executeStreamingReencryptCopy) as they're no longer used 4. Added large file (1MB) integration test to verify chunk-by-chunk copy works This ensures consistent behavior across all SSE types and fixes data corruption that occurred with large files in the streaming copy approach. * fmt * fmt * Address review: fail explicitly if SSE-S3 metadata is missing Instead of silently ignoring missing SSE-S3 metadata (which could create unreadable objects), now explicitly fail the copy operation with a clear error message if: - First chunk is missing - First chunk doesn't have SSE-S3 type - First chunk has empty SSE metadata - Deserialization fails * Address review: improve comment to reflect full scope of chunk creation * Address review: fail explicitly if baseIV is empty for SSE-S3 chunk encryption If DestinationIV is not set when encrypting SSE-S3 chunks, the chunk would be created without SseMetadata, causing GetObject decryption to fail later. Now fails explicitly with a clear error message. Note: calculateIVWithOffset returns ([]byte, int) not ([]byte, error) - the int is a skip amount for intra-block alignment, not an error code. * Address review: handle 0-byte files in SSE-S3 copy For 0-byte files, there are no chunks to get metadata from. Generate an IV for the object-level metadata to ensure even empty files are properly marked as SSE-S3 encrypted. Also validate that we don't have a non-empty file with no chunks (which would indicate an internal error). --- test/s3/sse/s3_sse_integration_test.go | 72 ++++++++++++++ weed/s3api/s3api_object_handlers_copy.go | 119 +++++++++++++++++++++-- weed/s3api/s3api_object_handlers_copy_unified.go | 108 ++------------------ weed/s3api/s3api_streaming_copy.go | 60 ++++++++++-- 4 files changed, 244 insertions(+), 115 deletions(-) diff --git a/test/s3/sse/s3_sse_integration_test.go b/test/s3/sse/s3_sse_integration_test.go index 7b939ea76..4b7eb0ddc 100644 --- a/test/s3/sse/s3_sse_integration_test.go +++ b/test/s3/sse/s3_sse_integration_test.go @@ -2082,6 +2082,78 @@ func TestCopyToBucketDefaultEncryptedRegression(t *testing.T) { require.NoError(t, err, "Failed to read object") assertDataEqual(t, testData, data, "Data mismatch") }) + + t.Run("LargeFileCopyEncrypted_ToTemp_ToEncrypted", func(t *testing.T) { + // Test with large file (1MB) to exercise chunk-by-chunk copy path + // This verifies consistent behavior with SSE-C and SSE-KMS + largeTestData := generateTestData(1024 * 1024) // 1MB + objectKey := "large-file-test.bin" + + // Step 1: Upload large object to source bucket (will be automatically encrypted) + _, err = client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(srcBucket), + Key: aws.String(objectKey), + Body: bytes.NewReader(largeTestData), + }) + require.NoError(t, err, "Failed to upload large file to source bucket") + + // Verify source object is encrypted + srcHead, err := client.HeadObject(ctx, &s3.HeadObjectInput{ + Bucket: aws.String(srcBucket), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Failed to HEAD source object") + assert.Equal(t, types.ServerSideEncryptionAes256, srcHead.ServerSideEncryption, + "Source object should be SSE-S3 encrypted") + + // Step 2: Copy to temp bucket (unencrypted) - exercises chunk-by-chunk decrypt + _, err = client.CopyObject(ctx, &s3.CopyObjectInput{ + Bucket: aws.String(tempBucket), + Key: aws.String(objectKey), + CopySource: aws.String(fmt.Sprintf("%s/%s", srcBucket, objectKey)), + }) + require.NoError(t, err, "Failed to copy large file to temp bucket") + + // Verify temp object is unencrypted and data is correct + tempGet, err := client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(tempBucket), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Failed to GET temp object") + tempData, err := io.ReadAll(tempGet.Body) + tempGet.Body.Close() + require.NoError(t, err, "Failed to read temp object") + assertDataEqual(t, largeTestData, tempData, "Temp object data mismatch after decrypt") + + // Step 3: Copy from temp bucket to dest bucket (with default encryption) + // This exercises chunk-by-chunk encrypt copy + _, err = client.CopyObject(ctx, &s3.CopyObjectInput{ + Bucket: aws.String(dstBucket), + Key: aws.String(objectKey), + CopySource: aws.String(fmt.Sprintf("%s/%s", tempBucket, objectKey)), + }) + require.NoError(t, err, "Failed to copy large file to destination bucket") + + // Verify destination object is encrypted + dstHead, err := client.HeadObject(ctx, &s3.HeadObjectInput{ + Bucket: aws.String(dstBucket), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Failed to HEAD destination object") + assert.Equal(t, types.ServerSideEncryptionAes256, dstHead.ServerSideEncryption, + "Destination object should be SSE-S3 encrypted via bucket default") + + // Verify destination object content is correct after re-encryption + dstGet, err := client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(dstBucket), + Key: aws.String(objectKey), + }) + require.NoError(t, err, "Failed to GET destination object") + dstData, err := io.ReadAll(dstGet.Body) + dstGet.Body.Close() + require.NoError(t, err, "Failed to read destination object") + assertDataEqual(t, largeTestData, dstData, "Large file data mismatch after re-encryption") + }) } // REGRESSION TESTS FOR CRITICAL BUGS FIXED diff --git a/weed/s3api/s3api_object_handlers_copy.go b/weed/s3api/s3api_object_handlers_copy.go index 0c465d3db..09d009372 100644 --- a/weed/s3api/s3api_object_handlers_copy.go +++ b/weed/s3api/s3api_object_handlers_copy.go @@ -199,7 +199,9 @@ func (s3a *S3ApiServer) CopyObjectHandler(w http.ResponseWriter, r *http.Request } // Process metadata and tags and apply to destination - processedMetadata, tagErr := processMetadataBytes(r.Header, entry.Extended, replaceMeta, replaceTagging) + // Use dstEntry.Extended (already filtered) as the source, not entry.Extended, + // to preserve the encryption header filtering. Fixes GitHub #7562. + processedMetadata, tagErr := processMetadataBytes(r.Header, dstEntry.Extended, replaceMeta, replaceTagging) if tagErr != nil { s3err.WriteErrorResponse(w, r, s3err.ErrInvalidCopySource) return @@ -1522,7 +1524,7 @@ func (s3a *S3ApiServer) copyMultipartSSECChunk(chunk *filer_pb.FileChunk, copySo } // copyMultipartCrossEncryption handles all cross-encryption and decrypt-only copy scenarios -// This unified function supports: SSE-C↔SSE-KMS, SSE-C→Plain, SSE-KMS→Plain +// This unified function supports: SSE-C↔SSE-KMS↔SSE-S3, and any→Plain func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstBucket, dstPath string) ([]*filer_pb.FileChunk, map[string][]byte, error) { var dstChunks []*filer_pb.FileChunk @@ -1531,6 +1533,7 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h var destKMSKeyID string var destKMSEncryptionContext map[string]string var destKMSBucketKeyEnabled bool + var destSSES3Key *SSES3Key if state.DstSSEC { var err error @@ -1544,7 +1547,13 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h if err != nil { return nil, nil, fmt.Errorf("failed to parse destination SSE-KMS headers: %w", err) } - } else { + } else if state.DstSSES3 { + // Generate SSE-S3 key for destination + var err error + destSSES3Key, err = GenerateSSES3Key() + if err != nil { + return nil, nil, fmt.Errorf("failed to generate SSE-S3 key: %w", err) + } } // Parse source encryption parameters @@ -1563,12 +1572,18 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h var err error if chunk.GetSseType() == filer_pb.SSEType_SSE_C { - copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, sourceSSECKey, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, dstPath, dstBucket, state) + copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, sourceSSECKey, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state) } else if chunk.GetSseType() == filer_pb.SSEType_SSE_KMS { - copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, dstPath, dstBucket, state) + copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state) + } else if chunk.GetSseType() == filer_pb.SSEType_SSE_S3 { + copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state) } else { - // Unencrypted chunk, copy directly - copiedChunk, err = s3a.copySingleChunk(chunk, dstPath) + // Unencrypted chunk - may need encryption if destination requires it + if state.DstSSEC || state.DstSSEKMS || state.DstSSES3 { + copiedChunk, err = s3a.copyCrossEncryptionChunk(chunk, nil, destSSECKey, destKMSKeyID, destKMSEncryptionContext, destKMSBucketKeyEnabled, destSSES3Key, dstPath, dstBucket, state) + } else { + copiedChunk, err = s3a.copySingleChunk(chunk, dstPath) + } } if err != nil { @@ -1619,6 +1634,40 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h } else { glog.Errorf("Failed to serialize SSE-KMS metadata: %v", serErr) } + } else if state.DstSSES3 && destSSES3Key != nil { + // For SSE-S3 destination, create object-level metadata + var sses3Metadata *SSES3Key + if len(dstChunks) == 0 { + // Handle 0-byte files - generate IV for metadata even though there's no content to encrypt + if entry.Attributes.FileSize != 0 { + return nil, nil, fmt.Errorf("internal error: no chunks created for non-empty SSE-S3 destination object") + } + // Generate IV for 0-byte object metadata + iv := make([]byte, s3_constants.AESBlockSize) + if _, err := io.ReadFull(rand.Reader, iv); err != nil { + return nil, nil, fmt.Errorf("generate IV for 0-byte object: %w", err) + } + destSSES3Key.IV = iv + sses3Metadata = destSSES3Key + } else { + // For non-empty objects, use the first chunk's metadata + if dstChunks[0].GetSseType() != filer_pb.SSEType_SSE_S3 || len(dstChunks[0].GetSseMetadata()) == 0 { + return nil, nil, fmt.Errorf("internal error: first chunk is missing expected SSE-S3 metadata for destination object") + } + keyManager := GetSSES3KeyManager() + var err error + sses3Metadata, err = DeserializeSSES3Metadata(dstChunks[0].GetSseMetadata(), keyManager) + if err != nil { + return nil, nil, fmt.Errorf("failed to deserialize SSE-S3 metadata from first chunk: %w", err) + } + } + // Use the derived key with its IV for object-level metadata + keyData, serErr := SerializeSSES3Metadata(sses3Metadata) + if serErr != nil { + return nil, nil, fmt.Errorf("failed to serialize SSE-S3 metadata: %w", serErr) + } + dstMetadata[s3_constants.SeaweedFSSSES3Key] = keyData + dstMetadata[s3_constants.AmzServerSideEncryption] = []byte("AES256") } // For unencrypted destination, no metadata needed (dstMetadata remains empty) @@ -1626,7 +1675,7 @@ func (s3a *S3ApiServer) copyMultipartCrossEncryption(entry *filer_pb.Entry, r *h } // copyCrossEncryptionChunk handles copying a single chunk with cross-encryption support -func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sourceSSECKey *SSECustomerKey, destSSECKey *SSECustomerKey, destKMSKeyID string, destKMSEncryptionContext map[string]string, destKMSBucketKeyEnabled bool, dstPath, dstBucket string, state *EncryptionState) (*filer_pb.FileChunk, error) { +func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sourceSSECKey *SSECustomerKey, destSSECKey *SSECustomerKey, destKMSKeyID string, destKMSEncryptionContext map[string]string, destKMSBucketKeyEnabled bool, destSSES3Key *SSES3Key, dstPath, dstBucket string, state *EncryptionState) (*filer_pb.FileChunk, error) { // Create destination chunk dstChunk := s3a.createDestinationChunk(chunk, chunk.Offset, chunk.Size) @@ -1726,6 +1775,30 @@ func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sour previewLen = len(finalData) } + } else if chunk.GetSseType() == filer_pb.SSEType_SSE_S3 { + // Decrypt SSE-S3 source + if len(chunk.GetSseMetadata()) == 0 { + return nil, fmt.Errorf("SSE-S3 chunk missing per-chunk metadata") + } + + keyManager := GetSSES3KeyManager() + sourceSSEKey, err := DeserializeSSES3Metadata(chunk.GetSseMetadata(), keyManager) + if err != nil { + return nil, fmt.Errorf("failed to deserialize SSE-S3 metadata: %w", err) + } + + decryptedReader, decErr := CreateSSES3DecryptedReader(bytes.NewReader(encryptedData), sourceSSEKey, sourceSSEKey.IV) + if decErr != nil { + return nil, fmt.Errorf("create SSE-S3 decrypted reader: %w", decErr) + } + + decryptedData, readErr := io.ReadAll(decryptedReader) + if readErr != nil { + return nil, fmt.Errorf("decrypt SSE-S3 chunk data: %w", readErr) + } + finalData = decryptedData + glog.V(4).Infof("Decrypted SSE-S3 chunk, size: %d", len(finalData)) + } else { // Source is unencrypted finalData = encryptedData @@ -1787,6 +1860,36 @@ func (s3a *S3ApiServer) copyCrossEncryptionChunk(chunk *filer_pb.FileChunk, sour dstChunk.SseMetadata = kmsMetadata glog.V(4).Infof("Re-encrypted chunk with SSE-KMS") + + } else if state.DstSSES3 && destSSES3Key != nil { + // Encrypt with SSE-S3 + encryptedReader, iv, encErr := CreateSSES3EncryptedReader(bytes.NewReader(finalData), destSSES3Key) + if encErr != nil { + return nil, fmt.Errorf("create SSE-S3 encrypted reader: %w", encErr) + } + + reencryptedData, readErr := io.ReadAll(encryptedReader) + if readErr != nil { + return nil, fmt.Errorf("re-encrypt with SSE-S3: %w", readErr) + } + finalData = reencryptedData + + // Create per-chunk SSE-S3 metadata with chunk-specific IV + chunkSSEKey := &SSES3Key{ + Key: destSSES3Key.Key, + KeyID: destSSES3Key.KeyID, + Algorithm: destSSES3Key.Algorithm, + IV: iv, + } + sses3Metadata, err := SerializeSSES3Metadata(chunkSSEKey) + if err != nil { + return nil, fmt.Errorf("serialize SSE-S3 metadata: %w", err) + } + + dstChunk.SseType = filer_pb.SSEType_SSE_S3 + dstChunk.SseMetadata = sses3Metadata + + glog.V(4).Infof("Re-encrypted chunk with SSE-S3") } // For unencrypted destination, finalData remains as decrypted plaintext diff --git a/weed/s3api/s3api_object_handlers_copy_unified.go b/weed/s3api/s3api_object_handlers_copy_unified.go index 255c3eb2d..f1b4ff280 100644 --- a/weed/s3api/s3api_object_handlers_copy_unified.go +++ b/weed/s3api/s3api_object_handlers_copy_unified.go @@ -1,7 +1,6 @@ package s3api import ( - "context" "errors" "fmt" "net/http" @@ -133,9 +132,9 @@ func (s3a *S3ApiServer) executeEncryptCopy(entry *filer_pb.Entry, r *http.Reques } if state.DstSSES3 { - // Use streaming copy for SSE-S3 encryption - chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath) - return chunks, nil, err + // Use chunk-by-chunk copy for SSE-S3 encryption (consistent with SSE-C and SSE-KMS) + glog.V(2).Infof("Plain→SSE-S3 copy: using unified multipart encrypt copy") + return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath) } return nil, nil, fmt.Errorf("unknown target encryption type") @@ -143,30 +142,18 @@ func (s3a *S3ApiServer) executeEncryptCopy(entry *filer_pb.Entry, r *http.Reques // executeDecryptCopy handles encrypted → plain copies func (s3a *S3ApiServer) executeDecryptCopy(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstPath string) ([]*filer_pb.FileChunk, map[string][]byte, error) { - // Use unified multipart-aware decrypt copy for all encryption types - if state.SrcSSEC || state.SrcSSEKMS { + // Use unified multipart-aware decrypt copy for all encryption types (consistent chunk-by-chunk) + if state.SrcSSEC || state.SrcSSEKMS || state.SrcSSES3 { glog.V(2).Infof("Encrypted→Plain copy: using unified multipart decrypt copy") return s3a.copyMultipartCrossEncryption(entry, r, state, "", dstPath) } - if state.SrcSSES3 { - // Use streaming copy for SSE-S3 decryption - chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath) - return chunks, nil, err - } - return nil, nil, fmt.Errorf("unknown source encryption type") } // executeReencryptCopy handles encrypted → encrypted copies with different keys/methods func (s3a *S3ApiServer) executeReencryptCopy(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstBucket, dstPath string) ([]*filer_pb.FileChunk, map[string][]byte, error) { - // Check if we should use streaming copy for better performance - if s3a.shouldUseStreamingCopy(entry, state) { - chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath) - return chunks, nil, err - } - - // Fallback to chunk-by-chunk approach for compatibility + // Use chunk-by-chunk approach for all cross-encryption scenarios (consistent behavior) if state.SrcSSEC && state.DstSSEC { return s3a.copyChunksWithSSEC(entry, r) } @@ -177,83 +164,8 @@ func (s3a *S3ApiServer) executeReencryptCopy(entry *filer_pb.Entry, r *http.Requ return chunks, dstMetadata, err } - if state.SrcSSEC && state.DstSSEKMS { - // SSE-C → SSE-KMS: use unified multipart-aware cross-encryption copy - glog.V(2).Infof("SSE-C→SSE-KMS cross-encryption copy: using unified multipart copy") - return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath) - } - - if state.SrcSSEKMS && state.DstSSEC { - // SSE-KMS → SSE-C: use unified multipart-aware cross-encryption copy - glog.V(2).Infof("SSE-KMS→SSE-C cross-encryption copy: using unified multipart copy") - return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath) - } - - // Handle SSE-S3 cross-encryption scenarios - if state.SrcSSES3 || state.DstSSES3 { - // Any scenario involving SSE-S3 uses streaming copy - chunks, err := s3a.executeStreamingReencryptCopy(entry, r, state, dstPath) - return chunks, nil, err - } - - return nil, nil, fmt.Errorf("unsupported cross-encryption scenario") -} - -// shouldUseStreamingCopy determines if streaming copy should be used -func (s3a *S3ApiServer) shouldUseStreamingCopy(entry *filer_pb.Entry, state *EncryptionState) bool { - // Use streaming copy for large files or when beneficial - fileSize := entry.Attributes.FileSize - - // Use streaming for files larger than 10MB - if fileSize > 10*1024*1024 { - return true - } - - // Check if this is a multipart encrypted object - isMultipartEncrypted := false - if state.IsSourceEncrypted() { - encryptedChunks := 0 - for _, chunk := range entry.GetChunks() { - if chunk.GetSseType() != filer_pb.SSEType_NONE { - encryptedChunks++ - } - } - isMultipartEncrypted = encryptedChunks > 1 - } - - // For multipart encrypted objects, avoid streaming copy to use per-chunk metadata approach - if isMultipartEncrypted { - glog.V(3).Infof("Multipart encrypted object detected, using chunk-by-chunk approach") - return false - } - - // Use streaming for cross-encryption scenarios (for single-part objects only) - if state.IsSourceEncrypted() && state.IsTargetEncrypted() { - srcType := s3a.getEncryptionTypeString(state.SrcSSEC, state.SrcSSEKMS, state.SrcSSES3) - dstType := s3a.getEncryptionTypeString(state.DstSSEC, state.DstSSEKMS, state.DstSSES3) - if srcType != dstType { - return true - } - } - - // Use streaming for compressed files - if isCompressedEntry(entry) { - return true - } - - // Use streaming for SSE-S3 scenarios (always) - if state.SrcSSES3 || state.DstSSES3 { - return true - } - - return false -} - -// executeStreamingReencryptCopy performs streaming re-encryption copy -func (s3a *S3ApiServer) executeStreamingReencryptCopy(entry *filer_pb.Entry, r *http.Request, state *EncryptionState, dstPath string) ([]*filer_pb.FileChunk, error) { - // Create streaming copy manager - streamingManager := NewStreamingCopyManager(s3a) - - // Execute streaming copy - return streamingManager.ExecuteStreamingCopy(context.Background(), entry, r, dstPath, state) + // All other cross-encryption scenarios use unified multipart copy + // This includes: SSE-C↔SSE-KMS, SSE-C↔SSE-S3, SSE-KMS↔SSE-S3, SSE-S3↔SSE-S3 + glog.V(2).Infof("Cross-encryption copy: using unified multipart copy") + return s3a.copyMultipartCrossEncryption(entry, r, state, dstBucket, dstPath) } diff --git a/weed/s3api/s3api_streaming_copy.go b/weed/s3api/s3api_streaming_copy.go index 457986858..94729c003 100644 --- a/weed/s3api/s3api_streaming_copy.go +++ b/weed/s3api/s3api_streaming_copy.go @@ -59,18 +59,19 @@ func NewStreamingCopyManager(s3a *S3ApiServer) *StreamingCopyManager { } } -// ExecuteStreamingCopy performs a streaming copy operation -func (scm *StreamingCopyManager) ExecuteStreamingCopy(ctx context.Context, entry *filer_pb.Entry, r *http.Request, dstPath string, state *EncryptionState) ([]*filer_pb.FileChunk, error) { +// ExecuteStreamingCopy performs a streaming copy operation and returns the encryption spec +// The encryption spec is needed for SSE-S3 to properly set destination metadata (fixes GitHub #7562) +func (scm *StreamingCopyManager) ExecuteStreamingCopy(ctx context.Context, entry *filer_pb.Entry, r *http.Request, dstPath string, state *EncryptionState) ([]*filer_pb.FileChunk, *EncryptionSpec, error) { // Create streaming copy specification spec, err := scm.createStreamingSpec(entry, r, state) if err != nil { - return nil, fmt.Errorf("create streaming spec: %w", err) + return nil, nil, fmt.Errorf("create streaming spec: %w", err) } // Create source reader from entry sourceReader, err := scm.createSourceReader(entry) if err != nil { - return nil, fmt.Errorf("create source reader: %w", err) + return nil, nil, fmt.Errorf("create source reader: %w", err) } defer sourceReader.Close() @@ -79,11 +80,16 @@ func (scm *StreamingCopyManager) ExecuteStreamingCopy(ctx context.Context, entry // Create processing pipeline processedReader, err := scm.createProcessingPipeline(spec) if err != nil { - return nil, fmt.Errorf("create processing pipeline: %w", err) + return nil, nil, fmt.Errorf("create processing pipeline: %w", err) } // Stream to destination - return scm.streamToDestination(ctx, processedReader, spec, dstPath) + chunks, err := scm.streamToDestination(ctx, processedReader, spec, dstPath) + if err != nil { + return nil, nil, err + } + + return chunks, spec.EncryptionSpec, nil } // createStreamingSpec creates a streaming specification based on copy parameters @@ -453,8 +459,8 @@ func (scm *StreamingCopyManager) streamToChunks(ctx context.Context, reader io.R for { n, err := reader.Read(buffer) if n > 0 { - // Create chunk for this data - chunk, chunkErr := scm.createChunkFromData(buffer[:n], offset, dstPath) + // Create chunk for this data, setting SSE type and per-chunk metadata (including chunk-specific IVs for SSE-S3) + chunk, chunkErr := scm.createChunkFromData(buffer[:n], offset, dstPath, spec.EncryptionSpec) if chunkErr != nil { return nil, fmt.Errorf("create chunk from data: %w", chunkErr) } @@ -474,7 +480,7 @@ func (scm *StreamingCopyManager) streamToChunks(ctx context.Context, reader io.R } // createChunkFromData creates a chunk from streaming data -func (scm *StreamingCopyManager) createChunkFromData(data []byte, offset int64, dstPath string) (*filer_pb.FileChunk, error) { +func (scm *StreamingCopyManager) createChunkFromData(data []byte, offset int64, dstPath string, encSpec *EncryptionSpec) (*filer_pb.FileChunk, error) { // Assign new volume assignResult, err := scm.s3a.assignNewVolume(dstPath) if err != nil { @@ -487,6 +493,42 @@ func (scm *StreamingCopyManager) createChunkFromData(data []byte, offset int64, Size: uint64(len(data)), } + // Set SSE type and metadata on chunk if destination is encrypted + // This is critical for GetObject to know to decrypt the data - fixes GitHub #7562 + if encSpec != nil && encSpec.NeedsEncryption { + switch encSpec.DestinationType { + case EncryptionTypeSSEC: + chunk.SseType = filer_pb.SSEType_SSE_C + // SSE-C metadata is handled at object level, not per-chunk for streaming copy + case EncryptionTypeSSEKMS: + chunk.SseType = filer_pb.SSEType_SSE_KMS + // SSE-KMS metadata is handled at object level, not per-chunk for streaming copy + case EncryptionTypeSSES3: + chunk.SseType = filer_pb.SSEType_SSE_S3 + // Create per-chunk SSE-S3 metadata with chunk-specific IV + if sseKey, ok := encSpec.DestinationKey.(*SSES3Key); ok { + // Calculate chunk-specific IV using base IV and chunk offset + baseIV := encSpec.DestinationIV + if len(baseIV) == 0 { + return nil, fmt.Errorf("SSE-S3 encryption requires DestinationIV to be set for chunk at offset %d", offset) + } + chunkIV, _ := calculateIVWithOffset(baseIV, offset) + // Create chunk key with the chunk-specific IV + chunkSSEKey := &SSES3Key{ + Key: sseKey.Key, + KeyID: sseKey.KeyID, + Algorithm: sseKey.Algorithm, + IV: chunkIV, + } + chunkMetadata, serErr := SerializeSSES3Metadata(chunkSSEKey) + if serErr != nil { + return nil, fmt.Errorf("failed to serialize chunk SSE-S3 metadata: %w", serErr) + } + chunk.SseMetadata = chunkMetadata + } + } + } + // Set file ID if err := scm.s3a.setChunkFileId(chunk, assignResult); err != nil { return nil, err -- cgit v1.2.3 From ee775825bcd46911adeb811241bad56836dab05f Mon Sep 17 00:00:00 2001 From: Lisandro Pin Date: Tue, 2 Dec 2025 18:29:27 +0100 Subject: Parallelize read-only volume check pass for `volume.check.disk`. (#7602) --- weed/shell/command_volume_check_disk.go | 52 ++++++++++++++++++--------------- weed/shell/common.go | 8 +++++ 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index d7b015979..78b2486dd 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -40,6 +40,8 @@ type volumeCheckDisk struct { syncDeletions bool fixReadOnly bool nonRepairThreshold float64 + + ewg *ErrorWaitGroup } func (c *commandVolumeCheckDisk) Name() string { @@ -92,6 +94,7 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write applyChangesAlias := fsckCommand.Bool("force", false, "apply the fix (alias for -apply)") fixReadOnly := fsckCommand.Bool("fixReadOnly", false, "apply the fix even on readonly volumes (EXPERIMENTAL!)") syncDeletions := fsckCommand.Bool("syncDeleted", false, "sync of deletions the fix") + maxParallelization := fsckCommand.Int("maxParallelization", DefaultMaxParallelization, "run up to X tasks in parallel, whenever possible") nonRepairThreshold := fsckCommand.Float64("nonRepairThreshold", 0.3, "repair when missing keys is not more than this limit") if err = fsckCommand.Parse(args); err != nil { return nil @@ -115,6 +118,8 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write syncDeletions: *syncDeletions, fixReadOnly: *fixReadOnly, nonRepairThreshold: *nonRepairThreshold, + + ewg: NewErrorWaitGroup(*maxParallelization), } // collect topology information @@ -137,11 +142,9 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write if err := vcd.checkWritableVolumes(volumeReplicas); err != nil { return err } - if err := vcd.checkReadOnlyVolumes(volumeReplicas); err != nil { - return err - } + vcd.checkReadOnlyVolumes(volumeReplicas) - return nil + return vcd.ewg.Wait() } // checkWritableVolumes fixes volume replicas which are not read-only. @@ -228,9 +231,9 @@ func (vcd *volumeCheckDisk) makeVolumeReadonly(vid uint32, vr *VolumeReplica) er return nil } -func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*VolumeReplica) error { +func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*VolumeReplica) { if !vcd.fixReadOnly { - return nil + return } vcd.write("Pass #2 (read-only volumes)\n") @@ -261,35 +264,38 @@ func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*Vo skip, err := vcd.shouldSkipVolume(r, source) if err != nil { - vcd.write("error checking if volume %d should be skipped: %v\n", r.info.Id, err) + vcd.ewg.AddErrorf("failed to check if volume %d should be skipped: %v\n", r.info.Id, err) continue } if skip { continue } - // make volume writable... - if err := vcd.makeVolumeWritable(vid, r); err != nil { - return err - } + vcd.ewg.Add(func() error { + // make volume writable... + if err := vcd.makeVolumeWritable(vid, r); err != nil { + return err + } + + // ...fix it... + // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes. + if err := vcd.syncTwoReplicas(source, r, false); err != nil { + vcd.write("sync read-only volume %d on %s from %s: %v\n", vid, r.location.dataNode.Id, source.location.dataNode.Id, err) - // ...fix it... - // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes. - if err := vcd.syncTwoReplicas(source, r, false); err != nil { - vcd.write("sync read-only volume %d on %s from %s: %v\n", vid, r.location.dataNode.Id, source.location.dataNode.Id, err) + // ...or revert it back to read-only, if something went wrong. + // TODO: we should keep unchanged volumes as read-only, so we don't modify valid volumes which are full. + if roErr := vcd.makeVolumeReadonly(vid, r); roErr != nil { + return fmt.Errorf("failed to make volume %d on %s readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr) + } + vcd.write("volume %d on %s is now read-only\n", vid, r.location.dataNode.Id) - // ...or revert it back to read-only, if something went wrong. - if roErr := vcd.makeVolumeReadonly(vid, r); roErr != nil { - return fmt.Errorf("failed to make volume %d on %s readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr) + return err } - vcd.write("volume %d on %s is now read-only\n", vid, r.location.dataNode.Id) - return err - } + return nil + }) } } - - return nil } func (vcd *volumeCheckDisk) grpcDialOption() grpc.DialOption { diff --git a/weed/shell/common.go b/weed/shell/common.go index 43571176e..cb2df5828 100644 --- a/weed/shell/common.go +++ b/weed/shell/common.go @@ -2,6 +2,7 @@ package shell import ( "errors" + "fmt" "sync" ) @@ -64,6 +65,13 @@ func (ewg *ErrorWaitGroup) Add(f ErrorWaitGroupTask) { }() } +// AddErrorf adds an error to an ErrorWaitGroupTask result, without queueing any goroutines. +func (ewg *ErrorWaitGroup) AddErrorf(format string, a ...interface{}) { + ewg.errorsMu.Lock() + ewg.errors = append(ewg.errors, fmt.Errorf(format, a...)) + ewg.errorsMu.Unlock() +} + // Wait sleeps until all ErrorWaitGroupTasks are completed, then returns errors for them. func (ewg *ErrorWaitGroup) Wait() error { ewg.wg.Wait() -- cgit v1.2.3 From ebb06a3908990c31dcfb4995a69682d836228179 Mon Sep 17 00:00:00 2001 From: Lisandro Pin Date: Tue, 2 Dec 2025 19:14:24 +0100 Subject: Mutex command output writes for `volume.check.disk`. (#7605) Prevents potential screen garbling when operations are parallelized .Also simplifies logging by automatically adding newlines on output, if necessary. Co-authored-by: Chris Lu --- weed/shell/command_volume_check_disk.go | 44 ++++++++++++++++------------ weed/shell/command_volume_check_disk_test.go | 6 ++-- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index 78b2486dd..dbb64e239 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -10,6 +10,8 @@ import ( "math" "math/rand/v2" "net/http" + "strings" + "sync" "time" "slices" @@ -32,6 +34,7 @@ type commandVolumeCheckDisk struct{} type volumeCheckDisk struct { commandEnv *CommandEnv writer io.Writer + writerMu sync.Mutex now time.Time slowMode bool @@ -149,14 +152,14 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write // checkWritableVolumes fixes volume replicas which are not read-only. func (vcd *volumeCheckDisk) checkWritableVolumes(volumeReplicas map[uint32][]*VolumeReplica) error { - vcd.write("Pass #1 (writable volumes)\n") + vcd.write("Pass #1 (writable volumes)") for _, replicas := range volumeReplicas { // filter readonly replica var writableReplicas []*VolumeReplica for _, replica := range replicas { if replica.info.ReadOnly { - vcd.write("skipping readonly volume %d on %s\n", replica.info.Id, replica.location.dataNode.Id) + vcd.write("skipping readonly volume %d on %s", replica.info.Id, replica.location.dataNode.Id) } else { writableReplicas = append(writableReplicas, replica) } @@ -169,7 +172,7 @@ func (vcd *volumeCheckDisk) checkWritableVolumes(volumeReplicas map[uint32][]*Vo a, b := writableReplicas[0], writableReplicas[1] shouldSkip, err := vcd.shouldSkipVolume(a, b) if err != nil { - vcd.write("error checking if volume %d should be skipped: %v\n", a.info.Id, err) + vcd.write("error checking if volume %d should be skipped: %v", a.info.Id, err) // Continue with sync despite error to be safe } else if shouldSkip { // always choose the larger volume to be the source @@ -177,7 +180,7 @@ func (vcd *volumeCheckDisk) checkWritableVolumes(volumeReplicas map[uint32][]*Vo continue } if err := vcd.syncTwoReplicas(a, b, true); err != nil { - vcd.write("sync volume %d on %s and %s: %v\n", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err) + vcd.write("sync volume %d on %s and %s: %v", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err) } // always choose the larger volume to be the source if a.info.FileCount > b.info.FileCount { @@ -207,7 +210,7 @@ func (vcd *volumeCheckDisk) makeVolumeWritable(vid uint32, vr *VolumeReplica) er return err } - vcd.write("volume %d on %s is now writable\n", vid, vr.location.dataNode.Id) + vcd.write("volume %d on %s is now writable", vid, vr.location.dataNode.Id) return nil } @@ -227,7 +230,7 @@ func (vcd *volumeCheckDisk) makeVolumeReadonly(vid uint32, vr *VolumeReplica) er return err } - vcd.write("volume %d on %s is now read-only\n", vid, vr.location.dataNode.Id) + vcd.write("volume %d on %s is now read-only", vid, vr.location.dataNode.Id) return nil } @@ -235,7 +238,7 @@ func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*Vo if !vcd.fixReadOnly { return } - vcd.write("Pass #2 (read-only volumes)\n") + vcd.write("Pass #2 (read-only volumes)") for vid, replicas := range volumeReplicas { roReplicas := []*VolumeReplica{} @@ -249,11 +252,11 @@ func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*Vo } } if len(roReplicas) == 0 { - vcd.write("no read-only replicas for volume %d\n", vid) + vcd.write("no read-only replicas for volume %d", vid) continue } if len(rwReplicas) == 0 { - vcd.write("got %d read-only replicas for volume %d and no writable replicas to fix from\n", len(roReplicas), vid) + vcd.write("got %d read-only replicas for volume %d and no writable replicas to fix from", len(roReplicas), vid) continue } @@ -303,12 +306,15 @@ func (vcd *volumeCheckDisk) grpcDialOption() grpc.DialOption { } func (vcd *volumeCheckDisk) write(format string, a ...any) { - fmt.Fprintf(vcd.writer, format, a...) + vcd.writerMu.Lock() + defer vcd.writerMu.Unlock() + fmt.Fprintf(vcd.writer, strings.TrimRight(format, "\r\n "), a...) + fmt.Fprint(vcd.writer, "\n") } func (vcd *volumeCheckDisk) writeVerbose(format string, a ...any) { if vcd.verbose { - fmt.Fprintf(vcd.writer, format, a...) + vcd.write(format, a...) } } @@ -394,7 +400,7 @@ func (vcd *volumeCheckDisk) shouldSkipVolume(a, b *VolumeReplica) (bool, error) if doSyncDeletedCount && !eqDeletedFileCount { return false, nil } - vcd.writeVerbose("skipping active volumes %d with the same file counts on %s and %s\n", + vcd.writeVerbose("skipping active volumes %d with the same file counts on %s and %s", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id) } else { return false, nil @@ -412,7 +418,7 @@ func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi for (sourceHasChanges || targetHasChanges) && iteration < maxIterations { iteration++ - vcd.writeVerbose("sync iteration %d/%d for volume %d\n", iteration, maxIterations, source.info.Id) + vcd.writeVerbose("sync iteration %d/%d for volume %d", iteration, maxIterations, source.info.Id) prevSourceHasChanges, prevTargetHasChanges := sourceHasChanges, targetHasChanges if sourceHasChanges, targetHasChanges, err = vcd.checkBoth(source, target, bidi); err != nil { @@ -421,14 +427,14 @@ func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi // Detect if we're stuck in a loop with no progress if iteration > 1 && prevSourceHasChanges == sourceHasChanges && prevTargetHasChanges == targetHasChanges && (sourceHasChanges || targetHasChanges) { - vcd.write("volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop\n", + vcd.write("volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop", source.info.Id, source.location.dataNode.Id, target.location.dataNode.Id, iteration) return fmt.Errorf("sync not making progress after %d iterations", iteration) } } if iteration >= maxIterations && (sourceHasChanges || targetHasChanges) { - vcd.write("volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention\n", + vcd.write("volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention", source.info.Id, maxIterations, source.location.dataNode.Id, target.location.dataNode.Id) return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations) } @@ -518,7 +524,7 @@ func (vcd *volumeCheckDisk) doVolumeCheckDisk(minuend, subtrahend *needle_map.Me return nil }) - vcd.write("volume %d %s has %d entries, %s missed %d and partially deleted %d entries\n", + vcd.write("volume %d %s has %d entries, %s missed %d and partially deleted %d entries", source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles), len(partiallyDeletedNeedles)) if counter == 0 || (len(missingNeedles) == 0 && len(partiallyDeletedNeedles) == 0) { @@ -542,7 +548,7 @@ func (vcd *volumeCheckDisk) doVolumeCheckDisk(minuend, subtrahend *needle_map.Me continue } - vcd.writeVerbose("read %s %s => %s\n", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id) + vcd.writeVerbose("read %s %s => %s", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id) hasChanges = true if err = vcd.writeNeedleBlobToTarget(pb.NewServerAddressFromDataNode(target.location.dataNode), source.info.Id, needleValue, needleBlob); err != nil { @@ -555,7 +561,7 @@ func (vcd *volumeCheckDisk) doVolumeCheckDisk(minuend, subtrahend *needle_map.Me var fidList []string for _, needleValue := range partiallyDeletedNeedles { fidList = append(fidList, needleValue.Key.FileId(source.info.Id)) - vcd.writeVerbose("delete %s %s => %s\n", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id) + vcd.writeVerbose("delete %s %s => %s", needleValue.Key.FileId(source.info.Id), source.location.dataNode.Id, target.location.dataNode.Id) } deleteResults := operation.DeleteFileIdsAtOneVolumeServer( pb.NewServerAddressFromDataNode(target.location.dataNode), @@ -610,7 +616,7 @@ func (vcd *volumeCheckDisk) readIndexDatabase(db *needle_map.MemDb, collection s return err } - vcd.writeVerbose("load collection %s volume %d index size %d from %s ...\n", collection, volumeId, buf.Len(), volumeServer) + vcd.writeVerbose("load collection %s volume %d index size %d from %s ...", collection, volumeId, buf.Len(), volumeServer) return db.LoadFilterFromReaderAt(bytes.NewReader(buf.Bytes()), true, false) } diff --git a/weed/shell/command_volume_check_disk_test.go b/weed/shell/command_volume_check_disk_test.go index eee9103a8..ec958fbc4 100644 --- a/weed/shell/command_volume_check_disk_test.go +++ b/weed/shell/command_volume_check_disk_test.go @@ -278,14 +278,14 @@ func TestVolumeCheckDiskHelperMethods(t *testing.T) { } // Test write method - vcd.write("test %s\n", "message") + vcd.write("test %s", "message") if buf.String() != "test message\n" { t.Errorf("write() output = %q, want %q", buf.String(), "test message\n") } // Test writeVerbose with verbose=true buf.Reset() - vcd.writeVerbose("verbose %d\n", 123) + vcd.writeVerbose("verbose %d", 123) if buf.String() != "verbose 123\n" { t.Errorf("writeVerbose() with verbose=true output = %q, want %q", buf.String(), "verbose 123\n") } @@ -293,7 +293,7 @@ func TestVolumeCheckDiskHelperMethods(t *testing.T) { // Test writeVerbose with verbose=false buf.Reset() vcd.verbose = false - vcd.writeVerbose("should not appear\n") + vcd.writeVerbose("should not appear") if buf.String() != "" { t.Errorf("writeVerbose() with verbose=false output = %q, want empty", buf.String()) } -- cgit v1.2.3 From 4f038820dc480a7374b928a17cc9121474ba4172 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 2 Dec 2025 12:30:15 -0800 Subject: Add disk-aware EC rebalancing (#7597) * Add placement package for EC shard placement logic - Consolidate EC shard placement algorithm for reuse across shell and worker tasks - Support multi-pass selection: racks, then servers, then disks - Include proper spread verification and scoring functions - Comprehensive test coverage for various cluster topologies * Make ec.balance disk-aware for multi-disk servers - Add EcDisk struct to track individual disks on volume servers - Update EcNode to maintain per-disk shard distribution - Parse disk_id from EC shard information during topology collection - Implement pickBestDiskOnNode() for selecting best disk per shard - Add diskDistributionScore() for tie-breaking node selection - Update all move operations to specify target disk in RPC calls - Improves shard balance within multi-disk servers, not just across servers * Use placement package in EC detection for consistent disk-level placement - Replace custom EC disk selection logic with shared placement package - Convert topology DiskInfo to placement.DiskCandidate format - Use SelectDestinations() for multi-rack/server/disk spreading - Convert placement results back to topology DiskInfo for task creation - Ensures EC detection uses same placement logic as shell commands * Make volume server evacuation disk-aware - Use pickBestDiskOnNode() when selecting evacuation target disk - Specify target disk in evacuation RPC requests - Maintains balanced disk distribution during server evacuations * Rename PlacementConfig to PlacementRequest for clarity PlacementRequest better reflects that this is a request for placement rather than a configuration object. This improves API semantics. * Rename DefaultConfig to DefaultPlacementRequest Aligns with the PlacementRequest type naming for consistency * Address review comments from Gemini and CodeRabbit Fix HIGH issues: - Fix empty disk discovery: Now discovers all disks from VolumeInfos, not just from EC shards. This ensures disks without EC shards are still considered for placement. - Fix EC shard count calculation in detection.go: Now correctly filters by DiskId and sums actual shard counts using ShardBits.ShardIdCount() instead of just counting EcShardInfo entries. Fix MEDIUM issues: - Add disk ID to evacuation log messages for consistency with other logging - Remove unused serverToDisks variable in placement.go - Fix comment that incorrectly said 'ascending' when sorting is 'descending' * add ec tests * Update ec-integration-tests.yml * Update ec_integration_test.go * Fix EC integration tests CI: build weed binary and update actions - Add 'Build weed binary' step before running tests - Update actions/setup-go from v4 to v6 (Node20 compatibility) - Update actions/checkout from v2 to v4 (Node20 compatibility) - Move working-directory to test step only * Add disk-aware EC rebalancing integration tests - Add TestDiskAwareECRebalancing test with multi-disk cluster setup - Test EC encode with disk awareness (shows disk ID in output) - Test EC balance with disk-level shard distribution - Add helper functions for disk-level verification: - startMultiDiskCluster: 3 servers x 4 disks each - countShardsPerDisk: track shards per disk per server - calculateDiskShardVariance: measure distribution balance - Verify no single disk is overloaded with shards --- .github/workflows/ec-integration-tests.yml | 41 ++ test/erasure_coding/ec_integration_test.go | 438 ++++++++++++++++- weed/shell/command_ec_common.go | 190 +++++++- weed/shell/command_volume_server_evacuate.go | 10 +- weed/storage/erasure_coding/placement/placement.go | 420 +++++++++++++++++ .../erasure_coding/placement/placement_test.go | 517 +++++++++++++++++++++ weed/worker/tasks/erasure_coding/detection.go | 130 +++--- 7 files changed, 1673 insertions(+), 73 deletions(-) create mode 100644 .github/workflows/ec-integration-tests.yml create mode 100644 weed/storage/erasure_coding/placement/placement.go create mode 100644 weed/storage/erasure_coding/placement/placement_test.go diff --git a/.github/workflows/ec-integration-tests.yml b/.github/workflows/ec-integration-tests.yml new file mode 100644 index 000000000..ea476b77c --- /dev/null +++ b/.github/workflows/ec-integration-tests.yml @@ -0,0 +1,41 @@ +name: "EC Integration Tests" + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +permissions: + contents: read + +jobs: + ec-integration-tests: + name: EC Integration Tests + runs-on: ubuntu-22.04 + timeout-minutes: 30 + steps: + - name: Set up Go 1.x + uses: actions/setup-go@v6 + with: + go-version: ^1.24 + id: go + + - name: Check out code into the Go module directory + uses: actions/checkout@v4 + + - name: Build weed binary + run: | + cd weed && go build -o weed . + + - name: Run EC Integration Tests + working-directory: test/erasure_coding + run: | + go test -v + + - name: Archive logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: ec-integration-test-logs + path: test/erasure_coding \ No newline at end of file diff --git a/test/erasure_coding/ec_integration_test.go b/test/erasure_coding/ec_integration_test.go index 87b9b40ba..67f8eed04 100644 --- a/test/erasure_coding/ec_integration_test.go +++ b/test/erasure_coding/ec_integration_test.go @@ -5,9 +5,11 @@ import ( "context" "fmt" "io" + "math" "os" "os/exec" "path/filepath" + "strings" "testing" "time" @@ -139,6 +141,15 @@ func TestECEncodingVolumeLocationTimingBug(t *testing.T) { t.Logf("EC encoding completed successfully") } + // Add detailed logging for EC encoding command + t.Logf("Debug: Executing EC encoding command for volume %d", volumeId) + t.Logf("Debug: Command arguments: %v", args) + if err != nil { + t.Logf("Debug: EC encoding command failed with error: %v", err) + } else { + t.Logf("Debug: EC encoding command completed successfully") + } + // The key test: check if the fix prevents the timing issue if contains(outputStr, "Collecting volume locations") && contains(outputStr, "before EC encoding") { t.Logf("FIX DETECTED: Volume locations collected BEFORE EC encoding (timing bug prevented)") @@ -526,7 +537,8 @@ func uploadTestData(data []byte, masterAddress string) (needle.VolumeId, error) func getVolumeLocations(commandEnv *shell.CommandEnv, volumeId needle.VolumeId) ([]string, error) { // Retry mechanism to handle timing issues with volume registration - for i := 0; i < 10; i++ { + // Increase retry attempts for volume location retrieval + for i := 0; i < 20; i++ { // Increased from 10 to 20 retries locations, ok := commandEnv.MasterClient.GetLocationsClone(uint32(volumeId)) if ok { var result []string @@ -646,3 +658,427 @@ func TestECEncodingRegressionPrevention(t *testing.T) { t.Log("Timing pattern regression test passed") }) } + +// TestDiskAwareECRebalancing tests EC shard placement across multiple disks per server +// This verifies the disk-aware EC rebalancing feature works correctly +func TestDiskAwareECRebalancing(t *testing.T) { + if testing.Short() { + t.Skip("Skipping disk-aware integration test in short mode") + } + + testDir, err := os.MkdirTemp("", "seaweedfs_disk_aware_ec_test_") + require.NoError(t, err) + defer os.RemoveAll(testDir) + + ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) + defer cancel() + + // Start cluster with MULTIPLE DISKS per volume server + cluster, err := startMultiDiskCluster(ctx, testDir) + require.NoError(t, err) + defer cluster.Stop() + + // Wait for servers to be ready + require.NoError(t, waitForServer("127.0.0.1:9334", 30*time.Second)) + for i := 0; i < 3; i++ { + require.NoError(t, waitForServer(fmt.Sprintf("127.0.0.1:809%d", i), 30*time.Second)) + } + + // Wait longer for volume servers to register with master and create volumes + t.Log("Waiting for volume servers to register with master...") + time.Sleep(10 * time.Second) + + // Create command environment + options := &shell.ShellOptions{ + Masters: stringPtr("127.0.0.1:9334"), + GrpcDialOption: grpc.WithInsecure(), + FilerGroup: stringPtr("default"), + } + commandEnv := shell.NewCommandEnv(options) + + // Connect to master with longer timeout + ctx2, cancel2 := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel2() + go commandEnv.MasterClient.KeepConnectedToMaster(ctx2) + commandEnv.MasterClient.WaitUntilConnected(ctx2) + + // Wait for master client to fully sync + time.Sleep(5 * time.Second) + + // Upload test data to create a volume - retry if volumes not ready + var volumeId needle.VolumeId + testData := []byte("Disk-aware EC rebalancing test data - this needs to be large enough to create a volume") + for retry := 0; retry < 5; retry++ { + volumeId, err = uploadTestDataToMaster(testData, "127.0.0.1:9334") + if err == nil { + break + } + t.Logf("Upload attempt %d failed: %v, retrying...", retry+1, err) + time.Sleep(3 * time.Second) + } + require.NoError(t, err, "Failed to upload test data after retries") + t.Logf("Created volume %d for disk-aware EC test", volumeId) + + // Wait for volume to be registered + time.Sleep(3 * time.Second) + + t.Run("verify_multi_disk_setup", func(t *testing.T) { + // Verify that each server has multiple disk directories + for server := 0; server < 3; server++ { + diskCount := 0 + for disk := 0; disk < 4; disk++ { + diskDir := filepath.Join(testDir, fmt.Sprintf("server%d_disk%d", server, disk)) + if _, err := os.Stat(diskDir); err == nil { + diskCount++ + } + } + assert.Equal(t, 4, diskCount, "Server %d should have 4 disk directories", server) + t.Logf("Server %d has %d disk directories", server, diskCount) + } + }) + + t.Run("ec_encode_with_disk_awareness", func(t *testing.T) { + // Get lock first + lockCmd := shell.Commands[findCommandIndex("lock")] + var lockOutput bytes.Buffer + err := lockCmd.Do([]string{}, commandEnv, &lockOutput) + if err != nil { + t.Logf("Lock command failed: %v", err) + } + + // Execute EC encoding + var output bytes.Buffer + ecEncodeCmd := shell.Commands[findCommandIndex("ec.encode")] + args := []string{"-volumeId", fmt.Sprintf("%d", volumeId), "-collection", "test", "-force"} + + // Capture output + oldStdout := os.Stdout + oldStderr := os.Stderr + r, w, _ := os.Pipe() + os.Stdout = w + os.Stderr = w + + err = ecEncodeCmd.Do(args, commandEnv, &output) + + w.Close() + os.Stdout = oldStdout + os.Stderr = oldStderr + + capturedOutput, _ := io.ReadAll(r) + outputStr := string(capturedOutput) + output.String() + + t.Logf("EC encode output:\n%s", outputStr) + + if err != nil { + t.Logf("EC encoding completed with error: %v", err) + } else { + t.Logf("EC encoding completed successfully") + } + }) + + t.Run("verify_disk_level_shard_distribution", func(t *testing.T) { + // Wait for shards to be distributed + time.Sleep(2 * time.Second) + + // Count shards on each disk of each server + diskDistribution := countShardsPerDisk(testDir, uint32(volumeId)) + + totalShards := 0 + disksWithShards := 0 + maxShardsOnSingleDisk := 0 + + t.Logf("Disk-level shard distribution for volume %d:", volumeId) + for server, disks := range diskDistribution { + for diskId, shardCount := range disks { + if shardCount > 0 { + t.Logf(" %s disk %d: %d shards", server, diskId, shardCount) + totalShards += shardCount + disksWithShards++ + if shardCount > maxShardsOnSingleDisk { + maxShardsOnSingleDisk = shardCount + } + } + } + } + + t.Logf("Summary: %d total shards across %d disks (max %d on single disk)", + totalShards, disksWithShards, maxShardsOnSingleDisk) + + // EC creates 14 shards (10 data + 4 parity), plus .ecx and .ecj files + // We should see shards distributed across multiple disks + if disksWithShards > 1 { + t.Logf("PASS: Shards distributed across %d disks", disksWithShards) + } else { + t.Logf("INFO: Shards on %d disk(s) - may be expected if volume was on single disk", disksWithShards) + } + }) + + t.Run("test_ec_balance_disk_awareness", func(t *testing.T) { + // Calculate initial disk balance variance + initialDistribution := countShardsPerDisk(testDir, uint32(volumeId)) + initialVariance := calculateDiskShardVariance(initialDistribution) + t.Logf("Initial disk shard variance: %.2f", initialVariance) + + // Run ec.balance command + var output bytes.Buffer + ecBalanceCmd := shell.Commands[findCommandIndex("ec.balance")] + + oldStdout := os.Stdout + oldStderr := os.Stderr + r, w, _ := os.Pipe() + os.Stdout = w + os.Stderr = w + + err := ecBalanceCmd.Do([]string{"-force"}, commandEnv, &output) + + w.Close() + os.Stdout = oldStdout + os.Stderr = oldStderr + + capturedOutput, _ := io.ReadAll(r) + outputStr := string(capturedOutput) + output.String() + + if err != nil { + t.Logf("ec.balance error: %v", err) + } + t.Logf("ec.balance output:\n%s", outputStr) + + // Wait for balance to complete + time.Sleep(2 * time.Second) + + // Calculate final disk balance variance + finalDistribution := countShardsPerDisk(testDir, uint32(volumeId)) + finalVariance := calculateDiskShardVariance(finalDistribution) + t.Logf("Final disk shard variance: %.2f", finalVariance) + + t.Logf("Variance change: %.2f -> %.2f", initialVariance, finalVariance) + }) + + t.Run("verify_no_disk_overload", func(t *testing.T) { + // Verify that no single disk has too many shards of the same volume + diskDistribution := countShardsPerDisk(testDir, uint32(volumeId)) + + for server, disks := range diskDistribution { + for diskId, shardCount := range disks { + // With 14 EC shards and 12 disks (3 servers x 4 disks), ideally ~1-2 shards per disk + // Allow up to 4 shards per disk as a reasonable threshold + if shardCount > 4 { + t.Logf("WARNING: %s disk %d has %d shards (may indicate imbalance)", + server, diskId, shardCount) + } + } + } + }) +} + +// MultiDiskCluster represents a test cluster with multiple disks per volume server +type MultiDiskCluster struct { + masterCmd *exec.Cmd + volumeServers []*exec.Cmd + testDir string +} + +func (c *MultiDiskCluster) Stop() { + // Stop volume servers first + for _, cmd := range c.volumeServers { + if cmd != nil && cmd.Process != nil { + cmd.Process.Kill() + cmd.Wait() + } + } + + // Stop master server + if c.masterCmd != nil && c.masterCmd.Process != nil { + c.masterCmd.Process.Kill() + c.masterCmd.Wait() + } +} + +// startMultiDiskCluster starts a SeaweedFS cluster with multiple disks per volume server +func startMultiDiskCluster(ctx context.Context, dataDir string) (*MultiDiskCluster, error) { + weedBinary := findWeedBinary() + if weedBinary == "" { + return nil, fmt.Errorf("weed binary not found") + } + + cluster := &MultiDiskCluster{testDir: dataDir} + + // Create master directory + masterDir := filepath.Join(dataDir, "master") + os.MkdirAll(masterDir, 0755) + + // Start master server on a different port to avoid conflict + masterCmd := exec.CommandContext(ctx, weedBinary, "master", + "-port", "9334", + "-mdir", masterDir, + "-volumeSizeLimitMB", "10", + "-ip", "127.0.0.1", + ) + + masterLogFile, err := os.Create(filepath.Join(masterDir, "master.log")) + if err != nil { + return nil, fmt.Errorf("failed to create master log file: %v", err) + } + masterCmd.Stdout = masterLogFile + masterCmd.Stderr = masterLogFile + + if err := masterCmd.Start(); err != nil { + return nil, fmt.Errorf("failed to start master server: %v", err) + } + cluster.masterCmd = masterCmd + + // Wait for master to be ready + time.Sleep(2 * time.Second) + + // Start 3 volume servers, each with 4 disks + const numServers = 3 + const disksPerServer = 4 + + for i := 0; i < numServers; i++ { + // Create 4 disk directories per server + var diskDirs []string + var maxVolumes []string + + for d := 0; d < disksPerServer; d++ { + diskDir := filepath.Join(dataDir, fmt.Sprintf("server%d_disk%d", i, d)) + if err := os.MkdirAll(diskDir, 0755); err != nil { + cluster.Stop() + return nil, fmt.Errorf("failed to create disk dir: %v", err) + } + diskDirs = append(diskDirs, diskDir) + maxVolumes = append(maxVolumes, "5") + } + + port := fmt.Sprintf("809%d", i) + rack := fmt.Sprintf("rack%d", i) + + volumeCmd := exec.CommandContext(ctx, weedBinary, "volume", + "-port", port, + "-dir", strings.Join(diskDirs, ","), + "-max", strings.Join(maxVolumes, ","), + "-mserver", "127.0.0.1:9334", + "-ip", "127.0.0.1", + "-dataCenter", "dc1", + "-rack", rack, + ) + + // Create log file for this volume server + logDir := filepath.Join(dataDir, fmt.Sprintf("server%d_logs", i)) + os.MkdirAll(logDir, 0755) + volumeLogFile, err := os.Create(filepath.Join(logDir, "volume.log")) + if err != nil { + cluster.Stop() + return nil, fmt.Errorf("failed to create volume log file: %v", err) + } + volumeCmd.Stdout = volumeLogFile + volumeCmd.Stderr = volumeLogFile + + if err := volumeCmd.Start(); err != nil { + cluster.Stop() + return nil, fmt.Errorf("failed to start volume server %d: %v", i, err) + } + cluster.volumeServers = append(cluster.volumeServers, volumeCmd) + } + + // Wait for volume servers to register with master + // Multi-disk servers may take longer to initialize + time.Sleep(8 * time.Second) + + return cluster, nil +} + +// uploadTestDataToMaster uploads test data to a specific master address +func uploadTestDataToMaster(data []byte, masterAddress string) (needle.VolumeId, error) { + assignResult, err := operation.Assign(context.Background(), func(ctx context.Context) pb.ServerAddress { + return pb.ServerAddress(masterAddress) + }, grpc.WithInsecure(), &operation.VolumeAssignRequest{ + Count: 1, + Collection: "test", + Replication: "000", + }) + if err != nil { + return 0, err + } + + uploader, err := operation.NewUploader() + if err != nil { + return 0, err + } + + uploadResult, err, _ := uploader.Upload(context.Background(), bytes.NewReader(data), &operation.UploadOption{ + UploadUrl: "http://" + assignResult.Url + "/" + assignResult.Fid, + Filename: "testfile.txt", + MimeType: "text/plain", + }) + if err != nil { + return 0, err + } + + if uploadResult.Error != "" { + return 0, fmt.Errorf("upload error: %s", uploadResult.Error) + } + + fid, err := needle.ParseFileIdFromString(assignResult.Fid) + if err != nil { + return 0, err + } + + return fid.VolumeId, nil +} + +// countShardsPerDisk counts EC shards on each disk of each server +// Returns map: "serverN" -> map[diskId]shardCount +func countShardsPerDisk(testDir string, volumeId uint32) map[string]map[int]int { + result := make(map[string]map[int]int) + + const numServers = 3 + const disksPerServer = 4 + + for server := 0; server < numServers; server++ { + serverKey := fmt.Sprintf("server%d", server) + result[serverKey] = make(map[int]int) + + for disk := 0; disk < disksPerServer; disk++ { + diskDir := filepath.Join(testDir, fmt.Sprintf("server%d_disk%d", server, disk)) + count, err := countECShardFiles(diskDir, volumeId) + if err == nil && count > 0 { + result[serverKey][disk] = count + } + } + } + + return result +} + +// calculateDiskShardVariance measures how evenly shards are distributed across disks +// Lower variance means better distribution +func calculateDiskShardVariance(distribution map[string]map[int]int) float64 { + var counts []float64 + + for _, disks := range distribution { + for _, count := range disks { + if count > 0 { + counts = append(counts, float64(count)) + } + } + } + + if len(counts) == 0 { + return 0 + } + + // Calculate mean + mean := 0.0 + for _, c := range counts { + mean += c + } + mean /= float64(len(counts)) + + // Calculate variance + variance := 0.0 + for _, c := range counts { + variance += (c - mean) * (c - mean) + } + + return math.Sqrt(variance / float64(len(counts))) +} diff --git a/weed/shell/command_ec_common.go b/weed/shell/command_ec_common.go index f059b4e74..f2cc581da 100644 --- a/weed/shell/command_ec_common.go +++ b/weed/shell/command_ec_common.go @@ -26,12 +26,25 @@ type DataCenterId string type EcNodeId string type RackId string +// EcDisk represents a single disk on a volume server +type EcDisk struct { + diskId uint32 + diskType string + freeEcSlots int + ecShardCount int // Total EC shards on this disk + // Map of volumeId -> shardBits for shards on this disk + ecShards map[needle.VolumeId]erasure_coding.ShardBits +} + type EcNode struct { info *master_pb.DataNodeInfo dc DataCenterId rack RackId freeEcSlot int + // disks maps diskId -> EcDisk for disk-level balancing + disks map[uint32]*EcDisk } + type CandidateEcNode struct { ecNode *EcNode shardCount int @@ -229,7 +242,7 @@ func collectCollectionsForVolumeIds(t *master_pb.TopologyInfo, vids []needle.Vol return collections } -func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, destinationEcNode *EcNode, applyBalancing bool) (err error) { +func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, destinationEcNode *EcNode, destDiskId uint32, applyBalancing bool) (err error) { if !commandEnv.isLocked() { return fmt.Errorf("lock is lost") @@ -242,7 +255,7 @@ func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, existingServerAddress := pb.NewServerAddressFromDataNode(existingLocation.info) // ask destination node to copy shard and the ecx file from source node, and mount it - copiedShardIds, err = oneServerCopyAndMountEcShardsFromSource(commandEnv.option.GrpcDialOption, destinationEcNode, []uint32{uint32(shardId)}, vid, collection, existingServerAddress) + copiedShardIds, err = oneServerCopyAndMountEcShardsFromSource(commandEnv.option.GrpcDialOption, destinationEcNode, []uint32{uint32(shardId)}, vid, collection, existingServerAddress, destDiskId) if err != nil { return err } @@ -259,7 +272,11 @@ func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, return err } - fmt.Printf("moved ec shard %d.%d %s => %s\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id) + if destDiskId > 0 { + fmt.Printf("moved ec shard %d.%d %s => %s (disk %d)\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id, destDiskId) + } else { + fmt.Printf("moved ec shard %d.%d %s => %s\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id) + } } @@ -272,7 +289,7 @@ func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, func oneServerCopyAndMountEcShardsFromSource(grpcDialOption grpc.DialOption, targetServer *EcNode, shardIdsToCopy []uint32, - volumeId needle.VolumeId, collection string, existingLocation pb.ServerAddress) (copiedShardIds []uint32, err error) { + volumeId needle.VolumeId, collection string, existingLocation pb.ServerAddress, destDiskId uint32) (copiedShardIds []uint32, err error) { fmt.Printf("allocate %d.%v %s => %s\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id) @@ -289,6 +306,7 @@ func oneServerCopyAndMountEcShardsFromSource(grpcDialOption grpc.DialOption, CopyEcjFile: true, CopyVifFile: true, SourceDataNode: string(existingLocation), + DiskId: destDiskId, }) if copyErr != nil { return fmt.Errorf("copy %d.%v %s => %s : %v\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id, copyErr) @@ -410,12 +428,74 @@ func collectEcVolumeServersByDc(topo *master_pb.TopologyInfo, selectedDataCenter } freeEcSlots := countFreeShardSlots(dn, types.HardDriveType) - ecNodes = append(ecNodes, &EcNode{ + ecNode := &EcNode{ info: dn, dc: dc, rack: rack, freeEcSlot: int(freeEcSlots), - }) + disks: make(map[uint32]*EcDisk), + } + + // Build disk-level information from volumes and EC shards + // First, discover all unique disk IDs from VolumeInfos (includes empty disks) + allDiskIds := make(map[uint32]string) // diskId -> diskType + for diskType, diskInfo := range dn.DiskInfos { + if diskInfo == nil { + continue + } + // Get all disk IDs from volumes + for _, vi := range diskInfo.VolumeInfos { + allDiskIds[vi.DiskId] = diskType + } + // Also get disk IDs from EC shards + for _, ecShardInfo := range diskInfo.EcShardInfos { + allDiskIds[ecShardInfo.DiskId] = diskType + } + } + + // Group EC shards by disk_id + diskShards := make(map[uint32]map[needle.VolumeId]erasure_coding.ShardBits) + for _, diskInfo := range dn.DiskInfos { + if diskInfo == nil { + continue + } + for _, ecShardInfo := range diskInfo.EcShardInfos { + diskId := ecShardInfo.DiskId + if diskShards[diskId] == nil { + diskShards[diskId] = make(map[needle.VolumeId]erasure_coding.ShardBits) + } + vid := needle.VolumeId(ecShardInfo.Id) + diskShards[diskId][vid] = erasure_coding.ShardBits(ecShardInfo.EcIndexBits) + } + } + + // Create EcDisk for each discovered disk + diskCount := len(allDiskIds) + if diskCount == 0 { + diskCount = 1 + } + freePerDisk := int(freeEcSlots) / diskCount + + for diskId, diskType := range allDiskIds { + shards := diskShards[diskId] + if shards == nil { + shards = make(map[needle.VolumeId]erasure_coding.ShardBits) + } + totalShardCount := 0 + for _, shardBits := range shards { + totalShardCount += shardBits.ShardIdCount() + } + + ecNode.disks[diskId] = &EcDisk{ + diskId: diskId, + diskType: diskType, + freeEcSlots: freePerDisk, + ecShardCount: totalShardCount, + ecShards: shards, + } + } + + ecNodes = append(ecNodes, ecNode) totalFreeEcSlots += freeEcSlots }) return @@ -884,10 +964,16 @@ func (ecb *ecBalancer) doBalanceEcRack(ecRack *EcRack) error { for _, shards := range fullDiskInfo.EcShardInfos { if _, found := emptyNodeIds[shards.Id]; !found { for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() { + vid := needle.VolumeId(shards.Id) + destDiskId := pickBestDiskOnNode(emptyNode, vid) - fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id) + if destDiskId > 0 { + fmt.Printf("%s moves ec shards %d.%d to %s (disk %d)\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id, destDiskId) + } else { + fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id) + } - err := moveMountedShardToEcNode(ecb.commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, ecb.applyBalancing) + err := moveMountedShardToEcNode(ecb.commandEnv, fullNode, shards.Collection, vid, shardId, emptyNode, destDiskId, ecb.applyBalancing) if err != nil { return err } @@ -957,18 +1043,98 @@ func (ecb *ecBalancer) pickEcNodeToBalanceShardsInto(vid needle.VolumeId, existi if len(targets) == 0 { return nil, errors.New(details) } + + // When multiple nodes have the same shard count, prefer nodes with better disk distribution + // (i.e., nodes with more disks that have fewer shards of this volume) + if len(targets) > 1 { + slices.SortFunc(targets, func(a, b *EcNode) int { + aScore := diskDistributionScore(a, vid) + bScore := diskDistributionScore(b, vid) + return aScore - bScore // Lower score is better + }) + return targets[0], nil + } + return targets[rand.IntN(len(targets))], nil } +// diskDistributionScore calculates a score for how well-distributed shards are on the node's disks +// Lower score is better (means more room for balanced distribution) +func diskDistributionScore(ecNode *EcNode, vid needle.VolumeId) int { + if len(ecNode.disks) == 0 { + return 0 + } + + // Sum the existing shard count for this volume on each disk + // Lower total means more room for new shards + score := 0 + for _, disk := range ecNode.disks { + if shardBits, ok := disk.ecShards[vid]; ok { + score += shardBits.ShardIdCount() * 10 // Weight shards of this volume heavily + } + score += disk.ecShardCount // Also consider total shards on disk + } + return score +} + +// pickBestDiskOnNode selects the best disk on a node for placing a new EC shard +// It prefers disks with fewer shards and more free slots +func pickBestDiskOnNode(ecNode *EcNode, vid needle.VolumeId) uint32 { + if len(ecNode.disks) == 0 { + return 0 // No disk info available, let the server decide + } + + var bestDiskId uint32 + bestScore := -1 + + for diskId, disk := range ecNode.disks { + if disk.freeEcSlots <= 0 { + continue + } + + // Check if this volume already has shards on this disk + existingShards := 0 + if shardBits, ok := disk.ecShards[vid]; ok { + existingShards = shardBits.ShardIdCount() + } + + // Score: prefer disks with fewer total shards and fewer shards of this volume + // Lower score is better + score := disk.ecShardCount*10 + existingShards*100 + + if bestScore == -1 || score < bestScore { + bestScore = score + bestDiskId = diskId + } + } + + return bestDiskId +} + +// pickEcNodeAndDiskToBalanceShardsInto picks both a destination node and specific disk +func (ecb *ecBalancer) pickEcNodeAndDiskToBalanceShardsInto(vid needle.VolumeId, existingLocation *EcNode, possibleDestinations []*EcNode) (*EcNode, uint32, error) { + node, err := ecb.pickEcNodeToBalanceShardsInto(vid, existingLocation, possibleDestinations) + if err != nil { + return nil, 0, err + } + + diskId := pickBestDiskOnNode(node, vid) + return node, diskId, nil +} + func (ecb *ecBalancer) pickOneEcNodeAndMoveOneShard(existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode) error { - destNode, err := ecb.pickEcNodeToBalanceShardsInto(vid, existingLocation, possibleDestinationEcNodes) + destNode, destDiskId, err := ecb.pickEcNodeAndDiskToBalanceShardsInto(vid, existingLocation, possibleDestinationEcNodes) if err != nil { - fmt.Printf("WARNING: Could not find suitable taget node for %d.%d:\n%s", vid, shardId, err.Error()) + fmt.Printf("WARNING: Could not find suitable target node for %d.%d:\n%s", vid, shardId, err.Error()) return nil } - fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destNode.info.Id) - return moveMountedShardToEcNode(ecb.commandEnv, existingLocation, collection, vid, shardId, destNode, ecb.applyBalancing) + if destDiskId > 0 { + fmt.Printf("%s moves ec shard %d.%d to %s (disk %d)\n", existingLocation.info.Id, vid, shardId, destNode.info.Id, destDiskId) + } else { + fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destNode.info.Id) + } + return moveMountedShardToEcNode(ecb.commandEnv, existingLocation, collection, vid, shardId, destNode, destDiskId, ecb.applyBalancing) } func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) map[erasure_coding.ShardId]*EcNode { diff --git a/weed/shell/command_volume_server_evacuate.go b/weed/shell/command_volume_server_evacuate.go index 5c1805c89..6135eb3eb 100644 --- a/weed/shell/command_volume_server_evacuate.go +++ b/weed/shell/command_volume_server_evacuate.go @@ -197,8 +197,14 @@ func (c *commandVolumeServerEvacuate) moveAwayOneEcVolume(commandEnv *CommandEnv if ecShardInfo.Collection != "" { collectionPrefix = ecShardInfo.Collection + "_" } - fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id) - err = moveMountedShardToEcNode(commandEnv, thisNode, ecShardInfo.Collection, needle.VolumeId(ecShardInfo.Id), shardId, emptyNode, applyChange) + vid := needle.VolumeId(ecShardInfo.Id) + destDiskId := pickBestDiskOnNode(emptyNode, vid) + if destDiskId > 0 { + fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s (disk %d)\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id, destDiskId) + } else { + fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id) + } + err = moveMountedShardToEcNode(commandEnv, thisNode, ecShardInfo.Collection, vid, shardId, emptyNode, destDiskId, applyChange) if err != nil { return } else { diff --git a/weed/storage/erasure_coding/placement/placement.go b/weed/storage/erasure_coding/placement/placement.go new file mode 100644 index 000000000..67e21c1f8 --- /dev/null +++ b/weed/storage/erasure_coding/placement/placement.go @@ -0,0 +1,420 @@ +// Package placement provides consolidated EC shard placement logic used by +// both shell commands and worker tasks. +// +// This package encapsulates the algorithms for: +// - Selecting destination nodes/disks for EC shards +// - Ensuring proper spread across racks, servers, and disks +// - Balancing shards across the cluster +package placement + +import ( + "fmt" + "sort" +) + +// DiskCandidate represents a disk that can receive EC shards +type DiskCandidate struct { + NodeID string + DiskID uint32 + DataCenter string + Rack string + + // Capacity information + VolumeCount int64 + MaxVolumeCount int64 + ShardCount int // Current number of EC shards on this disk + FreeSlots int // Available slots for new shards + + // Load information + LoadCount int // Number of active tasks on this disk +} + +// NodeCandidate represents a server node that can receive EC shards +type NodeCandidate struct { + NodeID string + DataCenter string + Rack string + FreeSlots int + ShardCount int // Total shards across all disks + Disks []*DiskCandidate // All disks on this node +} + +// PlacementRequest configures EC shard placement behavior +type PlacementRequest struct { + // ShardsNeeded is the total number of shards to place + ShardsNeeded int + + // MaxShardsPerServer limits how many shards can be placed on a single server + // 0 means no limit (but prefer spreading when possible) + MaxShardsPerServer int + + // MaxShardsPerRack limits how many shards can be placed in a single rack + // 0 means no limit + MaxShardsPerRack int + + // MaxTaskLoad is the maximum task load count for a disk to be considered + MaxTaskLoad int + + // PreferDifferentServers when true, spreads shards across different servers + // before using multiple disks on the same server + PreferDifferentServers bool + + // PreferDifferentRacks when true, spreads shards across different racks + // before using multiple servers in the same rack + PreferDifferentRacks bool +} + +// DefaultPlacementRequest returns the default placement configuration +func DefaultPlacementRequest() PlacementRequest { + return PlacementRequest{ + ShardsNeeded: 14, + MaxShardsPerServer: 0, + MaxShardsPerRack: 0, + MaxTaskLoad: 5, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } +} + +// PlacementResult contains the selected destinations for EC shards +type PlacementResult struct { + SelectedDisks []*DiskCandidate + + // Statistics + ServersUsed int + RacksUsed int + DCsUsed int + + // Distribution maps + ShardsPerServer map[string]int + ShardsPerRack map[string]int + ShardsPerDC map[string]int +} + +// SelectDestinations selects the best disks for EC shard placement. +// This is the main entry point for EC placement logic. +// +// The algorithm works in multiple passes: +// 1. First pass: Select one disk from each rack (maximize rack diversity) +// 2. Second pass: Select one disk from each unused server in used racks (maximize server diversity) +// 3. Third pass: Select additional disks from servers already used (maximize disk diversity) +func SelectDestinations(disks []*DiskCandidate, config PlacementRequest) (*PlacementResult, error) { + if len(disks) == 0 { + return nil, fmt.Errorf("no disk candidates provided") + } + if config.ShardsNeeded <= 0 { + return nil, fmt.Errorf("shardsNeeded must be positive, got %d", config.ShardsNeeded) + } + + // Filter suitable disks + suitable := filterSuitableDisks(disks, config) + if len(suitable) == 0 { + return nil, fmt.Errorf("no suitable disks found after filtering") + } + + // Build indexes for efficient lookup + rackToDisks := groupDisksByRack(suitable) + + result := &PlacementResult{ + SelectedDisks: make([]*DiskCandidate, 0, config.ShardsNeeded), + ShardsPerServer: make(map[string]int), + ShardsPerRack: make(map[string]int), + ShardsPerDC: make(map[string]int), + } + + usedDisks := make(map[string]bool) // "nodeID:diskID" -> bool + usedServers := make(map[string]bool) // nodeID -> bool + usedRacks := make(map[string]bool) // "dc:rack" -> bool + + // Pass 1: Select one disk from each rack (maximize rack diversity) + if config.PreferDifferentRacks { + // Sort racks by number of available servers (descending) to prioritize racks with more options + sortedRacks := sortRacksByServerCount(rackToDisks) + for _, rackKey := range sortedRacks { + if len(result.SelectedDisks) >= config.ShardsNeeded { + break + } + rackDisks := rackToDisks[rackKey] + // Select best disk from this rack, preferring a new server + disk := selectBestDiskFromRack(rackDisks, usedServers, usedDisks, config) + if disk != nil { + addDiskToResult(result, disk, usedDisks, usedServers, usedRacks) + } + } + } + + // Pass 2: Select disks from unused servers in already-used racks + if config.PreferDifferentServers && len(result.SelectedDisks) < config.ShardsNeeded { + for _, rackKey := range getSortedRackKeys(rackToDisks) { + if len(result.SelectedDisks) >= config.ShardsNeeded { + break + } + rackDisks := rackToDisks[rackKey] + for _, disk := range sortDisksByScore(rackDisks) { + if len(result.SelectedDisks) >= config.ShardsNeeded { + break + } + diskKey := getDiskKey(disk) + if usedDisks[diskKey] { + continue + } + // Skip if server already used (we want different servers in this pass) + if usedServers[disk.NodeID] { + continue + } + // Check server limit + if config.MaxShardsPerServer > 0 && result.ShardsPerServer[disk.NodeID] >= config.MaxShardsPerServer { + continue + } + // Check rack limit + if config.MaxShardsPerRack > 0 && result.ShardsPerRack[getRackKey(disk)] >= config.MaxShardsPerRack { + continue + } + addDiskToResult(result, disk, usedDisks, usedServers, usedRacks) + } + } + } + + // Pass 3: Fill remaining slots from already-used servers (different disks) + // Use round-robin across servers to balance shards evenly + if len(result.SelectedDisks) < config.ShardsNeeded { + // Group remaining disks by server + serverToRemainingDisks := make(map[string][]*DiskCandidate) + for _, disk := range suitable { + if !usedDisks[getDiskKey(disk)] { + serverToRemainingDisks[disk.NodeID] = append(serverToRemainingDisks[disk.NodeID], disk) + } + } + + // Sort each server's disks by score + for serverID := range serverToRemainingDisks { + serverToRemainingDisks[serverID] = sortDisksByScore(serverToRemainingDisks[serverID]) + } + + // Round-robin: repeatedly select from the server with the fewest shards + for len(result.SelectedDisks) < config.ShardsNeeded { + // Find server with fewest shards that still has available disks + var bestServer string + minShards := -1 + for serverID, disks := range serverToRemainingDisks { + if len(disks) == 0 { + continue + } + // Check server limit + if config.MaxShardsPerServer > 0 && result.ShardsPerServer[serverID] >= config.MaxShardsPerServer { + continue + } + shardCount := result.ShardsPerServer[serverID] + if minShards == -1 || shardCount < minShards { + minShards = shardCount + bestServer = serverID + } else if shardCount == minShards && serverID < bestServer { + // Tie-break by server name for determinism + bestServer = serverID + } + } + + if bestServer == "" { + // No more servers with available disks + break + } + + // Pop the best disk from this server + disks := serverToRemainingDisks[bestServer] + disk := disks[0] + serverToRemainingDisks[bestServer] = disks[1:] + + // Check rack limit + if config.MaxShardsPerRack > 0 && result.ShardsPerRack[getRackKey(disk)] >= config.MaxShardsPerRack { + continue + } + + addDiskToResult(result, disk, usedDisks, usedServers, usedRacks) + } + } + + // Calculate final statistics + result.ServersUsed = len(usedServers) + result.RacksUsed = len(usedRacks) + dcSet := make(map[string]bool) + for _, disk := range result.SelectedDisks { + dcSet[disk.DataCenter] = true + } + result.DCsUsed = len(dcSet) + + return result, nil +} + +// filterSuitableDisks filters disks that are suitable for EC placement +func filterSuitableDisks(disks []*DiskCandidate, config PlacementRequest) []*DiskCandidate { + var suitable []*DiskCandidate + for _, disk := range disks { + if disk.FreeSlots <= 0 { + continue + } + if config.MaxTaskLoad > 0 && disk.LoadCount > config.MaxTaskLoad { + continue + } + suitable = append(suitable, disk) + } + return suitable +} + +// groupDisksByRack groups disks by their rack (dc:rack key) +func groupDisksByRack(disks []*DiskCandidate) map[string][]*DiskCandidate { + result := make(map[string][]*DiskCandidate) + for _, disk := range disks { + key := getRackKey(disk) + result[key] = append(result[key], disk) + } + return result +} + +// groupDisksByServer groups disks by their server +func groupDisksByServer(disks []*DiskCandidate) map[string][]*DiskCandidate { + result := make(map[string][]*DiskCandidate) + for _, disk := range disks { + result[disk.NodeID] = append(result[disk.NodeID], disk) + } + return result +} + +// getRackKey returns the unique key for a rack (dc:rack) +func getRackKey(disk *DiskCandidate) string { + return fmt.Sprintf("%s:%s", disk.DataCenter, disk.Rack) +} + +// getDiskKey returns the unique key for a disk (nodeID:diskID) +func getDiskKey(disk *DiskCandidate) string { + return fmt.Sprintf("%s:%d", disk.NodeID, disk.DiskID) +} + +// sortRacksByServerCount returns rack keys sorted by number of servers (ascending) +func sortRacksByServerCount(rackToDisks map[string][]*DiskCandidate) []string { + // Count unique servers per rack + rackServerCount := make(map[string]int) + for rackKey, disks := range rackToDisks { + servers := make(map[string]bool) + for _, disk := range disks { + servers[disk.NodeID] = true + } + rackServerCount[rackKey] = len(servers) + } + + keys := getSortedRackKeys(rackToDisks) + sort.Slice(keys, func(i, j int) bool { + // Sort by server count (descending) to pick from racks with more options first + return rackServerCount[keys[i]] > rackServerCount[keys[j]] + }) + return keys +} + +// getSortedRackKeys returns rack keys in a deterministic order +func getSortedRackKeys(rackToDisks map[string][]*DiskCandidate) []string { + keys := make([]string, 0, len(rackToDisks)) + for k := range rackToDisks { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} + +// selectBestDiskFromRack selects the best disk from a rack for EC placement +// It prefers servers that haven't been used yet +func selectBestDiskFromRack(disks []*DiskCandidate, usedServers, usedDisks map[string]bool, config PlacementRequest) *DiskCandidate { + var bestDisk *DiskCandidate + bestScore := -1.0 + bestIsFromUnusedServer := false + + for _, disk := range disks { + if usedDisks[getDiskKey(disk)] { + continue + } + isFromUnusedServer := !usedServers[disk.NodeID] + score := calculateDiskScore(disk) + + // Prefer unused servers + if isFromUnusedServer && !bestIsFromUnusedServer { + bestDisk = disk + bestScore = score + bestIsFromUnusedServer = true + } else if isFromUnusedServer == bestIsFromUnusedServer && score > bestScore { + bestDisk = disk + bestScore = score + } + } + + return bestDisk +} + +// sortDisksByScore returns disks sorted by score (best first) +func sortDisksByScore(disks []*DiskCandidate) []*DiskCandidate { + sorted := make([]*DiskCandidate, len(disks)) + copy(sorted, disks) + sort.Slice(sorted, func(i, j int) bool { + return calculateDiskScore(sorted[i]) > calculateDiskScore(sorted[j]) + }) + return sorted +} + +// calculateDiskScore calculates a score for a disk candidate +// Higher score is better +func calculateDiskScore(disk *DiskCandidate) float64 { + score := 0.0 + + // Primary factor: available capacity (lower utilization is better) + if disk.MaxVolumeCount > 0 { + utilization := float64(disk.VolumeCount) / float64(disk.MaxVolumeCount) + score += (1.0 - utilization) * 60.0 // Up to 60 points + } else { + score += 30.0 // Default if no max count + } + + // Secondary factor: fewer shards already on this disk is better + score += float64(10-disk.ShardCount) * 2.0 // Up to 20 points + + // Tertiary factor: lower load is better + score += float64(10 - disk.LoadCount) // Up to 10 points + + return score +} + +// addDiskToResult adds a disk to the result and updates tracking maps +func addDiskToResult(result *PlacementResult, disk *DiskCandidate, + usedDisks, usedServers, usedRacks map[string]bool) { + diskKey := getDiskKey(disk) + rackKey := getRackKey(disk) + + result.SelectedDisks = append(result.SelectedDisks, disk) + usedDisks[diskKey] = true + usedServers[disk.NodeID] = true + usedRacks[rackKey] = true + result.ShardsPerServer[disk.NodeID]++ + result.ShardsPerRack[rackKey]++ + result.ShardsPerDC[disk.DataCenter]++ +} + +// VerifySpread checks if the placement result meets diversity requirements +func VerifySpread(result *PlacementResult, minServers, minRacks int) error { + if result.ServersUsed < minServers { + return fmt.Errorf("only %d servers used, need at least %d", result.ServersUsed, minServers) + } + if result.RacksUsed < minRacks { + return fmt.Errorf("only %d racks used, need at least %d", result.RacksUsed, minRacks) + } + return nil +} + +// CalculateIdealDistribution returns the ideal number of shards per server +// when we have a certain number of shards and servers +func CalculateIdealDistribution(totalShards, numServers int) (min, max int) { + if numServers <= 0 { + return 0, totalShards + } + min = totalShards / numServers + max = min + if totalShards%numServers != 0 { + max = min + 1 + } + return +} diff --git a/weed/storage/erasure_coding/placement/placement_test.go b/weed/storage/erasure_coding/placement/placement_test.go new file mode 100644 index 000000000..6cb94a4da --- /dev/null +++ b/weed/storage/erasure_coding/placement/placement_test.go @@ -0,0 +1,517 @@ +package placement + +import ( +"testing" +) + +// Helper function to create disk candidates for testing +func makeDisk(nodeID string, diskID uint32, dc, rack string, freeSlots int) *DiskCandidate { + return &DiskCandidate{ + NodeID: nodeID, + DiskID: diskID, + DataCenter: dc, + Rack: rack, + VolumeCount: 0, + MaxVolumeCount: 100, + ShardCount: 0, + FreeSlots: freeSlots, + LoadCount: 0, + } +} + +func TestSelectDestinations_SingleRack(t *testing.T) { + // Test: 3 servers in same rack, each with 2 disks, need 6 shards + // Expected: Should spread across all 6 disks (one per disk) + disks := []*DiskCandidate{ + makeDisk("server1", 0, "dc1", "rack1", 10), + makeDisk("server1", 1, "dc1", "rack1", 10), + makeDisk("server2", 0, "dc1", "rack1", 10), + makeDisk("server2", 1, "dc1", "rack1", 10), + makeDisk("server3", 0, "dc1", "rack1", 10), + makeDisk("server3", 1, "dc1", "rack1", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 6, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.SelectedDisks) != 6 { + t.Errorf("expected 6 selected disks, got %d", len(result.SelectedDisks)) + } + + // Verify all 3 servers are used + if result.ServersUsed != 3 { + t.Errorf("expected 3 servers used, got %d", result.ServersUsed) + } + + // Verify each disk is unique + diskSet := make(map[string]bool) + for _, disk := range result.SelectedDisks { + key := getDiskKey(disk) + if diskSet[key] { + t.Errorf("disk %s selected multiple times", key) + } + diskSet[key] = true + } +} + +func TestSelectDestinations_MultipleRacks(t *testing.T) { + // Test: 2 racks with 2 servers each, each server has 2 disks + // Need 8 shards + // Expected: Should spread across all 8 disks + disks := []*DiskCandidate{ + makeDisk("server1", 0, "dc1", "rack1", 10), + makeDisk("server1", 1, "dc1", "rack1", 10), + makeDisk("server2", 0, "dc1", "rack1", 10), + makeDisk("server2", 1, "dc1", "rack1", 10), + makeDisk("server3", 0, "dc1", "rack2", 10), + makeDisk("server3", 1, "dc1", "rack2", 10), + makeDisk("server4", 0, "dc1", "rack2", 10), + makeDisk("server4", 1, "dc1", "rack2", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 8, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.SelectedDisks) != 8 { + t.Errorf("expected 8 selected disks, got %d", len(result.SelectedDisks)) + } + + // Verify all 4 servers are used + if result.ServersUsed != 4 { + t.Errorf("expected 4 servers used, got %d", result.ServersUsed) + } + + // Verify both racks are used + if result.RacksUsed != 2 { + t.Errorf("expected 2 racks used, got %d", result.RacksUsed) + } +} + +func TestSelectDestinations_PrefersDifferentServers(t *testing.T) { + // Test: 4 servers with 4 disks each, need 4 shards + // Expected: Should use one disk from each server + disks := []*DiskCandidate{ + makeDisk("server1", 0, "dc1", "rack1", 10), + makeDisk("server1", 1, "dc1", "rack1", 10), + makeDisk("server1", 2, "dc1", "rack1", 10), + makeDisk("server1", 3, "dc1", "rack1", 10), + makeDisk("server2", 0, "dc1", "rack1", 10), + makeDisk("server2", 1, "dc1", "rack1", 10), + makeDisk("server2", 2, "dc1", "rack1", 10), + makeDisk("server2", 3, "dc1", "rack1", 10), + makeDisk("server3", 0, "dc1", "rack1", 10), + makeDisk("server3", 1, "dc1", "rack1", 10), + makeDisk("server3", 2, "dc1", "rack1", 10), + makeDisk("server3", 3, "dc1", "rack1", 10), + makeDisk("server4", 0, "dc1", "rack1", 10), + makeDisk("server4", 1, "dc1", "rack1", 10), + makeDisk("server4", 2, "dc1", "rack1", 10), + makeDisk("server4", 3, "dc1", "rack1", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 4, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.SelectedDisks) != 4 { + t.Errorf("expected 4 selected disks, got %d", len(result.SelectedDisks)) + } + + // Verify all 4 servers are used (one shard per server) + if result.ServersUsed != 4 { + t.Errorf("expected 4 servers used, got %d", result.ServersUsed) + } + + // Each server should have exactly 1 shard + for server, count := range result.ShardsPerServer { + if count != 1 { + t.Errorf("server %s has %d shards, expected 1", server, count) + } + } +} + +func TestSelectDestinations_SpilloverToMultipleDisksPerServer(t *testing.T) { + // Test: 2 servers with 4 disks each, need 6 shards + // Expected: First pick one from each server (2 shards), then one more from each (4 shards), + // then fill remaining from any server (6 shards) + disks := []*DiskCandidate{ + makeDisk("server1", 0, "dc1", "rack1", 10), + makeDisk("server1", 1, "dc1", "rack1", 10), + makeDisk("server1", 2, "dc1", "rack1", 10), + makeDisk("server1", 3, "dc1", "rack1", 10), + makeDisk("server2", 0, "dc1", "rack1", 10), + makeDisk("server2", 1, "dc1", "rack1", 10), + makeDisk("server2", 2, "dc1", "rack1", 10), + makeDisk("server2", 3, "dc1", "rack1", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 6, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.SelectedDisks) != 6 { + t.Errorf("expected 6 selected disks, got %d", len(result.SelectedDisks)) + } + + // Both servers should be used + if result.ServersUsed != 2 { + t.Errorf("expected 2 servers used, got %d", result.ServersUsed) + } + + // Each server should have exactly 3 shards (balanced) + for server, count := range result.ShardsPerServer { + if count != 3 { + t.Errorf("server %s has %d shards, expected 3", server, count) + } + } +} + +func TestSelectDestinations_MaxShardsPerServer(t *testing.T) { + // Test: 2 servers with 4 disks each, need 6 shards, max 2 per server + // Expected: Should only select 4 shards (2 per server limit) + disks := []*DiskCandidate{ + makeDisk("server1", 0, "dc1", "rack1", 10), + makeDisk("server1", 1, "dc1", "rack1", 10), + makeDisk("server1", 2, "dc1", "rack1", 10), + makeDisk("server1", 3, "dc1", "rack1", 10), + makeDisk("server2", 0, "dc1", "rack1", 10), + makeDisk("server2", 1, "dc1", "rack1", 10), + makeDisk("server2", 2, "dc1", "rack1", 10), + makeDisk("server2", 3, "dc1", "rack1", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 6, + MaxShardsPerServer: 2, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should only get 4 shards due to server limit + if len(result.SelectedDisks) != 4 { + t.Errorf("expected 4 selected disks (limit 2 per server), got %d", len(result.SelectedDisks)) + } + + // No server should exceed the limit + for server, count := range result.ShardsPerServer { + if count > 2 { + t.Errorf("server %s has %d shards, exceeds limit of 2", server, count) + } + } +} + +func TestSelectDestinations_14ShardsAcross7Servers(t *testing.T) { + // Test: Real-world EC scenario - 14 shards across 7 servers with 2 disks each + // Expected: Should spread evenly (2 shards per server) + var disks []*DiskCandidate + for i := 1; i <= 7; i++ { + serverID := "server" + string(rune('0'+i)) + disks = append(disks, makeDisk(serverID, 0, "dc1", "rack1", 10)) + disks = append(disks, makeDisk(serverID, 1, "dc1", "rack1", 10)) + } + + config := PlacementRequest{ + ShardsNeeded: 14, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.SelectedDisks) != 14 { + t.Errorf("expected 14 selected disks, got %d", len(result.SelectedDisks)) + } + + // All 7 servers should be used + if result.ServersUsed != 7 { + t.Errorf("expected 7 servers used, got %d", result.ServersUsed) + } + + // Each server should have exactly 2 shards + for server, count := range result.ShardsPerServer { + if count != 2 { + t.Errorf("server %s has %d shards, expected 2", server, count) + } + } +} + +func TestSelectDestinations_FewerServersThanShards(t *testing.T) { + // Test: Only 3 servers but need 6 shards + // Expected: Should distribute evenly (2 per server) + disks := []*DiskCandidate{ + makeDisk("server1", 0, "dc1", "rack1", 10), + makeDisk("server1", 1, "dc1", "rack1", 10), + makeDisk("server1", 2, "dc1", "rack1", 10), + makeDisk("server2", 0, "dc1", "rack1", 10), + makeDisk("server2", 1, "dc1", "rack1", 10), + makeDisk("server2", 2, "dc1", "rack1", 10), + makeDisk("server3", 0, "dc1", "rack1", 10), + makeDisk("server3", 1, "dc1", "rack1", 10), + makeDisk("server3", 2, "dc1", "rack1", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 6, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.SelectedDisks) != 6 { + t.Errorf("expected 6 selected disks, got %d", len(result.SelectedDisks)) + } + + // All 3 servers should be used + if result.ServersUsed != 3 { + t.Errorf("expected 3 servers used, got %d", result.ServersUsed) + } + + // Each server should have exactly 2 shards + for server, count := range result.ShardsPerServer { + if count != 2 { + t.Errorf("server %s has %d shards, expected 2", server, count) + } + } +} + +func TestSelectDestinations_NoSuitableDisks(t *testing.T) { + // Test: All disks have no free slots + disks := []*DiskCandidate{ + {NodeID: "server1", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 0}, + {NodeID: "server2", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 0}, + } + + config := PlacementRequest{ + ShardsNeeded: 4, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + _, err := SelectDestinations(disks, config) + if err == nil { + t.Error("expected error for no suitable disks, got nil") + } +} + +func TestSelectDestinations_EmptyInput(t *testing.T) { + config := DefaultPlacementRequest() + _, err := SelectDestinations([]*DiskCandidate{}, config) + if err == nil { + t.Error("expected error for empty input, got nil") + } +} + +func TestSelectDestinations_FiltersByLoad(t *testing.T) { + // Test: Some disks have too high load + disks := []*DiskCandidate{ + {NodeID: "server1", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 10, LoadCount: 10}, + {NodeID: "server2", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 10, LoadCount: 2}, + {NodeID: "server3", DiskID: 0, DataCenter: "dc1", Rack: "rack1", FreeSlots: 10, LoadCount: 1}, + } + + config := PlacementRequest{ + ShardsNeeded: 2, + MaxTaskLoad: 5, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should only select from server2 and server3 (server1 has too high load) + for _, disk := range result.SelectedDisks { + if disk.NodeID == "server1" { + t.Errorf("disk from server1 should not be selected (load too high)") + } + } +} + +func TestCalculateDiskScore(t *testing.T) { + // Test that score calculation works as expected + lowUtilDisk := &DiskCandidate{ + VolumeCount: 10, + MaxVolumeCount: 100, + ShardCount: 0, + LoadCount: 0, + } + + highUtilDisk := &DiskCandidate{ + VolumeCount: 90, + MaxVolumeCount: 100, + ShardCount: 5, + LoadCount: 5, + } + + lowScore := calculateDiskScore(lowUtilDisk) + highScore := calculateDiskScore(highUtilDisk) + + if lowScore <= highScore { + t.Errorf("low utilization disk should have higher score: low=%f, high=%f", lowScore, highScore) + } +} + +func TestCalculateIdealDistribution(t *testing.T) { + tests := []struct { + totalShards int + numServers int + expectedMin int + expectedMax int + }{ + {14, 7, 2, 2}, // Even distribution + {14, 4, 3, 4}, // Uneven: 14/4 = 3 remainder 2 + {6, 3, 2, 2}, // Even distribution + {7, 3, 2, 3}, // Uneven: 7/3 = 2 remainder 1 + {10, 0, 0, 10}, // Edge case: no servers + {0, 5, 0, 0}, // Edge case: no shards + } + + for _, tt := range tests { + min, max := CalculateIdealDistribution(tt.totalShards, tt.numServers) + if min != tt.expectedMin || max != tt.expectedMax { + t.Errorf("CalculateIdealDistribution(%d, %d) = (%d, %d), want (%d, %d)", +tt.totalShards, tt.numServers, min, max, tt.expectedMin, tt.expectedMax) + } + } +} + +func TestVerifySpread(t *testing.T) { + result := &PlacementResult{ + ServersUsed: 3, + RacksUsed: 2, + } + + // Should pass + if err := VerifySpread(result, 3, 2); err != nil { + t.Errorf("unexpected error: %v", err) + } + + // Should fail - not enough servers + if err := VerifySpread(result, 4, 2); err == nil { + t.Error("expected error for insufficient servers") + } + + // Should fail - not enough racks + if err := VerifySpread(result, 3, 3); err == nil { + t.Error("expected error for insufficient racks") + } +} + +func TestSelectDestinations_MultiDC(t *testing.T) { + // Test: 2 DCs, each with 2 racks, each rack has 2 servers + disks := []*DiskCandidate{ + // DC1, Rack1 + makeDisk("dc1-r1-s1", 0, "dc1", "rack1", 10), + makeDisk("dc1-r1-s1", 1, "dc1", "rack1", 10), + makeDisk("dc1-r1-s2", 0, "dc1", "rack1", 10), + makeDisk("dc1-r1-s2", 1, "dc1", "rack1", 10), + // DC1, Rack2 + makeDisk("dc1-r2-s1", 0, "dc1", "rack2", 10), + makeDisk("dc1-r2-s1", 1, "dc1", "rack2", 10), + makeDisk("dc1-r2-s2", 0, "dc1", "rack2", 10), + makeDisk("dc1-r2-s2", 1, "dc1", "rack2", 10), + // DC2, Rack1 + makeDisk("dc2-r1-s1", 0, "dc2", "rack1", 10), + makeDisk("dc2-r1-s1", 1, "dc2", "rack1", 10), + makeDisk("dc2-r1-s2", 0, "dc2", "rack1", 10), + makeDisk("dc2-r1-s2", 1, "dc2", "rack1", 10), + // DC2, Rack2 + makeDisk("dc2-r2-s1", 0, "dc2", "rack2", 10), + makeDisk("dc2-r2-s1", 1, "dc2", "rack2", 10), + makeDisk("dc2-r2-s2", 0, "dc2", "rack2", 10), + makeDisk("dc2-r2-s2", 1, "dc2", "rack2", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 8, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(result.SelectedDisks) != 8 { + t.Errorf("expected 8 selected disks, got %d", len(result.SelectedDisks)) + } + + // Should use all 4 racks + if result.RacksUsed != 4 { + t.Errorf("expected 4 racks used, got %d", result.RacksUsed) + } + + // Should use both DCs + if result.DCsUsed != 2 { + t.Errorf("expected 2 DCs used, got %d", result.DCsUsed) + } +} + +func TestSelectDestinations_SameRackDifferentDC(t *testing.T) { + // Test: Same rack name in different DCs should be treated as different racks + disks := []*DiskCandidate{ + makeDisk("dc1-s1", 0, "dc1", "rack1", 10), + makeDisk("dc2-s1", 0, "dc2", "rack1", 10), + } + + config := PlacementRequest{ + ShardsNeeded: 2, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } + + result, err := SelectDestinations(disks, config) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should use 2 racks (dc1:rack1 and dc2:rack1 are different) + if result.RacksUsed != 2 { + t.Errorf("expected 2 racks used (different DCs), got %d", result.RacksUsed) + } +} diff --git a/weed/worker/tasks/erasure_coding/detection.go b/weed/worker/tasks/erasure_coding/detection.go index cd74bed33..c5568fe26 100644 --- a/weed/worker/tasks/erasure_coding/detection.go +++ b/weed/worker/tasks/erasure_coding/detection.go @@ -9,6 +9,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding/placement" "github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" "github.com/seaweedfs/seaweedfs/weed/worker/types" ) @@ -429,85 +430,100 @@ func createECTaskParams(multiPlan *topology.MultiDestinationPlan) *worker_pb.Era } // selectBestECDestinations selects multiple disks for EC shard placement with diversity +// Uses the consolidated placement package for proper rack/server/disk spreading func selectBestECDestinations(disks []*topology.DiskInfo, sourceRack, sourceDC string, shardsNeeded int) []*topology.DiskInfo { if len(disks) == 0 { return nil } - // Group disks by rack and DC for diversity - rackGroups := make(map[string][]*topology.DiskInfo) - for _, disk := range disks { - rackKey := fmt.Sprintf("%s:%s", disk.DataCenter, disk.Rack) - rackGroups[rackKey] = append(rackGroups[rackKey], disk) + // Convert topology.DiskInfo to placement.DiskCandidate + candidates := diskInfosToCandidates(disks) + if len(candidates) == 0 { + return nil } - var selected []*topology.DiskInfo - usedRacks := make(map[string]bool) + // Configure placement for EC shards + config := placement.PlacementRequest{ + ShardsNeeded: shardsNeeded, + MaxShardsPerServer: 0, // No hard limit, but prefer spreading + MaxShardsPerRack: 0, // No hard limit, but prefer spreading + MaxTaskLoad: topology.MaxTaskLoadForECPlacement, + PreferDifferentServers: true, + PreferDifferentRacks: true, + } - // First pass: select one disk from each rack for maximum diversity - for rackKey, rackDisks := range rackGroups { - if len(selected) >= shardsNeeded { - break - } + // Use the shared placement algorithm + result, err := placement.SelectDestinations(candidates, config) + if err != nil { + glog.V(2).Infof("EC placement failed: %v", err) + return nil + } - // Select best disk from this rack - bestDisk := selectBestFromRack(rackDisks, sourceRack, sourceDC) - if bestDisk != nil { - selected = append(selected, bestDisk) - usedRacks[rackKey] = true + // Convert back to topology.DiskInfo + return candidatesToDiskInfos(result.SelectedDisks, disks) +} + +// diskInfosToCandidates converts topology.DiskInfo slice to placement.DiskCandidate slice +func diskInfosToCandidates(disks []*topology.DiskInfo) []*placement.DiskCandidate { + var candidates []*placement.DiskCandidate + for _, disk := range disks { + if disk.DiskInfo == nil { + continue } - } - // Second pass: if we need more disks, select from racks we've already used - if len(selected) < shardsNeeded { - for _, disk := range disks { - if len(selected) >= shardsNeeded { - break - } + // Calculate free slots (using default max if not set) + freeSlots := int(disk.DiskInfo.MaxVolumeCount - disk.DiskInfo.VolumeCount) + if freeSlots < 0 { + freeSlots = 0 + } - // Skip if already selected - alreadySelected := false - for _, sel := range selected { - if sel.NodeID == disk.NodeID && sel.DiskID == disk.DiskID { - alreadySelected = true - break + // Calculate EC shard count for this specific disk + // EcShardInfos contains all shards, so we need to filter by DiskId and sum actual shard counts + ecShardCount := 0 + if disk.DiskInfo.EcShardInfos != nil { + for _, shardInfo := range disk.DiskInfo.EcShardInfos { + if shardInfo.DiskId == disk.DiskID { + ecShardCount += erasure_coding.ShardBits(shardInfo.EcIndexBits).ShardIdCount() } } - - if !alreadySelected && isDiskSuitableForEC(disk) { - selected = append(selected, disk) - } } - } - return selected + candidates = append(candidates, &placement.DiskCandidate{ + NodeID: disk.NodeID, + DiskID: disk.DiskID, + DataCenter: disk.DataCenter, + Rack: disk.Rack, + VolumeCount: disk.DiskInfo.VolumeCount, + MaxVolumeCount: disk.DiskInfo.MaxVolumeCount, + ShardCount: ecShardCount, + FreeSlots: freeSlots, + LoadCount: disk.LoadCount, + }) + } + return candidates } -// selectBestFromRack selects the best disk from a rack for EC placement -func selectBestFromRack(disks []*topology.DiskInfo, sourceRack, sourceDC string) *topology.DiskInfo { - if len(disks) == 0 { - return nil +// candidatesToDiskInfos converts placement results back to topology.DiskInfo +func candidatesToDiskInfos(candidates []*placement.DiskCandidate, originalDisks []*topology.DiskInfo) []*topology.DiskInfo { + // Create a map for quick lookup + diskMap := make(map[string]*topology.DiskInfo) + for _, disk := range originalDisks { + key := fmt.Sprintf("%s:%d", disk.NodeID, disk.DiskID) + diskMap[key] = disk } - var bestDisk *topology.DiskInfo - bestScore := -1.0 - - for _, disk := range disks { - if !isDiskSuitableForEC(disk) { - continue - } - - score := calculateECScore(disk, sourceRack, sourceDC) - if score > bestScore { - bestScore = score - bestDisk = disk + var result []*topology.DiskInfo + for _, candidate := range candidates { + key := fmt.Sprintf("%s:%d", candidate.NodeID, candidate.DiskID) + if disk, ok := diskMap[key]; ok { + result = append(result, disk) } } - - return bestDisk + return result } // calculateECScore calculates placement score for EC operations +// Used for logging and plan metadata func calculateECScore(disk *topology.DiskInfo, sourceRack, sourceDC string) float64 { if disk.DiskInfo == nil { return 0.0 @@ -524,14 +540,12 @@ func calculateECScore(disk *topology.DiskInfo, sourceRack, sourceDC string) floa // Consider current load (secondary factor) score += (10.0 - float64(disk.LoadCount)) // Up to 10 points for low load - // Note: We don't penalize placing shards on the same rack/DC as source - // since the original volume will be deleted after EC conversion. - // This allows for better network efficiency and storage utilization. - return score } // isDiskSuitableForEC checks if a disk is suitable for EC placement +// Note: This is kept for backward compatibility but the placement package +// handles filtering internally func isDiskSuitableForEC(disk *topology.DiskInfo) bool { if disk.DiskInfo == nil { return false -- cgit v1.2.3 From 51841a2e04c973aa9c13422ad771858f98eee182 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 2 Dec 2025 17:00:05 -0800 Subject: fix: skip cookie validation for EC volume deletion when SkipCookieCheck is set (#7608) fix: EC volume deletion issues Fixes #7489 1. Skip cookie check for EC volume deletion when SkipCookieCheck is set When batch deleting files from EC volumes with SkipCookieCheck=true (e.g., orphan file cleanup), the cookie is not available. The deletion was failing with 'unexpected cookie 0' because DeleteEcShardNeedle always validated the cookie. 2. Optimize doDeleteNeedleFromAtLeastOneRemoteEcShards to return early Return immediately when a deletion succeeds, instead of continuing to try all parity shards unnecessarily. 3. Remove useless log message that always logged nil error The log at V(1) was logging err after checking it was nil. Regression introduced in commit 7bdae5172 (Jan 3, 2023) when EC batch delete support was added. --- weed/storage/store_ec_delete.go | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/weed/storage/store_ec_delete.go b/weed/storage/store_ec_delete.go index a3e028bbb..9fcb092a2 100644 --- a/weed/storage/store_ec_delete.go +++ b/weed/storage/store_ec_delete.go @@ -3,6 +3,7 @@ package storage import ( "context" "fmt" + "github.com/seaweedfs/seaweedfs/weed/pb" "github.com/seaweedfs/seaweedfs/weed/glog" @@ -21,7 +22,8 @@ func (s *Store) DeleteEcShardNeedle(ecVolume *erasure_coding.EcVolume, n *needle return 0, err } - if cookie != n.Cookie { + // cookie == 0 indicates SkipCookieCheck was requested (e.g., orphan cleanup) + if cookie != 0 && cookie != n.Cookie { return 0, fmt.Errorf("unexpected cookie %x", cookie) } @@ -45,22 +47,17 @@ func (s *Store) doDeleteNeedleFromAtLeastOneRemoteEcShards(ecVolume *erasure_cod shardId, _ := intervals[0].ToShardIdAndOffset(erasure_coding.ErasureCodingLargeBlockSize, erasure_coding.ErasureCodingSmallBlockSize) - hasDeletionSuccess := false err = s.doDeleteNeedleFromRemoteEcShardServers(shardId, ecVolume, needleId) if err == nil { - hasDeletionSuccess = true + return nil } for shardId = erasure_coding.DataShardsCount; shardId < erasure_coding.TotalShardsCount; shardId++ { if parityDeletionError := s.doDeleteNeedleFromRemoteEcShardServers(shardId, ecVolume, needleId); parityDeletionError == nil { - hasDeletionSuccess = true + return nil } } - if hasDeletionSuccess { - return nil - } - return err } @@ -77,11 +74,9 @@ func (s *Store) doDeleteNeedleFromRemoteEcShardServers(shardId erasure_coding.Sh for _, sourceDataNode := range sourceDataNodes { glog.V(4).Infof("delete from remote ec shard %d.%d from %s", ecVolume.VolumeId, shardId, sourceDataNode) - err := s.doDeleteNeedleFromRemoteEcShard(sourceDataNode, ecVolume.VolumeId, ecVolume.Collection, ecVolume.Version, needleId) - if err != nil { + if err := s.doDeleteNeedleFromRemoteEcShard(sourceDataNode, ecVolume.VolumeId, ecVolume.Collection, ecVolume.Version, needleId); err != nil { return err } - glog.V(1).Infof("delete from remote ec shard %d.%d from %s: %v", ecVolume.VolumeId, shardId, sourceDataNode, err) } return nil -- cgit v1.2.3 From 5ed0b00fb937a01d7ffecd89fbab97e5ce7f6aaa Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 2 Dec 2025 22:08:11 -0800 Subject: Support separate volume server ID independent of RPC bind address (#7609) * pb: add id field to Heartbeat message for stable volume server identification This adds an 'id' field to the Heartbeat protobuf message that allows volume servers to identify themselves independently of their IP:port address. Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * storage: add Id field to Store struct Add Id field to Store struct and include it in CollectHeartbeat(). The Id field provides a stable volume server identity independent of IP:port. Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * topology: support id-based DataNode identification Update GetOrCreateDataNode to accept an id parameter for stable node identification. When id is provided, the DataNode can maintain its identity even when its IP address changes (e.g., in Kubernetes pod reschedules). For backward compatibility: - If id is provided, use it as the node ID - If id is empty, fall back to ip:port Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * volume: add -id flag for stable volume server identity Add -id command line flag to volume server that allows specifying a stable identifier independent of the IP address. This is useful for Kubernetes deployments with hostPath volumes where pods can be rescheduled to different nodes while the persisted data remains on the original node. Usage: weed volume -id=node-1 -ip=10.0.0.1 ... If -id is not specified, it defaults to ip:port for backward compatibility. Fixes https://github.com/seaweedfs/seaweedfs/issues/7487 * server: add -volume.id flag to weed server command Support the -volume.id flag in the all-in-one 'weed server' command, consistent with the standalone 'weed volume' command. Usage: weed server -volume.id=node-1 ... Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * topology: add test for id-based DataNode identification Test the key scenarios: 1. Create DataNode with explicit id 2. Same id with different IP returns same DataNode (K8s reschedule) 3. IP/PublicUrl are updated when node reconnects with new address 4. Different id creates new DataNode 5. Empty id falls back to ip:port (backward compatibility) Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * pb: add address field to DataNodeInfo for proper node addressing Previously, DataNodeInfo.Id was used as the node address, which worked when Id was always ip:port. Now that Id can be an explicit string, we need a separate Address field for connection purposes. Changes: - Add 'address' field to DataNodeInfo protobuf message - Update ToDataNodeInfo() to populate the address field - Update NewServerAddressFromDataNode() to use Address (with Id fallback) - Fix LookupEcVolume to use dn.Url() instead of dn.Id() Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * fix: trim whitespace from volume server id and fix test - Trim whitespace from -id flag to treat ' ' as empty - Fix store_load_balancing_test.go to include id parameter in NewStore call Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * refactor: extract GetVolumeServerId to util package Move the volume server ID determination logic to a shared utility function to avoid code duplication between volume.go and rack.go. Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * fix: improve transition logic for legacy nodes - Use exact ip:port match instead of net.SplitHostPort heuristic - Update GrpcPort and PublicUrl during transition for consistency - Remove unused net import Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 * fix: add id normalization and address change logging - Normalize id parameter at function boundary (trim whitespace) - Log when DataNode IP:Port changes (helps debug K8s pod rescheduling) Ref: https://github.com/seaweedfs/seaweedfs/issues/7487 --- weed/command/server.go | 1 + weed/command/volume.go | 7 +- weed/pb/master.proto | 2 + weed/pb/master_pb/master.pb.go | 26 ++++++- weed/pb/server_address.go | 12 ++- weed/server/master_grpc_server.go | 4 +- weed/server/master_grpc_server_volume.go | 2 +- weed/server/volume_server.go | 4 +- weed/storage/store.go | 8 +- weed/storage/store_load_balancing_test.go | 2 +- weed/topology/data_node.go | 1 + weed/topology/rack.go | 69 +++++++++++++++-- weed/topology/topology_test.go | 119 ++++++++++++++++++++++++++++-- weed/util/network.go | 11 +++ 14 files changed, 240 insertions(+), 28 deletions(-) diff --git a/weed/command/server.go b/weed/command/server.go index 47df30fc2..d729502f0 100644 --- a/weed/command/server.go +++ b/weed/command/server.go @@ -133,6 +133,7 @@ func init() { serverOptions.v.port = cmdServer.Flag.Int("volume.port", 8080, "volume server http listen port") serverOptions.v.portGrpc = cmdServer.Flag.Int("volume.port.grpc", 0, "volume server grpc listen port") serverOptions.v.publicPort = cmdServer.Flag.Int("volume.port.public", 0, "volume server public port") + serverOptions.v.id = cmdServer.Flag.String("volume.id", "", "volume server id. If empty, default to ip:port") serverOptions.v.indexType = cmdServer.Flag.String("volume.index", "memory", "Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance.") serverOptions.v.diskType = cmdServer.Flag.String("volume.disk", "", "[hdd|ssd|] hard drive or solid state drive or any tag") serverOptions.v.fixJpgOrientation = cmdServer.Flag.Bool("volume.images.fix.orientation", false, "Adjust jpg orientation when uploading.") diff --git a/weed/command/volume.go b/weed/command/volume.go index e21437e9a..514553172 100644 --- a/weed/command/volume.go +++ b/weed/command/volume.go @@ -41,6 +41,7 @@ type VolumeServerOptions struct { folderMaxLimits []int32 idxFolder *string ip *string + id *string publicUrl *string bindIp *string mastersString *string @@ -78,6 +79,7 @@ func init() { v.portGrpc = cmdVolume.Flag.Int("port.grpc", 0, "grpc listen port") v.publicPort = cmdVolume.Flag.Int("port.public", 0, "port opened to public") v.ip = cmdVolume.Flag.String("ip", util.DetectedHostAddress(), "ip or server name, also used as identifier") + v.id = cmdVolume.Flag.String("id", "", "volume server id. If empty, default to ip:port") v.publicUrl = cmdVolume.Flag.String("publicUrl", "", "Publicly accessible address") v.bindIp = cmdVolume.Flag.String("ip.bind", "", "ip address to bind to. If empty, default to same as -ip option.") v.mastersString = cmdVolume.Flag.String("master", "localhost:9333", "comma-separated master servers") @@ -253,8 +255,11 @@ func (v VolumeServerOptions) startVolumeServer(volumeFolders, maxVolumeCounts, v volumeNeedleMapKind = storage.NeedleMapLevelDbLarge } + // Determine volume server ID: if not specified, use ip:port + volumeServerId := util.GetVolumeServerId(*v.id, *v.ip, *v.port) + volumeServer := weed_server.NewVolumeServer(volumeMux, publicVolumeMux, - *v.ip, *v.port, *v.portGrpc, *v.publicUrl, + *v.ip, *v.port, *v.portGrpc, *v.publicUrl, volumeServerId, v.folders, v.folderMaxLimits, minFreeSpaces, diskTypes, *v.idxFolder, volumeNeedleMapKind, diff --git a/weed/pb/master.proto b/weed/pb/master.proto index f8049c466..afbf31de9 100644 --- a/weed/pb/master.proto +++ b/weed/pb/master.proto @@ -81,6 +81,7 @@ message Heartbeat { map max_volume_counts = 4; uint32 grpc_port = 20; repeated string location_uuids = 21; + string id = 22; // volume server id, independent of ip:port for stable identification } message HeartbeatResponse { @@ -289,6 +290,7 @@ message DataNodeInfo { string id = 1; map diskInfos = 2; uint32 grpc_port = 3; + string address = 4; // ip:port for connecting to the volume server } message RackInfo { string id = 1; diff --git a/weed/pb/master_pb/master.pb.go b/weed/pb/master_pb/master.pb.go index 19df43d71..41d46fad1 100644 --- a/weed/pb/master_pb/master.pb.go +++ b/weed/pb/master_pb/master.pb.go @@ -44,6 +44,7 @@ type Heartbeat struct { MaxVolumeCounts map[string]uint32 `protobuf:"bytes,4,rep,name=max_volume_counts,json=maxVolumeCounts,proto3" json:"max_volume_counts,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"varint,2,opt,name=value"` GrpcPort uint32 `protobuf:"varint,20,opt,name=grpc_port,json=grpcPort,proto3" json:"grpc_port,omitempty"` LocationUuids []string `protobuf:"bytes,21,rep,name=location_uuids,json=locationUuids,proto3" json:"location_uuids,omitempty"` + Id string `protobuf:"bytes,22,opt,name=id,proto3" json:"id,omitempty"` // volume server id, independent of ip:port for stable identification unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -204,6 +205,13 @@ func (x *Heartbeat) GetLocationUuids() []string { return nil } +func (x *Heartbeat) GetId() string { + if x != nil { + return x.Id + } + return "" +} + type HeartbeatResponse struct { state protoimpl.MessageState `protogen:"open.v1"` VolumeSizeLimit uint64 `protobuf:"varint,1,opt,name=volume_size_limit,json=volumeSizeLimit,proto3" json:"volume_size_limit,omitempty"` @@ -2039,6 +2047,7 @@ type DataNodeInfo struct { Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` DiskInfos map[string]*DiskInfo `protobuf:"bytes,2,rep,name=diskInfos,proto3" json:"diskInfos,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` GrpcPort uint32 `protobuf:"varint,3,opt,name=grpc_port,json=grpcPort,proto3" json:"grpc_port,omitempty"` + Address string `protobuf:"bytes,4,opt,name=address,proto3" json:"address,omitempty"` // ip:port for connecting to the volume server unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -2094,6 +2103,13 @@ func (x *DataNodeInfo) GetGrpcPort() uint32 { return 0 } +func (x *DataNodeInfo) GetAddress() string { + if x != nil { + return x.Address + } + return "" +} + type RackInfo struct { state protoimpl.MessageState `protogen:"open.v1"` Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` @@ -4038,7 +4054,7 @@ var File_master_proto protoreflect.FileDescriptor const file_master_proto_rawDesc = "" + "\n" + - "\fmaster.proto\x12\tmaster_pb\"\xc0\a\n" + + "\fmaster.proto\x12\tmaster_pb\"\xd0\a\n" + "\tHeartbeat\x12\x0e\n" + "\x02ip\x18\x01 \x01(\tR\x02ip\x12\x12\n" + "\x04port\x18\x02 \x01(\rR\x04port\x12\x1d\n" + @@ -4063,7 +4079,8 @@ const file_master_proto_rawDesc = "" + "\x10has_no_ec_shards\x18\x13 \x01(\bR\rhasNoEcShards\x12U\n" + "\x11max_volume_counts\x18\x04 \x03(\v2).master_pb.Heartbeat.MaxVolumeCountsEntryR\x0fmaxVolumeCounts\x12\x1b\n" + "\tgrpc_port\x18\x14 \x01(\rR\bgrpcPort\x12%\n" + - "\x0elocation_uuids\x18\x15 \x03(\tR\rlocationUuids\x1aB\n" + + "\x0elocation_uuids\x18\x15 \x03(\tR\rlocationUuids\x12\x0e\n" + + "\x02id\x18\x16 \x01(\tR\x02id\x1aB\n" + "\x14MaxVolumeCountsEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\rR\x05value:\x028\x01\"\xcd\x02\n" + @@ -4254,11 +4271,12 @@ const file_master_proto_rawDesc = "" + "\fvolume_infos\x18\x06 \x03(\v2#.master_pb.VolumeInformationMessageR\vvolumeInfos\x12P\n" + "\x0eec_shard_infos\x18\a \x03(\v2*.master_pb.VolumeEcShardInformationMessageR\fecShardInfos\x12.\n" + "\x13remote_volume_count\x18\b \x01(\x03R\x11remoteVolumeCount\x12\x17\n" + - "\adisk_id\x18\t \x01(\rR\x06diskId\"\xd4\x01\n" + + "\adisk_id\x18\t \x01(\rR\x06diskId\"\xee\x01\n" + "\fDataNodeInfo\x12\x0e\n" + "\x02id\x18\x01 \x01(\tR\x02id\x12D\n" + "\tdiskInfos\x18\x02 \x03(\v2&.master_pb.DataNodeInfo.DiskInfosEntryR\tdiskInfos\x12\x1b\n" + - "\tgrpc_port\x18\x03 \x01(\rR\bgrpcPort\x1aQ\n" + + "\tgrpc_port\x18\x03 \x01(\rR\bgrpcPort\x12\x18\n" + + "\aaddress\x18\x04 \x01(\tR\aaddress\x1aQ\n" + "\x0eDiskInfosEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12)\n" + "\x05value\x18\x02 \x01(\v2\x13.master_pb.DiskInfoR\x05value:\x028\x01\"\xf0\x01\n" + diff --git a/weed/pb/server_address.go b/weed/pb/server_address.go index a0aa79ae4..943b85519 100644 --- a/weed/pb/server_address.go +++ b/weed/pb/server_address.go @@ -2,11 +2,12 @@ package pb import ( "fmt" - "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" - "github.com/seaweedfs/seaweedfs/weed/util" "net" "strconv" "strings" + + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/seaweedfs/seaweedfs/weed/util" ) type ServerAddress string @@ -32,7 +33,12 @@ func NewServerAddressWithGrpcPort(address string, grpcPort int) ServerAddress { } func NewServerAddressFromDataNode(dn *master_pb.DataNodeInfo) ServerAddress { - return NewServerAddressWithGrpcPort(dn.Id, int(dn.GrpcPort)) + // Use Address field if available (new behavior), fall back to Id for backward compatibility + addr := dn.Address + if addr == "" { + addr = dn.Id // backward compatibility: old nodes use ip:port as id + } + return NewServerAddressWithGrpcPort(addr, int(dn.GrpcPort)) } func NewServerAddressFromLocation(dn *master_pb.Location) ServerAddress { diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go index dcf279e1d..e053d9ea7 100644 --- a/weed/server/master_grpc_server.go +++ b/weed/server/master_grpc_server.go @@ -137,8 +137,8 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ dcName, rackName := ms.Topo.Configuration.Locate(heartbeat.Ip, heartbeat.DataCenter, heartbeat.Rack) dc := ms.Topo.GetOrCreateDataCenter(dcName) rack := dc.GetOrCreateRack(rackName) - dn = rack.GetOrCreateDataNode(heartbeat.Ip, int(heartbeat.Port), int(heartbeat.GrpcPort), heartbeat.PublicUrl, heartbeat.MaxVolumeCounts) - glog.V(0).Infof("added volume server %d: %v:%d %v", dn.Counter, heartbeat.GetIp(), heartbeat.GetPort(), heartbeat.LocationUuids) + dn = rack.GetOrCreateDataNode(heartbeat.Ip, int(heartbeat.Port), int(heartbeat.GrpcPort), heartbeat.PublicUrl, heartbeat.Id, heartbeat.MaxVolumeCounts) + glog.V(0).Infof("added volume server %d: %v (id=%s, ip=%v:%d) %v", dn.Counter, dn.Id(), heartbeat.Id, heartbeat.GetIp(), heartbeat.GetPort(), heartbeat.LocationUuids) uuidlist, err := ms.RegisterUuids(heartbeat) if err != nil { if stream_err := stream.Send(&master_pb.HeartbeatResponse{ diff --git a/weed/server/master_grpc_server_volume.go b/weed/server/master_grpc_server_volume.go index a7ef8e7e9..d00cb5df4 100644 --- a/weed/server/master_grpc_server_volume.go +++ b/weed/server/master_grpc_server_volume.go @@ -253,7 +253,7 @@ func (ms *MasterServer) LookupEcVolume(ctx context.Context, req *master_pb.Looku var locations []*master_pb.Location for _, dn := range shardLocations { locations = append(locations, &master_pb.Location{ - Url: string(dn.Id()), + Url: dn.Url(), PublicUrl: dn.PublicUrl, DataCenter: dn.GetDataCenterId(), }) diff --git a/weed/server/volume_server.go b/weed/server/volume_server.go index 4f8a7fb0d..65909996a 100644 --- a/weed/server/volume_server.go +++ b/weed/server/volume_server.go @@ -55,7 +55,7 @@ type VolumeServer struct { } func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, - port int, grpcPort int, publicUrl string, + port int, grpcPort int, publicUrl string, id string, folders []string, maxCounts []int32, minFreeSpaces []util.MinFreeSpace, diskTypes []types.DiskType, idxFolder string, needleMapKind storage.NeedleMapKind, @@ -114,7 +114,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, vs.checkWithMaster() - vs.store = storage.NewStore(vs.grpcDialOption, ip, port, grpcPort, publicUrl, folders, maxCounts, minFreeSpaces, idxFolder, vs.needleMapKind, diskTypes, ldbTimeout) + vs.store = storage.NewStore(vs.grpcDialOption, ip, port, grpcPort, publicUrl, id, folders, maxCounts, minFreeSpaces, idxFolder, vs.needleMapKind, diskTypes, ldbTimeout) vs.guard = security.NewGuard(whiteList, signingKey, expiresAfterSec, readSigningKey, readExpiresAfterSec) handleStaticResources(adminMux) diff --git a/weed/storage/store.go b/weed/storage/store.go index cc07f8702..30f33d6d9 100644 --- a/weed/storage/store.go +++ b/weed/storage/store.go @@ -63,6 +63,7 @@ type Store struct { Port int GrpcPort int PublicUrl string + Id string // volume server id, independent of ip:port for stable identification Locations []*DiskLocation dataCenter string // optional information, overwriting master setting if exists rack string // optional information, overwriting master setting if exists @@ -76,13 +77,13 @@ type Store struct { } func (s *Store) String() (str string) { - str = fmt.Sprintf("Ip:%s, Port:%d, GrpcPort:%d PublicUrl:%s, dataCenter:%s, rack:%s, connected:%v, volumeSizeLimit:%d", s.Ip, s.Port, s.GrpcPort, s.PublicUrl, s.dataCenter, s.rack, s.connected, s.GetVolumeSizeLimit()) + str = fmt.Sprintf("Id:%s, Ip:%s, Port:%d, GrpcPort:%d PublicUrl:%s, dataCenter:%s, rack:%s, connected:%v, volumeSizeLimit:%d", s.Id, s.Ip, s.Port, s.GrpcPort, s.PublicUrl, s.dataCenter, s.rack, s.connected, s.GetVolumeSizeLimit()) return } -func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int, publicUrl string, dirnames []string, maxVolumeCounts []int32, +func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int, publicUrl string, id string, dirnames []string, maxVolumeCounts []int32, minFreeSpaces []util.MinFreeSpace, idxFolder string, needleMapKind NeedleMapKind, diskTypes []DiskType, ldbTimeout int64) (s *Store) { - s = &Store{grpcDialOption: grpcDialOption, Port: port, Ip: ip, GrpcPort: grpcPort, PublicUrl: publicUrl, NeedleMapKind: needleMapKind} + s = &Store{grpcDialOption: grpcDialOption, Port: port, Ip: ip, GrpcPort: grpcPort, PublicUrl: publicUrl, Id: id, NeedleMapKind: needleMapKind} s.Locations = make([]*DiskLocation, 0) var wg sync.WaitGroup @@ -414,6 +415,7 @@ func (s *Store) CollectHeartbeat() *master_pb.Heartbeat { Port: uint32(s.Port), GrpcPort: uint32(s.GrpcPort), PublicUrl: s.PublicUrl, + Id: s.Id, MaxVolumeCounts: maxVolumeCounts, MaxFileKey: NeedleIdToUint64(maxFileKey), DataCenter: s.dataCenter, diff --git a/weed/storage/store_load_balancing_test.go b/weed/storage/store_load_balancing_test.go index 15e709d53..35475a6ae 100644 --- a/weed/storage/store_load_balancing_test.go +++ b/weed/storage/store_load_balancing_test.go @@ -31,7 +31,7 @@ func newTestStore(t *testing.T, numDirs int) *Store { diskTypes = append(diskTypes, types.HardDriveType) } - store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080", + store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080", "", dirs, maxCounts, minFreeSpaces, "", NeedleMapInMemory, diskTypes, 3) // Consume channel messages to prevent blocking diff --git a/weed/topology/data_node.go b/weed/topology/data_node.go index 4f2dbe464..07e00ac0a 100644 --- a/weed/topology/data_node.go +++ b/weed/topology/data_node.go @@ -269,6 +269,7 @@ func (dn *DataNode) ToDataNodeInfo() *master_pb.DataNodeInfo { Id: string(dn.Id()), DiskInfos: make(map[string]*master_pb.DiskInfo), GrpcPort: uint32(dn.GrpcPort), + Address: dn.Url(), // ip:port for connecting to the volume server } for _, c := range dn.Children() { disk := c.(*Disk) diff --git a/weed/topology/rack.go b/weed/topology/rack.go index f526cd84d..1e5c8b632 100644 --- a/weed/topology/rack.go +++ b/weed/topology/rack.go @@ -5,6 +5,7 @@ import ( "strings" "time" + "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/storage/types" "github.com/seaweedfs/seaweedfs/weed/util" @@ -34,17 +35,73 @@ func (r *Rack) FindDataNode(ip string, port int) *DataNode { } return nil } -func (r *Rack) GetOrCreateDataNode(ip string, port int, grpcPort int, publicUrl string, maxVolumeCounts map[string]uint32) *DataNode { + +// FindDataNodeById finds a DataNode by its ID using O(1) map lookup +func (r *Rack) FindDataNodeById(id string) *DataNode { + r.RLock() + defer r.RUnlock() + if c, ok := r.children[NodeId(id)]; ok { + return c.(*DataNode) + } + return nil +} + +func (r *Rack) GetOrCreateDataNode(ip string, port int, grpcPort int, publicUrl string, id string, maxVolumeCounts map[string]uint32) *DataNode { r.Lock() defer r.Unlock() - for _, c := range r.children { + + // Normalize the id parameter (trim whitespace) + id = strings.TrimSpace(id) + + // Determine the node ID: use provided id, or fall back to ip:port for backward compatibility + nodeId := util.GetVolumeServerId(id, ip, port) + + // First, try to find by node ID using O(1) map lookup (stable identity) + if c, ok := r.children[NodeId(nodeId)]; ok { dn := c.(*DataNode) - if dn.MatchLocation(ip, port) { - dn.LastSeen = time.Now().Unix() - return dn + // Log if IP or Port changed (e.g., pod rescheduled in K8s) + if dn.Ip != ip || dn.Port != port { + glog.V(0).Infof("DataNode %s address changed from %s:%d to %s:%d", nodeId, dn.Ip, dn.Port, ip, port) } + // Update the IP/Port in case they changed + dn.Ip = ip + dn.Port = port + dn.GrpcPort = grpcPort + dn.PublicUrl = publicUrl + dn.LastSeen = time.Now().Unix() + return dn } - dn := NewDataNode(util.JoinHostPort(ip, port)) + + // For backward compatibility: if explicit id was provided, also check by ip:port + // to handle transition from old (ip:port) to new (explicit id) behavior + ipPortId := util.JoinHostPort(ip, port) + if nodeId != ipPortId { + for oldId, c := range r.children { + dn := c.(*DataNode) + if dn.MatchLocation(ip, port) { + // Only transition if the oldId exactly matches ip:port (legacy identification). + // If oldId is different, this is a node with an explicit id that happens to + // reuse the same ip:port - don't incorrectly merge them. + if string(oldId) != ipPortId { + glog.Warningf("Volume server with id %s has ip:port %s which is used by node %s", nodeId, ipPortId, oldId) + continue + } + // Found a legacy node identified by ip:port, transition it to use the new explicit id + glog.V(0).Infof("Volume server %s transitioning id from %s to %s", dn.Url(), oldId, nodeId) + // Re-key the node in the children map with the new id + delete(r.children, oldId) + dn.id = NodeId(nodeId) + r.children[NodeId(nodeId)] = dn + // Update connection info in case they changed + dn.GrpcPort = grpcPort + dn.PublicUrl = publicUrl + dn.LastSeen = time.Now().Unix() + return dn + } + } + } + + dn := NewDataNode(nodeId) dn.Ip = ip dn.Port = port dn.GrpcPort = grpcPort diff --git a/weed/topology/topology_test.go b/weed/topology/topology_test.go index 8515d2f81..e5a8969fc 100644 --- a/weed/topology/topology_test.go +++ b/weed/topology/topology_test.go @@ -34,7 +34,7 @@ func TestHandlingVolumeServerHeartbeat(t *testing.T) { maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 maxVolumeCounts["ssd"] = 12 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) { volumeCount := 7 @@ -180,7 +180,7 @@ func TestAddRemoveVolume(t *testing.T) { maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 maxVolumeCounts["ssd"] = 12 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) v := storage.VolumeInfo{ Id: needle.VolumeId(1), @@ -218,7 +218,7 @@ func TestVolumeReadOnlyStatusChange(t *testing.T) { rack := dc.GetOrCreateRack("rack1") maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) // Create a writable volume v := storage.VolumeInfo{ @@ -267,7 +267,7 @@ func TestVolumeReadOnlyAndRemoteStatusChange(t *testing.T) { rack := dc.GetOrCreateRack("rack1") maxVolumeCounts := make(map[string]uint32) maxVolumeCounts[""] = 25 - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", maxVolumeCounts) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts) // Create a writable, local volume v := storage.VolumeInfo{ @@ -331,7 +331,7 @@ func TestListCollections(t *testing.T) { topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false) dc := topo.GetOrCreateDataCenter("dc1") rack := dc.GetOrCreateRack("rack1") - dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", nil) + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", nil) topo.RegisterVolumeLayout(storage.VolumeInfo{ Id: needle.VolumeId(1111), @@ -396,3 +396,112 @@ func TestListCollections(t *testing.T) { }) } } + +func TestDataNodeIdBasedIdentification(t *testing.T) { + topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false) + dc := topo.GetOrCreateDataCenter("dc1") + rack := dc.GetOrCreateRack("rack1") + + maxVolumeCounts := make(map[string]uint32) + maxVolumeCounts[""] = 10 + + // Test 1: Create a DataNode with explicit id + dn1 := rack.GetOrCreateDataNode("10.0.0.1", 8080, 18080, "10.0.0.1:8080", "node-1", maxVolumeCounts) + if string(dn1.Id()) != "node-1" { + t.Errorf("expected node id 'node-1', got '%s'", dn1.Id()) + } + if dn1.Ip != "10.0.0.1" { + t.Errorf("expected ip '10.0.0.1', got '%s'", dn1.Ip) + } + + // Test 2: Same id with different IP should return the same DataNode (K8s pod reschedule scenario) + dn2 := rack.GetOrCreateDataNode("10.0.0.2", 8080, 18080, "10.0.0.2:8080", "node-1", maxVolumeCounts) + if dn1 != dn2 { + t.Errorf("expected same DataNode for same id, got different nodes") + } + // IP should be updated to the new value + if dn2.Ip != "10.0.0.2" { + t.Errorf("expected ip to be updated to '10.0.0.2', got '%s'", dn2.Ip) + } + if dn2.PublicUrl != "10.0.0.2:8080" { + t.Errorf("expected publicUrl to be updated to '10.0.0.2:8080', got '%s'", dn2.PublicUrl) + } + + // Test 3: Different id should create a new DataNode + dn3 := rack.GetOrCreateDataNode("10.0.0.3", 8080, 18080, "10.0.0.3:8080", "node-2", maxVolumeCounts) + if string(dn3.Id()) != "node-2" { + t.Errorf("expected node id 'node-2', got '%s'", dn3.Id()) + } + if dn1 == dn3 { + t.Errorf("expected different DataNode for different id") + } + + // Test 4: Empty id should fall back to ip:port (backward compatibility) + dn4 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "", maxVolumeCounts) + if string(dn4.Id()) != "10.0.0.4:8080" { + t.Errorf("expected node id '10.0.0.4:8080' for empty id, got '%s'", dn4.Id()) + } + + // Test 5: Same ip:port with empty id should return the same DataNode + dn5 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "", maxVolumeCounts) + if dn4 != dn5 { + t.Errorf("expected same DataNode for same ip:port with empty id") + } + + // Verify we have 3 unique DataNodes total: + // - node-1 (dn1/dn2 share the same id) + // - node-2 (dn3) + // - 10.0.0.4:8080 (dn4/dn5 share the same ip:port) + children := rack.Children() + if len(children) != 3 { + t.Errorf("expected 3 DataNodes, got %d", len(children)) + } + + // Test 6: Transition from ip:port to explicit id + // First, the node exists with ip:port as id (dn4/dn5) + // Now the same volume server starts sending an explicit id + dn6 := rack.GetOrCreateDataNode("10.0.0.4", 8080, 18080, "10.0.0.4:8080", "node-4-explicit", maxVolumeCounts) + // Should return the same DataNode instance + if dn6 != dn4 { + t.Errorf("expected same DataNode instance during transition") + } + // But the id should now be updated to the explicit id + if string(dn6.Id()) != "node-4-explicit" { + t.Errorf("expected node id to transition to 'node-4-explicit', got '%s'", dn6.Id()) + } + // The node should be re-keyed in the children map + if rack.FindDataNodeById("node-4-explicit") != dn6 { + t.Errorf("expected to find DataNode by new explicit id") + } + // Old ip:port key should no longer work + if rack.FindDataNodeById("10.0.0.4:8080") != nil { + t.Errorf("expected old ip:port id to be removed from children map") + } + + // Still 3 unique DataNodes (node-1, node-2, node-4-explicit) + children = rack.Children() + if len(children) != 3 { + t.Errorf("expected 3 DataNodes after transition, got %d", len(children)) + } + + // Test 7: Prevent incorrect transition when a new node reuses ip:port of a node with explicit id + // Scenario: node-1 runs at 10.0.0.1:8080, dies, new node-99 starts at same ip:port + // The transition should NOT happen because node-1 already has an explicit id + dn7 := rack.GetOrCreateDataNode("10.0.0.1", 8080, 18080, "10.0.0.1:8080", "node-99", maxVolumeCounts) + // Should create a NEW DataNode, not reuse node-1 + if dn7 == dn1 { + t.Errorf("expected new DataNode for node-99, got reused node-1") + } + if string(dn7.Id()) != "node-99" { + t.Errorf("expected node id 'node-99', got '%s'", dn7.Id()) + } + // node-1 should still exist with its original id + if rack.FindDataNodeById("node-1") == nil { + t.Errorf("node-1 should still exist") + } + // Now we have 4 DataNodes + children = rack.Children() + if len(children) != 4 { + t.Errorf("expected 4 DataNodes, got %d", len(children)) + } +} diff --git a/weed/util/network.go b/weed/util/network.go index 328808dbc..f7dbeebb7 100644 --- a/weed/util/network.go +++ b/weed/util/network.go @@ -64,3 +64,14 @@ func JoinHostPort(host string, port int) string { } return net.JoinHostPort(host, portStr) } + +// GetVolumeServerId returns the volume server ID. +// If id is provided (non-empty after trimming), use it as the identifier. +// Otherwise, fall back to ip:port for backward compatibility. +func GetVolumeServerId(id, ip string, port int) string { + volumeServerId := strings.TrimSpace(id) + if volumeServerId == "" { + volumeServerId = JoinHostPort(ip, port) + } + return volumeServerId +} -- cgit v1.2.3 From e9da64f62a141d06dd1cd913a739b91cad016dce Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 2 Dec 2025 23:19:14 -0800 Subject: fix: volume server healthz now checks local conditions only (#7610) This fixes issue #6823 where a single volume server shutdown would cause other healthy volume servers to fail their health checks and get restarted by Kubernetes, causing a cascading failure. Previously, the healthz handler checked if all replicated volumes could reach their remote replicas via GetWritableRemoteReplications(). When a volume server went down, the master would remove it from the volume location list. Other volume servers would then fail their healthz checks because they couldn't find all required replicas, causing Kubernetes to restart them. The healthz endpoint now only checks local conditions: 1. Is the server shutting down? 2. Is the server heartbeating with the master? This follows the principle that a health check should only verify the health of THIS server, not the overall cluster state. Fixes #6823 --- weed/server/volume_server_handlers_admin.go | 31 +++++++++++++++++------------ weed/storage/store.go | 4 ++++ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/weed/server/volume_server_handlers_admin.go b/weed/server/volume_server_handlers_admin.go index ec6490662..a54369277 100644 --- a/weed/server/volume_server_handlers_admin.go +++ b/weed/server/volume_server_handlers_admin.go @@ -4,28 +4,33 @@ import ( "net/http" "path/filepath" - "github.com/seaweedfs/seaweedfs/weed/topology" "github.com/seaweedfs/seaweedfs/weed/util/version" "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" "github.com/seaweedfs/seaweedfs/weed/stats" ) +// healthzHandler checks the local health of the volume server. +// It only checks local conditions to avoid cascading failures when remote +// volume servers go down. Previously, this handler checked if all replicated +// volumes could reach their remote replicas, which caused healthy volume +// servers to fail health checks when a peer went down. +// See https://github.com/seaweedfs/seaweedfs/issues/6823 func (vs *VolumeServer) healthzHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Server", "SeaweedFS Volume "+version.VERSION) - volumeInfos := vs.store.VolumeInfos() - for _, vinfo := range volumeInfos { - if len(vinfo.Collection) == 0 { - continue - } - if vinfo.ReplicaPlacement.GetCopyCount() > 1 { - _, err := topology.GetWritableRemoteReplications(vs.store, vs.grpcDialOption, vinfo.Id, vs.GetMaster) - if err != nil { - w.WriteHeader(http.StatusServiceUnavailable) - return - } - } + + // Check if the server is shutting down + if vs.store.IsStopping() { + w.WriteHeader(http.StatusServiceUnavailable) + return } + + // Check if we can communicate with master + if !vs.isHeartbeating { + w.WriteHeader(http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) } diff --git a/weed/storage/store.go b/weed/storage/store.go index 30f33d6d9..7a336d1ff 100644 --- a/weed/storage/store.go +++ b/weed/storage/store.go @@ -469,6 +469,10 @@ func (s *Store) SetStopping() { } } +func (s *Store) IsStopping() bool { + return s.isStopping +} + func (s *Store) LoadNewVolumes() { for _, location := range s.Locations { location.loadExistingVolumes(s.NeedleMapKind, 0) -- cgit v1.2.3 From 3bcadc9f9052d40ab38dc0cc407065ab5cc10061 Mon Sep 17 00:00:00 2001 From: Xiao Wei <403828237@qq.com> Date: Thu, 4 Dec 2025 02:23:59 +0800 Subject: =?UTF-8?q?fix:=20update=20getVersioningState=20to=20signal=20non-?= =?UTF-8?q?existent=20buckets=20with=20Er=E2=80=A6=20(#7613)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: update getVersioningState to signal non-existent buckets with ErrNotFound This change modifies the getVersioningState function to return filer_pb.ErrNotFound when a requested bucket does not exist, allowing callers to handle the situation appropriately, such as auto-creating the bucket in PUT handlers. This improves error handling and clarity in the API's behavior regarding bucket existence. * Update weed/s3api/s3api_bucket_config.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Co-authored-by: 洪晓威 Co-authored-by: Chris Lu Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- weed/s3api/s3api_bucket_config.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/weed/s3api/s3api_bucket_config.go b/weed/s3api/s3api_bucket_config.go index 00449d80a..a10374339 100644 --- a/weed/s3api/s3api_bucket_config.go +++ b/weed/s3api/s3api_bucket_config.go @@ -519,7 +519,9 @@ func (s3a *S3ApiServer) getVersioningState(bucket string) (string, error) { config, errCode := s3a.getBucketConfig(bucket) if errCode != s3err.ErrNone { if errCode == s3err.ErrNoSuchBucket { - return "", nil + // Signal to callers that the bucket does not exist so they can + // decide whether to auto-create it (e.g., in PUT handlers). + return "", filer_pb.ErrNotFound } glog.Errorf("getVersioningState: failed to get bucket config for %s: %v", bucket, errCode) return "", fmt.Errorf("failed to get bucket config: %v", errCode) -- cgit v1.2.3 From d59cc1b09f3eff5771bbf66753b2c4c9fbe50c78 Mon Sep 17 00:00:00 2001 From: Lisandro Pin Date: Wed, 3 Dec 2025 20:33:35 +0100 Subject: Fix handling of fixed read-only volumes for `volume.check.disk`. (#7612) There's unfortunatley no way to tell whether a volume is flagged read-only because it got full, or because it is faulty. To address this, modify the check logic so all read-only volumes are processed; if no changes are written (i.e. the volume is healthy) it is kept as read-only. Volumes which are modified in this process are deemed fixed, and switched to writable. --- weed/shell/command_volume_check_disk.go | 55 +++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index dbb64e239..4d775000f 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -64,9 +64,9 @@ func (c *commandVolumeCheckDisk) Help() string { append entries in B and not in A to A optionally, for each non-writable volume replica A - if volume is not full + select a writable volume replica B + if entries in A don't match B prune late volume entries not matching its index file - select a writable volume replica B append missing entries from B into A mark the volume as writable (healthy) @@ -179,9 +179,16 @@ func (vcd *volumeCheckDisk) checkWritableVolumes(volumeReplicas map[uint32][]*Vo writableReplicas = append(writableReplicas[:1], writableReplicas[2:]...) continue } - if err := vcd.syncTwoReplicas(a, b, true); err != nil { - vcd.write("sync volume %d on %s and %s: %v", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err) + + modified, err := vcd.syncTwoReplicas(a, b, true) + if err != nil { + vcd.write("failed to sync volumes %d on %s and %s: %v", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err) + } else { + if modified { + vcd.write("synced %s and %s for volume %d", a.location.dataNode.Id, b.location.dataNode.Id, a.info.Id) + } } + // always choose the larger volume to be the source if a.info.FileCount > b.info.FileCount { writableReplicas = append(writableReplicas[:1], writableReplicas[2:]...) @@ -280,19 +287,25 @@ func (vcd *volumeCheckDisk) checkReadOnlyVolumes(volumeReplicas map[uint32][]*Vo return err } - // ...fix it... - // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes. - if err := vcd.syncTwoReplicas(source, r, false); err != nil { - vcd.write("sync read-only volume %d on %s from %s: %v\n", vid, r.location.dataNode.Id, source.location.dataNode.Id, err) + // ...try to fix it... + // TODO: test whether syncTwoReplicas() is enough to prune garbage entries on broken volumes... + modified, err := vcd.syncTwoReplicas(source, r, false) + if err != nil { + vcd.write("sync read-only volume %d on %s from %s: %v", vid, r.location.dataNode.Id, source.location.dataNode.Id, err) - // ...or revert it back to read-only, if something went wrong. - // TODO: we should keep unchanged volumes as read-only, so we don't modify valid volumes which are full. if roErr := vcd.makeVolumeReadonly(vid, r); roErr != nil { - return fmt.Errorf("failed to make volume %d on %s readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr) + return fmt.Errorf("failed to revert volume %d on %s to readonly after: %v: %v", vid, r.location.dataNode.Id, err, roErr) } - vcd.write("volume %d on %s is now read-only\n", vid, r.location.dataNode.Id) - return err + } else { + if modified { + vcd.write("volume %d on %s is now synced to %d and writable", vid, r.location.dataNode.Id, source.location.dataNode.Id) + } else { + // ...or restore back to read-only, if no changes were made. + if err := vcd.makeVolumeReadonly(vid, r); err != nil { + return fmt.Errorf("failed to revert volume %d on %s to readonly: %v", vid, r.location.dataNode.Id, err) + } + } } return nil @@ -411,35 +424,39 @@ func (vcd *volumeCheckDisk) shouldSkipVolume(a, b *VolumeReplica) (bool, error) // syncTwoReplicas attempts to sync all entries from a source volume replica into a target. If bi-directional mode // is enabled, changes from target are also synced back into the source. -func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi bool) (err error) { +// Returns true if source and/or target were modified, false otherwise. +func (vcd *volumeCheckDisk) syncTwoReplicas(source, target *VolumeReplica, bidi bool) (modified bool, err error) { sourceHasChanges, targetHasChanges := true, true const maxIterations = 5 iteration := 0 + modified = false + for (sourceHasChanges || targetHasChanges) && iteration < maxIterations { iteration++ vcd.writeVerbose("sync iteration %d/%d for volume %d", iteration, maxIterations, source.info.Id) prevSourceHasChanges, prevTargetHasChanges := sourceHasChanges, targetHasChanges if sourceHasChanges, targetHasChanges, err = vcd.checkBoth(source, target, bidi); err != nil { - return err + return modified, err } + modified = modified || sourceHasChanges || targetHasChanges // Detect if we're stuck in a loop with no progress if iteration > 1 && prevSourceHasChanges == sourceHasChanges && prevTargetHasChanges == targetHasChanges && (sourceHasChanges || targetHasChanges) { vcd.write("volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop", source.info.Id, source.location.dataNode.Id, target.location.dataNode.Id, iteration) - return fmt.Errorf("sync not making progress after %d iterations", iteration) + return modified, fmt.Errorf("sync not making progress after %d iterations", iteration) } } if iteration >= maxIterations && (sourceHasChanges || targetHasChanges) { vcd.write("volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention", source.info.Id, maxIterations, source.location.dataNode.Id, target.location.dataNode.Id) - return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations) + return modified, fmt.Errorf("reached maximum sync iterations (%d)", maxIterations) } - return nil + return modified, nil } // checkBoth performs a sync between source and target volume replicas. If bi-directional mode is enabled, changes from target are also synced back into the source. @@ -628,7 +645,7 @@ func (vcd *volumeCheckDisk) copyVolumeIndexFile(collection string, volumeId uint copyFileClient, err := volumeServerClient.CopyFile(context.Background(), &volume_server_pb.CopyFileRequest{ VolumeId: volumeId, - Ext: ".idx", + Ext: ext, CompactionRevision: math.MaxUint32, StopOffset: math.MaxInt64, Collection: collection, -- cgit v1.2.3 From e361daa7547556c69e3b7691b3254d8ddc4a2b3c Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 3 Dec 2025 13:42:05 -0800 Subject: fix: SFTP HomeDir path translation for user operations (#7611) * fix: SFTP HomeDir path translation for user operations When users have a non-root HomeDir (e.g., '/sftp/user'), their SFTP operations should be relative to that directory. Previously, when a user uploaded to '/' via SFTP, the path was not translated to their home directory, causing 'permission denied for / for permission write'. This fix adds a toAbsolutePath() method that implements chroot-like behavior where the user's HomeDir becomes their root. All file and directory operations now translate paths through this method. Example: User with HomeDir='/sftp/user' uploading to '/' now correctly maps to '/sftp/user'. Fixes: https://github.com/seaweedfs/seaweedfs/issues/7470 * test: add SFTP integration tests Add comprehensive integration tests for the SFTP server including: - HomeDir path translation tests (verifies fix for issue #7470) - Basic file upload/download operations - Directory operations (mkdir, rmdir, list) - Large file handling (1MB test) - File rename operations - Stat/Lstat operations - Path edge cases (trailing slashes, .., unicode filenames) - Admin root access verification The test framework starts a complete SeaweedFS cluster with: - Master server - Volume server - Filer server - SFTP server with test user credentials Test users are configured in testdata/userstore.json: - admin: HomeDir=/ with full access - testuser: HomeDir=/sftp/testuser with access to home - readonly: HomeDir=/public with read-only access * fix: correct SFTP HomeDir path translation and add CI Fix path.Join issue where paths starting with '/' weren't joined correctly. path.Join('/sftp/user', '/file') returns '/file' instead of '/sftp/user/file'. Now we strip the leading '/' before joining. Test improvements: - Update go.mod to Go 1.24 - Fix weed binary discovery to prefer local build over PATH - Add stabilization delay after service startup - All 8 SFTP integration tests pass locally Add GitHub Actions workflow for SFTP tests: - Runs on push/PR affecting sftpd code or tests - Tests HomeDir path translation, file ops, directory ops - Covers issue #7470 fix verification * security: update golang.org/x/crypto to v0.45.0 Addresses security vulnerability in golang.org/x/crypto < 0.45.0 * security: use proper SSH host key verification in tests Replace ssh.InsecureIgnoreHostKey() with ssh.FixedHostKey() that verifies the server's host key matches the known test key we generated. This addresses CodeQL warning go/insecure-hostkeycallback. Also updates go.mod to specify go 1.24.0 explicitly. * security: fix path traversal vulnerability in SFTP toAbsolutePath The previous implementation had a critical security vulnerability: - Path traversal via '../..' could escape the HomeDir chroot jail - Absolute paths were not correctly prefixed with HomeDir The fix: 1. Concatenate HomeDir with userPath directly, then clean 2. Add security check to ensure final path stays within HomeDir 3. If traversal detected, safely return HomeDir instead Also adds path traversal prevention tests to verify the fix. * fix: address PR review comments 1. Fix SkipCleanup check to use actual test config instead of default - Added skipCleanup field to SftpTestFramework struct - Store config.SkipCleanup during Setup() - Use f.skipCleanup in Cleanup() instead of DefaultTestConfig() 2. Fix path prefix check false positive in mkdir - Changed from strings.HasPrefix(absPath, fs.user.HomeDir) - To: absPath == fs.user.HomeDir || strings.HasPrefix(absPath, fs.user.HomeDir+"/") - Prevents matching partial directory names (e.g., /sftp/username when HomeDir is /sftp/user) * fix: check write permission on parent dir for mkdir Aligns makeDir's permission check with newFileWriter for consistency. To create a directory, a user needs write permission on the parent directory, not mkdir permission on the new directory path. * fix: refine SFTP path traversal logic and tests 1. Refine toAbsolutePath: - Use path.Join with strings.TrimPrefix for idiomatic path construction - Return explicit error on path traversal attempt instead of clamping - Updated all call sites to handle the error 2. Add Unit Tests: - Added sftp_server_test.go to verify toAbsolutePath logic - Covers normal paths, root path, and various traversal attempts 3. Update Integration Tests: - Updated PathTraversalPrevention test to reflect that standard SFTP clients sanitize paths before sending. The test now verifies successful containment within the jail rather than blocking (since the server receives a clean path). - The server-side blocking is verified by the new unit tests. 4. Makefile: - Removed -v from default test target * fix: address PR comments on tests and makefile 1. Enhanced Unit Tests: - Added edge cases (empty path, multiple slashes, trailing slash) to sftp_server_test.go 2. Makefile Improvements: - Added 'all' target as default entry point 3. Code Clarity: - Added comment to mkdir permission check explaining defensive nature of HomeDir check * fix: address PR review comments on permissions and tests 1. Security: - Added write permission check on target directory in renameEntry 2. Logging: - Changed dispatch log verbosity from V(0) to V(1) 3. Testing: - Updated Makefile .PHONY targets - Added unit test cases for empty/root HomeDir behavior in toAbsolutePath * fix: set SFTP starting directory to virtual root 1. Critical Fix: - Changed sftp.WithStartDirectory from fs.user.HomeDir to '/' - Prevents double-prefixing when toAbsolutePath translates paths - Users now correctly start at their virtual root which maps to HomeDir 2. Test Improvements: - Use pointer for homeDir in tests for clearer nil vs empty distinction * fix: clean HomeDir at config load time Clean HomeDir path when loading users from JSON config. This handles trailing slashes and other path anomalies at the source, ensuring consistency throughout the codebase and avoiding repeated cleaning on every toAbsolutePath call. * test: strengthen assertions and add error checking in SFTP tests 1. Add error checking for cleanup operations in TestWalk 2. Strengthen cwd assertion to expect '/' explicitly in TestCurrentWorkingDirectory 3. Add error checking for cleanup in PathTraversalPrevention test --- .github/workflows/sftp-tests.yml | 92 ++++++ test/sftp/Makefile | 41 +++ test/sftp/README.md | 91 ++++++ test/sftp/basic_test.go | 652 ++++++++++++++++++++++++++++++++++++++ test/sftp/framework.go | 423 +++++++++++++++++++++++++ test/sftp/go.mod | 17 + test/sftp/go.sum | 64 ++++ test/sftp/testdata/userstore.json | 36 +++ weed/sftpd/sftp_file_writer.go | 5 +- weed/sftpd/sftp_filer.go | 82 +++-- weed/sftpd/sftp_server.go | 24 ++ weed/sftpd/sftp_server_test.go | 103 ++++++ weed/sftpd/sftp_service.go | 4 +- weed/sftpd/user/filestore.go | 5 + 14 files changed, 1607 insertions(+), 32 deletions(-) create mode 100644 .github/workflows/sftp-tests.yml create mode 100644 test/sftp/Makefile create mode 100644 test/sftp/README.md create mode 100644 test/sftp/basic_test.go create mode 100644 test/sftp/framework.go create mode 100644 test/sftp/go.mod create mode 100644 test/sftp/go.sum create mode 100644 test/sftp/testdata/userstore.json create mode 100644 weed/sftpd/sftp_server_test.go diff --git a/.github/workflows/sftp-tests.yml b/.github/workflows/sftp-tests.yml new file mode 100644 index 000000000..d2ec47eb4 --- /dev/null +++ b/.github/workflows/sftp-tests.yml @@ -0,0 +1,92 @@ +name: "SFTP Integration Tests" + +on: + push: + branches: [ master, main ] + paths: + - 'weed/sftpd/**' + - 'weed/command/sftp.go' + - 'test/sftp/**' + - '.github/workflows/sftp-tests.yml' + pull_request: + branches: [ master, main ] + paths: + - 'weed/sftpd/**' + - 'weed/command/sftp.go' + - 'test/sftp/**' + - '.github/workflows/sftp-tests.yml' + +concurrency: + group: ${{ github.head_ref }}/sftp-tests + cancel-in-progress: true + +permissions: + contents: read + +env: + GO_VERSION: '1.24' + TEST_TIMEOUT: '15m' + +jobs: + sftp-integration: + name: SFTP Integration Testing + runs-on: ubuntu-22.04 + timeout-minutes: 20 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@v5 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y openssh-client + + - name: Build SeaweedFS + run: | + cd weed + go build -o weed . + chmod +x weed + ./weed version + + - name: Run SFTP Integration Tests + run: | + cd test/sftp + + echo "🧪 Running SFTP integration tests..." + echo "============================================" + + # Install test dependencies + go mod download + + # Run all SFTP tests + go test -v -timeout=${{ env.TEST_TIMEOUT }} ./... + + echo "============================================" + echo "✅ SFTP integration tests completed" + + - name: Test Summary + if: always() + run: | + echo "## 🔐 SFTP Integration Test Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Test Coverage" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **HomeDir Path Translation**: User home directory mapping (fixes #7470)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **File Operations**: Upload, download, delete" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Directory Operations**: Create, list, remove" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Large File Handling**: 1MB+ file support" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Path Edge Cases**: Unicode, trailing slashes, .. paths" >> $GITHUB_STEP_SUMMARY + echo "- ✅ **Admin Access**: Root user verification" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Test Configuration" >> $GITHUB_STEP_SUMMARY + echo "| User | HomeDir | Permissions |" >> $GITHUB_STEP_SUMMARY + echo "|------|---------|-------------|" >> $GITHUB_STEP_SUMMARY + echo "| admin | / | Full access |" >> $GITHUB_STEP_SUMMARY + echo "| testuser | /sftp/testuser | Home directory only |" >> $GITHUB_STEP_SUMMARY + echo "| readonly | /public | Read-only |" >> $GITHUB_STEP_SUMMARY + diff --git a/test/sftp/Makefile b/test/sftp/Makefile new file mode 100644 index 000000000..bc46dd3ce --- /dev/null +++ b/test/sftp/Makefile @@ -0,0 +1,41 @@ +.PHONY: all build test test-verbose test-short test-homedir test-debug clean deps tidy + +all: build test + +# Build the weed binary first +build: + cd ../../weed && go build -o weed . + +# Install test dependencies +deps: + go mod download + +# Run all tests +test: build deps + go test -timeout 5m ./... + +# Run tests with verbose output +test-verbose: build deps + go test -v -timeout 5m ./... + +# Run quick tests only (skip integration tests) +test-short: deps + go test -short -v ./... + +# Run specific test +test-homedir: build deps + go test -v -timeout 5m -run TestHomeDirPathTranslation ./... + +# Run tests with debug output from SeaweedFS +test-debug: build deps + go test -v -timeout 5m ./... 2>&1 | tee test.log + +# Clean up test artifacts +clean: + rm -f test.log + go clean -testcache + +# Update go.sum +tidy: + go mod tidy + diff --git a/test/sftp/README.md b/test/sftp/README.md new file mode 100644 index 000000000..e2908f166 --- /dev/null +++ b/test/sftp/README.md @@ -0,0 +1,91 @@ +# SeaweedFS SFTP Integration Tests + +This directory contains integration tests for the SeaweedFS SFTP server. + +## Prerequisites + +1. Build the SeaweedFS binary: + ```bash + cd ../../weed + go build -o weed . + ``` + +2. Ensure `ssh-keygen` is available (for generating test SSH host keys) + +## Running Tests + +### Run all tests +```bash +make test +``` + +### Run tests with verbose output +```bash +make test-verbose +``` + +### Run a specific test +```bash +go test -v -run TestHomeDirPathTranslation +``` + +### Skip long-running tests +```bash +go test -short ./... +``` + +## Test Structure + +- `framework.go` - Test framework that starts SeaweedFS cluster with SFTP +- `basic_test.go` - Basic SFTP operation tests including: + - HomeDir path translation (fixes issue #7470) + - File upload/download + - Directory operations + - Large file handling + - Edge cases + +## Test Configuration + +Tests use `testdata/userstore.json` which defines test users: + +| Username | Password | HomeDir | Permissions | +|----------|----------|---------|-------------| +| admin | adminpassword | / | Full access | +| testuser | testuserpassword | /sftp/testuser | Full access to home | +| readonly | readonlypassword | /public | Read-only | + +## Key Tests + +### TestHomeDirPathTranslation + +Tests the fix for [issue #7470](https://github.com/seaweedfs/seaweedfs/issues/7470) where +users with a non-root HomeDir (e.g., `/sftp/testuser`) could not upload files to `/` +because the path wasn't being translated to their home directory. + +The test verifies: +- Uploading to `/` correctly maps to the user's HomeDir +- Creating directories at `/` works +- Listing `/` shows the user's home directory contents +- All path operations respect the HomeDir translation + +## Debugging + +To debug test failures: + +1. Enable verbose output: + ```bash + go test -v -run TestName + ``` + +2. Keep test artifacts (don't cleanup): + ```go + config := DefaultTestConfig() + config.SkipCleanup = true + ``` + +3. Enable debug logging: + ```go + config := DefaultTestConfig() + config.EnableDebug = true + ``` + diff --git a/test/sftp/basic_test.go b/test/sftp/basic_test.go new file mode 100644 index 000000000..e5ffe90d1 --- /dev/null +++ b/test/sftp/basic_test.go @@ -0,0 +1,652 @@ +package sftp + +import ( + "bytes" + "io" + "path" + "testing" + + "github.com/stretchr/testify/require" +) + +// TestHomeDirPathTranslation tests that SFTP operations correctly translate +// paths relative to the user's HomeDir. +// This is the fix for https://github.com/seaweedfs/seaweedfs/issues/7470 +func TestHomeDirPathTranslation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + // Test with user "testuser" who has HomeDir="/sftp/testuser" + // When they upload to "/", it should actually go to "/sftp/testuser" + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Test 1: Upload file to "/" (should map to /sftp/testuser/) + t.Run("UploadToRoot", func(t *testing.T) { + testContent := []byte("Hello from SFTP test!") + filename := "test_upload.txt" + + // Create file at "/" from user's perspective + file, err := sftpClient.Create("/" + filename) + require.NoError(t, err, "should be able to create file at /") + + _, err = file.Write(testContent) + require.NoError(t, err, "should be able to write to file") + err = file.Close() + require.NoError(t, err, "should be able to close file") + + // Verify file exists and has correct content + readFile, err := sftpClient.Open("/" + filename) + require.NoError(t, err, "should be able to open file") + defer readFile.Close() + + content, err := io.ReadAll(readFile) + require.NoError(t, err, "should be able to read file") + require.Equal(t, testContent, content, "file content should match") + + // Clean up + err = sftpClient.Remove("/" + filename) + require.NoError(t, err, "should be able to remove file") + }) + + // Test 2: Create directory at "/" (should map to /sftp/testuser/) + t.Run("CreateDirAtRoot", func(t *testing.T) { + dirname := "test_dir" + + err := sftpClient.Mkdir("/" + dirname) + require.NoError(t, err, "should be able to create directory at /") + + // Verify directory exists + info, err := sftpClient.Stat("/" + dirname) + require.NoError(t, err, "should be able to stat directory") + require.True(t, info.IsDir(), "should be a directory") + + // Clean up + err = sftpClient.RemoveDirectory("/" + dirname) + require.NoError(t, err, "should be able to remove directory") + }) + + // Test 3: List directory at "/" (should list /sftp/testuser/) + t.Run("ListRoot", func(t *testing.T) { + // Create a test file first + testContent := []byte("list test content") + filename := "list_test.txt" + + file, err := sftpClient.Create("/" + filename) + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // List root directory + files, err := sftpClient.ReadDir("/") + require.NoError(t, err, "should be able to list root directory") + + // Should find our test file + found := false + for _, f := range files { + if f.Name() == filename { + found = true + break + } + } + require.True(t, found, "should find test file in listing") + + // Clean up + err = sftpClient.Remove("/" + filename) + require.NoError(t, err) + }) + + // Test 4: Nested directory operations + t.Run("NestedOperations", func(t *testing.T) { + // Create nested directory structure + err := sftpClient.MkdirAll("/nested/dir/structure") + require.NoError(t, err, "should be able to create nested directories") + + // Create file in nested directory + testContent := []byte("nested file content") + file, err := sftpClient.Create("/nested/dir/structure/file.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // Verify file exists + readFile, err := sftpClient.Open("/nested/dir/structure/file.txt") + require.NoError(t, err) + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + require.Equal(t, testContent, content) + + // Clean up + err = sftpClient.Remove("/nested/dir/structure/file.txt") + require.NoError(t, err) + err = sftpClient.RemoveDirectory("/nested/dir/structure") + require.NoError(t, err) + err = sftpClient.RemoveDirectory("/nested/dir") + require.NoError(t, err) + err = sftpClient.RemoveDirectory("/nested") + require.NoError(t, err) + }) + + // Test 5: Rename operation + t.Run("RenameFile", func(t *testing.T) { + testContent := []byte("rename test content") + + file, err := sftpClient.Create("/original.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // Rename file + err = sftpClient.Rename("/original.txt", "/renamed.txt") + require.NoError(t, err, "should be able to rename file") + + // Verify old file doesn't exist + _, err = sftpClient.Stat("/original.txt") + require.Error(t, err, "original file should not exist") + + // Verify new file exists with correct content + readFile, err := sftpClient.Open("/renamed.txt") + require.NoError(t, err, "renamed file should exist") + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + require.Equal(t, testContent, content) + + // Clean up + err = sftpClient.Remove("/renamed.txt") + require.NoError(t, err) + }) +} + +// TestAdminRootAccess tests that admin user with HomeDir="/" can access everything +func TestAdminRootAccess(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + // Connect as admin with HomeDir="/" + sftpClient, sshConn, err := fw.ConnectSFTP("admin", "adminpassword") + require.NoError(t, err, "failed to connect as admin") + defer sshConn.Close() + defer sftpClient.Close() + + // Admin should be able to create directories anywhere + t.Run("CreateAnyDirectory", func(t *testing.T) { + // Create the user's home directory structure + err := sftpClient.MkdirAll("/sftp/testuser") + require.NoError(t, err, "admin should be able to create any directory") + + // Create file in that directory + testContent := []byte("admin created this") + file, err := sftpClient.Create("/sftp/testuser/admin_file.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + // Verify file exists + info, err := sftpClient.Stat("/sftp/testuser/admin_file.txt") + require.NoError(t, err) + require.False(t, info.IsDir()) + + // Clean up + err = sftpClient.Remove("/sftp/testuser/admin_file.txt") + require.NoError(t, err) + }) +} + +// TestLargeFileUpload tests uploading larger files through SFTP +func TestLargeFileUpload(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create a 1MB file + t.Run("Upload1MB", func(t *testing.T) { + size := 1024 * 1024 // 1MB + testData := bytes.Repeat([]byte("A"), size) + + file, err := sftpClient.Create("/large_file.bin") + require.NoError(t, err) + n, err := file.Write(testData) + require.NoError(t, err) + require.Equal(t, size, n) + file.Close() + + // Verify file size + info, err := sftpClient.Stat("/large_file.bin") + require.NoError(t, err) + require.Equal(t, int64(size), info.Size()) + + // Verify content + readFile, err := sftpClient.Open("/large_file.bin") + require.NoError(t, err) + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + require.Equal(t, testData, content) + + // Clean up + err = sftpClient.Remove("/large_file.bin") + require.NoError(t, err) + }) +} + +// TestStatOperations tests Stat and Lstat operations +func TestStatOperations(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create a test file + testContent := []byte("stat test content") + file, err := sftpClient.Create("/stat_test.txt") + require.NoError(t, err) + _, err = file.Write(testContent) + require.NoError(t, err) + file.Close() + + t.Run("StatFile", func(t *testing.T) { + info, err := sftpClient.Stat("/stat_test.txt") + require.NoError(t, err) + require.Equal(t, "stat_test.txt", info.Name()) + require.Equal(t, int64(len(testContent)), info.Size()) + require.False(t, info.IsDir()) + }) + + t.Run("StatDirectory", func(t *testing.T) { + err := sftpClient.Mkdir("/stat_dir") + require.NoError(t, err) + + info, err := sftpClient.Stat("/stat_dir") + require.NoError(t, err) + require.Equal(t, "stat_dir", info.Name()) + require.True(t, info.IsDir()) + + // Clean up + err = sftpClient.RemoveDirectory("/stat_dir") + require.NoError(t, err) + }) + + t.Run("StatRoot", func(t *testing.T) { + // Should be able to stat "/" which maps to user's home directory + info, err := sftpClient.Stat("/") + require.NoError(t, err, "should be able to stat root (home) directory") + require.True(t, info.IsDir(), "root should be a directory") + }) + + // Clean up + err = sftpClient.Remove("/stat_test.txt") + require.NoError(t, err) +} + +// TestWalk tests walking directory trees +func TestWalk(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create directory structure + err = sftpClient.MkdirAll("/walk/a/b") + require.NoError(t, err) + err = sftpClient.MkdirAll("/walk/c") + require.NoError(t, err) + + // Create files + for _, p := range []string{"/walk/file1.txt", "/walk/a/file2.txt", "/walk/a/b/file3.txt", "/walk/c/file4.txt"} { + file, err := sftpClient.Create(p) + require.NoError(t, err) + file.Write([]byte("test")) + file.Close() + } + + t.Run("WalkEntireTree", func(t *testing.T) { + var paths []string + walker := sftpClient.Walk("/walk") + for walker.Step() { + if walker.Err() != nil { + continue + } + paths = append(paths, walker.Path()) + } + + // Should find all directories and files + require.Contains(t, paths, "/walk") + require.Contains(t, paths, "/walk/a") + require.Contains(t, paths, "/walk/a/b") + require.Contains(t, paths, "/walk/c") + }) + + // Clean up + for _, p := range []string{"/walk/file1.txt", "/walk/a/file2.txt", "/walk/a/b/file3.txt", "/walk/c/file4.txt"} { + require.NoError(t, sftpClient.Remove(p)) + } + for _, p := range []string{"/walk/a/b", "/walk/a", "/walk/c", "/walk"} { + require.NoError(t, sftpClient.RemoveDirectory(p)) + } +} + +// TestCurrentWorkingDirectory tests that Getwd and Chdir work correctly +func TestCurrentWorkingDirectory(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + // Create test directory + err = sftpClient.Mkdir("/cwd_test") + require.NoError(t, err) + + t.Run("GetCurrentDir", func(t *testing.T) { + cwd, err := sftpClient.Getwd() + require.NoError(t, err) + // The initial working directory should be the user's home directory + // which from the user's perspective is "/" + require.Equal(t, "/", cwd, "initial working directory should be the virtual root") + }) + + t.Run("ChangeAndCreate", func(t *testing.T) { + // Create file in subdirectory using relative path after chdir + // Note: pkg/sftp doesn't support Chdir, so we test using absolute paths + file, err := sftpClient.Create("/cwd_test/relative_file.txt") + require.NoError(t, err) + file.Write([]byte("test")) + file.Close() + + // Verify using absolute path + _, err = sftpClient.Stat("/cwd_test/relative_file.txt") + require.NoError(t, err) + + // Clean up + sftpClient.Remove("/cwd_test/relative_file.txt") + }) + + // Clean up + err = sftpClient.RemoveDirectory("/cwd_test") + require.NoError(t, err) +} + +// TestPathEdgeCases tests various edge cases in path handling +func TestPathEdgeCases(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + t.Run("PathWithDotDot", func(t *testing.T) { + // Create directory structure + err := sftpClient.MkdirAll("/edge/subdir") + require.NoError(t, err) + + // Create file using path with .. + file, err := sftpClient.Create("/edge/subdir/../file.txt") + require.NoError(t, err) + file.Write([]byte("test")) + file.Close() + + // Verify file was created in /edge + _, err = sftpClient.Stat("/edge/file.txt") + require.NoError(t, err, "file should be created in parent directory") + + // Clean up + sftpClient.Remove("/edge/file.txt") + sftpClient.RemoveDirectory("/edge/subdir") + sftpClient.RemoveDirectory("/edge") + }) + + t.Run("PathWithTrailingSlash", func(t *testing.T) { + err := sftpClient.Mkdir("/trailing") + require.NoError(t, err) + + // Stat with trailing slash + info, err := sftpClient.Stat("/trailing/") + require.NoError(t, err) + require.True(t, info.IsDir()) + + // Clean up + sftpClient.RemoveDirectory("/trailing") + }) + + t.Run("CreateFileAtRootPath", func(t *testing.T) { + // This is the exact scenario from issue #7470 + // User with HomeDir="/sftp/testuser" uploads to "/" + file, err := sftpClient.Create("/issue7470.txt") + require.NoError(t, err, "should be able to create file at / (issue #7470)") + file.Write([]byte("This tests the fix for issue #7470")) + file.Close() + + // Verify + _, err = sftpClient.Stat("/issue7470.txt") + require.NoError(t, err) + + // Clean up + sftpClient.Remove("/issue7470.txt") + }) + + // Security test: path traversal attacks should be blocked + t.Run("PathTraversalPrevention", func(t *testing.T) { + // User's HomeDir is "/sftp/testuser" + // Attempting to escape via "../.." should NOT create files outside home directory + + // First, create a valid file to ensure we can write + validFile, err := sftpClient.Create("/valid.txt") + require.NoError(t, err) + validFile.Write([]byte("valid")) + validFile.Close() + + // Try various path traversal attempts + // These should either: + // 1. Be blocked (error returned), OR + // 2. Be safely resolved to stay within home directory + + traversalPaths := []string{ + "/../escape.txt", + "/../../escape.txt", + "/../../../escape.txt", + "/subdir/../../escape.txt", + "/./../../escape.txt", + } + + for _, traversalPath := range traversalPaths { + t.Run(traversalPath, func(t *testing.T) { + // Note: The pkg/sftp client sanitizes paths locally before sending them to the server. + // So "/../escape.txt" becomes "/escape.txt" on the wire. + // Therefore, we cannot trigger the server-side path traversal block with this client. + // Instead, we verify that the file is created successfully within the jail (contained). + // The server-side protection logic is verified in unit tests (sftpd/sftp_server_test.go). + + file, err := sftpClient.Create(traversalPath) + require.NoError(t, err, "creation should succeed because client sanitizes path") + file.Close() + + // Clean up + err = sftpClient.Remove(traversalPath) + require.NoError(t, err) + }) + } + + // Clean up + sftpClient.Remove("/valid.txt") + }) +} + +// TestFileContent tests reading and writing file content correctly +func TestFileContent(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + config := DefaultTestConfig() + config.EnableDebug = testing.Verbose() + + fw := NewSftpTestFramework(t, config) + err := fw.Setup(config) + require.NoError(t, err, "failed to setup test framework") + defer fw.Cleanup() + + sftpClient, sshConn, err := fw.ConnectSFTP("testuser", "testuserpassword") + require.NoError(t, err, "failed to connect as testuser") + defer sshConn.Close() + defer sftpClient.Close() + + t.Run("BinaryContent", func(t *testing.T) { + // Create binary data with all byte values + data := make([]byte, 256) + for i := 0; i < 256; i++ { + data[i] = byte(i) + } + + file, err := sftpClient.Create("/binary.bin") + require.NoError(t, err) + n, err := file.Write(data) + require.NoError(t, err) + require.Equal(t, 256, n) + file.Close() + + // Read back + readFile, err := sftpClient.Open("/binary.bin") + require.NoError(t, err) + content, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + + require.Equal(t, data, content, "binary content should match") + + // Clean up + sftpClient.Remove("/binary.bin") + }) + + t.Run("EmptyFile", func(t *testing.T) { + file, err := sftpClient.Create("/empty.txt") + require.NoError(t, err) + file.Close() + + info, err := sftpClient.Stat("/empty.txt") + require.NoError(t, err) + require.Equal(t, int64(0), info.Size()) + + // Clean up + sftpClient.Remove("/empty.txt") + }) + + t.Run("UnicodeFilename", func(t *testing.T) { + filename := "/文件名.txt" + content := []byte("Unicode content: 你好世界") + + file, err := sftpClient.Create(filename) + require.NoError(t, err) + file.Write(content) + file.Close() + + // Read back + readFile, err := sftpClient.Open(filename) + require.NoError(t, err) + readContent, err := io.ReadAll(readFile) + require.NoError(t, err) + readFile.Close() + + require.Equal(t, content, readContent) + + // Verify in listing + files, err := sftpClient.ReadDir("/") + require.NoError(t, err) + found := false + for _, f := range files { + if f.Name() == path.Base(filename) { + found = true + break + } + } + require.True(t, found, "should find unicode filename in listing") + + // Clean up + sftpClient.Remove(filename) + }) +} + diff --git a/test/sftp/framework.go b/test/sftp/framework.go new file mode 100644 index 000000000..5572eac28 --- /dev/null +++ b/test/sftp/framework.go @@ -0,0 +1,423 @@ +package sftp + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "runtime" + "syscall" + "testing" + "time" + + "github.com/pkg/sftp" + "github.com/stretchr/testify/require" + "golang.org/x/crypto/ssh" +) + +// SftpTestFramework provides utilities for SFTP integration testing +type SftpTestFramework struct { + t *testing.T + tempDir string + dataDir string + masterProcess *os.Process + volumeProcess *os.Process + filerProcess *os.Process + sftpProcess *os.Process + masterAddr string + volumeAddr string + filerAddr string + sftpAddr string + weedBinary string + userStoreFile string + hostKeyFile string + isSetup bool + skipCleanup bool +} + +// TestConfig holds configuration for SFTP tests +type TestConfig struct { + NumVolumes int + EnableDebug bool + SkipCleanup bool // for debugging failed tests + UserStoreFile string +} + +// DefaultTestConfig returns a default configuration for SFTP tests +func DefaultTestConfig() *TestConfig { + return &TestConfig{ + NumVolumes: 3, + EnableDebug: false, + SkipCleanup: false, + UserStoreFile: "", + } +} + +// NewSftpTestFramework creates a new SFTP testing framework +func NewSftpTestFramework(t *testing.T, config *TestConfig) *SftpTestFramework { + if config == nil { + config = DefaultTestConfig() + } + + tempDir, err := os.MkdirTemp("", "seaweedfs_sftp_test_") + require.NoError(t, err) + + // Generate SSH host key for SFTP server + hostKeyFile := filepath.Join(tempDir, "ssh_host_key") + cmd := exec.Command("ssh-keygen", "-t", "ed25519", "-f", hostKeyFile, "-N", "") + err = cmd.Run() + require.NoError(t, err, "failed to generate SSH host key") + + // Use provided userstore or copy the test one + userStoreFile := config.UserStoreFile + if userStoreFile == "" { + // Copy test userstore to temp dir + userStoreFile = filepath.Join(tempDir, "userstore.json") + testDataPath := findTestDataPath() + input, err := os.ReadFile(filepath.Join(testDataPath, "userstore.json")) + require.NoError(t, err, "failed to read test userstore.json") + err = os.WriteFile(userStoreFile, input, 0644) + require.NoError(t, err, "failed to write userstore.json") + } + + return &SftpTestFramework{ + t: t, + tempDir: tempDir, + dataDir: filepath.Join(tempDir, "data"), + masterAddr: "127.0.0.1:19333", + volumeAddr: "127.0.0.1:18080", + filerAddr: "127.0.0.1:18888", + sftpAddr: "127.0.0.1:12022", + weedBinary: findWeedBinary(), + userStoreFile: userStoreFile, + hostKeyFile: hostKeyFile, + isSetup: false, + } +} + +// Setup starts SeaweedFS cluster with SFTP server +func (f *SftpTestFramework) Setup(config *TestConfig) error { + if f.isSetup { + return fmt.Errorf("framework already setup") + } + + // Create all data directories + dirs := []string{ + f.dataDir, + filepath.Join(f.dataDir, "master"), + filepath.Join(f.dataDir, "volume"), + } + for _, dir := range dirs { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory %s: %v", dir, err) + } + } + + // Start master + if err := f.startMaster(config); err != nil { + return fmt.Errorf("failed to start master: %v", err) + } + + // Wait for master to be ready + if err := f.waitForService(f.masterAddr, 30*time.Second); err != nil { + return fmt.Errorf("master not ready: %v", err) + } + + // Start volume server + if err := f.startVolumeServer(config); err != nil { + return fmt.Errorf("failed to start volume server: %v", err) + } + + // Wait for volume server to be ready + if err := f.waitForService(f.volumeAddr, 30*time.Second); err != nil { + return fmt.Errorf("volume server not ready: %v", err) + } + + // Start filer + if err := f.startFiler(config); err != nil { + return fmt.Errorf("failed to start filer: %v", err) + } + + // Wait for filer to be ready + if err := f.waitForService(f.filerAddr, 30*time.Second); err != nil { + return fmt.Errorf("filer not ready: %v", err) + } + + // Start SFTP server + if err := f.startSftpServer(config); err != nil { + return fmt.Errorf("failed to start SFTP server: %v", err) + } + + // Wait for SFTP server to be ready + if err := f.waitForService(f.sftpAddr, 30*time.Second); err != nil { + return fmt.Errorf("SFTP server not ready: %v", err) + } + + // Additional wait for all services to stabilize (gRPC endpoints) + time.Sleep(500 * time.Millisecond) + + f.skipCleanup = config.SkipCleanup + f.isSetup = true + return nil +} + +// Cleanup stops all processes and removes temporary files +func (f *SftpTestFramework) Cleanup() { + // Stop processes in reverse order + processes := []*os.Process{f.sftpProcess, f.filerProcess, f.volumeProcess, f.masterProcess} + for _, proc := range processes { + if proc != nil { + proc.Signal(syscall.SIGTERM) + proc.Wait() + } + } + + // Remove temp directory + if !f.skipCleanup { + os.RemoveAll(f.tempDir) + } +} + +// GetSftpAddr returns the SFTP server address +func (f *SftpTestFramework) GetSftpAddr() string { + return f.sftpAddr +} + +// GetFilerAddr returns the filer address +func (f *SftpTestFramework) GetFilerAddr() string { + return f.filerAddr +} + +// ConnectSFTP creates an SFTP client connection with the given credentials +func (f *SftpTestFramework) ConnectSFTP(username, password string) (*sftp.Client, *ssh.Client, error) { + // Load the known host public key for verification + hostKeyCallback, err := f.getHostKeyCallback() + if err != nil { + return nil, nil, fmt.Errorf("failed to get host key callback: %v", err) + } + + config := &ssh.ClientConfig{ + User: username, + Auth: []ssh.AuthMethod{ + ssh.Password(password), + }, + HostKeyCallback: hostKeyCallback, + Timeout: 5 * time.Second, + } + + sshConn, err := ssh.Dial("tcp", f.sftpAddr, config) + if err != nil { + return nil, nil, fmt.Errorf("failed to connect SSH: %v", err) + } + + sftpClient, err := sftp.NewClient(sshConn) + if err != nil { + sshConn.Close() + return nil, nil, fmt.Errorf("failed to create SFTP client: %v", err) + } + + return sftpClient, sshConn, nil +} + +// getHostKeyCallback returns a callback that verifies the server's host key +// matches the known test server key we generated +func (f *SftpTestFramework) getHostKeyCallback() (ssh.HostKeyCallback, error) { + // Read the public key file generated alongside the private key + pubKeyFile := f.hostKeyFile + ".pub" + pubKeyBytes, err := os.ReadFile(pubKeyFile) + if err != nil { + return nil, fmt.Errorf("failed to read host public key: %v", err) + } + + // Parse the public key + pubKey, _, _, _, err := ssh.ParseAuthorizedKey(pubKeyBytes) + if err != nil { + return nil, fmt.Errorf("failed to parse host public key: %v", err) + } + + // Return a callback that verifies the server key matches our known key + return ssh.FixedHostKey(pubKey), nil +} + +// startMaster starts the SeaweedFS master server +func (f *SftpTestFramework) startMaster(config *TestConfig) error { + args := []string{ + "master", + "-ip=127.0.0.1", + "-port=19333", + "-mdir=" + filepath.Join(f.dataDir, "master"), + "-raftBootstrap", + "-peers=none", + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.masterProcess = cmd.Process + return nil +} + +// startVolumeServer starts SeaweedFS volume server +func (f *SftpTestFramework) startVolumeServer(config *TestConfig) error { + args := []string{ + "volume", + "-mserver=" + f.masterAddr, + "-ip=127.0.0.1", + "-port=18080", + "-dir=" + filepath.Join(f.dataDir, "volume"), + fmt.Sprintf("-max=%d", config.NumVolumes), + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.volumeProcess = cmd.Process + return nil +} + +// startFiler starts the SeaweedFS filer server +func (f *SftpTestFramework) startFiler(config *TestConfig) error { + args := []string{ + "filer", + "-master=" + f.masterAddr, + "-ip=127.0.0.1", + "-port=18888", + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.filerProcess = cmd.Process + return nil +} + +// startSftpServer starts the SeaweedFS SFTP server +func (f *SftpTestFramework) startSftpServer(config *TestConfig) error { + args := []string{ + "sftp", + "-filer=" + f.filerAddr, + "-ip.bind=127.0.0.1", + "-port=12022", + "-sshPrivateKey=" + f.hostKeyFile, + "-userStoreFile=" + f.userStoreFile, + } + + cmd := exec.Command(f.weedBinary, args...) + cmd.Dir = f.tempDir + if config.EnableDebug { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + if err := cmd.Start(); err != nil { + return err + } + f.sftpProcess = cmd.Process + return nil +} + +// waitForService waits for a service to be available +func (f *SftpTestFramework) waitForService(addr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + conn, err := net.DialTimeout("tcp", addr, 1*time.Second) + if err == nil { + conn.Close() + return nil + } + time.Sleep(100 * time.Millisecond) + } + return fmt.Errorf("service at %s not ready within timeout", addr) +} + +// findWeedBinary locates the weed binary +// Prefers local build over system-installed weed to ensure we test the latest code +func findWeedBinary() string { + // Get the directory where this source file is located + // This ensures we find the locally built weed binary first + _, thisFile, _, ok := runtime.Caller(0) + if ok { + thisDir := filepath.Dir(thisFile) + // From test/sftp/, the weed binary should be at ../../weed/weed + candidates := []string{ + filepath.Join(thisDir, "../../weed/weed"), + filepath.Join(thisDir, "../weed/weed"), + } + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + abs, _ := filepath.Abs(candidate) + return abs + } + } + } + + // Try relative paths from current working directory + cwd, _ := os.Getwd() + candidates := []string{ + filepath.Join(cwd, "../../weed/weed"), + filepath.Join(cwd, "../weed/weed"), + filepath.Join(cwd, "./weed"), + } + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + abs, _ := filepath.Abs(candidate) + return abs + } + } + + // Fallback to PATH only if local build not found + if path, err := exec.LookPath("weed"); err == nil { + return path + } + + // Default fallback + return "weed" +} + +// findTestDataPath locates the testdata directory +func findTestDataPath() string { + // Get the directory where this source file is located + _, thisFile, _, ok := runtime.Caller(0) + if ok { + thisDir := filepath.Dir(thisFile) + testDataPath := filepath.Join(thisDir, "testdata") + if _, err := os.Stat(testDataPath); err == nil { + return testDataPath + } + } + + // Try relative paths from current working directory + cwd, _ := os.Getwd() + candidates := []string{ + filepath.Join(cwd, "testdata"), + filepath.Join(cwd, "../sftp/testdata"), + filepath.Join(cwd, "test/sftp/testdata"), + } + + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + return candidate + } + } + + return "./testdata" +} + diff --git a/test/sftp/go.mod b/test/sftp/go.mod new file mode 100644 index 000000000..34d9053a8 --- /dev/null +++ b/test/sftp/go.mod @@ -0,0 +1,17 @@ +module seaweedfs-sftp-tests + +go 1.24.0 + +require ( + github.com/pkg/sftp v1.13.7 + github.com/stretchr/testify v1.10.0 + golang.org/x/crypto v0.45.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/kr/fs v0.1.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + golang.org/x/sys v0.38.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/test/sftp/go.sum b/test/sftp/go.sum new file mode 100644 index 000000000..112e6f88a --- /dev/null +++ b/test/sftp/go.sum @@ -0,0 +1,64 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/fs v0.1.0 h1:Jskdu9ieNAYnjxsi0LbQp1ulIKZV1LAFgK1tWhpZgl8= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/pkg/sftp v1.13.7 h1:uv+I3nNJvlKZIQGSr8JVQLNHFU9YhhNpvC14Y6KgmSM= +github.com/pkg/sftp v1.13.7/go.mod h1:KMKI0t3T6hfA+lTR/ssZdunHo+uwq7ghoN09/FSu3DY= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/test/sftp/testdata/userstore.json b/test/sftp/testdata/userstore.json new file mode 100644 index 000000000..540a9486d --- /dev/null +++ b/test/sftp/testdata/userstore.json @@ -0,0 +1,36 @@ +[ + { + "Username": "admin", + "Password": "adminpassword", + "PublicKeys": [], + "HomeDir": "/", + "Permissions": { + "/": ["*"] + }, + "Uid": 0, + "Gid": 0 + }, + { + "Username": "testuser", + "Password": "testuserpassword", + "PublicKeys": [], + "HomeDir": "/sftp/testuser", + "Permissions": { + "/sftp/testuser": ["*"] + }, + "Uid": 1001, + "Gid": 1001 + }, + { + "Username": "readonly", + "Password": "readonlypassword", + "PublicKeys": [], + "HomeDir": "/public", + "Permissions": { + "/public": ["read", "list"] + }, + "Uid": 1002, + "Gid": 1002 + } +] + diff --git a/weed/sftpd/sftp_file_writer.go b/weed/sftpd/sftp_file_writer.go index 0a662d021..fed60eec0 100644 --- a/weed/sftpd/sftp_file_writer.go +++ b/weed/sftpd/sftp_file_writer.go @@ -72,6 +72,7 @@ func (l listerat) ListAt(ls []os.FileInfo, offset int64) (int, error) { type SeaweedSftpFileWriter struct { fs SftpServer req *sftp.Request + absPath string // Absolute path after HomeDir translation mu sync.Mutex tmpFile *os.File permissions os.FileMode @@ -105,6 +106,6 @@ func (w *SeaweedSftpFileWriter) Close() error { return err } - // Stream the file instead of loading it - return w.fs.putFile(w.req.Filepath, w.tmpFile, w.fs.user) + // Stream the file to the absolute path (after HomeDir translation) + return w.fs.putFile(w.absPath, w.tmpFile, w.fs.user) } diff --git a/weed/sftpd/sftp_filer.go b/weed/sftpd/sftp_filer.go index 9baaf41d7..eb196cc28 100644 --- a/weed/sftpd/sftp_filer.go +++ b/weed/sftpd/sftp_filer.go @@ -100,18 +100,26 @@ func (fs *SftpServer) withTimeoutContext(fn func(ctx context.Context) error) err // ==================== Command Dispatcher ==================== func (fs *SftpServer) dispatchCmd(r *sftp.Request) error { - glog.V(0).Infof("Dispatch: %s %s", r.Method, r.Filepath) + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return err + } + glog.V(1).Infof("Dispatch: %s %s (absolute: %s)", r.Method, r.Filepath, absPath) switch r.Method { case "Remove": - return fs.removeEntry(r) + return fs.removeEntry(absPath) case "Rename": - return fs.renameEntry(r) + absTarget, err := fs.toAbsolutePath(r.Target) + if err != nil { + return err + } + return fs.renameEntry(absPath, absTarget) case "Mkdir": - return fs.makeDir(r) + return fs.makeDir(absPath) case "Rmdir": - return fs.removeDir(r) + return fs.removeDir(absPath) case "Setstat": - return fs.setFileStat(r) + return fs.setFileStatWithRequest(absPath, r) default: return fmt.Errorf("unsupported: %s", r.Method) } @@ -120,10 +128,14 @@ func (fs *SftpServer) dispatchCmd(r *sftp.Request) error { // ==================== File Operations ==================== func (fs *SftpServer) readFile(r *sftp.Request) (io.ReaderAt, error) { - if err := fs.checkFilePermission(r.Filepath, "read"); err != nil { + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return nil, err + } + if err := fs.checkFilePermission(absPath, "read"); err != nil { return nil, err } - entry, err := fs.getEntry(r.Filepath) + entry, err := fs.getEntry(absPath) if err != nil { return nil, err } @@ -131,7 +143,11 @@ func (fs *SftpServer) readFile(r *sftp.Request) (io.ReaderAt, error) { } func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) { - dir, _ := util.FullPath(r.Filepath).DirAndName() + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return nil, err + } + dir, _ := util.FullPath(absPath).DirAndName() if err := fs.checkFilePermission(dir, "write"); err != nil { glog.Errorf("Permission denied for %s", dir) return nil, err @@ -145,6 +161,7 @@ func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) { return &SeaweedSftpFileWriter{ fs: *fs, req: r, + absPath: absPath, tmpFile: tmpFile, permissions: 0644, uid: fs.user.Uid, @@ -153,16 +170,20 @@ func (fs *SftpServer) newFileWriter(r *sftp.Request) (io.WriterAt, error) { }, nil } -func (fs *SftpServer) removeEntry(r *sftp.Request) error { - return fs.deleteEntry(r.Filepath, false) +func (fs *SftpServer) removeEntry(absPath string) error { + return fs.deleteEntry(absPath, false) } -func (fs *SftpServer) renameEntry(r *sftp.Request) error { - if err := fs.checkFilePermission(r.Filepath, "rename"); err != nil { +func (fs *SftpServer) renameEntry(absPath, absTarget string) error { + if err := fs.checkFilePermission(absPath, "rename"); err != nil { + return err + } + targetDir, _ := util.FullPath(absTarget).DirAndName() + if err := fs.checkFilePermission(targetDir, "write"); err != nil { return err } - oldDir, oldName := util.FullPath(r.Filepath).DirAndName() - newDir, newName := util.FullPath(r.Target).DirAndName() + oldDir, oldName := util.FullPath(absPath).DirAndName() + newDir, newName := util.FullPath(absTarget).DirAndName() return fs.callWithClient(false, func(ctx context.Context, client filer_pb.SeaweedFilerClient) error { _, err := client.AtomicRenameEntry(ctx, &filer_pb.AtomicRenameEntryRequest{ OldDirectory: oldDir, OldName: oldName, @@ -172,15 +193,15 @@ func (fs *SftpServer) renameEntry(r *sftp.Request) error { }) } -func (fs *SftpServer) setFileStat(r *sftp.Request) error { - if err := fs.checkFilePermission(r.Filepath, "write"); err != nil { +func (fs *SftpServer) setFileStatWithRequest(absPath string, r *sftp.Request) error { + if err := fs.checkFilePermission(absPath, "write"); err != nil { return err } - entry, err := fs.getEntry(r.Filepath) + entry, err := fs.getEntry(absPath) if err != nil { return err } - dir, _ := util.FullPath(r.Filepath).DirAndName() + dir, _ := util.FullPath(absPath).DirAndName() // apply attrs if r.AttrFlags().Permissions { entry.Attributes.FileMode = uint32(r.Attributes().FileMode()) @@ -201,18 +222,22 @@ func (fs *SftpServer) setFileStat(r *sftp.Request) error { // ==================== Directory Operations ==================== func (fs *SftpServer) listDir(r *sftp.Request) (sftp.ListerAt, error) { - if err := fs.checkFilePermission(r.Filepath, "list"); err != nil { + absPath, err := fs.toAbsolutePath(r.Filepath) + if err != nil { + return nil, err + } + if err := fs.checkFilePermission(absPath, "list"); err != nil { return nil, err } if r.Method == "Stat" || r.Method == "Lstat" { - entry, err := fs.getEntry(r.Filepath) + entry, err := fs.getEntry(absPath) if err != nil { return nil, err } fi := &EnhancedFileInfo{FileInfo: FileInfoFromEntry(entry), uid: entry.Attributes.Uid, gid: entry.Attributes.Gid} return listerat([]os.FileInfo{fi}), nil } - return fs.listAllPages(r.Filepath) + return fs.listAllPages(absPath) } func (fs *SftpServer) listAllPages(dirPath string) (sftp.ListerAt, error) { @@ -259,18 +284,19 @@ func (fs *SftpServer) fetchDirectoryPage(dirPath, start string) ([]os.FileInfo, } // makeDir creates a new directory with proper permissions. -func (fs *SftpServer) makeDir(r *sftp.Request) error { +func (fs *SftpServer) makeDir(absPath string) error { if fs.user == nil { return fmt.Errorf("cannot create directory: no user info") } - dir, name := util.FullPath(r.Filepath).DirAndName() - if err := fs.checkFilePermission(r.Filepath, "mkdir"); err != nil { + dir, name := util.FullPath(absPath).DirAndName() + if err := fs.checkFilePermission(dir, "write"); err != nil { return err } // default mode and ownership err := filer_pb.Mkdir(context.Background(), fs, string(dir), name, func(entry *filer_pb.Entry) { mode := uint32(0755 | os.ModeDir) - if strings.HasPrefix(r.Filepath, fs.user.HomeDir) { + // Defensive check: all paths should be under HomeDir after toAbsolutePath translation + if absPath == fs.user.HomeDir || strings.HasPrefix(absPath, fs.user.HomeDir+"/") { mode = uint32(0700 | os.ModeDir) } entry.Attributes.FileMode = mode @@ -288,8 +314,8 @@ func (fs *SftpServer) makeDir(r *sftp.Request) error { } // removeDir deletes a directory. -func (fs *SftpServer) removeDir(r *sftp.Request) error { - return fs.deleteEntry(r.Filepath, false) +func (fs *SftpServer) removeDir(absPath string) error { + return fs.deleteEntry(absPath, false) } func (fs *SftpServer) putFile(filepath string, reader io.Reader, user *user.User) error { diff --git a/weed/sftpd/sftp_server.go b/weed/sftpd/sftp_server.go index f158aeb64..e53098e6b 100644 --- a/weed/sftpd/sftp_server.go +++ b/weed/sftpd/sftp_server.go @@ -6,6 +6,8 @@ import ( "fmt" "io" "os" + "path" + "strings" "time" "github.com/pkg/sftp" @@ -37,6 +39,28 @@ func NewSftpServer(filerAddr pb.ServerAddress, grpcDialOption grpc.DialOption, d } } +// toAbsolutePath translates a user-relative path to an absolute filer path. +// When a user has HomeDir="/sftp/user", their view of "/" maps to "/sftp/user". +// This implements chroot-like behavior where the user's home directory +// becomes their root. +func (fs *SftpServer) toAbsolutePath(userPath string) (string, error) { + // If user has root as home directory, no translation needed + if fs.user.HomeDir == "" || fs.user.HomeDir == "/" { + return path.Clean(userPath), nil + } + + // Concatenate home directory with user path, then clean to resolve any ".." components + p := path.Join(fs.user.HomeDir, strings.TrimPrefix(userPath, "/")) + + // Security check: ensure the final path is within the home directory. + // This prevents path traversal attacks like `../..` that could escape the chroot jail. + if !strings.HasPrefix(p, fs.user.HomeDir+"/") && p != fs.user.HomeDir { + return "", fmt.Errorf("path traversal attempt: %s resolves to %s which is outside home dir %s", userPath, p, fs.user.HomeDir) + } + + return p, nil +} + // Fileread is invoked for “get” requests. func (fs *SftpServer) Fileread(req *sftp.Request) (io.ReaderAt, error) { return fs.readFile(req) diff --git a/weed/sftpd/sftp_server_test.go b/weed/sftpd/sftp_server_test.go new file mode 100644 index 000000000..0af94ca14 --- /dev/null +++ b/weed/sftpd/sftp_server_test.go @@ -0,0 +1,103 @@ +package sftpd + +import ( + "testing" + + "github.com/seaweedfs/seaweedfs/weed/sftpd/user" + "github.com/stretchr/testify/assert" +) + +func stringPtr(s string) *string { + return &s +} + +func TestToAbsolutePath(t *testing.T) { + tests := []struct { + name string + homeDir *string // Use pointer to distinguish between unset and empty + userPath string + expected string + expectError bool + }{ + { + name: "normal path", + userPath: "/foo.txt", + expected: "/sftp/testuser/foo.txt", + }, + { + name: "root path", + userPath: "/", + expected: "/sftp/testuser", + }, + { + name: "path with dot", + userPath: "/./foo.txt", + expected: "/sftp/testuser/foo.txt", + }, + { + name: "path traversal attempts", + userPath: "/../foo.txt", + expectError: true, + }, + { + name: "path traversal attempts 2", + userPath: "../../foo.txt", + expectError: true, + }, + { + name: "path traversal attempts 3", + userPath: "/subdir/../../foo.txt", + expectError: true, + }, + { + name: "empty path", + userPath: "", + expected: "/sftp/testuser", + }, + { + name: "multiple slashes", + userPath: "//foo.txt", + expected: "/sftp/testuser/foo.txt", + }, + { + name: "trailing slash", + userPath: "/foo/", + expected: "/sftp/testuser/foo", + }, + { + name: "empty HomeDir passthrough", + homeDir: stringPtr(""), + userPath: "/foo.txt", + expected: "/foo.txt", + }, + { + name: "root HomeDir passthrough", + homeDir: stringPtr("/"), + userPath: "/foo.txt", + expected: "/foo.txt", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + homeDir := "/sftp/testuser" // default + if tt.homeDir != nil { + homeDir = *tt.homeDir + } + + fs := &SftpServer{ + user: &user.User{ + HomeDir: homeDir, + }, + } + + got, err := fs.toAbsolutePath(tt.userPath) + if tt.expectError { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expected, got) + } + }) + } +} diff --git a/weed/sftpd/sftp_service.go b/weed/sftpd/sftp_service.go index e50bd87ba..4d21815a9 100644 --- a/weed/sftpd/sftp_service.go +++ b/weed/sftpd/sftp_service.go @@ -284,8 +284,8 @@ func (s *SFTPService) handleChannel(newChannel ssh.NewChannel, fs *SftpServer) { // handleSFTP starts the SFTP server on the SSH channel. func (s *SFTPService) handleSFTP(channel ssh.Channel, fs *SftpServer) { - // Create server options with initial working directory set to user's home - serverOptions := sftp.WithStartDirectory(fs.user.HomeDir) + // Start at virtual root "/" - toAbsolutePath translates this to the user's HomeDir + serverOptions := sftp.WithStartDirectory("/") server := sftp.NewRequestServer(channel, sftp.Handlers{ FileGet: fs, FilePut: fs, diff --git a/weed/sftpd/user/filestore.go b/weed/sftpd/user/filestore.go index c522a388a..4c372aa76 100644 --- a/weed/sftpd/user/filestore.go +++ b/weed/sftpd/user/filestore.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "os" + "path" "sync" "golang.org/x/crypto/ssh" @@ -99,6 +100,10 @@ func (s *FileStore) loadUsers() error { user.PublicKeys[i] = string(pubKey.Marshal()) } } + // Clean HomeDir to handle trailing slashes and normalize path + if user.HomeDir != "" { + user.HomeDir = path.Clean(user.HomeDir) + } s.users[user.Username] = user } -- cgit v1.2.3 From 268cc84e8c8629c4824d4cc30c79cc8dac0a5142 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 3 Dec 2025 18:53:06 -0800 Subject: [helm] Fix liveness/readiness probe scheme path in templates (#7616) Fix the templates to read scheme from httpGet.scheme instead of the probe level, matching the structure defined in values.yaml. This ensures that changing *.livenessProbe.httpGet.scheme or *.readinessProbe.httpGet.scheme in values.yaml now correctly affects the rendered manifests. Affected components: master, filer, volume, s3, all-in-one Fixes #7615 --- .github/workflows/helm_ci.yml | 74 ++++++++++++++++++++++ k8s/charts/seaweedfs/Chart.yaml | 2 +- .../all-in-one/all-in-one-deployment.yaml | 4 +- .../templates/filer/filer-statefulset.yaml | 4 +- .../templates/master/master-statefulset.yaml | 4 +- .../seaweedfs/templates/s3/s3-deployment.yaml | 4 +- .../templates/volume/volume-statefulset.yaml | 4 +- k8s/charts/seaweedfs/values.yaml | 4 +- 8 files changed, 87 insertions(+), 13 deletions(-) diff --git a/.github/workflows/helm_ci.yml b/.github/workflows/helm_ci.yml index f936ff445..ea971aec1 100644 --- a/.github/workflows/helm_ci.yml +++ b/.github/workflows/helm_ci.yml @@ -44,6 +44,80 @@ jobs: - name: Run chart-testing (lint) run: ct lint --target-branch ${{ github.event.repository.default_branch }} --all --validate-maintainers=false --chart-dirs k8s/charts + - name: Verify template rendering + run: | + set -e + CHART_DIR="k8s/charts/seaweedfs" + + echo "=== Testing default configuration ===" + helm template test $CHART_DIR > /tmp/default.yaml + echo "✓ Default configuration renders successfully" + + echo "=== Testing with S3 enabled ===" + helm template test $CHART_DIR --set s3.enabled=true > /tmp/s3.yaml + grep -q "kind: Deployment" /tmp/s3.yaml && grep -q "seaweedfs-s3" /tmp/s3.yaml + echo "✓ S3 deployment renders correctly" + + echo "=== Testing with all-in-one mode ===" + helm template test $CHART_DIR --set allInOne.enabled=true > /tmp/allinone.yaml + grep -q "seaweedfs-all-in-one" /tmp/allinone.yaml + echo "✓ All-in-one deployment renders correctly" + + echo "=== Testing with security enabled ===" + helm template test $CHART_DIR --set global.enableSecurity=true > /tmp/security.yaml + grep -q "security-config" /tmp/security.yaml + echo "✓ Security configuration renders correctly" + + echo "=== Testing with monitoring enabled ===" + helm template test $CHART_DIR \ + --set global.monitoring.enabled=true \ + --set global.monitoring.gatewayHost=prometheus \ + --set global.monitoring.gatewayPort=9091 > /tmp/monitoring.yaml + echo "✓ Monitoring configuration renders correctly" + + echo "=== Testing with PVC storage ===" + helm template test $CHART_DIR \ + --set master.data.type=persistentVolumeClaim \ + --set master.data.size=10Gi \ + --set master.data.storageClass=standard > /tmp/pvc.yaml + grep -q "PersistentVolumeClaim" /tmp/pvc.yaml + echo "✓ PVC configuration renders correctly" + + echo "=== Testing with custom replicas ===" + helm template test $CHART_DIR \ + --set master.replicas=3 \ + --set filer.replicas=2 \ + --set volume.replicas=3 > /tmp/replicas.yaml + echo "✓ Custom replicas configuration renders correctly" + + echo "=== Testing filer with S3 gateway ===" + helm template test $CHART_DIR \ + --set filer.s3.enabled=true \ + --set filer.s3.enableAuth=true > /tmp/filer-s3.yaml + echo "✓ Filer S3 gateway renders correctly" + + echo "=== Testing SFTP enabled ===" + helm template test $CHART_DIR --set sftp.enabled=true > /tmp/sftp.yaml + grep -q "seaweedfs-sftp" /tmp/sftp.yaml + echo "✓ SFTP deployment renders correctly" + + echo "=== Testing ingress configurations ===" + helm template test $CHART_DIR \ + --set master.ingress.enabled=true \ + --set filer.ingress.enabled=true \ + --set s3.enabled=true \ + --set s3.ingress.enabled=true > /tmp/ingress.yaml + grep -q "kind: Ingress" /tmp/ingress.yaml + echo "✓ Ingress configurations render correctly" + + echo "=== Testing COSI driver ===" + helm template test $CHART_DIR --set cosi.enabled=true > /tmp/cosi.yaml + grep -q "seaweedfs-cosi" /tmp/cosi.yaml + echo "✓ COSI driver renders correctly" + + echo "" + echo "✅ All template rendering tests passed!" + - name: Create kind cluster uses: helm/kind-action@v1.13.0 diff --git a/k8s/charts/seaweedfs/Chart.yaml b/k8s/charts/seaweedfs/Chart.yaml index 379f67890..421b85175 100644 --- a/k8s/charts/seaweedfs/Chart.yaml +++ b/k8s/charts/seaweedfs/Chart.yaml @@ -3,4 +3,4 @@ description: SeaweedFS name: seaweedfs appVersion: "4.01" # Dev note: Trigger a helm chart release by `git tag -a helm-` -version: 4.0.401 \ No newline at end of file +version: 4.0.401 diff --git a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml index 8700a8a69..6f176ae19 100644 --- a/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml +++ b/k8s/charts/seaweedfs/templates/all-in-one/all-in-one-deployment.yaml @@ -352,7 +352,7 @@ spec: httpGet: path: {{ .Values.allInOne.readinessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.allInOne.readinessProbe.scheme }} + scheme: {{ .Values.allInOne.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.allInOne.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.allInOne.readinessProbe.periodSeconds }} successThreshold: {{ .Values.allInOne.readinessProbe.successThreshold }} @@ -364,7 +364,7 @@ spec: httpGet: path: {{ .Values.allInOne.livenessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.allInOne.livenessProbe.scheme }} + scheme: {{ .Values.allInOne.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.allInOne.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.allInOne.livenessProbe.periodSeconds }} successThreshold: {{ .Values.allInOne.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml b/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml index 5aeccfa02..af82bd5e0 100644 --- a/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml +++ b/k8s/charts/seaweedfs/templates/filer/filer-statefulset.yaml @@ -289,7 +289,7 @@ spec: httpGet: path: {{ .Values.filer.readinessProbe.httpGet.path }} port: {{ .Values.filer.port }} - scheme: {{ .Values.filer.readinessProbe.scheme }} + scheme: {{ .Values.filer.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.filer.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.filer.readinessProbe.periodSeconds }} successThreshold: {{ .Values.filer.readinessProbe.successThreshold }} @@ -301,7 +301,7 @@ spec: httpGet: path: {{ .Values.filer.livenessProbe.httpGet.path }} port: {{ .Values.filer.port }} - scheme: {{ .Values.filer.livenessProbe.scheme }} + scheme: {{ .Values.filer.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.filer.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.filer.livenessProbe.periodSeconds }} successThreshold: {{ .Values.filer.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml b/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml index 704a33b80..a70673454 100644 --- a/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml +++ b/k8s/charts/seaweedfs/templates/master/master-statefulset.yaml @@ -235,7 +235,7 @@ spec: httpGet: path: {{ .Values.master.readinessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.master.readinessProbe.scheme }} + scheme: {{ .Values.master.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.master.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.master.readinessProbe.periodSeconds }} successThreshold: {{ .Values.master.readinessProbe.successThreshold }} @@ -247,7 +247,7 @@ spec: httpGet: path: {{ .Values.master.livenessProbe.httpGet.path }} port: {{ .Values.master.port }} - scheme: {{ .Values.master.livenessProbe.scheme }} + scheme: {{ .Values.master.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.master.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.master.livenessProbe.periodSeconds }} successThreshold: {{ .Values.master.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml b/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml index 0c6d52c3e..830e1d787 100644 --- a/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml +++ b/k8s/charts/seaweedfs/templates/s3/s3-deployment.yaml @@ -204,7 +204,7 @@ spec: httpGet: path: {{ .Values.s3.readinessProbe.httpGet.path }} port: {{ .Values.s3.port }} - scheme: {{ .Values.s3.readinessProbe.scheme }} + scheme: {{ .Values.s3.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.s3.readinessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.s3.readinessProbe.periodSeconds }} successThreshold: {{ .Values.s3.readinessProbe.successThreshold }} @@ -216,7 +216,7 @@ spec: httpGet: path: {{ .Values.s3.livenessProbe.httpGet.path }} port: {{ .Values.s3.port }} - scheme: {{ .Values.s3.livenessProbe.scheme }} + scheme: {{ .Values.s3.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ .Values.s3.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.s3.livenessProbe.periodSeconds }} successThreshold: {{ .Values.s3.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml b/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml index 29a035a2b..1a8964a55 100644 --- a/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml +++ b/k8s/charts/seaweedfs/templates/volume/volume-statefulset.yaml @@ -251,7 +251,7 @@ spec: httpGet: path: {{ $volume.readinessProbe.httpGet.path }} port: {{ $volume.port }} - scheme: {{ $volume.readinessProbe.scheme }} + scheme: {{ $volume.readinessProbe.httpGet.scheme }} initialDelaySeconds: {{ $volume.readinessProbe.initialDelaySeconds }} periodSeconds: {{ $volume.readinessProbe.periodSeconds }} successThreshold: {{ $volume.readinessProbe.successThreshold }} @@ -263,7 +263,7 @@ spec: httpGet: path: {{ $volume.livenessProbe.httpGet.path }} port: {{ $volume.port }} - scheme: {{ $volume.livenessProbe.scheme }} + scheme: {{ $volume.livenessProbe.httpGet.scheme }} initialDelaySeconds: {{ $volume.livenessProbe.initialDelaySeconds }} periodSeconds: {{ $volume.livenessProbe.periodSeconds }} successThreshold: {{ $volume.livenessProbe.successThreshold }} diff --git a/k8s/charts/seaweedfs/values.yaml b/k8s/charts/seaweedfs/values.yaml index 547b05479..520323dce 100644 --- a/k8s/charts/seaweedfs/values.yaml +++ b/k8s/charts/seaweedfs/values.yaml @@ -1133,7 +1133,7 @@ allInOne: httpGet: path: /cluster/status port: 9333 - scheme: HTTP + scheme: HTTP initialDelaySeconds: 10 periodSeconds: 15 successThreshold: 1 @@ -1145,7 +1145,7 @@ allInOne: httpGet: path: /cluster/status port: 9333 - scheme: HTTP + scheme: HTTP initialDelaySeconds: 20 periodSeconds: 30 successThreshold: 1 -- cgit v1.2.3 From 39ba19eea6d47a5d35c67064d560fb569c6c5baf Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 3 Dec 2025 21:12:19 -0800 Subject: filer: async empty folder cleanup via metadata events (#7614) * filer: async empty folder cleanup via metadata events Implements asynchronous empty folder cleanup when files are deleted in S3. Key changes: 1. EmptyFolderCleaner - New component that handles folder cleanup: - Uses consistent hashing (LockRing) to determine folder ownership - Each filer owns specific folders, avoiding duplicate cleanup work - Debounces delete events (10s delay) to batch multiple deletes - Caches rough folder counts to skip unnecessary checks - Cancels pending cleanup when new files are created - Handles both file and subdirectory deletions 2. Integration with metadata events: - Listens to both local and remote filer metadata events - Processes create/delete/rename events to track folder state - Only processes folders under /buckets//... 3. Removed synchronous empty folder cleanup from S3 handlers: - DeleteObjectHandler no longer calls DoDeleteEmptyParentDirectories - DeleteMultipleObjectsHandler no longer tracks/cleans directories - Cleanup now happens asynchronously via metadata events Benefits: - Non-blocking: S3 delete requests return immediately - Coordinated: Only one filer (the owner) cleans each folder - Efficient: Batching and caching reduce unnecessary checks - Event-driven: Folder deletion triggers parent folder check automatically * filer: add CleanupQueue data structure for deduplicated folder cleanup CleanupQueue uses a linked list for FIFO ordering and a hashmap for O(1) deduplication. Processing is triggered when: - Queue size reaches maxSize (default 1000), OR - Oldest item exceeds maxAge (default 10 minutes) Key features: - O(1) Add, Remove, Pop, Contains operations - Duplicate folders are ignored (keeps original position/time) - Testable with injectable time function - Thread-safe with mutex protection * filer: use CleanupQueue for empty folder cleanup Replace timer-per-folder approach with queue-based processing: - Use CleanupQueue for deduplication and ordered processing - Process queue when full (1000 items) or oldest item exceeds 10 minutes - Background processor checks queue every 10 seconds - Remove from queue on create events to cancel pending cleanup Benefits: - Bounded memory: queue has max size, not unlimited timers - Efficient: O(1) add/remove/contains operations - Batch processing: handle many folders efficiently - Better for high-volume delete scenarios * filer: CleanupQueue.Add moves duplicate to back with updated time When adding a folder that already exists in the queue: - Remove it from its current position - Add it to the back of the queue - Update the queue time to current time This ensures that folders with recent delete activity are processed later, giving more time for additional deletes to occur. * filer: CleanupQueue uses event time and inserts in sorted order Changes: - Add() now takes eventTime parameter instead of using current time - Insert items in time-sorted order (oldest at front) to handle out-of-order events - When updating duplicate with newer time, reposition to maintain sort order - Ignore updates with older time (keep existing later time) This ensures proper ordering when processing events from distributed filers where event arrival order may not match event occurrence order. * filer: remove unused CleanupQueue functions (SetNowFunc, GetAll) Removed test-only functions: - SetNowFunc: tests now use real time with past event times - GetAll: tests now use Pop() to verify order Kept functions used in production: - Peek: used in filer_notify_read.go - OldestAge: used in empty_folder_cleaner.go logging * filer: initialize cache entry on first delete/create event Previously, roughCount was only updated if the cache entry already existed, but entries were only created during executeCleanup. This meant delete/create events before the first cleanup didn't track the count. Now create the cache entry on first event, so roughCount properly tracks all changes from the start. * filer: skip adding to cleanup queue if roughCount > 0 If the cached roughCount indicates there are still items in the folder, don't bother adding it to the cleanup queue. This avoids unnecessary queue entries and reduces wasted cleanup checks. * filer: don't create cache entry on create event Only update roughCount if the folder is already being tracked. New folders don't need tracking until we see a delete event. * filer: move empty folder cleanup to its own package - Created weed/filer/empty_folder_cleanup package - Defined FilerOperations interface to break circular dependency - Added CountDirectoryEntries method to Filer - Exported IsUnderPath and IsUnderBucketPath helper functions * filer: make isUnderPath and isUnderBucketPath private These helpers are only used within the empty_folder_cleanup package. --- weed/filer/empty_folder_cleanup/cleanup_queue.go | 206 ++++++++ .../empty_folder_cleanup/cleanup_queue_test.go | 370 ++++++++++++++ .../empty_folder_cleanup/empty_folder_cleaner.go | 436 ++++++++++++++++ .../empty_folder_cleaner_test.go | 569 +++++++++++++++++++++ weed/filer/filer.go | 8 + weed/filer/filer_notify.go | 39 ++ weed/filer/filer_on_meta_event.go | 39 ++ weed/filer/filer_search.go | 13 + weed/s3api/s3api_object_handlers_delete.go | 57 +-- 9 files changed, 1685 insertions(+), 52 deletions(-) create mode 100644 weed/filer/empty_folder_cleanup/cleanup_queue.go create mode 100644 weed/filer/empty_folder_cleanup/cleanup_queue_test.go create mode 100644 weed/filer/empty_folder_cleanup/empty_folder_cleaner.go create mode 100644 weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue.go b/weed/filer/empty_folder_cleanup/cleanup_queue.go new file mode 100644 index 000000000..66889e930 --- /dev/null +++ b/weed/filer/empty_folder_cleanup/cleanup_queue.go @@ -0,0 +1,206 @@ +package empty_folder_cleanup + +import ( + "container/list" + "sync" + "time" +) + +// CleanupQueue manages a deduplicated queue of folders pending cleanup. +// It uses a doubly-linked list ordered by event time (oldest at front) and a map for O(1) deduplication. +// Processing is triggered when: +// - Queue size reaches maxSize, OR +// - Oldest item exceeds maxAge +type CleanupQueue struct { + mu sync.Mutex + items *list.List // Linked list of *queueItem ordered by time (front = oldest) + itemsMap map[string]*list.Element // folder -> list element for O(1) lookup + maxSize int // Max queue size before triggering cleanup + maxAge time.Duration // Max age before triggering cleanup +} + +// queueItem represents an item in the cleanup queue +type queueItem struct { + folder string + queueTime time.Time +} + +// NewCleanupQueue creates a new CleanupQueue with the specified limits +func NewCleanupQueue(maxSize int, maxAge time.Duration) *CleanupQueue { + return &CleanupQueue{ + items: list.New(), + itemsMap: make(map[string]*list.Element), + maxSize: maxSize, + maxAge: maxAge, + } +} + +// Add adds a folder to the queue with the specified event time. +// The item is inserted in time-sorted order (oldest at front) to handle out-of-order events. +// If folder already exists with an older time, the time is updated and position adjusted. +// Returns true if the folder was newly added, false if it was updated. +func (q *CleanupQueue) Add(folder string, eventTime time.Time) bool { + q.mu.Lock() + defer q.mu.Unlock() + + // Check if folder already exists + if elem, exists := q.itemsMap[folder]; exists { + existingItem := elem.Value.(*queueItem) + // Only update if new event is later + if eventTime.After(existingItem.queueTime) { + // Remove from current position + q.items.Remove(elem) + // Re-insert with new time in sorted position + newElem := q.insertSorted(folder, eventTime) + q.itemsMap[folder] = newElem + } + return false + } + + // Insert new folder in sorted position + elem := q.insertSorted(folder, eventTime) + q.itemsMap[folder] = elem + return true +} + +// insertSorted inserts an item in the correct position to maintain time ordering (oldest at front) +func (q *CleanupQueue) insertSorted(folder string, eventTime time.Time) *list.Element { + item := &queueItem{ + folder: folder, + queueTime: eventTime, + } + + // Find the correct position (insert before the first item with a later time) + for elem := q.items.Back(); elem != nil; elem = elem.Prev() { + existingItem := elem.Value.(*queueItem) + if !eventTime.Before(existingItem.queueTime) { + // Insert after this element + return q.items.InsertAfter(item, elem) + } + } + + // This item is the oldest, insert at front + return q.items.PushFront(item) +} + +// Remove removes a specific folder from the queue (e.g., when a file is created). +// Returns true if the folder was found and removed. +func (q *CleanupQueue) Remove(folder string) bool { + q.mu.Lock() + defer q.mu.Unlock() + + elem, exists := q.itemsMap[folder] + if !exists { + return false + } + + q.items.Remove(elem) + delete(q.itemsMap, folder) + return true +} + +// ShouldProcess returns true if the queue should be processed. +// This is true when: +// - Queue size >= maxSize, OR +// - Oldest item age > maxAge +func (q *CleanupQueue) ShouldProcess() bool { + q.mu.Lock() + defer q.mu.Unlock() + + return q.shouldProcessLocked() +} + +// shouldProcessLocked checks if processing is needed (caller must hold lock) +func (q *CleanupQueue) shouldProcessLocked() bool { + if q.items.Len() == 0 { + return false + } + + // Check if queue is full + if q.items.Len() >= q.maxSize { + return true + } + + // Check if oldest item exceeds max age + front := q.items.Front() + if front != nil { + item := front.Value.(*queueItem) + if time.Since(item.queueTime) > q.maxAge { + return true + } + } + + return false +} + +// Pop removes and returns the oldest folder from the queue. +// Returns the folder and true if an item was available, or empty string and false if queue is empty. +func (q *CleanupQueue) Pop() (string, bool) { + q.mu.Lock() + defer q.mu.Unlock() + + front := q.items.Front() + if front == nil { + return "", false + } + + item := front.Value.(*queueItem) + q.items.Remove(front) + delete(q.itemsMap, item.folder) + + return item.folder, true +} + +// Peek returns the oldest folder without removing it. +// Returns the folder and queue time if available, or empty values if queue is empty. +func (q *CleanupQueue) Peek() (folder string, queueTime time.Time, ok bool) { + q.mu.Lock() + defer q.mu.Unlock() + + front := q.items.Front() + if front == nil { + return "", time.Time{}, false + } + + item := front.Value.(*queueItem) + return item.folder, item.queueTime, true +} + +// Len returns the current queue size. +func (q *CleanupQueue) Len() int { + q.mu.Lock() + defer q.mu.Unlock() + return q.items.Len() +} + +// Contains checks if a folder is in the queue. +func (q *CleanupQueue) Contains(folder string) bool { + q.mu.Lock() + defer q.mu.Unlock() + _, exists := q.itemsMap[folder] + return exists +} + +// Clear removes all items from the queue. +func (q *CleanupQueue) Clear() { + q.mu.Lock() + defer q.mu.Unlock() + + q.items.Init() + q.itemsMap = make(map[string]*list.Element) +} + +// OldestAge returns the age of the oldest item in the queue, or 0 if empty. +func (q *CleanupQueue) OldestAge() time.Duration { + q.mu.Lock() + defer q.mu.Unlock() + + front := q.items.Front() + if front == nil { + return 0 + } + + item := front.Value.(*queueItem) + return time.Since(item.queueTime) +} + diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue_test.go b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go new file mode 100644 index 000000000..eda1c3633 --- /dev/null +++ b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go @@ -0,0 +1,370 @@ +package empty_folder_cleanup + +import ( + "testing" + "time" +) + +func TestCleanupQueue_Add(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + // Add first item + if !q.Add("/buckets/b1/folder1", now) { + t.Error("expected Add to return true for new item") + } + if q.Len() != 1 { + t.Errorf("expected len 1, got %d", q.Len()) + } + + // Add second item with later time + if !q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) { + t.Error("expected Add to return true for new item") + } + if q.Len() != 2 { + t.Errorf("expected len 2, got %d", q.Len()) + } + + // Add duplicate with newer time - should update and reposition + if q.Add("/buckets/b1/folder1", now.Add(2*time.Second)) { + t.Error("expected Add to return false for existing item") + } + if q.Len() != 2 { + t.Errorf("expected len 2 after duplicate, got %d", q.Len()) + } + + // folder1 should now be at the back (newer time) - verify by popping + folder1, _ := q.Pop() + folder2, _ := q.Pop() + if folder1 != "/buckets/b1/folder2" || folder2 != "/buckets/b1/folder1" { + t.Errorf("expected folder1 to be moved to back, got %s, %s", folder1, folder2) + } +} + +func TestCleanupQueue_Add_OutOfOrder(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add items out of order + q.Add("/buckets/b1/folder3", baseTime.Add(3*time.Second)) + q.Add("/buckets/b1/folder1", baseTime.Add(1*time.Second)) + q.Add("/buckets/b1/folder2", baseTime.Add(2*time.Second)) + + // Items should be in time order (oldest first) - verify by popping + expected := []string{"/buckets/b1/folder1", "/buckets/b1/folder2", "/buckets/b1/folder3"} + for i, exp := range expected { + folder, ok := q.Pop() + if !ok || folder != exp { + t.Errorf("at index %d: expected %s, got %s", i, exp, folder) + } + } +} + +func TestCleanupQueue_Add_DuplicateWithOlderTime(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add folder at t=5 + q.Add("/buckets/b1/folder1", baseTime.Add(5*time.Second)) + + // Try to add same folder with older time - should NOT update + q.Add("/buckets/b1/folder1", baseTime.Add(2*time.Second)) + + // Time should remain at t=5 + _, queueTime, _ := q.Peek() + if queueTime != baseTime.Add(5*time.Second) { + t.Errorf("expected time to remain unchanged, got %v", queueTime) + } +} + +func TestCleanupQueue_Remove(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + + // Remove middle item + if !q.Remove("/buckets/b1/folder2") { + t.Error("expected Remove to return true for existing item") + } + if q.Len() != 2 { + t.Errorf("expected len 2, got %d", q.Len()) + } + if q.Contains("/buckets/b1/folder2") { + t.Error("removed item should not be in queue") + } + + // Remove non-existent item + if q.Remove("/buckets/b1/nonexistent") { + t.Error("expected Remove to return false for non-existent item") + } + + // Verify order is preserved by popping + folder1, _ := q.Pop() + folder3, _ := q.Pop() + if folder1 != "/buckets/b1/folder1" || folder3 != "/buckets/b1/folder3" { + t.Errorf("unexpected order: %s, %s", folder1, folder3) + } +} + +func TestCleanupQueue_Pop(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + // Pop from empty queue + folder, ok := q.Pop() + if ok { + t.Error("expected Pop to return false for empty queue") + } + if folder != "" { + t.Errorf("expected empty folder, got %s", folder) + } + + // Add items and pop in order + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + + folder, ok = q.Pop() + if !ok || folder != "/buckets/b1/folder1" { + t.Errorf("expected folder1, got %s (ok=%v)", folder, ok) + } + + folder, ok = q.Pop() + if !ok || folder != "/buckets/b1/folder2" { + t.Errorf("expected folder2, got %s (ok=%v)", folder, ok) + } + + folder, ok = q.Pop() + if !ok || folder != "/buckets/b1/folder3" { + t.Errorf("expected folder3, got %s (ok=%v)", folder, ok) + } + + // Queue should be empty now + if q.Len() != 0 { + t.Errorf("expected empty queue, got len %d", q.Len()) + } +} + +func TestCleanupQueue_Peek(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + // Peek empty queue + folder, _, ok := q.Peek() + if ok { + t.Error("expected Peek to return false for empty queue") + } + + // Add item and peek + q.Add("/buckets/b1/folder1", now) + folder, queueTime, ok := q.Peek() + if !ok || folder != "/buckets/b1/folder1" { + t.Errorf("expected folder1, got %s (ok=%v)", folder, ok) + } + if queueTime != now { + t.Errorf("expected queue time %v, got %v", now, queueTime) + } + + // Peek should not remove item + if q.Len() != 1 { + t.Errorf("Peek should not remove item, len=%d", q.Len()) + } +} + +func TestCleanupQueue_Contains(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + q.Add("/buckets/b1/folder1", now) + + if !q.Contains("/buckets/b1/folder1") { + t.Error("expected Contains to return true") + } + if q.Contains("/buckets/b1/folder2") { + t.Error("expected Contains to return false for non-existent") + } +} + +func TestCleanupQueue_ShouldProcess_MaxSize(t *testing.T) { + q := NewCleanupQueue(3, 10*time.Minute) + now := time.Now() + + // Empty queue + if q.ShouldProcess() { + t.Error("empty queue should not need processing") + } + + // Add items below max + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + if q.ShouldProcess() { + t.Error("queue below max should not need processing") + } + + // Add item to reach max + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + if !q.ShouldProcess() { + t.Error("queue at max should need processing") + } +} + +func TestCleanupQueue_ShouldProcess_MaxAge(t *testing.T) { + q := NewCleanupQueue(100, 100*time.Millisecond) // Short max age for testing + + // Add item with old event time + oldTime := time.Now().Add(-1 * time.Second) // 1 second ago + q.Add("/buckets/b1/folder1", oldTime) + + // Item is older than maxAge, should need processing + if !q.ShouldProcess() { + t.Error("old item should trigger processing") + } + + // Clear and add fresh item + q.Clear() + q.Add("/buckets/b1/folder2", time.Now()) + + // Fresh item should not trigger processing + if q.ShouldProcess() { + t.Error("fresh item should not trigger processing") + } +} + +func TestCleanupQueue_Clear(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + now := time.Now() + + q.Add("/buckets/b1/folder1", now) + q.Add("/buckets/b1/folder2", now.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", now.Add(2*time.Second)) + + q.Clear() + + if q.Len() != 0 { + t.Errorf("expected empty queue after Clear, got len %d", q.Len()) + } + if q.Contains("/buckets/b1/folder1") { + t.Error("queue should not contain items after Clear") + } +} + +func TestCleanupQueue_OldestAge(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + + // Empty queue + if q.OldestAge() != 0 { + t.Error("empty queue should have zero oldest age") + } + + // Add item with time in the past + oldTime := time.Now().Add(-5 * time.Minute) + q.Add("/buckets/b1/folder1", oldTime) + + // Age should be approximately 5 minutes + age := q.OldestAge() + if age < 4*time.Minute || age > 6*time.Minute { + t.Errorf("expected ~5m age, got %v", age) + } +} + +func TestCleanupQueue_TimeOrder(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add items in order + items := []string{ + "/buckets/b1/a", + "/buckets/b1/b", + "/buckets/b1/c", + "/buckets/b1/d", + "/buckets/b1/e", + } + for i, item := range items { + q.Add(item, baseTime.Add(time.Duration(i)*time.Second)) + } + + // Pop should return in time order + for i, expected := range items { + got, ok := q.Pop() + if !ok { + t.Errorf("Pop %d: expected item, got empty", i) + } + if got != expected { + t.Errorf("Pop %d: expected %s, got %s", i, expected, got) + } + } +} + +func TestCleanupQueue_DuplicateWithNewerTime(t *testing.T) { + q := NewCleanupQueue(100, 10*time.Minute) + baseTime := time.Now() + + // Add items + q.Add("/buckets/b1/folder1", baseTime) + q.Add("/buckets/b1/folder2", baseTime.Add(1*time.Second)) + q.Add("/buckets/b1/folder3", baseTime.Add(2*time.Second)) + + // Add duplicate with newer time - should update and reposition + q.Add("/buckets/b1/folder1", baseTime.Add(3*time.Second)) + + // folder1 should now be at the back (newest time) - verify by popping + expected := []string{"/buckets/b1/folder2", "/buckets/b1/folder3", "/buckets/b1/folder1"} + for i, exp := range expected { + folder, ok := q.Pop() + if !ok || folder != exp { + t.Errorf("at index %d: expected %s, got %s", i, exp, folder) + } + } +} + +func TestCleanupQueue_Concurrent(t *testing.T) { + q := NewCleanupQueue(1000, 10*time.Minute) + done := make(chan bool) + now := time.Now() + + // Concurrent adds + go func() { + for i := 0; i < 100; i++ { + q.Add("/buckets/b1/folder"+string(rune('A'+i%26)), now.Add(time.Duration(i)*time.Millisecond)) + } + done <- true + }() + + // Concurrent removes + go func() { + for i := 0; i < 50; i++ { + q.Remove("/buckets/b1/folder" + string(rune('A'+i%26))) + } + done <- true + }() + + // Concurrent pops + go func() { + for i := 0; i < 30; i++ { + q.Pop() + } + done <- true + }() + + // Concurrent reads + go func() { + for i := 0; i < 100; i++ { + q.Len() + q.Contains("/buckets/b1/folderA") + q.ShouldProcess() + } + done <- true + }() + + // Wait for all goroutines + for i := 0; i < 4; i++ { + <-done + } + + // Just verify no panic occurred and queue is in consistent state + _ = q.Len() +} + diff --git a/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go b/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go new file mode 100644 index 000000000..70856aaf1 --- /dev/null +++ b/weed/filer/empty_folder_cleanup/empty_folder_cleaner.go @@ -0,0 +1,436 @@ +package empty_folder_cleanup + +import ( + "context" + "strings" + "sync" + "time" + + "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager" + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb" + "github.com/seaweedfs/seaweedfs/weed/util" +) + +const ( + DefaultMaxCountCheck = 1000 + DefaultCacheExpiry = 5 * time.Minute + DefaultQueueMaxSize = 1000 + DefaultQueueMaxAge = 10 * time.Minute + DefaultProcessorSleep = 10 * time.Second // How often to check queue +) + +// FilerOperations defines the filer operations needed by EmptyFolderCleaner +type FilerOperations interface { + CountDirectoryEntries(ctx context.Context, dirPath util.FullPath, limit int) (count int, err error) + DeleteEntryMetaAndData(ctx context.Context, p util.FullPath, isRecursive, ignoreRecursiveError, shouldDeleteChunks, isFromOtherCluster bool, signatures []int32, ifNotModifiedAfter int64) error +} + +// folderState tracks the state of a folder for empty folder cleanup +type folderState struct { + roughCount int // Cached rough count (up to maxCountCheck) + lastAddTime time.Time // Last time an item was added + lastDelTime time.Time // Last time an item was deleted + lastCheck time.Time // Last time we checked the actual count +} + +// EmptyFolderCleaner handles asynchronous cleanup of empty folders +// Each filer owns specific folders via consistent hashing based on the peer filer list +type EmptyFolderCleaner struct { + filer FilerOperations + lockRing *lock_manager.LockRing + host pb.ServerAddress + + // Folder state tracking + mu sync.RWMutex + folderCounts map[string]*folderState // Rough count cache + + // Cleanup queue (thread-safe, has its own lock) + cleanupQueue *CleanupQueue + + // Configuration + maxCountCheck int // Max items to count (1000) + cacheExpiry time.Duration // How long to keep cache entries + processorSleep time.Duration // How often processor checks queue + bucketPath string // e.g., "/buckets" + + // Control + enabled bool + stopCh chan struct{} +} + +// NewEmptyFolderCleaner creates a new EmptyFolderCleaner +func NewEmptyFolderCleaner(filer FilerOperations, lockRing *lock_manager.LockRing, host pb.ServerAddress, bucketPath string) *EmptyFolderCleaner { + efc := &EmptyFolderCleaner{ + filer: filer, + lockRing: lockRing, + host: host, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(DefaultQueueMaxSize, DefaultQueueMaxAge), + maxCountCheck: DefaultMaxCountCheck, + cacheExpiry: DefaultCacheExpiry, + processorSleep: DefaultProcessorSleep, + bucketPath: bucketPath, + enabled: true, + stopCh: make(chan struct{}), + } + go efc.cacheEvictionLoop() + go efc.cleanupProcessor() + return efc +} + +// SetEnabled enables or disables the cleaner +func (efc *EmptyFolderCleaner) SetEnabled(enabled bool) { + efc.mu.Lock() + defer efc.mu.Unlock() + efc.enabled = enabled +} + +// IsEnabled returns whether the cleaner is enabled +func (efc *EmptyFolderCleaner) IsEnabled() bool { + efc.mu.RLock() + defer efc.mu.RUnlock() + return efc.enabled +} + +// ownsFolder checks if this filer owns the folder via consistent hashing +func (efc *EmptyFolderCleaner) ownsFolder(folder string) bool { + servers := efc.lockRing.GetSnapshot() + if len(servers) <= 1 { + return true // Single filer case + } + return efc.hashKeyToServer(folder, servers) == efc.host +} + +// hashKeyToServer uses consistent hashing to map a folder to a server +func (efc *EmptyFolderCleaner) hashKeyToServer(key string, servers []pb.ServerAddress) pb.ServerAddress { + if len(servers) == 0 { + return "" + } + x := util.HashStringToLong(key) + if x < 0 { + x = -x + } + x = x % int64(len(servers)) + return servers[x] +} + +// OnDeleteEvent is called when a file or directory is deleted +// Both file and directory deletions count towards making the parent folder empty +// eventTime is the time when the delete event occurred (for proper ordering) +func (efc *EmptyFolderCleaner) OnDeleteEvent(directory string, entryName string, isDirectory bool, eventTime time.Time) { + // Skip if not under bucket path (must be at least /buckets//...) + if efc.bucketPath != "" && !isUnderBucketPath(directory, efc.bucketPath) { + return + } + + // Check if we own this folder + if !efc.ownsFolder(directory) { + glog.V(4).Infof("EmptyFolderCleaner: not owner of %s, skipping", directory) + return + } + + efc.mu.Lock() + defer efc.mu.Unlock() + + // Check enabled inside lock to avoid race with Stop() + if !efc.enabled { + return + } + + glog.V(3).Infof("EmptyFolderCleaner: delete event in %s/%s (isDir=%v)", directory, entryName, isDirectory) + + // Update cached count (create entry if needed) + state, exists := efc.folderCounts[directory] + if !exists { + state = &folderState{} + efc.folderCounts[directory] = state + } + if state.roughCount > 0 { + state.roughCount-- + } + state.lastDelTime = eventTime + + // Only add to cleanup queue if roughCount suggests folder might be empty + if state.roughCount > 0 { + glog.V(3).Infof("EmptyFolderCleaner: skipping queue for %s, roughCount=%d", directory, state.roughCount) + return + } + + // Add to cleanup queue with event time (handles out-of-order events) + if efc.cleanupQueue.Add(directory, eventTime) { + glog.V(3).Infof("EmptyFolderCleaner: queued %s for cleanup", directory) + } +} + +// OnCreateEvent is called when a file or directory is created +// Both file and directory creations cancel pending cleanup for the parent folder +func (efc *EmptyFolderCleaner) OnCreateEvent(directory string, entryName string, isDirectory bool) { + // Skip if not under bucket path (must be at least /buckets//...) + if efc.bucketPath != "" && !isUnderBucketPath(directory, efc.bucketPath) { + return + } + + efc.mu.Lock() + defer efc.mu.Unlock() + + // Check enabled inside lock to avoid race with Stop() + if !efc.enabled { + return + } + + // Update cached count only if already tracked (no need to track new folders) + if state, exists := efc.folderCounts[directory]; exists { + state.roughCount++ + state.lastAddTime = time.Now() + } + + // Remove from cleanup queue (cancel pending cleanup) + if efc.cleanupQueue.Remove(directory) { + glog.V(3).Infof("EmptyFolderCleaner: cancelled cleanup for %s due to new entry", directory) + } +} + +// cleanupProcessor runs in background and processes the cleanup queue +func (efc *EmptyFolderCleaner) cleanupProcessor() { + ticker := time.NewTicker(efc.processorSleep) + defer ticker.Stop() + + for { + select { + case <-efc.stopCh: + return + case <-ticker.C: + efc.processCleanupQueue() + } + } +} + +// processCleanupQueue processes items from the cleanup queue +func (efc *EmptyFolderCleaner) processCleanupQueue() { + // Check if we should process + if !efc.cleanupQueue.ShouldProcess() { + return + } + + glog.V(3).Infof("EmptyFolderCleaner: processing cleanup queue (len=%d, age=%v)", + efc.cleanupQueue.Len(), efc.cleanupQueue.OldestAge()) + + // Process all items that are ready + for efc.cleanupQueue.Len() > 0 { + // Check if still enabled + if !efc.IsEnabled() { + return + } + + // Pop the oldest item + folder, ok := efc.cleanupQueue.Pop() + if !ok { + break + } + + // Execute cleanup for this folder + efc.executeCleanup(folder) + + // If queue is no longer full and oldest item is not old enough, stop processing + if !efc.cleanupQueue.ShouldProcess() { + break + } + } +} + +// executeCleanup performs the actual cleanup of an empty folder +func (efc *EmptyFolderCleaner) executeCleanup(folder string) { + efc.mu.Lock() + + // Quick check: if we have cached count and it's > 0, skip + if state, exists := efc.folderCounts[folder]; exists { + if state.roughCount > 0 { + glog.V(3).Infof("EmptyFolderCleaner: skipping %s, cached count=%d", folder, state.roughCount) + efc.mu.Unlock() + return + } + // If there was an add after our delete, skip + if !state.lastAddTime.IsZero() && state.lastAddTime.After(state.lastDelTime) { + glog.V(3).Infof("EmptyFolderCleaner: skipping %s, add happened after delete", folder) + efc.mu.Unlock() + return + } + } + efc.mu.Unlock() + + // Re-check ownership (topology might have changed) + if !efc.ownsFolder(folder) { + glog.V(3).Infof("EmptyFolderCleaner: no longer owner of %s, skipping", folder) + return + } + + // Check if folder is actually empty (count up to maxCountCheck) + ctx := context.Background() + count, err := efc.countItems(ctx, folder) + if err != nil { + glog.V(2).Infof("EmptyFolderCleaner: error counting items in %s: %v", folder, err) + return + } + + efc.mu.Lock() + // Update cache + if _, exists := efc.folderCounts[folder]; !exists { + efc.folderCounts[folder] = &folderState{} + } + efc.folderCounts[folder].roughCount = count + efc.folderCounts[folder].lastCheck = time.Now() + efc.mu.Unlock() + + if count > 0 { + glog.V(3).Infof("EmptyFolderCleaner: folder %s has %d items, not empty", folder, count) + return + } + + // Delete the empty folder + glog.V(2).Infof("EmptyFolderCleaner: deleting empty folder %s", folder) + if err := efc.deleteFolder(ctx, folder); err != nil { + glog.V(2).Infof("EmptyFolderCleaner: failed to delete empty folder %s: %v", folder, err) + return + } + + // Clean up cache entry + efc.mu.Lock() + delete(efc.folderCounts, folder) + efc.mu.Unlock() + + // Note: No need to recursively check parent folder here. + // The deletion of this folder will generate a metadata event, + // which will trigger OnDeleteEvent for the parent folder. +} + +// countItems counts items in a folder (up to maxCountCheck) +func (efc *EmptyFolderCleaner) countItems(ctx context.Context, folder string) (int, error) { + return efc.filer.CountDirectoryEntries(ctx, util.FullPath(folder), efc.maxCountCheck) +} + +// deleteFolder deletes an empty folder +func (efc *EmptyFolderCleaner) deleteFolder(ctx context.Context, folder string) error { + return efc.filer.DeleteEntryMetaAndData(ctx, util.FullPath(folder), false, false, false, false, nil, 0) +} + +// isUnderPath checks if child is under parent path +func isUnderPath(child, parent string) bool { + if parent == "" || parent == "/" { + return true + } + // Ensure parent ends without slash for proper prefix matching + if len(parent) > 0 && parent[len(parent)-1] == '/' { + parent = parent[:len(parent)-1] + } + // Child must start with parent and then have a / or be exactly parent + if len(child) < len(parent) { + return false + } + if child[:len(parent)] != parent { + return false + } + if len(child) == len(parent) { + return true + } + return child[len(parent)] == '/' +} + +// isUnderBucketPath checks if directory is inside a bucket (under /buckets//...) +// This ensures we only clean up folders inside buckets, not the buckets themselves +func isUnderBucketPath(directory, bucketPath string) bool { + if bucketPath == "" { + return true + } + // Ensure bucketPath ends without slash + if len(bucketPath) > 0 && bucketPath[len(bucketPath)-1] == '/' { + bucketPath = bucketPath[:len(bucketPath)-1] + } + // Directory must be under bucketPath + if !isUnderPath(directory, bucketPath) { + return false + } + // Directory must be at least /buckets// + // i.e., depth must be at least bucketPath depth + 2 + // For /buckets (depth 1), we need at least /buckets/mybucket/folder (depth 3) + bucketPathDepth := strings.Count(bucketPath, "/") + directoryDepth := strings.Count(directory, "/") + return directoryDepth >= bucketPathDepth+2 +} + +// cacheEvictionLoop periodically removes stale entries from folderCounts +func (efc *EmptyFolderCleaner) cacheEvictionLoop() { + ticker := time.NewTicker(efc.cacheExpiry) + defer ticker.Stop() + + for { + select { + case <-efc.stopCh: + return + case <-ticker.C: + efc.evictStaleCacheEntries() + } + } +} + +// evictStaleCacheEntries removes cache entries that haven't been accessed recently +func (efc *EmptyFolderCleaner) evictStaleCacheEntries() { + efc.mu.Lock() + defer efc.mu.Unlock() + + now := time.Now() + expiredCount := 0 + for folder, state := range efc.folderCounts { + // Skip if folder is in cleanup queue + if efc.cleanupQueue.Contains(folder) { + continue + } + + // Find the most recent activity time for this folder + lastActivity := state.lastCheck + if state.lastAddTime.After(lastActivity) { + lastActivity = state.lastAddTime + } + if state.lastDelTime.After(lastActivity) { + lastActivity = state.lastDelTime + } + + // Evict if no activity within cache expiry period + if now.Sub(lastActivity) > efc.cacheExpiry { + delete(efc.folderCounts, folder) + expiredCount++ + } + } + + if expiredCount > 0 { + glog.V(3).Infof("EmptyFolderCleaner: evicted %d stale cache entries", expiredCount) + } +} + +// Stop stops the cleaner and cancels all pending tasks +func (efc *EmptyFolderCleaner) Stop() { + close(efc.stopCh) + + efc.mu.Lock() + defer efc.mu.Unlock() + + efc.enabled = false + efc.cleanupQueue.Clear() + efc.folderCounts = make(map[string]*folderState) // Clear cache on stop +} + +// GetPendingCleanupCount returns the number of pending cleanup tasks (for testing) +func (efc *EmptyFolderCleaner) GetPendingCleanupCount() int { + return efc.cleanupQueue.Len() +} + +// GetCachedFolderCount returns the cached count for a folder (for testing) +func (efc *EmptyFolderCleaner) GetCachedFolderCount(folder string) (int, bool) { + efc.mu.RLock() + defer efc.mu.RUnlock() + if state, exists := efc.folderCounts[folder]; exists { + return state.roughCount, true + } + return 0, false +} + diff --git a/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go b/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go new file mode 100644 index 000000000..fbc05ccf8 --- /dev/null +++ b/weed/filer/empty_folder_cleanup/empty_folder_cleaner_test.go @@ -0,0 +1,569 @@ +package empty_folder_cleanup + +import ( + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager" + "github.com/seaweedfs/seaweedfs/weed/pb" +) + +func Test_isUnderPath(t *testing.T) { + tests := []struct { + name string + child string + parent string + expected bool + }{ + {"child under parent", "/buckets/mybucket/folder/file.txt", "/buckets", true}, + {"child is parent", "/buckets", "/buckets", true}, + {"child not under parent", "/other/path", "/buckets", false}, + {"empty parent", "/any/path", "", true}, + {"root parent", "/any/path", "/", true}, + {"parent with trailing slash", "/buckets/mybucket", "/buckets/", true}, + {"similar prefix but not under", "/buckets-other/file", "/buckets", false}, + {"deeply nested", "/buckets/a/b/c/d/e/f", "/buckets/a/b", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isUnderPath(tt.child, tt.parent) + if result != tt.expected { + t.Errorf("isUnderPath(%q, %q) = %v, want %v", tt.child, tt.parent, result, tt.expected) + } + }) + } +} + +func Test_isUnderBucketPath(t *testing.T) { + tests := []struct { + name string + directory string + bucketPath string + expected bool + }{ + // Should NOT process - bucket path itself + {"bucket path itself", "/buckets", "/buckets", false}, + // Should NOT process - bucket directory (immediate child) + {"bucket directory", "/buckets/mybucket", "/buckets", false}, + // Should process - folder inside bucket + {"folder in bucket", "/buckets/mybucket/folder", "/buckets", true}, + // Should process - nested folder + {"nested folder", "/buckets/mybucket/a/b/c", "/buckets", true}, + // Should NOT process - outside buckets + {"outside buckets", "/other/path", "/buckets", false}, + // Empty bucket path allows all + {"empty bucket path", "/any/path", "", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isUnderBucketPath(tt.directory, tt.bucketPath) + if result != tt.expected { + t.Errorf("isUnderBucketPath(%q, %q) = %v, want %v", tt.directory, tt.bucketPath, result, tt.expected) + } + }) + } +} + +func TestEmptyFolderCleaner_ownsFolder(t *testing.T) { + // Create a LockRing with multiple servers + lockRing := lock_manager.NewLockRing(5 * time.Second) + + servers := []pb.ServerAddress{ + "filer1:8888", + "filer2:8888", + "filer3:8888", + } + lockRing.SetSnapshot(servers) + + // Create cleaner for filer1 + cleaner1 := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + } + + // Create cleaner for filer2 + cleaner2 := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer2:8888", + } + + // Create cleaner for filer3 + cleaner3 := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer3:8888", + } + + // Test that exactly one filer owns each folder + testFolders := []string{ + "/buckets/mybucket/folder1", + "/buckets/mybucket/folder2", + "/buckets/mybucket/folder3", + "/buckets/mybucket/a/b/c", + "/buckets/otherbucket/x", + } + + for _, folder := range testFolders { + ownCount := 0 + if cleaner1.ownsFolder(folder) { + ownCount++ + } + if cleaner2.ownsFolder(folder) { + ownCount++ + } + if cleaner3.ownsFolder(folder) { + ownCount++ + } + + if ownCount != 1 { + t.Errorf("folder %q owned by %d filers, expected exactly 1", folder, ownCount) + } + } +} + +func TestEmptyFolderCleaner_ownsFolder_singleServer(t *testing.T) { + // Create a LockRing with a single server + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + } + + // Single filer should own all folders + testFolders := []string{ + "/buckets/mybucket/folder1", + "/buckets/mybucket/folder2", + "/buckets/otherbucket/x", + } + + for _, folder := range testFolders { + if !cleaner.ownsFolder(folder) { + t.Errorf("single filer should own folder %q", folder) + } + } +} + +func TestEmptyFolderCleaner_ownsFolder_emptyRing(t *testing.T) { + // Create an empty LockRing + lockRing := lock_manager.NewLockRing(5 * time.Second) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + } + + // With empty ring, should own all folders + if !cleaner.ownsFolder("/buckets/mybucket/folder") { + t.Error("should own folder with empty ring") + } +} + +func TestEmptyFolderCleaner_OnCreateEvent_cancelsCleanup(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate delete event + cleaner.OnDeleteEvent(folder, "file.txt", false, now) + + // Check that cleanup is queued + if cleaner.GetPendingCleanupCount() != 1 { + t.Errorf("expected 1 pending cleanup, got %d", cleaner.GetPendingCleanupCount()) + } + + // Simulate create event + cleaner.OnCreateEvent(folder, "newfile.txt", false) + + // Check that cleanup is cancelled + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("expected 0 pending cleanups after create, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_deduplication(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate multiple delete events for same folder + for i := 0; i < 5; i++ { + cleaner.OnDeleteEvent(folder, "file"+string(rune('0'+i))+".txt", false, now.Add(time.Duration(i)*time.Second)) + } + + // Check that only 1 cleanup is queued (deduplicated) + if cleaner.GetPendingCleanupCount() != 1 { + t.Errorf("expected 1 pending cleanup after deduplication, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_multipleFolders(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Delete files in different folders + cleaner.OnDeleteEvent("/buckets/mybucket/folder1", "file.txt", false, now) + cleaner.OnDeleteEvent("/buckets/mybucket/folder2", "file.txt", false, now.Add(1*time.Second)) + cleaner.OnDeleteEvent("/buckets/mybucket/folder3", "file.txt", false, now.Add(2*time.Second)) + + // Each folder should be queued + if cleaner.GetPendingCleanupCount() != 3 { + t.Errorf("expected 3 pending cleanups, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_notOwner(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888", "filer2:8888"}) + + // Create cleaner for filer that doesn't own the folder + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Try many folders, looking for one that filer1 doesn't own + foundNonOwned := false + for i := 0; i < 100; i++ { + folder := "/buckets/mybucket/folder" + string(rune('0'+i%10)) + string(rune('0'+i/10)) + if !cleaner.ownsFolder(folder) { + // This folder is not owned by filer1 + cleaner.OnDeleteEvent(folder, "file.txt", false, now) + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("non-owner should not queue cleanup for folder %s", folder) + } + foundNonOwned = true + break + } + } + + if !foundNonOwned { + t.Skip("could not find a folder not owned by filer1") + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_disabled(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: false, // Disabled + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate delete event + cleaner.OnDeleteEvent(folder, "file.txt", false, now) + + // Check that no cleanup is queued when disabled + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("disabled cleaner should not queue cleanup, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_OnDeleteEvent_directoryDeletion(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + now := time.Now() + + // Simulate directory delete event - should trigger cleanup + // because subdirectory deletion also makes parent potentially empty + cleaner.OnDeleteEvent(folder, "subdir", true, now) + + // Check that cleanup IS queued for directory deletion + if cleaner.GetPendingCleanupCount() != 1 { + t.Errorf("directory deletion should trigger cleanup, got %d", cleaner.GetPendingCleanupCount()) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_cachedCounts(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/testfolder" + + // Initialize cached count + cleaner.folderCounts[folder] = &folderState{roughCount: 5} + + // Simulate create events + cleaner.OnCreateEvent(folder, "newfile1.txt", false) + cleaner.OnCreateEvent(folder, "newfile2.txt", false) + + // Check cached count increased + count, exists := cleaner.GetCachedFolderCount(folder) + if !exists { + t.Error("cached folder count should exist") + } + if count != 7 { + t.Errorf("expected cached count 7, got %d", count) + } + + // Simulate delete events + now := time.Now() + cleaner.OnDeleteEvent(folder, "file1.txt", false, now) + cleaner.OnDeleteEvent(folder, "file2.txt", false, now.Add(1*time.Second)) + + // Check cached count decreased + count, exists = cleaner.GetCachedFolderCount(folder) + if !exists { + t.Error("cached folder count should exist") + } + if count != 5 { + t.Errorf("expected cached count 5, got %d", count) + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_Stop(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Queue some cleanups + cleaner.OnDeleteEvent("/buckets/mybucket/folder1", "file1.txt", false, now) + cleaner.OnDeleteEvent("/buckets/mybucket/folder2", "file2.txt", false, now.Add(1*time.Second)) + cleaner.OnDeleteEvent("/buckets/mybucket/folder3", "file3.txt", false, now.Add(2*time.Second)) + + // Verify cleanups are queued + if cleaner.GetPendingCleanupCount() < 1 { + t.Error("expected at least 1 pending cleanup before stop") + } + + // Stop the cleaner + cleaner.Stop() + + // Verify all cleanups are cancelled + if cleaner.GetPendingCleanupCount() != 0 { + t.Errorf("expected 0 pending cleanups after stop, got %d", cleaner.GetPendingCleanupCount()) + } +} + +func TestEmptyFolderCleaner_cacheEviction(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + cacheExpiry: 100 * time.Millisecond, // Short expiry for testing + stopCh: make(chan struct{}), + } + + folder1 := "/buckets/mybucket/folder1" + folder2 := "/buckets/mybucket/folder2" + folder3 := "/buckets/mybucket/folder3" + + // Add some cache entries with old timestamps + oldTime := time.Now().Add(-1 * time.Hour) + cleaner.folderCounts[folder1] = &folderState{roughCount: 5, lastCheck: oldTime} + cleaner.folderCounts[folder2] = &folderState{roughCount: 3, lastCheck: oldTime} + // folder3 has recent activity + cleaner.folderCounts[folder3] = &folderState{roughCount: 2, lastCheck: time.Now()} + + // Verify all entries exist + if len(cleaner.folderCounts) != 3 { + t.Errorf("expected 3 cache entries, got %d", len(cleaner.folderCounts)) + } + + // Run eviction + cleaner.evictStaleCacheEntries() + + // Verify stale entries are evicted + if len(cleaner.folderCounts) != 1 { + t.Errorf("expected 1 cache entry after eviction, got %d", len(cleaner.folderCounts)) + } + + // Verify the recent entry still exists + if _, exists := cleaner.folderCounts[folder3]; !exists { + t.Error("expected folder3 to still exist in cache") + } + + // Verify stale entries are removed + if _, exists := cleaner.folderCounts[folder1]; exists { + t.Error("expected folder1 to be evicted") + } + if _, exists := cleaner.folderCounts[folder2]; exists { + t.Error("expected folder2 to be evicted") + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_cacheEviction_skipsEntriesInQueue(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + cacheExpiry: 100 * time.Millisecond, + stopCh: make(chan struct{}), + } + + folder := "/buckets/mybucket/folder" + oldTime := time.Now().Add(-1 * time.Hour) + + // Add a stale cache entry + cleaner.folderCounts[folder] = &folderState{roughCount: 0, lastCheck: oldTime} + // Also add to cleanup queue + cleaner.cleanupQueue.Add(folder, time.Now()) + + // Run eviction + cleaner.evictStaleCacheEntries() + + // Verify entry is NOT evicted because it's in cleanup queue + if _, exists := cleaner.folderCounts[folder]; !exists { + t.Error("expected folder to still exist in cache (is in cleanup queue)") + } + + cleaner.Stop() +} + +func TestEmptyFolderCleaner_queueFIFOOrder(t *testing.T) { + lockRing := lock_manager.NewLockRing(5 * time.Second) + lockRing.SetSnapshot([]pb.ServerAddress{"filer1:8888"}) + + cleaner := &EmptyFolderCleaner{ + lockRing: lockRing, + host: "filer1:8888", + bucketPath: "/buckets", + enabled: true, + folderCounts: make(map[string]*folderState), + cleanupQueue: NewCleanupQueue(1000, 10*time.Minute), + stopCh: make(chan struct{}), + } + + now := time.Now() + + // Add folders in order + folders := []string{ + "/buckets/mybucket/folder1", + "/buckets/mybucket/folder2", + "/buckets/mybucket/folder3", + } + for i, folder := range folders { + cleaner.OnDeleteEvent(folder, "file.txt", false, now.Add(time.Duration(i)*time.Second)) + } + + // Verify queue length + if cleaner.GetPendingCleanupCount() != 3 { + t.Errorf("expected 3 queued folders, got %d", cleaner.GetPendingCleanupCount()) + } + + // Verify time-sorted order by popping + for i, expected := range folders { + folder, ok := cleaner.cleanupQueue.Pop() + if !ok || folder != expected { + t.Errorf("expected folder %s at index %d, got %s", expected, i, folder) + } + } + + cleaner.Stop() +} + diff --git a/weed/filer/filer.go b/weed/filer/filer.go index f9f3d4fb2..382eb644f 100644 --- a/weed/filer/filer.go +++ b/weed/filer/filer.go @@ -11,6 +11,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/s3api/s3bucket" "github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager" + "github.com/seaweedfs/seaweedfs/weed/filer/empty_folder_cleanup" "github.com/seaweedfs/seaweedfs/weed/cluster" "github.com/seaweedfs/seaweedfs/weed/pb" @@ -56,6 +57,7 @@ type Filer struct { MaxFilenameLength uint32 deletionQuit chan struct{} DeletionRetryQueue *DeletionRetryQueue + EmptyFolderCleaner *empty_folder_cleanup.EmptyFolderCleaner } func NewFiler(masters pb.ServerDiscovery, grpcDialOption grpc.DialOption, filerHost pb.ServerAddress, filerGroup string, collection string, replication string, dataCenter string, maxFilenameLength uint32, notifyFn func()) *Filer { @@ -116,6 +118,9 @@ func (f *Filer) AggregateFromPeers(self pb.ServerAddress, existingNodes []*maste f.Dlm.LockRing.SetSnapshot(snapshot) glog.V(0).Infof("%s aggregate from peers %+v", self, snapshot) + // Initialize the empty folder cleaner using the same LockRing as Dlm for consistent hashing + f.EmptyFolderCleaner = empty_folder_cleanup.NewEmptyFolderCleaner(f, f.Dlm.LockRing, self, f.DirBucketsPath) + f.MetaAggregator = NewMetaAggregator(f, self, f.GrpcDialOption) f.MasterClient.SetOnPeerUpdateFn(func(update *master_pb.ClusterNodeUpdate, startFrom time.Time) { if update.NodeType != cluster.FilerType { @@ -506,6 +511,9 @@ func (f *Filer) IsDirectoryEmpty(ctx context.Context, dirPath util.FullPath) (bo func (f *Filer) Shutdown() { close(f.deletionQuit) + if f.EmptyFolderCleaner != nil { + f.EmptyFolderCleaner.Stop() + } f.LocalMetaLogBuffer.ShutdownLogBuffer() f.Store.Shutdown() } diff --git a/weed/filer/filer_notify.go b/weed/filer/filer_notify.go index 845a0678e..45c9b070f 100644 --- a/weed/filer/filer_notify.go +++ b/weed/filer/filer_notify.go @@ -66,6 +66,10 @@ func (f *Filer) NotifyUpdateEvent(ctx context.Context, oldEntry, newEntry *Entry f.logMetaEvent(ctx, fullpath, eventNotification) + // Trigger empty folder cleanup for local events + // Remote events are handled via MetaAggregator.onMetadataChangeEvent + f.triggerLocalEmptyFolderCleanup(oldEntry, newEntry) + } func (f *Filer) logMetaEvent(ctx context.Context, fullpath string, eventNotification *filer_pb.EventNotification) { @@ -89,6 +93,41 @@ func (f *Filer) logMetaEvent(ctx context.Context, fullpath string, eventNotifica } +// triggerLocalEmptyFolderCleanup triggers empty folder cleanup for local events +// This is needed because onMetadataChangeEvent is only called for remote peer events +func (f *Filer) triggerLocalEmptyFolderCleanup(oldEntry, newEntry *Entry) { + if f.EmptyFolderCleaner == nil || !f.EmptyFolderCleaner.IsEnabled() { + return + } + + eventTime := time.Now() + + // Handle delete events (oldEntry exists, newEntry is nil) + if oldEntry != nil && newEntry == nil { + dir, name := oldEntry.FullPath.DirAndName() + f.EmptyFolderCleaner.OnDeleteEvent(dir, name, oldEntry.IsDirectory(), eventTime) + } + + // Handle create events (oldEntry is nil, newEntry exists) + if oldEntry == nil && newEntry != nil { + dir, name := newEntry.FullPath.DirAndName() + f.EmptyFolderCleaner.OnCreateEvent(dir, name, newEntry.IsDirectory()) + } + + // Handle rename/move events (both exist but paths differ) + if oldEntry != nil && newEntry != nil { + oldDir, oldName := oldEntry.FullPath.DirAndName() + newDir, newName := newEntry.FullPath.DirAndName() + + if oldDir != newDir || oldName != newName { + // Treat old location as delete + f.EmptyFolderCleaner.OnDeleteEvent(oldDir, oldName, oldEntry.IsDirectory(), eventTime) + // Treat new location as create + f.EmptyFolderCleaner.OnCreateEvent(newDir, newName, newEntry.IsDirectory()) + } + } +} + func (f *Filer) logFlushFunc(logBuffer *log_buffer.LogBuffer, startTime, stopTime time.Time, buf []byte, minOffset, maxOffset int64) { if len(buf) == 0 { diff --git a/weed/filer/filer_on_meta_event.go b/weed/filer/filer_on_meta_event.go index acbf4aa47..4ee80b3a6 100644 --- a/weed/filer/filer_on_meta_event.go +++ b/weed/filer/filer_on_meta_event.go @@ -2,6 +2,7 @@ package filer import ( "bytes" + "time" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" @@ -13,6 +14,7 @@ func (f *Filer) onMetadataChangeEvent(event *filer_pb.SubscribeMetadataResponse) f.maybeReloadFilerConfiguration(event) f.maybeReloadRemoteStorageConfigurationAndMapping(event) f.onBucketEvents(event) + f.onEmptyFolderCleanupEvents(event) } func (f *Filer) onBucketEvents(event *filer_pb.SubscribeMetadataResponse) { @@ -32,6 +34,43 @@ func (f *Filer) onBucketEvents(event *filer_pb.SubscribeMetadataResponse) { } } +// onEmptyFolderCleanupEvents handles create/delete events for empty folder cleanup +func (f *Filer) onEmptyFolderCleanupEvents(event *filer_pb.SubscribeMetadataResponse) { + if f.EmptyFolderCleaner == nil || !f.EmptyFolderCleaner.IsEnabled() { + return + } + + message := event.EventNotification + directory := event.Directory + eventTime := time.Unix(0, event.TsNs) + + // Handle delete events - trigger folder cleanup check + if filer_pb.IsDelete(event) && message.OldEntry != nil { + f.EmptyFolderCleaner.OnDeleteEvent(directory, message.OldEntry.Name, message.OldEntry.IsDirectory, eventTime) + } + + // Handle create events - cancel pending cleanup for the folder + if filer_pb.IsCreate(event) && message.NewEntry != nil { + f.EmptyFolderCleaner.OnCreateEvent(directory, message.NewEntry.Name, message.NewEntry.IsDirectory) + } + + // Handle rename/move events + if filer_pb.IsRename(event) { + // Treat the old location as a delete + if message.OldEntry != nil { + f.EmptyFolderCleaner.OnDeleteEvent(directory, message.OldEntry.Name, message.OldEntry.IsDirectory, eventTime) + } + // Treat the new location as a create + if message.NewEntry != nil { + newDir := message.NewParentPath + if newDir == "" { + newDir = directory + } + f.EmptyFolderCleaner.OnCreateEvent(newDir, message.NewEntry.Name, message.NewEntry.IsDirectory) + } + } +} + func (f *Filer) maybeReloadFilerConfiguration(event *filer_pb.SubscribeMetadataResponse) { if DirectoryEtcSeaweedFS != event.Directory { if DirectoryEtcSeaweedFS != event.EventNotification.NewParentPath { diff --git a/weed/filer/filer_search.go b/weed/filer/filer_search.go index 294fc0e7f..e6366e82f 100644 --- a/weed/filer/filer_search.go +++ b/weed/filer/filer_search.go @@ -41,6 +41,19 @@ func (f *Filer) ListDirectoryEntries(ctx context.Context, p util.FullPath, start return entries, hasMore, err } +// CountDirectoryEntries counts entries in a directory up to limit +func (f *Filer) CountDirectoryEntries(ctx context.Context, p util.FullPath, limit int) (count int, err error) { + entries, hasMore, err := f.ListDirectoryEntries(ctx, p, "", false, int64(limit), "", "", "") + if err != nil { + return 0, err + } + count = len(entries) + if hasMore { + count = limit // At least this many + } + return count, nil +} + // For now, prefix and namePattern are mutually exclusive func (f *Filer) StreamListDirectoryEntries(ctx context.Context, p util.FullPath, startFileName string, inclusive bool, limit int64, prefix string, namePattern string, namePatternExclude string, eachEntryFunc ListEachEntryFunc) (lastFileName string, err error) { if strings.HasSuffix(string(p), "/") && len(p) > 1 { diff --git a/weed/s3api/s3api_object_handlers_delete.go b/weed/s3api/s3api_object_handlers_delete.go index f779a6edc..6e373bb4e 100644 --- a/weed/s3api/s3api_object_handlers_delete.go +++ b/weed/s3api/s3api_object_handlers_delete.go @@ -1,12 +1,10 @@ package s3api import ( - "context" "encoding/xml" "fmt" "io" "net/http" - "slices" "strings" "github.com/seaweedfs/seaweedfs/weed/filer" @@ -127,22 +125,9 @@ func (s3a *S3ApiServer) DeleteObjectHandler(w http.ResponseWriter, r *http.Reque dir, name := target.DirAndName() err := s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error { - // Use operation context that won't be cancelled if request terminates - // This ensures deletion completes atomically to avoid inconsistent state - opCtx := context.WithoutCancel(r.Context()) - - if err := doDeleteEntry(client, dir, name, true, false); err != nil { - return err - } - - // Cleanup empty directories - if !s3a.option.AllowEmptyFolder && strings.LastIndex(object, "/") > 0 { - bucketPath := fmt.Sprintf("%s/%s", s3a.option.BucketsPath, bucket) - // Recursively delete empty parent directories, stop at bucket path - filer_pb.DoDeleteEmptyParentDirectories(opCtx, client, util.FullPath(dir), util.FullPath(bucketPath), nil) - } - - return nil + return doDeleteEntry(client, dir, name, true, false) + // Note: Empty folder cleanup is now handled asynchronously by EmptyFolderCleaner + // which listens to metadata events and uses consistent hashing for coordination }) if err != nil { s3err.WriteErrorResponse(w, r, s3err.ErrInternalError) @@ -222,8 +207,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h var deleteErrors []DeleteError var auditLog *s3err.AccessLog - directoriesWithDeletion := make(map[string]bool) - if s3err.Logger != nil { auditLog = s3err.GetAccessLog(r, http.StatusNoContent, s3err.ErrNone) } @@ -245,10 +228,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h versioningConfigured := (versioningState != "") s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error { - // Use operation context that won't be cancelled if request terminates - // This ensures batch deletion completes atomically to avoid inconsistent state - opCtx := context.WithoutCancel(r.Context()) - // delete file entries for _, object := range deleteObjects.Objects { if object.Key == "" { @@ -357,10 +336,6 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h err := doDeleteEntry(client, parentDirectoryPath, entryName, isDeleteData, isRecursive) if err == nil { - // Track directory for empty directory cleanup - if !s3a.option.AllowEmptyFolder { - directoriesWithDeletion[parentDirectoryPath] = true - } deletedObjects = append(deletedObjects, object) } else if strings.Contains(err.Error(), filer.MsgFailDelNonEmptyFolder) { deletedObjects = append(deletedObjects, object) @@ -380,30 +355,8 @@ func (s3a *S3ApiServer) DeleteMultipleObjectsHandler(w http.ResponseWriter, r *h } } - // Cleanup empty directories - optimize by processing deepest first - if !s3a.option.AllowEmptyFolder && len(directoriesWithDeletion) > 0 { - bucketPath := fmt.Sprintf("%s/%s", s3a.option.BucketsPath, bucket) - - // Collect and sort directories by depth (deepest first) to avoid redundant checks - var allDirs []string - for dirPath := range directoriesWithDeletion { - allDirs = append(allDirs, dirPath) - } - // Sort by depth (deeper directories first) - slices.SortFunc(allDirs, func(a, b string) int { - return strings.Count(b, "/") - strings.Count(a, "/") - }) - - // Track already-checked directories to avoid redundant work - checked := make(map[string]bool) - for _, dirPath := range allDirs { - if !checked[dirPath] { - // Recursively delete empty parent directories, stop at bucket path - // Mark this directory and all its parents as checked during recursion - filer_pb.DoDeleteEmptyParentDirectories(opCtx, client, util.FullPath(dirPath), util.FullPath(bucketPath), checked) - } - } - } + // Note: Empty folder cleanup is now handled asynchronously by EmptyFolderCleaner + // which listens to metadata events and uses consistent hashing for coordination return nil }) -- cgit v1.2.3 From 8d110b29ddfd9b9cdb504a4380106b2b287155ca Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 4 Dec 2025 10:40:01 -0800 Subject: fmt --- .github/workflows/container_release_unified.yml | 1 + .github/workflows/sftp-tests.yml | 1 + test/sftp/README.md | 1 + test/sftp/testdata/userstore.json | 1 + weed/filer/empty_folder_cleanup/cleanup_queue.go | 1 + weed/filer/empty_folder_cleanup/cleanup_queue_test.go | 1 + 6 files changed, 6 insertions(+) diff --git a/.github/workflows/container_release_unified.yml b/.github/workflows/container_release_unified.yml index eb8df9834..c7aa648fd 100644 --- a/.github/workflows/container_release_unified.yml +++ b/.github/workflows/container_release_unified.yml @@ -223,3 +223,4 @@ jobs: echo "✓ Successfully copied ${{ matrix.variant }} to Docker Hub" + diff --git a/.github/workflows/sftp-tests.yml b/.github/workflows/sftp-tests.yml index d2ec47eb4..80a1b9929 100644 --- a/.github/workflows/sftp-tests.yml +++ b/.github/workflows/sftp-tests.yml @@ -90,3 +90,4 @@ jobs: echo "| testuser | /sftp/testuser | Home directory only |" >> $GITHUB_STEP_SUMMARY echo "| readonly | /public | Read-only |" >> $GITHUB_STEP_SUMMARY + diff --git a/test/sftp/README.md b/test/sftp/README.md index e2908f166..17b5e67c7 100644 --- a/test/sftp/README.md +++ b/test/sftp/README.md @@ -89,3 +89,4 @@ To debug test failures: config.EnableDebug = true ``` + diff --git a/test/sftp/testdata/userstore.json b/test/sftp/testdata/userstore.json index 540a9486d..66d78dd1d 100644 --- a/test/sftp/testdata/userstore.json +++ b/test/sftp/testdata/userstore.json @@ -34,3 +34,4 @@ } ] + diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue.go b/weed/filer/empty_folder_cleanup/cleanup_queue.go index 66889e930..f92af389d 100644 --- a/weed/filer/empty_folder_cleanup/cleanup_queue.go +++ b/weed/filer/empty_folder_cleanup/cleanup_queue.go @@ -204,3 +204,4 @@ func (q *CleanupQueue) OldestAge() time.Duration { return time.Since(item.queueTime) } + diff --git a/weed/filer/empty_folder_cleanup/cleanup_queue_test.go b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go index eda1c3633..2effa3138 100644 --- a/weed/filer/empty_folder_cleanup/cleanup_queue_test.go +++ b/weed/filer/empty_folder_cleanup/cleanup_queue_test.go @@ -368,3 +368,4 @@ func TestCleanupQueue_Concurrent(t *testing.T) { _ = q.Len() } + -- cgit v1.2.3