From d80d8be01208ffe82861b8aa50d4baf599fdfa02 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sat, 13 Dec 2025 12:10:15 -0800 Subject: fix(s3): start KeepConnectedToMaster for filer discovery with filerGroup (#7732) Fixes #7721 When S3 server is configured with a filerGroup, it creates a MasterClient to enable dynamic filer discovery. However, the KeepConnectedToMaster() background goroutine was never started, causing GetMaster() to block indefinitely in WaitUntilConnected(). This resulted in the log message: WaitUntilConnected still waiting for master connection (attempt N)... being logged repeatedly every ~20 seconds. The fix adds the missing 'go masterClient.KeepConnectedToMaster(ctx)' call to properly establish the connection to master servers. Also adds unit tests to verify WaitUntilConnected respects context cancellation. --- weed/s3api/s3api_server.go | 2 + weed/wdclient/masterclient_test.go | 104 +++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 weed/wdclient/masterclient_test.go (limited to 'weed') diff --git a/weed/s3api/s3api_server.go b/weed/s3api/s3api_server.go index 94321814e..4a8368409 100644 --- a/weed/s3api/s3api_server.go +++ b/weed/s3api/s3api_server.go @@ -115,6 +115,8 @@ func NewS3ApiServerWithStore(router *mux.Router, option *S3ApiServerOption, expl masterMap[fmt.Sprintf("master%d", i)] = addr } masterClient := wdclient.NewMasterClient(option.GrpcDialOption, option.FilerGroup, cluster.S3Type, "", "", "", *pb.NewServiceDiscoveryFromMap(masterMap)) + // Start the master client connection loop - required for GetMaster() to work + go masterClient.KeepConnectedToMaster(context.Background()) filerClient = wdclient.NewFilerClient(option.Filers, option.GrpcDialOption, option.DataCenter, &wdclient.FilerClientOption{ MasterClient: masterClient, diff --git a/weed/wdclient/masterclient_test.go b/weed/wdclient/masterclient_test.go new file mode 100644 index 000000000..d88de40c1 --- /dev/null +++ b/weed/wdclient/masterclient_test.go @@ -0,0 +1,104 @@ +package wdclient + +import ( + "context" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/pb" + "google.golang.org/grpc" +) + +// TestWaitUntilConnectedWithoutKeepConnected verifies that WaitUntilConnected +// respects context cancellation when KeepConnectedToMaster is not running. +// This tests the fix for https://github.com/seaweedfs/seaweedfs/issues/7721 +func TestWaitUntilConnectedWithoutKeepConnected(t *testing.T) { + mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{}) + + // Without KeepConnectedToMaster running, WaitUntilConnected should + // respect context cancellation and not block forever + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + + start := time.Now() + mc.WaitUntilConnected(ctx) + elapsed := time.Since(start) + + // Should have returned due to context timeout, not blocked forever + if elapsed > 200*time.Millisecond { + t.Errorf("WaitUntilConnected blocked for %v, expected to return on context timeout", elapsed) + } + + // GetMaster should return empty since we never connected + master := mc.getCurrentMaster() + if master != "" { + t.Errorf("Expected empty master, got %s", master) + } +} + +// TestWaitUntilConnectedReturnsImmediatelyWhenConnected verifies that +// WaitUntilConnected returns immediately when a master is already set. +func TestWaitUntilConnectedReturnsImmediatelyWhenConnected(t *testing.T) { + mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{}) + + // Simulate that KeepConnectedToMaster has already established a connection + mc.setCurrentMaster("localhost:9333") + + ctx := context.Background() + start := time.Now() + mc.WaitUntilConnected(ctx) + elapsed := time.Since(start) + + // Should return almost immediately (< 10ms) + if elapsed > 10*time.Millisecond { + t.Errorf("WaitUntilConnected took %v when master was already set, expected immediate return", elapsed) + } + + // Verify master is returned + master := mc.getCurrentMaster() + if master != "localhost:9333" { + t.Errorf("Expected master localhost:9333, got %s", master) + } +} + +// TestGetMasterRespectsContextCancellation verifies that GetMaster +// respects context cancellation and doesn't block forever. +func TestGetMasterRespectsContextCancellation(t *testing.T) { + mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{}) + + // GetMaster calls WaitUntilConnected internally + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + + start := time.Now() + master := mc.GetMaster(ctx) + elapsed := time.Since(start) + + // Should return on context timeout + if elapsed > 200*time.Millisecond { + t.Errorf("GetMaster blocked for %v, expected to return on context timeout", elapsed) + } + + // Master should be empty since we never connected + if master != "" { + t.Errorf("Expected empty master when not connected, got %s", master) + } +} + +// TestMasterClientFilerGroupLogging verifies the FilerGroup is properly set +// and would be logged correctly (regression test for issue #7721 log message format) +func TestMasterClientFilerGroupLogging(t *testing.T) { + filerGroup := "filer_1" + clientType := "s3" + + mc := NewMasterClient(grpc.EmptyDialOption{}, filerGroup, clientType, "", "", "", pb.ServerDiscovery{}) + + if mc.FilerGroup != filerGroup { + t.Errorf("Expected FilerGroup %s, got %s", filerGroup, mc.FilerGroup) + } + + if mc.clientType != clientType { + t.Errorf("Expected clientType %s, got %s", clientType, mc.clientType) + } +} + -- cgit v1.2.3