aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorwyang <wings.wyang@gmail.com>2024-11-04 15:20:48 +0800
committerGitHub <noreply@github.com>2024-11-03 23:20:48 -0800
commita7973ed7d1e9b507d2f3982a7b433a20f5c19cc1 (patch)
treeae54c3fbf35fcdaa43d43530190dc2a54391e956
parent0f2c3648dc274c045d7049e744bb04849b47cd95 (diff)
downloadseaweedfs-a7973ed7d1e9b507d2f3982a7b433a20f5c19cc1.tar.xz
seaweedfs-a7973ed7d1e9b507d2f3982a7b433a20f5c19cc1.zip
fix deadlock hang when broadcast to clients (#6184)
fix deadlock when broadcast to clients when master thransfer leader, the old master will disconnect with all filers and volumeServers, if the cluster is a big , the broadcast messages may be more big than the max of the channel len 100, then if the KeepConnect was not listen on the channel in disconnect, it will deadlock. and the whole cluster will not serve!
-rw-r--r--weed/server/master_grpc_server.go17
-rw-r--r--weed/stats/metrics.go9
2 files changed, 21 insertions, 5 deletions
diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go
index 256a4be52..dcf279e1d 100644
--- a/weed/server/master_grpc_server.go
+++ b/weed/server/master_grpc_server.go
@@ -4,12 +4,13 @@ import (
"context"
"errors"
"fmt"
- "github.com/google/uuid"
- "github.com/seaweedfs/seaweedfs/weed/cluster"
"net"
"sort"
"time"
+ "github.com/google/uuid"
+ "github.com/seaweedfs/seaweedfs/weed/cluster"
+
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/storage/backend"
@@ -89,7 +90,7 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ
glog.V(0).Infof("unregister disconnected volume server %s:%d", dn.Ip, dn.Port)
ms.UnRegisterUuids(dn.Ip, dn.Port)
- if len(message.DeletedVids) > 0 || len(message.DeletedEcVids) > 0 {
+ if ms.Topo.IsLeader() && (len(message.DeletedVids) > 0 || len(message.DeletedEcVids) > 0) {
ms.broadcastToClients(&master_pb.KeepConnectedResponse{VolumeLocation: message})
}
}
@@ -338,8 +339,14 @@ func (ms *MasterServer) KeepConnected(stream master_pb.Seaweed_KeepConnectedServ
func (ms *MasterServer) broadcastToClients(message *master_pb.KeepConnectedResponse) {
ms.clientChansLock.RLock()
- for _, ch := range ms.clientChans {
- ch <- message
+ for client, ch := range ms.clientChans {
+ select {
+ case ch <- message:
+ glog.V(4).Infof("send message to %s", client)
+ default:
+ stats.MasterBroadcastToFullErrorCounter.Inc()
+ glog.Errorf("broadcastToClients %s message full", client)
+ }
}
ms.clientChansLock.RUnlock()
}
diff --git a/weed/stats/metrics.go b/weed/stats/metrics.go
index 93f80c1f4..9f84a3e70 100644
--- a/weed/stats/metrics.go
+++ b/weed/stats/metrics.go
@@ -94,6 +94,14 @@ var (
Help: "Counter of master pick for write error",
})
+ MasterBroadcastToFullErrorCounter = prometheus.NewCounter(
+ prometheus.CounterOpts{
+ Namespace: Namespace,
+ Subsystem: "master",
+ Name: "broadcast_to_full",
+ Help: "Counter of master broadcast send to full message channel err",
+ })
+
MasterLeaderChangeCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
@@ -314,6 +322,7 @@ func init() {
Gather.MustRegister(MasterReplicaPlacementMismatch)
Gather.MustRegister(MasterVolumeLayoutWritable)
Gather.MustRegister(MasterVolumeLayoutCrowded)
+ Gather.MustRegister(MasterBroadcastToFullErrorCounter)
Gather.MustRegister(FilerRequestCounter)
Gather.MustRegister(FilerHandlerCounter)