diff options
| author | wyang <wings.wyang@gmail.com> | 2024-11-04 15:20:48 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-11-03 23:20:48 -0800 |
| commit | a7973ed7d1e9b507d2f3982a7b433a20f5c19cc1 (patch) | |
| tree | ae54c3fbf35fcdaa43d43530190dc2a54391e956 | |
| parent | 0f2c3648dc274c045d7049e744bb04849b47cd95 (diff) | |
| download | seaweedfs-a7973ed7d1e9b507d2f3982a7b433a20f5c19cc1.tar.xz seaweedfs-a7973ed7d1e9b507d2f3982a7b433a20f5c19cc1.zip | |
fix deadlock hang when broadcast to clients (#6184)
fix deadlock when broadcast to clients
when master thransfer leader, the old master will disconnect with all
filers and volumeServers, if the cluster is a big , the broadcast
messages may be more big than the max of the channel len 100, then if the
KeepConnect was not listen on the channel in disconnect, it will
deadlock. and the whole cluster will not serve!
| -rw-r--r-- | weed/server/master_grpc_server.go | 17 | ||||
| -rw-r--r-- | weed/stats/metrics.go | 9 |
2 files changed, 21 insertions, 5 deletions
diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go index 256a4be52..dcf279e1d 100644 --- a/weed/server/master_grpc_server.go +++ b/weed/server/master_grpc_server.go @@ -4,12 +4,13 @@ import ( "context" "errors" "fmt" - "github.com/google/uuid" - "github.com/seaweedfs/seaweedfs/weed/cluster" "net" "sort" "time" + "github.com/google/uuid" + "github.com/seaweedfs/seaweedfs/weed/cluster" + "github.com/seaweedfs/seaweedfs/weed/pb" "github.com/seaweedfs/seaweedfs/weed/stats" "github.com/seaweedfs/seaweedfs/weed/storage/backend" @@ -89,7 +90,7 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ glog.V(0).Infof("unregister disconnected volume server %s:%d", dn.Ip, dn.Port) ms.UnRegisterUuids(dn.Ip, dn.Port) - if len(message.DeletedVids) > 0 || len(message.DeletedEcVids) > 0 { + if ms.Topo.IsLeader() && (len(message.DeletedVids) > 0 || len(message.DeletedEcVids) > 0) { ms.broadcastToClients(&master_pb.KeepConnectedResponse{VolumeLocation: message}) } } @@ -338,8 +339,14 @@ func (ms *MasterServer) KeepConnected(stream master_pb.Seaweed_KeepConnectedServ func (ms *MasterServer) broadcastToClients(message *master_pb.KeepConnectedResponse) { ms.clientChansLock.RLock() - for _, ch := range ms.clientChans { - ch <- message + for client, ch := range ms.clientChans { + select { + case ch <- message: + glog.V(4).Infof("send message to %s", client) + default: + stats.MasterBroadcastToFullErrorCounter.Inc() + glog.Errorf("broadcastToClients %s message full", client) + } } ms.clientChansLock.RUnlock() } diff --git a/weed/stats/metrics.go b/weed/stats/metrics.go index 93f80c1f4..9f84a3e70 100644 --- a/weed/stats/metrics.go +++ b/weed/stats/metrics.go @@ -94,6 +94,14 @@ var ( Help: "Counter of master pick for write error", }) + MasterBroadcastToFullErrorCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "master", + Name: "broadcast_to_full", + Help: "Counter of master broadcast send to full message channel err", + }) + MasterLeaderChangeCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: Namespace, @@ -314,6 +322,7 @@ func init() { Gather.MustRegister(MasterReplicaPlacementMismatch) Gather.MustRegister(MasterVolumeLayoutWritable) Gather.MustRegister(MasterVolumeLayoutCrowded) + Gather.MustRegister(MasterBroadcastToFullErrorCounter) Gather.MustRegister(FilerRequestCounter) Gather.MustRegister(FilerHandlerCounter) |
