diff options
Diffstat (limited to 'weed/shell/command_volume_balance.go')
| -rw-r--r-- | weed/shell/command_volume_balance.go | 238 |
1 files changed, 162 insertions, 76 deletions
diff --git a/weed/shell/command_volume_balance.go b/weed/shell/command_volume_balance.go index 69e3c7fd9..53222ca29 100644 --- a/weed/shell/command_volume_balance.go +++ b/weed/shell/command_volume_balance.go @@ -4,6 +4,7 @@ import ( "context" "flag" "fmt" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" "io" "os" "sort" @@ -39,14 +40,15 @@ func (c *commandVolumeBalance) Help() string { } func balanceWritableVolumes(){ - idealWritableVolumes = totalWritableVolumes / numVolumeServers + idealWritableVolumeRatio = totalWritableVolumes / totalNumberOfMaxVolumes for hasMovedOneVolume { - sort all volume servers ordered by the number of local writable volumes - pick the volume server A with the lowest number of writable volumes x - pick the volume server B with the highest number of writable volumes y - if y > idealWritableVolumes and x +1 <= idealWritableVolumes { - if B has a writable volume id v that A does not have { - move writable volume v from A to B + sort all volume servers ordered by the localWritableVolumeRatio = localWritableVolumes to localVolumeMax + pick the volume server B with the highest localWritableVolumeRatio y + for any the volume server A with the number of writable volumes x + 1 <= idealWritableVolumeRatio * localVolumeMax { + if y > localWritableVolumeRatio { + if B has a writable volume id v that A does not have, and satisfy v replication requirements { + move writable volume v from A to B + } } } } @@ -81,38 +83,33 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer return err } - typeToNodes := collectVolumeServersByType(resp.TopologyInfo, *dc) + volumeServers := collectVolumeServersByDc(resp.TopologyInfo, *dc) + volumeReplicas, _ := collectVolumeReplicaLocations(resp) - for maxVolumeCount, volumeServers := range typeToNodes { - if len(volumeServers) < 2 { - fmt.Printf("only 1 node is configured max %d volumes, skipping balancing\n", maxVolumeCount) - continue + if *collection == "EACH_COLLECTION" { + collections, err := ListCollectionNames(commandEnv, true, false) + if err != nil { + return err } - if *collection == "EACH_COLLECTION" { - collections, err := ListCollectionNames(commandEnv, true, false) - if err != nil { - return err - } - for _, c := range collections { - if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, c, *applyBalancing); err != nil { - return err - } - } - } else if *collection == "ALL_COLLECTIONS" { - if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, "ALL_COLLECTIONS", *applyBalancing); err != nil { - return err - } - } else { - if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, *collection, *applyBalancing); err != nil { + for _, c := range collections { + if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, c, *applyBalancing); err != nil { return err } } - + } else if *collection == "ALL_COLLECTIONS" { + if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, "ALL_COLLECTIONS", *applyBalancing); err != nil { + return err + } + } else { + if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, *collection, *applyBalancing); err != nil { + return err + } } + return nil } -func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit uint64, collection string, applyBalancing bool) error { +func balanceVolumeServers(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, volumeSizeLimit uint64, collection string, applyBalancing bool) error { // balance writable volumes for _, n := range nodes { @@ -125,7 +122,7 @@ func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit return !v.ReadOnly && v.Size < volumeSizeLimit }) } - if err := balanceSelectedVolume(commandEnv, nodes, sortWritableVolumes, applyBalancing); err != nil { + if err := balanceSelectedVolume(commandEnv, volumeReplicas, nodes, sortWritableVolumes, applyBalancing); err != nil { return err } @@ -140,22 +137,21 @@ func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit return v.ReadOnly || v.Size >= volumeSizeLimit }) } - if err := balanceSelectedVolume(commandEnv, nodes, sortReadOnlyVolumes, applyBalancing); err != nil { + if err := balanceSelectedVolume(commandEnv, volumeReplicas, nodes, sortReadOnlyVolumes, applyBalancing); err != nil { return err } return nil } -func collectVolumeServersByType(t *master_pb.TopologyInfo, selectedDataCenter string) (typeToNodes map[uint64][]*Node) { - typeToNodes = make(map[uint64][]*Node) +func collectVolumeServersByDc(t *master_pb.TopologyInfo, selectedDataCenter string) (nodes []*Node) { for _, dc := range t.DataCenterInfos { if selectedDataCenter != "" && dc.Id != selectedDataCenter { continue } for _, r := range dc.RackInfos { for _, dn := range r.DataNodeInfos { - typeToNodes[dn.MaxVolumeCount] = append(typeToNodes[dn.MaxVolumeCount], &Node{ + nodes = append(nodes, &Node{ info: dn, dc: dc.Id, rack: r.Id, @@ -173,6 +169,23 @@ type Node struct { rack string } +func (n *Node) localVolumeRatio() float64 { + return divide(len(n.selectedVolumes), int(n.info.MaxVolumeCount)) +} + +func (n *Node) localVolumeNextRatio() float64 { + return divide(len(n.selectedVolumes)+1, int(n.info.MaxVolumeCount)) +} + +func (n *Node) selectVolumes(fn func(v *master_pb.VolumeInformationMessage) bool) { + n.selectedVolumes = make(map[uint32]*master_pb.VolumeInformationMessage) + for _, v := range n.info.VolumeInfos { + if fn(v) { + n.selectedVolumes[v.Id] = v + } + } +} + func sortWritableVolumes(volumes []*master_pb.VolumeInformationMessage) { sort.Slice(volumes, func(i, j int) bool { return volumes[i].Size < volumes[j].Size @@ -185,73 +198,146 @@ func sortReadOnlyVolumes(volumes []*master_pb.VolumeInformationMessage) { }) } -func balanceSelectedVolume(commandEnv *CommandEnv, nodes []*Node, sortCandidatesFn func(volumes []*master_pb.VolumeInformationMessage), applyBalancing bool) error { - selectedVolumeCount := 0 +func balanceSelectedVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, sortCandidatesFn func(volumes []*master_pb.VolumeInformationMessage), applyBalancing bool) (err error) { + selectedVolumeCount, volumeMaxCount := 0, 0 for _, dn := range nodes { selectedVolumeCount += len(dn.selectedVolumes) + volumeMaxCount += int(dn.info.MaxVolumeCount) } - idealSelectedVolumes := ceilDivide(selectedVolumeCount, len(nodes)) + idealVolumeRatio := divide(selectedVolumeCount, volumeMaxCount) - hasMove := true + hasMoved := true - for hasMove { - hasMove = false + for hasMoved { + hasMoved = false sort.Slice(nodes, func(i, j int) bool { - // TODO sort by free volume slots??? - return len(nodes[i].selectedVolumes) < len(nodes[j].selectedVolumes) + return nodes[i].localVolumeRatio() < nodes[j].localVolumeRatio() }) - emptyNode, fullNode := nodes[0], nodes[len(nodes)-1] - if len(fullNode.selectedVolumes) > idealSelectedVolumes && len(emptyNode.selectedVolumes)+1 <= idealSelectedVolumes { - // sort the volumes to move - var candidateVolumes []*master_pb.VolumeInformationMessage - for _, v := range fullNode.selectedVolumes { - candidateVolumes = append(candidateVolumes, v) + fullNode := nodes[len(nodes)-1] + var candidateVolumes []*master_pb.VolumeInformationMessage + for _, v := range fullNode.selectedVolumes { + candidateVolumes = append(candidateVolumes, v) + } + sortCandidatesFn(candidateVolumes) + + for i := 0; i < len(nodes)-1; i++ { + emptyNode := nodes[i] + if !(fullNode.localVolumeRatio() > idealVolumeRatio && emptyNode.localVolumeNextRatio() <= idealVolumeRatio) { + // no more volume servers with empty slots + break } - sortCandidatesFn(candidateVolumes) - - for _, v := range candidateVolumes { - if v.ReplicaPlacement > 0 { - if fullNode.dc != emptyNode.dc && fullNode.rack != emptyNode.rack { - // TODO this logic is too simple, but should work most of the time - // Need a correct algorithm to handle all different cases - continue - } - } - if _, found := emptyNode.selectedVolumes[v.Id]; !found { - if err := moveVolume(commandEnv, v, fullNode, emptyNode, applyBalancing); err == nil { - delete(fullNode.selectedVolumes, v.Id) - emptyNode.selectedVolumes[v.Id] = v - hasMove = true - break - } else { - return err - } - } + hasMoved, err = attemptToMoveOneVolume(commandEnv, volumeReplicas, fullNode, candidateVolumes, emptyNode, applyBalancing) + if err != nil { + return + } + if hasMoved { + // moved one volume + break } } } return nil } -func moveVolume(commandEnv *CommandEnv, v *master_pb.VolumeInformationMessage, fullNode *Node, emptyNode *Node, applyBalancing bool) error { +func attemptToMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolumes []*master_pb.VolumeInformationMessage, emptyNode *Node, applyBalancing bool) (hasMoved bool, err error) { + + for _, v := range candidateVolumes { + hasMoved, err = maybeMoveOneVolume(commandEnv, volumeReplicas, fullNode, v, emptyNode, applyBalancing) + if err != nil { + return + } + if hasMoved { + break + } + } + return +} + +func maybeMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolume *master_pb.VolumeInformationMessage, emptyNode *Node, applyChange bool) (hasMoved bool, err error) { + + if candidateVolume.ReplicaPlacement > 0 { + replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(candidateVolume.ReplicaPlacement)) + if !isGoodMove(replicaPlacement, volumeReplicas[candidateVolume.Id], fullNode, emptyNode) { + return false, nil + } + } + if _, found := emptyNode.selectedVolumes[candidateVolume.Id]; !found { + if err = moveVolume(commandEnv, candidateVolume, fullNode, emptyNode, applyChange); err == nil { + adjustAfterMove(candidateVolume, volumeReplicas, fullNode, emptyNode) + return true, nil + } else { + return + } + } + return +} + +func moveVolume(commandEnv *CommandEnv, v *master_pb.VolumeInformationMessage, fullNode *Node, emptyNode *Node, applyChange bool) error { collectionPrefix := v.Collection + "_" if v.Collection == "" { collectionPrefix = "" } fmt.Fprintf(os.Stdout, "moving volume %s%d %s => %s\n", collectionPrefix, v.Id, fullNode.info.Id, emptyNode.info.Id) - if applyBalancing { + if applyChange { return LiveMoveVolume(commandEnv.option.GrpcDialOption, needle.VolumeId(v.Id), fullNode.info.Id, emptyNode.info.Id, 5*time.Second) } return nil } -func (node *Node) selectVolumes(fn func(v *master_pb.VolumeInformationMessage) bool) { - node.selectedVolumes = make(map[uint32]*master_pb.VolumeInformationMessage) - for _, v := range node.info.VolumeInfos { - if fn(v) { - node.selectedVolumes[v.Id] = v +func isGoodMove(placement *super_block.ReplicaPlacement, existingReplicas []*VolumeReplica, sourceNode, targetNode *Node) bool { + for _, replica := range existingReplicas { + if replica.location.dataNode.Id == targetNode.info.Id && + replica.location.rack == targetNode.rack && + replica.location.dc == targetNode.dc { + // never move to existing nodes + return false + } + } + dcs, racks := make(map[string]bool), make(map[string]int) + for _, replica := range existingReplicas { + if replica.location.dataNode.Id != sourceNode.info.Id { + dcs[replica.location.DataCenter()] = true + racks[replica.location.Rack()]++ + } + } + + dcs[targetNode.dc] = true + racks[fmt.Sprintf("%s %s", targetNode.dc, targetNode.rack)]++ + + if len(dcs) > placement.DiffDataCenterCount+1 { + return false + } + + if len(racks) > placement.DiffRackCount+placement.DiffDataCenterCount+1 { + return false + } + + for _, sameRackCount := range racks { + if sameRackCount > placement.SameRackCount+1 { + return false + } + } + + return true + +} + +func adjustAfterMove(v *master_pb.VolumeInformationMessage, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, emptyNode *Node) { + delete(fullNode.selectedVolumes, v.Id) + if emptyNode.selectedVolumes != nil { + emptyNode.selectedVolumes[v.Id] = v + } + existingReplicas := volumeReplicas[v.Id] + for _, replica := range existingReplicas { + if replica.location.dataNode.Id == fullNode.info.Id && + replica.location.rack == fullNode.rack && + replica.location.dc == fullNode.dc { + replica.location.dc = emptyNode.dc + replica.location.rack = emptyNode.rack + replica.location.dataNode = emptyNode.info + return } } } |
