aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Lu <chris.lu@gmail.com>2020-09-11 00:29:25 -0700
committerChris Lu <chris.lu@gmail.com>2020-09-11 00:29:25 -0700
commite60b2117c3bf36f9a5ff4a1cf1b9216124546f8b (patch)
treeaabca8ef6dc26400b4daa7adba133c12d2f4a34f
parent89a62e8007e50f61e80c925d694e051ad220a516 (diff)
downloadseaweedfs-e60b2117c3bf36f9a5ff4a1cf1b9216124546f8b.tar.xz
seaweedfs-e60b2117c3bf36f9a5ff4a1cf1b9216124546f8b.zip
shell: volume balance follows replica placement
-rw-r--r--weed/shell/command_volume_balance.go63
-rw-r--r--weed/shell/command_volume_balance_test.go139
-rw-r--r--weed/shell/command_volume_fix_replication.go29
3 files changed, 207 insertions, 24 deletions
diff --git a/weed/shell/command_volume_balance.go b/weed/shell/command_volume_balance.go
index 005236806..4f96c3e08 100644
--- a/weed/shell/command_volume_balance.go
+++ b/weed/shell/command_volume_balance.go
@@ -4,6 +4,7 @@ import (
"context"
"flag"
"fmt"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
"io"
"os"
"sort"
@@ -83,6 +84,7 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer
}
typeToNodes := collectVolumeServersByType(resp.TopologyInfo, *dc)
+ volumeReplicas, _ := collectVolumeReplicaLocations(resp)
for maxVolumeCount, volumeServers := range typeToNodes {
if len(volumeServers) < 2 {
@@ -95,16 +97,16 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer
return err
}
for _, c := range collections {
- if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, c, *applyBalancing); err != nil {
+ if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, c, *applyBalancing); err != nil {
return err
}
}
} else if *collection == "ALL_COLLECTIONS" {
- if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, "ALL_COLLECTIONS", *applyBalancing); err != nil {
+ if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, "ALL_COLLECTIONS", *applyBalancing); err != nil {
return err
}
} else {
- if err = balanceVolumeServers(commandEnv, volumeServers, resp.VolumeSizeLimitMb*1024*1024, *collection, *applyBalancing); err != nil {
+ if err = balanceVolumeServers(commandEnv, volumeReplicas, volumeServers, resp.VolumeSizeLimitMb*1024*1024, *collection, *applyBalancing); err != nil {
return err
}
}
@@ -113,7 +115,7 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer
return nil
}
-func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit uint64, collection string, applyBalancing bool) error {
+func balanceVolumeServers(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, volumeSizeLimit uint64, collection string, applyBalancing bool) error {
// balance writable volumes
for _, n := range nodes {
@@ -126,7 +128,7 @@ func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit
return !v.ReadOnly && v.Size < volumeSizeLimit
})
}
- if err := balanceSelectedVolume(commandEnv, nodes, sortWritableVolumes, applyBalancing); err != nil {
+ if err := balanceSelectedVolume(commandEnv, volumeReplicas, nodes, sortWritableVolumes, applyBalancing); err != nil {
return err
}
@@ -141,7 +143,7 @@ func balanceVolumeServers(commandEnv *CommandEnv, nodes []*Node, volumeSizeLimit
return v.ReadOnly || v.Size >= volumeSizeLimit
})
}
- if err := balanceSelectedVolume(commandEnv, nodes, sortReadOnlyVolumes, applyBalancing); err != nil {
+ if err := balanceSelectedVolume(commandEnv, volumeReplicas, nodes, sortReadOnlyVolumes, applyBalancing); err != nil {
return err
}
@@ -186,7 +188,7 @@ func sortReadOnlyVolumes(volumes []*master_pb.VolumeInformationMessage) {
})
}
-func balanceSelectedVolume(commandEnv *CommandEnv, nodes []*Node, sortCandidatesFn func(volumes []*master_pb.VolumeInformationMessage), applyBalancing bool) (err error) {
+func balanceSelectedVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, sortCandidatesFn func(volumes []*master_pb.VolumeInformationMessage), applyBalancing bool) (err error) {
selectedVolumeCount := 0
for _, dn := range nodes {
selectedVolumeCount += len(dn.selectedVolumes)
@@ -216,7 +218,7 @@ func balanceSelectedVolume(commandEnv *CommandEnv, nodes []*Node, sortCandidates
// no more volume servers with empty slots
break
}
- hasMoved, err = attemptToMoveOneVolume(commandEnv, fullNode, candidateVolumes, emptyNode, applyBalancing)
+ hasMoved, err = attemptToMoveOneVolume(commandEnv, volumeReplicas, fullNode, candidateVolumes, emptyNode, applyBalancing)
if err != nil {
return
}
@@ -229,13 +231,12 @@ func balanceSelectedVolume(commandEnv *CommandEnv, nodes []*Node, sortCandidates
return nil
}
-func attemptToMoveOneVolume(commandEnv *CommandEnv, fullNode *Node, candidateVolumes []*master_pb.VolumeInformationMessage, emptyNode *Node, applyBalancing bool) (hasMoved bool, err error) {
+func attemptToMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolumes []*master_pb.VolumeInformationMessage, emptyNode *Node, applyBalancing bool) (hasMoved bool, err error) {
for _, v := range candidateVolumes {
if v.ReplicaPlacement > 0 {
- if fullNode.dc != emptyNode.dc && fullNode.rack != emptyNode.rack {
- // TODO this logic is too simple, but should work most of the time
- // Need a correct algorithm to handle all different cases
+ replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(v.ReplicaPlacement))
+ if !isGoodMove(replicaPlacement, volumeReplicas[v.Id], fullNode, emptyNode) {
continue
}
}
@@ -273,3 +274,41 @@ func (node *Node) selectVolumes(fn func(v *master_pb.VolumeInformationMessage) b
}
}
}
+
+func isGoodMove(placement *super_block.ReplicaPlacement, existingReplicas []*VolumeReplica, sourceNode, targetNode *Node) bool {
+ for _, replica := range existingReplicas {
+ if replica.location.dataNode.Id == targetNode.info.Id &&
+ replica.location.rack == targetNode.rack &&
+ replica.location.dc == targetNode.dc {
+ // never move to existing nodes
+ return false
+ }
+ }
+ dcs, racks := make(map[string]bool), make(map[string]int)
+ for _, replica := range existingReplicas {
+ if replica.location.dataNode.Id != sourceNode.info.Id {
+ dcs[replica.location.DataCenter()] = true
+ racks[replica.location.Rack()]++
+ }
+ }
+
+ dcs[targetNode.dc] = true
+ racks[fmt.Sprintf("%s %s", targetNode.dc, targetNode.rack)]++
+
+ if len(dcs) > placement.DiffDataCenterCount+1 {
+ return false
+ }
+
+ if len(racks) > placement.DiffRackCount+1 {
+ return false
+ }
+
+ for _, sameRackCount := range racks {
+ if sameRackCount > placement.SameRackCount+1 {
+ return false
+ }
+ }
+
+ return true
+
+}
diff --git a/weed/shell/command_volume_balance_test.go b/weed/shell/command_volume_balance_test.go
new file mode 100644
index 000000000..7919d60c0
--- /dev/null
+++ b/weed/shell/command_volume_balance_test.go
@@ -0,0 +1,139 @@
+package shell
+
+import (
+ "testing"
+
+ "github.com/chrislusf/seaweedfs/weed/pb/master_pb"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
+)
+
+type testMoveCase struct {
+ name string
+ replication string
+ replicas []*VolumeReplica
+ sourceLocation location
+ targetLocation location
+ expected bool
+}
+
+func TestIsGoodMove(t *testing.T) {
+
+ var tests = []testMoveCase{
+ {
+ name: "test move to the same node",
+ replication: "001",
+ replicas: []*VolumeReplica{
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}},
+ },
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}},
+ },
+ },
+ sourceLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}},
+ targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}},
+ expected: false,
+ },
+
+ {
+ name: "test move to the same rack, but existing node",
+ replication: "001",
+ replicas: []*VolumeReplica{
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}},
+ },
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}},
+ },
+ },
+ sourceLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}},
+ targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}},
+ expected: false,
+ },
+
+ {
+ name: "test move to the same rack, a new node",
+ replication: "001",
+ replicas: []*VolumeReplica{
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}},
+ },
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}},
+ },
+ },
+ sourceLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn2"}},
+ targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn3"}},
+ expected: true,
+ },
+
+ {
+ name: "test 010 move all to the same rack",
+ replication: "010",
+ replicas: []*VolumeReplica{
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}},
+ },
+ {
+ location: &location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}},
+ },
+ },
+ sourceLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}},
+ targetLocation: location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn3"}},
+ expected: false,
+ },
+
+ {
+ name: "test 010 move to spread racks",
+ replication: "010",
+ replicas: []*VolumeReplica{
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}},
+ },
+ {
+ location: &location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}},
+ },
+ },
+ sourceLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}},
+ targetLocation: location{"dc1", "r3", &master_pb.DataNodeInfo{Id: "dn3"}},
+ expected: true,
+ },
+
+ {
+ name: "test 010 move to spread racks",
+ replication: "010",
+ replicas: []*VolumeReplica{
+ {
+ location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "dn1"}},
+ },
+ {
+ location: &location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}},
+ },
+ },
+ sourceLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn2"}},
+ targetLocation: location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "dn3"}},
+ expected: true,
+ },
+
+ }
+
+ for _, tt := range tests {
+ replicaPlacement, _ := super_block.NewReplicaPlacementFromString(tt.replication)
+ println("replication:", tt.replication, "expected", tt.expected, "name:", tt.name)
+ sourceNode := &Node{
+ info: tt.sourceLocation.dataNode,
+ dc: tt.sourceLocation.dc,
+ rack: tt.sourceLocation.rack,
+ }
+ targetNode := &Node{
+ info: tt.targetLocation.dataNode,
+ dc: tt.targetLocation.dc,
+ rack: tt.targetLocation.rack,
+ }
+ if isGoodMove(replicaPlacement, tt.replicas, sourceNode, targetNode) != tt.expected {
+ t.Errorf("%s: expect %v move from %v to %s, replication:%v",
+ tt.name, tt.expected, tt.sourceLocation, tt.targetLocation, tt.replication)
+ }
+ }
+
+}
diff --git a/weed/shell/command_volume_fix_replication.go b/weed/shell/command_volume_fix_replication.go
index 061a58891..b32ccaaab 100644
--- a/weed/shell/command_volume_fix_replication.go
+++ b/weed/shell/command_volume_fix_replication.go
@@ -66,18 +66,7 @@ func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv,
// find all volumes that needs replication
// collect all data nodes
- volumeReplicas := make(map[uint32][]*VolumeReplica)
- var allLocations []location
- eachDataNode(resp.TopologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) {
- loc := newLocation(dc, string(rack), dn)
- for _, v := range dn.VolumeInfos {
- volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{
- location: &loc,
- info: v,
- })
- }
- allLocations = append(allLocations, loc)
- })
+ volumeReplicas, allLocations := collectVolumeReplicaLocations(resp)
if len(allLocations) == 0 {
return fmt.Errorf("no data nodes at all")
@@ -111,6 +100,22 @@ func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv,
}
+func collectVolumeReplicaLocations(resp *master_pb.VolumeListResponse) (map[uint32][]*VolumeReplica, []location) {
+ volumeReplicas := make(map[uint32][]*VolumeReplica)
+ var allLocations []location
+ eachDataNode(resp.TopologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) {
+ loc := newLocation(dc, string(rack), dn)
+ for _, v := range dn.VolumeInfos {
+ volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{
+ location: &loc,
+ info: v,
+ })
+ }
+ allLocations = append(allLocations, loc)
+ })
+ return volumeReplicas, allLocations
+}
+
func (c *commandVolumeFixReplication) fixOverReplicatedVolumes(commandEnv *CommandEnv, writer io.Writer, takeAction bool, overReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location) error {
for _, vid := range overReplicatedVolumeIds {
replicas := volumeReplicas[vid]