aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Lu <chris.lu@gmail.com>2019-05-25 02:02:44 -0700
committerChris Lu <chris.lu@gmail.com>2019-05-25 02:02:44 -0700
commitf0e6574d5ed03446b9b221653b20618c0e11b381 (patch)
treed54d1ce0fd6254ee9dc0a3b050e336d74a68885e
parent6f4b09b6a46f63eaebbbc23198d3ec73754ed11d (diff)
downloadseaweedfs-f0e6574d5ed03446b9b221653b20618c0e11b381.tar.xz
seaweedfs-f0e6574d5ed03446b9b221653b20618c0e11b381.zip
allocate ec shards to volume servers
-rw-r--r--weed/shell/command_ec_encode.go154
-rw-r--r--weed/storage/erasure_coding/ec_encoder.go7
-rw-r--r--weed/storage/erasure_coding/ec_test.go4
-rw-r--r--weed/storage/erasure_coding/ec_volume_info.go13
-rw-r--r--weed/topology/data_node_ec.go4
-rw-r--r--weed/topology/topology_ec.go4
6 files changed, 172 insertions, 14 deletions
diff --git a/weed/shell/command_ec_encode.go b/weed/shell/command_ec_encode.go
index 80a0ccf5c..4647c2507 100644
--- a/weed/shell/command_ec_encode.go
+++ b/weed/shell/command_ec_encode.go
@@ -5,10 +5,15 @@ import (
"flag"
"fmt"
"io"
+ "sort"
+ "sync"
"github.com/chrislusf/seaweedfs/weed/operation"
+ "github.com/chrislusf/seaweedfs/weed/pb/master_pb"
"github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb"
+ "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "github.com/chrislusf/seaweedfs/weed/wdclient"
"google.golang.org/grpc"
)
@@ -53,18 +58,28 @@ func (c *commandEcEncode) Do(args []string, commandEnv *commandEnv, writer io.Wr
ctx := context.Background()
+ // find volume location
locations := commandEnv.masterClient.GetLocations(uint32(*volumeId))
-
if len(locations) == 0 {
return fmt.Errorf("volume %d not found", *volumeId)
}
- err = generateEcSlices(ctx, commandEnv.option.GrpcDialOption, needle.VolumeId(*volumeId), locations[0].Url)
+ // generate ec shards
+ err = generateEcShards(ctx, commandEnv.option.GrpcDialOption, needle.VolumeId(*volumeId), locations[0].Url)
+ if err != nil {
+ return fmt.Errorf("generate ec shards for volume %d on %s: %v", *volumeId, locations[0].Url, err)
+ }
+
+ // balance the ec shards to current cluster
+ err = balanceEcShards(ctx, commandEnv, needle.VolumeId(*volumeId), locations[0])
+ if err != nil {
+ return fmt.Errorf("balance ec shards for volume %d on %s: %v", *volumeId, locations[0].Url, err)
+ }
return err
}
-func generateEcSlices(ctx context.Context, grpcDialOption grpc.DialOption, volumeId needle.VolumeId, sourceVolumeServer string) error {
+func generateEcShards(ctx context.Context, grpcDialOption grpc.DialOption, volumeId needle.VolumeId, sourceVolumeServer string) error {
err := operation.WithVolumeServerClient(sourceVolumeServer, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
_, genErr := volumeServerClient.VolumeEcGenerateSlices(ctx, &volume_server_pb.VolumeEcGenerateSlicesRequest{
@@ -76,3 +91,136 @@ func generateEcSlices(ctx context.Context, grpcDialOption grpc.DialOption, volum
return err
}
+
+func balanceEcShards(ctx context.Context, commandEnv *commandEnv, volumeId needle.VolumeId, existingLocation wdclient.Location) (err error) {
+
+ // list all possible locations
+ var resp *master_pb.VolumeListResponse
+ err = commandEnv.masterClient.WithClient(ctx, func(client master_pb.SeaweedClient) error {
+ resp, err = client.VolumeList(ctx, &master_pb.VolumeListRequest{})
+ return err
+ })
+ if err != nil {
+ return err
+ }
+
+ // find out all volume servers with one volume slot left.
+ var allDataNodes []*master_pb.DataNodeInfo
+ var totalFreeEcSlots int
+ eachDataNode(resp.TopologyInfo, func(dn *master_pb.DataNodeInfo) {
+ if freeEcSlots := countFreeShardSlots(dn); freeEcSlots > 0 {
+ allDataNodes = append(allDataNodes, dn)
+ totalFreeEcSlots += freeEcSlots
+ }
+ })
+ if totalFreeEcSlots < erasure_coding.TotalShardsCount {
+ return fmt.Errorf("not enough free ec shard slots. only %d left", totalFreeEcSlots)
+ }
+ sort.Slice(allDataNodes, func(i, j int) bool {
+ return countFreeShardSlots(allDataNodes[j]) < countFreeShardSlots(allDataNodes[i])
+ })
+ if len(allDataNodes) > erasure_coding.TotalShardsCount {
+ allDataNodes = allDataNodes[:erasure_coding.TotalShardsCount]
+ }
+
+ // calculate how many shards to allocate for these servers
+ allocated := balancedEcDistribution(allDataNodes)
+
+ // ask the data nodes to copy from the source volume server
+ err = parallelCopyEcShardsFromSource(ctx, commandEnv.option.GrpcDialOption, allDataNodes, allocated, volumeId, existingLocation)
+ if err != nil {
+ return nil
+ }
+
+ // ask the source volume server to clean up copied ec shards
+
+ // ask the source volume server to delete the original volume
+
+ return err
+
+}
+
+func parallelCopyEcShardsFromSource(ctx context.Context, grpcDialOption grpc.DialOption,
+ targetServers []*master_pb.DataNodeInfo, allocated []int,
+ volumeId needle.VolumeId, existingLocation wdclient.Location) (err error) {
+
+ // parallelize
+ var wg sync.WaitGroup
+ startFromShardId := 0
+ for i, server := range targetServers {
+ if allocated[i] <= 0 {
+ continue
+ }
+
+ wg.Add(1)
+ go func(server *master_pb.DataNodeInfo, startFromShardId int, shardCount int) {
+ defer wg.Done()
+ copyErr := oneServerCopyEcShardsFromSource(ctx, grpcDialOption, server, startFromShardId, shardCount, volumeId, existingLocation)
+ if copyErr != nil {
+ err = copyErr
+ }
+ }(server, startFromShardId, allocated[i])
+ startFromShardId += allocated[i]
+ }
+ wg.Wait()
+
+ return err
+}
+
+func oneServerCopyEcShardsFromSource(ctx context.Context, grpcDialOption grpc.DialOption,
+ targetServer *master_pb.DataNodeInfo, startFromShardId int, shardCount int,
+ volumeId needle.VolumeId, existingLocation wdclient.Location) (err error) {
+
+ if targetServer.Id == existingLocation.Url {
+ return nil
+ }
+
+ for shardId := startFromShardId; shardId < startFromShardId+shardCount; shardId++ {
+ fmt.Printf("copy %d.%d %s => %s\n", volumeId, shardId, existingLocation.Url, targetServer.Id)
+ }
+
+ return nil
+}
+func balancedEcDistribution(servers []*master_pb.DataNodeInfo) (allocated []int) {
+ freeSlots := make([]int, len(servers))
+ allocated = make([]int, len(servers))
+ for i, server := range servers {
+ freeSlots[i] = countFreeShardSlots(server)
+ }
+ allocatedCount := 0
+ for allocatedCount < erasure_coding.TotalShardsCount {
+ for i, _ := range servers {
+ if freeSlots[i]-allocated[i] > 0 {
+ allocated[i] += 1
+ allocatedCount += 1
+ }
+ if allocatedCount >= erasure_coding.TotalShardsCount {
+ break
+ }
+ }
+ }
+
+ return allocated
+}
+
+func eachDataNode(topo *master_pb.TopologyInfo, fn func(*master_pb.DataNodeInfo)) {
+ for _, dc := range topo.DataCenterInfos {
+ for _, rack := range dc.RackInfos {
+ for _, dn := range rack.DataNodeInfos {
+ fn(dn)
+ }
+ }
+ }
+}
+
+func countShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) (count int) {
+ for _, ecShardInfo := range ecShardInfos {
+ shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
+ count += shardBits.ShardIdCount()
+ }
+ return
+}
+
+func countFreeShardSlots(dn *master_pb.DataNodeInfo) (count int) {
+ return int(dn.FreeVolumeCount)*10 - countShards(dn.EcShardInfos)
+}
diff --git a/weed/storage/erasure_coding/ec_encoder.go b/weed/storage/erasure_coding/ec_encoder.go
index dbfe5858b..da0cfcde8 100644
--- a/weed/storage/erasure_coding/ec_encoder.go
+++ b/weed/storage/erasure_coding/ec_encoder.go
@@ -15,6 +15,7 @@ import (
const (
DataShardsCount = 10
ParityShardsCount = 4
+ TotalShardsCount = DataShardsCount + ParityShardsCount
ErasureCodingLargeBlockSize = 1024 * 1024 * 1024 // 1GB
ErasureCodingSmallBlockSize = 1024 * 1024 // 1MB
)
@@ -93,7 +94,7 @@ func encodeData(file *os.File, enc reedsolomon.Encoder, startOffset, blockSize i
}
func openEcFiles(baseFileName string, forRead bool) (files []*os.File, err error) {
- for i := 0; i < DataShardsCount+ParityShardsCount; i++ {
+ for i := 0; i < TotalShardsCount; i++ {
fname := baseFileName + ToExt(i)
openOption := os.O_TRUNC | os.O_CREATE | os.O_WRONLY
if forRead {
@@ -138,7 +139,7 @@ func encodeDataOneBatch(file *os.File, enc reedsolomon.Encoder, startOffset, blo
return err
}
- for i := 0; i < DataShardsCount+ParityShardsCount; i++ {
+ for i := 0; i < TotalShardsCount; i++ {
_, err := outputs[i].Write(buffers[i])
if err != nil {
return err
@@ -154,7 +155,7 @@ func encodeDatFile(remainingSize int64, err error, baseFileName string, bufferSi
if err != nil {
return fmt.Errorf("failed to create encoder: %v", err)
}
- buffers := make([][]byte, DataShardsCount+ParityShardsCount)
+ buffers := make([][]byte, TotalShardsCount)
outputs, err := openEcFiles(baseFileName, false)
defer closeEcFiles(outputs)
if err != nil {
diff --git a/weed/storage/erasure_coding/ec_test.go b/weed/storage/erasure_coding/ec_test.go
index 625f4e9a6..ecf73ac96 100644
--- a/weed/storage/erasure_coding/ec_test.go
+++ b/weed/storage/erasure_coding/ec_test.go
@@ -153,9 +153,9 @@ func readFromOtherEcFiles(ecFiles []*os.File, ecFileIndex int, ecFileOffset int6
return nil, fmt.Errorf("failed to create encoder: %v", err)
}
- bufs := make([][]byte, DataShardsCount+ParityShardsCount)
+ bufs := make([][]byte, TotalShardsCount)
for i := 0; i < DataShardsCount; {
- n := int(rand.Int31n(DataShardsCount + ParityShardsCount))
+ n := int(rand.Int31n(TotalShardsCount))
if n == ecFileIndex || bufs[n] != nil {
continue
}
diff --git a/weed/storage/erasure_coding/ec_volume_info.go b/weed/storage/erasure_coding/ec_volume_info.go
index c26269158..ef8cc4ed4 100644
--- a/weed/storage/erasure_coding/ec_volume_info.go
+++ b/weed/storage/erasure_coding/ec_volume_info.go
@@ -36,6 +36,10 @@ func (ecInfo *EcVolumeInfo) ShardIds() (ret []ShardId) {
return ecInfo.ShardBits.ShardIds()
}
+func (ecInfo *EcVolumeInfo) ShardIdCount() (count int) {
+ return ecInfo.ShardBits.ShardIdCount()
+}
+
func (ecInfo *EcVolumeInfo) Minus(other *EcVolumeInfo) (*EcVolumeInfo) {
ret := &EcVolumeInfo{
VolumeId: ecInfo.VolumeId,
@@ -69,7 +73,7 @@ func (b ShardBits) HasShardId(id ShardId) bool {
}
func (b ShardBits) ShardIds() (ret []ShardId) {
- for i := ShardId(0); i < DataShardsCount+ParityShardsCount; i++ {
+ for i := ShardId(0); i < TotalShardsCount; i++ {
if b.HasShardId(i) {
ret = append(ret, i)
}
@@ -77,6 +81,13 @@ func (b ShardBits) ShardIds() (ret []ShardId) {
return
}
+func (b ShardBits) ShardIdCount() (count int) {
+ for count = 0; b > 0; count++ {
+ b &= b - 1
+ }
+ return
+}
+
func (b ShardBits) Minus(other ShardBits) (ShardBits) {
return b &^ other
}
diff --git a/weed/topology/data_node_ec.go b/weed/topology/data_node_ec.go
index e8ead5511..63c8f2127 100644
--- a/weed/topology/data_node_ec.go
+++ b/weed/topology/data_node_ec.go
@@ -30,11 +30,11 @@ func (dn *DataNode) UpdateEcShards(actualShards []*erasure_coding.EcVolumeInfo)
} else {
// found, but maybe the actual shard could be missing
a := actualEcShards.Minus(ecShards)
- if len(a.ShardIds()) > 0 {
+ if a.ShardIdCount() > 0 {
newShards = append(newShards, a)
}
d := ecShards.Minus(actualEcShards)
- if len(d.ShardIds()) > 0 {
+ if d.ShardIdCount() > 0 {
deletedShards = append(deletedShards, d)
}
}
diff --git a/weed/topology/topology_ec.go b/weed/topology/topology_ec.go
index eb52b44b4..050a0b901 100644
--- a/weed/topology/topology_ec.go
+++ b/weed/topology/topology_ec.go
@@ -7,11 +7,9 @@ import (
"github.com/chrislusf/seaweedfs/weed/storage/needle"
)
-const shardCount = erasure_coding.DataShardsCount + erasure_coding.ParityShardsCount
-
type EcShardLocations struct {
Collection string
- locations [shardCount][]*DataNode
+ locations [erasure_coding.TotalShardsCount][]*DataNode
}
func (t *Topology) SyncDataNodeEcShards(shardInfos []*master_pb.VolumeEcShardInformationMessage, dn *DataNode) (newShards, deletedShards []*erasure_coding.EcVolumeInfo) {