diff options
| author | HongyanShen <763987993@qq.com> | 2020-03-11 12:55:24 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-03-11 12:55:24 +0800 |
| commit | 03529fc0c29072f6f26e11ffbd7229cf92dc71ce (patch) | |
| tree | ed8833386a712c850dcef0815509774681a6ab56 /weed/topology | |
| parent | 0fca1ae776783b37481549df40f477b7d9248d3c (diff) | |
| parent | 60f5f05c78a2918d5219c925cea5847759281a2c (diff) | |
| download | seaweedfs-03529fc0c29072f6f26e11ffbd7229cf92dc71ce.tar.xz seaweedfs-03529fc0c29072f6f26e11ffbd7229cf92dc71ce.zip | |
Merge pull request #1 from chrislusf/master
sync
Diffstat (limited to 'weed/topology')
| -rw-r--r-- | weed/topology/collection.go | 4 | ||||
| -rw-r--r-- | weed/topology/data_center.go | 1 | ||||
| -rw-r--r-- | weed/topology/data_node.go | 31 | ||||
| -rw-r--r-- | weed/topology/node.go | 97 | ||||
| -rw-r--r-- | weed/topology/rack.go | 1 | ||||
| -rw-r--r-- | weed/topology/store_replicate.go | 182 | ||||
| -rw-r--r-- | weed/topology/topology.go | 15 | ||||
| -rw-r--r-- | weed/topology/topology_event_handling.go | 1 | ||||
| -rw-r--r-- | weed/topology/topology_map.go | 1 | ||||
| -rw-r--r-- | weed/topology/topology_test.go | 5 | ||||
| -rw-r--r-- | weed/topology/topology_vacuum.go | 40 | ||||
| -rw-r--r-- | weed/topology/volume_growth.go | 24 | ||||
| -rw-r--r-- | weed/topology/volume_growth_test.go | 212 | ||||
| -rw-r--r-- | weed/topology/volume_layout.go | 5 |
14 files changed, 460 insertions, 159 deletions
diff --git a/weed/topology/collection.go b/weed/topology/collection.go index f6b728ec9..7a611d904 100644 --- a/weed/topology/collection.go +++ b/weed/topology/collection.go @@ -3,8 +3,8 @@ package topology import ( "fmt" - "github.com/chrislusf/seaweedfs/weed/storage" "github.com/chrislusf/seaweedfs/weed/storage/needle" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" "github.com/chrislusf/seaweedfs/weed/util" ) @@ -24,7 +24,7 @@ func (c *Collection) String() string { return fmt.Sprintf("Name:%s, volumeSizeLimit:%d, storageType2VolumeLayout:%v", c.Name, c.volumeSizeLimit, c.storageType2VolumeLayout) } -func (c *Collection) GetOrCreateVolumeLayout(rp *storage.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout { +func (c *Collection) GetOrCreateVolumeLayout(rp *super_block.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout { keyString := rp.String() if ttl != nil { keyString += ttl.String() diff --git a/weed/topology/data_center.go b/weed/topology/data_center.go index 640cb1937..dc3accb71 100644 --- a/weed/topology/data_center.go +++ b/weed/topology/data_center.go @@ -48,6 +48,7 @@ func (dc *DataCenter) ToDataCenterInfo() *master_pb.DataCenterInfo { MaxVolumeCount: uint64(dc.GetMaxVolumeCount()), FreeVolumeCount: uint64(dc.FreeSpace()), ActiveVolumeCount: uint64(dc.GetActiveVolumeCount()), + RemoteVolumeCount: uint64(dc.GetRemoteVolumeCount()), } for _, c := range dc.Children() { rack := c.(*Rack) diff --git a/weed/topology/data_node.go b/weed/topology/data_node.go index 3e72ccdbf..617341e54 100644 --- a/weed/topology/data_node.go +++ b/weed/topology/data_node.go @@ -2,14 +2,13 @@ package topology import ( "fmt" + "strconv" "sync" "github.com/chrislusf/seaweedfs/weed/pb/master_pb" "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding" "github.com/chrislusf/seaweedfs/weed/storage/needle" - "strconv" - "github.com/chrislusf/seaweedfs/weed/glog" "github.com/chrislusf/seaweedfs/weed/storage" ) @@ -44,15 +43,26 @@ func (dn *DataNode) String() string { func (dn *DataNode) AddOrUpdateVolume(v storage.VolumeInfo) (isNew bool) { dn.Lock() defer dn.Unlock() - if _, ok := dn.volumes[v.Id]; !ok { + if oldV, ok := dn.volumes[v.Id]; !ok { dn.volumes[v.Id] = v dn.UpAdjustVolumeCountDelta(1) + if v.IsRemote() { + dn.UpAdjustRemoteVolumeCountDelta(1) + } if !v.ReadOnly { dn.UpAdjustActiveVolumeCountDelta(1) } dn.UpAdjustMaxVolumeId(v.Id) isNew = true } else { + if oldV.IsRemote() != v.IsRemote() { + if v.IsRemote() { + dn.UpAdjustRemoteVolumeCountDelta(1) + } + if oldV.IsRemote() { + dn.UpAdjustRemoteVolumeCountDelta(-1) + } + } dn.volumes[v.Id] = v } return @@ -70,7 +80,12 @@ func (dn *DataNode) UpdateVolumes(actualVolumes []storage.VolumeInfo) (newVolume delete(dn.volumes, vid) deletedVolumes = append(deletedVolumes, v) dn.UpAdjustVolumeCountDelta(-1) - dn.UpAdjustActiveVolumeCountDelta(-1) + if v.IsRemote() { + dn.UpAdjustRemoteVolumeCountDelta(-1) + } + if !v.ReadOnly { + dn.UpAdjustActiveVolumeCountDelta(-1) + } } } dn.Unlock() @@ -88,7 +103,12 @@ func (dn *DataNode) DeltaUpdateVolumes(newlVolumes, deletedVolumes []storage.Vol for _, v := range deletedVolumes { delete(dn.volumes, v.Id) dn.UpAdjustVolumeCountDelta(-1) - dn.UpAdjustActiveVolumeCountDelta(-1) + if v.IsRemote() { + dn.UpAdjustRemoteVolumeCountDelta(-1) + } + if !v.ReadOnly { + dn.UpAdjustActiveVolumeCountDelta(-1) + } } dn.Unlock() for _, v := range newlVolumes { @@ -160,6 +180,7 @@ func (dn *DataNode) ToDataNodeInfo() *master_pb.DataNodeInfo { MaxVolumeCount: uint64(dn.GetMaxVolumeCount()), FreeVolumeCount: uint64(dn.FreeSpace()), ActiveVolumeCount: uint64(dn.GetActiveVolumeCount()), + RemoteVolumeCount: uint64(dn.GetRemoteVolumeCount()), } for _, v := range dn.GetVolumes() { m.VolumeInfos = append(m.VolumeInfos, v.ToVolumeInformationMessage()) diff --git a/weed/topology/node.go b/weed/topology/node.go index b2808f589..ceeb96d60 100644 --- a/weed/topology/node.go +++ b/weed/topology/node.go @@ -20,6 +20,7 @@ type Node interface { ReserveOneVolume(r int64) (*DataNode, error) UpAdjustMaxVolumeCountDelta(maxVolumeCountDelta int64) UpAdjustVolumeCountDelta(volumeCountDelta int64) + UpAdjustRemoteVolumeCountDelta(remoteVolumeCountDelta int64) UpAdjustEcShardCountDelta(ecShardCountDelta int64) UpAdjustActiveVolumeCountDelta(activeVolumeCountDelta int64) UpAdjustMaxVolumeId(vid needle.VolumeId) @@ -27,6 +28,7 @@ type Node interface { GetVolumeCount() int64 GetEcShardCount() int64 GetActiveVolumeCount() int64 + GetRemoteVolumeCount() int64 GetMaxVolumeCount() int64 GetMaxVolumeId() needle.VolumeId SetParent(Node) @@ -44,6 +46,7 @@ type Node interface { } type NodeImpl struct { volumeCount int64 + remoteVolumeCount int64 activeVolumeCount int64 ecShardCount int64 maxVolumeCount int64 @@ -59,56 +62,64 @@ type NodeImpl struct { } // the first node must satisfy filterFirstNodeFn(), the rest nodes must have one free slot -func (n *NodeImpl) RandomlyPickNodes(numberOfNodes int, filterFirstNodeFn func(dn Node) error) (firstNode Node, restNodes []Node, err error) { - candidates := make([]Node, 0, len(n.children)) +func (n *NodeImpl) PickNodesByWeight(numberOfNodes int, filterFirstNodeFn func(dn Node) error) (firstNode Node, restNodes []Node, err error) { + var totalWeights int64 var errs []string n.RLock() + candidates := make([]Node, 0, len(n.children)) + candidatesWeights := make([]int64, 0, len(n.children)) + //pick nodes which has enough free volumes as candidates, and use free volumes number as node weight. for _, node := range n.children { - if err := filterFirstNodeFn(node); err == nil { - candidates = append(candidates, node) - } else { - errs = append(errs, string(node.Id())+":"+err.Error()) + if node.FreeSpace() <= 0 { + continue } + totalWeights += node.FreeSpace() + candidates = append(candidates, node) + candidatesWeights = append(candidatesWeights, node.FreeSpace()) } n.RUnlock() - if len(candidates) == 0 { - return nil, nil, errors.New("No matching data node found! \n" + strings.Join(errs, "\n")) + if len(candidates) < numberOfNodes { + glog.V(2).Infoln(n.Id(), "failed to pick", numberOfNodes, "from ", len(candidates), "node candidates") + return nil, nil, errors.New("No enough data node found!") } - firstNode = candidates[rand.Intn(len(candidates))] - glog.V(2).Infoln(n.Id(), "picked main node:", firstNode.Id()) - restNodes = make([]Node, numberOfNodes-1) - candidates = candidates[:0] - n.RLock() - for _, node := range n.children { - if node.Id() == firstNode.Id() { - continue - } - if node.FreeSpace() <= 0 { - continue + //pick nodes randomly by weights, the node picked earlier has higher final weights + sortedCandidates := make([]Node, 0, len(candidates)) + for i := 0; i < len(candidates); i++ { + weightsInterval := rand.Int63n(totalWeights) + lastWeights := int64(0) + for k, weights := range candidatesWeights { + if (weightsInterval >= lastWeights) && (weightsInterval < lastWeights+weights) { + sortedCandidates = append(sortedCandidates, candidates[k]) + candidatesWeights[k] = 0 + totalWeights -= weights + break + } + lastWeights += weights } - glog.V(2).Infoln("select rest node candidate:", node.Id()) - candidates = append(candidates, node) } - n.RUnlock() - glog.V(2).Infoln(n.Id(), "picking", numberOfNodes-1, "from rest", len(candidates), "node candidates") - ret := len(restNodes) == 0 - for k, node := range candidates { - if k < len(restNodes) { - restNodes[k] = node - if k == len(restNodes)-1 { - ret = true + + restNodes = make([]Node, 0, numberOfNodes-1) + ret := false + n.RLock() + for k, node := range sortedCandidates { + if err := filterFirstNodeFn(node); err == nil { + firstNode = node + if k >= numberOfNodes-1 { + restNodes = sortedCandidates[:numberOfNodes-1] + } else { + restNodes = append(restNodes, sortedCandidates[:k]...) + restNodes = append(restNodes, sortedCandidates[k+1:numberOfNodes]...) } + ret = true + break } else { - r := rand.Intn(k + 1) - if r < len(restNodes) { - restNodes[r] = node - } + errs = append(errs, string(node.Id())+":"+err.Error()) } } + n.RUnlock() if !ret { - glog.V(2).Infoln(n.Id(), "failed to pick", numberOfNodes-1, "from rest", len(candidates), "node candidates") - err = errors.New("No enough data node found!") + return nil, nil, errors.New("No matching data node found! \n" + strings.Join(errs, "\n")) } return } @@ -132,10 +143,11 @@ func (n *NodeImpl) Id() NodeId { return n.id } func (n *NodeImpl) FreeSpace() int64 { + freeVolumeSlotCount := n.maxVolumeCount + n.remoteVolumeCount - n.volumeCount if n.ecShardCount > 0 { - return n.maxVolumeCount - n.volumeCount - n.ecShardCount/erasure_coding.DataShardsCount - 1 + freeVolumeSlotCount = freeVolumeSlotCount - n.ecShardCount/erasure_coding.DataShardsCount - 1 } - return n.maxVolumeCount - n.volumeCount + return freeVolumeSlotCount } func (n *NodeImpl) SetParent(node Node) { n.parent = node @@ -191,6 +203,12 @@ func (n *NodeImpl) UpAdjustVolumeCountDelta(volumeCountDelta int64) { //can be n n.parent.UpAdjustVolumeCountDelta(volumeCountDelta) } } +func (n *NodeImpl) UpAdjustRemoteVolumeCountDelta(remoteVolumeCountDelta int64) { //can be negative + atomic.AddInt64(&n.remoteVolumeCount, remoteVolumeCountDelta) + if n.parent != nil { + n.parent.UpAdjustRemoteVolumeCountDelta(remoteVolumeCountDelta) + } +} func (n *NodeImpl) UpAdjustEcShardCountDelta(ecShardCountDelta int64) { //can be negative atomic.AddInt64(&n.ecShardCount, ecShardCountDelta) if n.parent != nil { @@ -220,6 +238,9 @@ func (n *NodeImpl) GetVolumeCount() int64 { func (n *NodeImpl) GetEcShardCount() int64 { return n.ecShardCount } +func (n *NodeImpl) GetRemoteVolumeCount() int64 { + return n.remoteVolumeCount +} func (n *NodeImpl) GetActiveVolumeCount() int64 { return n.activeVolumeCount } @@ -235,6 +256,7 @@ func (n *NodeImpl) LinkChildNode(node Node) { n.UpAdjustMaxVolumeCountDelta(node.GetMaxVolumeCount()) n.UpAdjustMaxVolumeId(node.GetMaxVolumeId()) n.UpAdjustVolumeCountDelta(node.GetVolumeCount()) + n.UpAdjustRemoteVolumeCountDelta(node.GetRemoteVolumeCount()) n.UpAdjustEcShardCountDelta(node.GetEcShardCount()) n.UpAdjustActiveVolumeCountDelta(node.GetActiveVolumeCount()) node.SetParent(n) @@ -250,6 +272,7 @@ func (n *NodeImpl) UnlinkChildNode(nodeId NodeId) { node.SetParent(nil) delete(n.children, node.Id()) n.UpAdjustVolumeCountDelta(-node.GetVolumeCount()) + n.UpAdjustRemoteVolumeCountDelta(-node.GetRemoteVolumeCount()) n.UpAdjustEcShardCountDelta(-node.GetEcShardCount()) n.UpAdjustActiveVolumeCountDelta(-node.GetActiveVolumeCount()) n.UpAdjustMaxVolumeCountDelta(-node.GetMaxVolumeCount()) diff --git a/weed/topology/rack.go b/weed/topology/rack.go index 932c1a804..1921c0c05 100644 --- a/weed/topology/rack.go +++ b/weed/topology/rack.go @@ -67,6 +67,7 @@ func (r *Rack) ToRackInfo() *master_pb.RackInfo { MaxVolumeCount: uint64(r.GetMaxVolumeCount()), FreeVolumeCount: uint64(r.FreeSpace()), ActiveVolumeCount: uint64(r.GetActiveVolumeCount()), + RemoteVolumeCount: uint64(r.GetRemoteVolumeCount()), } for _, c := range r.Children() { dn := c.(*DataNode) diff --git a/weed/topology/store_replicate.go b/weed/topology/store_replicate.go index d21c4d210..8c4996d45 100644 --- a/weed/topology/store_replicate.go +++ b/weed/topology/store_replicate.go @@ -1,7 +1,6 @@ package topology import ( - "bytes" "encoding/json" "errors" "fmt" @@ -25,58 +24,60 @@ func ReplicatedWrite(masterNode string, s *storage.Store, //check JWT jwt := security.GetJwt(r) + var remoteLocations []operation.Location + if r.FormValue("type") != "replicate" { + remoteLocations, err = getWritableRemoteReplications(s, volumeId, masterNode) + if err != nil { + glog.V(0).Infoln(err) + return + } + } + size, isUnchanged, err = s.WriteVolumeNeedle(volumeId, n) if err != nil { err = fmt.Errorf("failed to write to local disk: %v", err) + glog.V(0).Infoln(err) return } - needToReplicate := !s.HasVolume(volumeId) - needToReplicate = needToReplicate || s.GetVolume(volumeId).NeedToReplicate() - if !needToReplicate { - needToReplicate = s.GetVolume(volumeId).NeedToReplicate() - } - if needToReplicate { //send to other replica locations - if r.FormValue("type") != "replicate" { - - if err = distributedOperation(masterNode, s, volumeId, func(location operation.Location) error { - u := url.URL{ - Scheme: "http", - Host: location.Url, - Path: r.URL.Path, - } - q := url.Values{ - "type": {"replicate"}, - "ttl": {n.Ttl.String()}, - } - if n.LastModified > 0 { - q.Set("ts", strconv.FormatUint(n.LastModified, 10)) - } - if n.IsChunkedManifest() { - q.Set("cm", "true") + if len(remoteLocations) > 0 { //send to other replica locations + if err = distributedOperation(remoteLocations, s, func(location operation.Location) error { + u := url.URL{ + Scheme: "http", + Host: location.Url, + Path: r.URL.Path, + } + q := url.Values{ + "type": {"replicate"}, + "ttl": {n.Ttl.String()}, + } + if n.LastModified > 0 { + q.Set("ts", strconv.FormatUint(n.LastModified, 10)) + } + if n.IsChunkedManifest() { + q.Set("cm", "true") + } + u.RawQuery = q.Encode() + + pairMap := make(map[string]string) + if n.HasPairs() { + tmpMap := make(map[string]string) + err := json.Unmarshal(n.Pairs, &tmpMap) + if err != nil { + glog.V(0).Infoln("Unmarshal pairs error:", err) } - u.RawQuery = q.Encode() - - pairMap := make(map[string]string) - if n.HasPairs() { - tmpMap := make(map[string]string) - err := json.Unmarshal(n.Pairs, &tmpMap) - if err != nil { - glog.V(0).Infoln("Unmarshal pairs error:", err) - } - for k, v := range tmpMap { - pairMap[needle.PairNamePrefix+k] = v - } + for k, v := range tmpMap { + pairMap[needle.PairNamePrefix+k] = v } - - _, err := operation.Upload(u.String(), - string(n.Name), bytes.NewReader(n.Data), n.IsGzipped(), string(n.Mime), - pairMap, jwt) - return err - }); err != nil { - size = 0 - err = fmt.Errorf("failed to write to replicas for volume %d: %v", volumeId, err) } + + // volume server do not know about encryption + _, err := operation.UploadData(u.String(), string(n.Name), false, n.Data, n.IsGzipped(), string(n.Mime), pairMap, jwt) + return err + }); err != nil { + size = 0 + err = fmt.Errorf("failed to write to replicas for volume %d: %v", volumeId, err) + glog.V(0).Infoln(err) } } return @@ -84,31 +85,34 @@ func ReplicatedWrite(masterNode string, s *storage.Store, func ReplicatedDelete(masterNode string, store *storage.Store, volumeId needle.VolumeId, n *needle.Needle, - r *http.Request) (uint32, error) { + r *http.Request) (size uint32, err error) { //check JWT jwt := security.GetJwt(r) - ret, err := store.DeleteVolumeNeedle(volumeId, n) + var remoteLocations []operation.Location + if r.FormValue("type") != "replicate" { + remoteLocations, err = getWritableRemoteReplications(store, volumeId, masterNode) + if err != nil { + glog.V(0).Infoln(err) + return + } + } + + size, err = store.DeleteVolumeNeedle(volumeId, n) if err != nil { glog.V(0).Infoln("delete error:", err) - return ret, err + return } - needToReplicate := !store.HasVolume(volumeId) - if !needToReplicate && ret > 0 { - needToReplicate = store.GetVolume(volumeId).NeedToReplicate() - } - if needToReplicate { //send to other replica locations - if r.FormValue("type") != "replicate" { - if err = distributedOperation(masterNode, store, volumeId, func(location operation.Location) error { - return util.Delete("http://"+location.Url+r.URL.Path+"?type=replicate", string(jwt)) - }); err != nil { - ret = 0 - } + if len(remoteLocations) > 0 { //send to other replica locations + if err = distributedOperation(remoteLocations, store, func(location operation.Location) error { + return util.Delete("http://"+location.Url+r.URL.Path+"?type=replicate", string(jwt)) + }); err != nil { + size = 0 } } - return ret, err + return } type DistributedOperationResult map[string]error @@ -131,32 +135,44 @@ type RemoteResult struct { Error error } -func distributedOperation(masterNode string, store *storage.Store, volumeId needle.VolumeId, op func(location operation.Location) error) error { - if lookupResult, lookupErr := operation.Lookup(masterNode, volumeId.String()); lookupErr == nil { - length := 0 - selfUrl := (store.Ip + ":" + strconv.Itoa(store.Port)) - results := make(chan RemoteResult) - for _, location := range lookupResult.Locations { - if location.Url != selfUrl { - length++ - go func(location operation.Location, results chan RemoteResult) { - results <- RemoteResult{location.Url, op(location)} - }(location, results) +func distributedOperation(locations []operation.Location, store *storage.Store, op func(location operation.Location) error) error { + length := len(locations) + results := make(chan RemoteResult) + for _, location := range locations { + go func(location operation.Location, results chan RemoteResult) { + results <- RemoteResult{location.Url, op(location)} + }(location, results) + } + ret := DistributedOperationResult(make(map[string]error)) + for i := 0; i < length; i++ { + result := <-results + ret[result.Host] = result.Error + } + + return ret.Error() +} + +func getWritableRemoteReplications(s *storage.Store, volumeId needle.VolumeId, masterNode string) ( + remoteLocations []operation.Location, err error) { + copyCount := s.GetVolume(volumeId).ReplicaPlacement.GetCopyCount() + if copyCount > 1 { + if lookupResult, lookupErr := operation.Lookup(masterNode, volumeId.String()); lookupErr == nil { + if len(lookupResult.Locations) < copyCount { + err = fmt.Errorf("replicating opetations [%d] is less than volume %d replication copy count [%d]", + len(lookupResult.Locations), volumeId, copyCount) + return } - } - ret := DistributedOperationResult(make(map[string]error)) - for i := 0; i < length; i++ { - result := <-results - ret[result.Host] = result.Error - } - if volume := store.GetVolume(volumeId); volume != nil { - if length+1 < volume.ReplicaPlacement.GetCopyCount() { - return fmt.Errorf("replicating opetations [%d] is less than volume's replication copy count [%d]", length+1, volume.ReplicaPlacement.GetCopyCount()) + selfUrl := s.Ip + ":" + strconv.Itoa(s.Port) + for _, location := range lookupResult.Locations { + if location.Url != selfUrl { + remoteLocations = append(remoteLocations, location) + } } + } else { + err = fmt.Errorf("failed to lookup for %d: %v", volumeId, lookupErr) + return } - return ret.Error() - } else { - glog.V(0).Infoln() - return fmt.Errorf("Failed to lookup for %d: %v", volumeId, lookupErr) } + + return } diff --git a/weed/topology/topology.go b/weed/topology/topology.go index b7ebe8af5..fbf998707 100644 --- a/weed/topology/topology.go +++ b/weed/topology/topology.go @@ -7,11 +7,13 @@ import ( "sync" "github.com/chrislusf/raft" + "github.com/chrislusf/seaweedfs/weed/glog" "github.com/chrislusf/seaweedfs/weed/pb/master_pb" "github.com/chrislusf/seaweedfs/weed/sequence" "github.com/chrislusf/seaweedfs/weed/storage" "github.com/chrislusf/seaweedfs/weed/storage/needle" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" "github.com/chrislusf/seaweedfs/weed/util" ) @@ -58,7 +60,12 @@ func NewTopology(id string, seq sequence.Sequencer, volumeSizeLimit uint64, puls func (t *Topology) IsLeader() bool { if t.RaftServer != nil { - return t.RaftServer.State() == raft.Leader + if t.RaftServer.State() == raft.Leader { + return true + } + if t.RaftServer.Leader() == "" { + return true + } } return false } @@ -73,7 +80,7 @@ func (t *Topology) Leader() (string, error) { if l == "" { // We are a single node cluster, we are the leader - return t.RaftServer.Name(), errors.New("Raft Server not initialized!") + return t.RaftServer.Name(), nil } return l, nil @@ -129,7 +136,7 @@ func (t *Topology) PickForWrite(count uint64, option *VolumeGrowOption) (string, return needle.NewFileId(*vid, fileId, rand.Uint32()).String(), count, datanodes.Head(), nil } -func (t *Topology) GetVolumeLayout(collectionName string, rp *storage.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout { +func (t *Topology) GetVolumeLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout { return t.collectionMap.Get(collectionName, func() interface{} { return NewCollection(collectionName, t.volumeSizeLimit) }).(*Collection).GetOrCreateVolumeLayout(rp, ttl) @@ -150,7 +157,7 @@ func (t *Topology) ListCollections(includeNormalVolumes, includeEcVolumes bool) t.ecShardMapLock.RUnlock() } - for k, _ := range mapOfCollections { + for k := range mapOfCollections { ret = append(ret, k) } return ret diff --git a/weed/topology/topology_event_handling.go b/weed/topology/topology_event_handling.go index 041351492..068bd401e 100644 --- a/weed/topology/topology_event_handling.go +++ b/weed/topology/topology_event_handling.go @@ -59,6 +59,7 @@ func (t *Topology) UnRegisterDataNode(dn *DataNode) { vl.SetVolumeUnavailable(dn, v.Id) } dn.UpAdjustVolumeCountDelta(-dn.GetVolumeCount()) + dn.UpAdjustRemoteVolumeCountDelta(-dn.GetRemoteVolumeCount()) dn.UpAdjustActiveVolumeCountDelta(-dn.GetActiveVolumeCount()) dn.UpAdjustMaxVolumeCountDelta(-dn.GetMaxVolumeCount()) if dn.Parent() != nil { diff --git a/weed/topology/topology_map.go b/weed/topology/topology_map.go index 0ad30f12e..73c55d77d 100644 --- a/weed/topology/topology_map.go +++ b/weed/topology/topology_map.go @@ -85,6 +85,7 @@ func (t *Topology) ToTopologyInfo() *master_pb.TopologyInfo { MaxVolumeCount: uint64(t.GetMaxVolumeCount()), FreeVolumeCount: uint64(t.FreeSpace()), ActiveVolumeCount: uint64(t.GetActiveVolumeCount()), + RemoteVolumeCount: uint64(t.GetRemoteVolumeCount()), } for _, c := range t.Children() { dc := c.(*DataCenter) diff --git a/weed/topology/topology_test.go b/weed/topology/topology_test.go index 8f79ad684..e7676ccf7 100644 --- a/weed/topology/topology_test.go +++ b/weed/topology/topology_test.go @@ -5,6 +5,7 @@ import ( "github.com/chrislusf/seaweedfs/weed/sequence" "github.com/chrislusf/seaweedfs/weed/storage" "github.com/chrislusf/seaweedfs/weed/storage/needle" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" "testing" ) @@ -94,7 +95,7 @@ func TestHandlingVolumeServerHeartbeat(t *testing.T) { []*master_pb.VolumeShortInformationMessage{newVolumeShortMessage}, nil, dn) - rp, _ := storage.NewReplicaPlacementFromString("000") + rp, _ := super_block.NewReplicaPlacementFromString("000") layout := topo.GetVolumeLayout("", rp, needle.EMPTY_TTL) assert(t, "writables after repeated add", len(layout.writables), volumeCount) @@ -154,7 +155,7 @@ func TestAddRemoveVolume(t *testing.T) { DeletedByteCount: 45, ReadOnly: false, Version: needle.CurrentVersion, - ReplicaPlacement: &storage.ReplicaPlacement{}, + ReplicaPlacement: &super_block.ReplicaPlacement{}, Ttl: needle.EMPTY_TTL, } diff --git a/weed/topology/topology_vacuum.go b/weed/topology/topology_vacuum.go index ff32f1874..ca626e973 100644 --- a/weed/topology/topology_vacuum.go +++ b/weed/topology/topology_vacuum.go @@ -13,8 +13,10 @@ import ( "github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb" ) -func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId, locationlist *VolumeLocationList, garbageThreshold float64) bool { - ch := make(chan bool, locationlist.Length()) +func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId, + locationlist *VolumeLocationList, garbageThreshold float64) (*VolumeLocationList, bool) { + ch := make(chan int, locationlist.Length()) + errCount := int32(0) for index, dn := range locationlist.list { go func(index int, url string, vid needle.VolumeId) { err := operation.WithVolumeServerClient(url, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error { @@ -22,11 +24,15 @@ func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vi VolumeId: uint32(vid), }) if err != nil { - ch <- false + atomic.AddInt32(&errCount, 1) + ch <- -1 return err } - isNeeded := resp.GarbageRatio > garbageThreshold - ch <- isNeeded + if resp.GarbageRatio >= garbageThreshold { + ch <- index + } else { + ch <- -1 + } return nil }) if err != nil { @@ -34,18 +40,21 @@ func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vi } }(index, dn.Url(), vid) } - isCheckSuccess := true + vacuumLocationList := NewVolumeLocationList() for range locationlist.list { select { - case canVacuum := <-ch: - isCheckSuccess = isCheckSuccess && canVacuum + case index := <-ch: + if index != -1 { + vacuumLocationList.list = append(vacuumLocationList.list, locationlist.list[index]) + } case <-time.After(30 * time.Minute): - return false + return vacuumLocationList, false } } - return isCheckSuccess + return vacuumLocationList, errCount == 0 && len(vacuumLocationList.list) > 0 } -func batchVacuumVolumeCompact(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId, locationlist *VolumeLocationList, preallocate int64) bool { +func batchVacuumVolumeCompact(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId, + locationlist *VolumeLocationList, preallocate int64) bool { vl.accessLock.Lock() vl.removeFromWritable(vid) vl.accessLock.Unlock() @@ -163,11 +172,12 @@ func vacuumOneVolumeLayout(grpcDialOption grpc.DialOption, volumeLayout *VolumeL } glog.V(2).Infof("check vacuum on collection:%s volume:%d", c.Name, vid) - if batchVacuumVolumeCheck(grpcDialOption, volumeLayout, vid, locationList, garbageThreshold) { - if batchVacuumVolumeCompact(grpcDialOption, volumeLayout, vid, locationList, preallocate) { - batchVacuumVolumeCommit(grpcDialOption, volumeLayout, vid, locationList) + if vacuumLocationList, needVacuum := batchVacuumVolumeCheck( + grpcDialOption, volumeLayout, vid, locationList, garbageThreshold); needVacuum { + if batchVacuumVolumeCompact(grpcDialOption, volumeLayout, vid, vacuumLocationList, preallocate) { + batchVacuumVolumeCommit(grpcDialOption, volumeLayout, vid, vacuumLocationList) } else { - batchVacuumVolumeCleanup(grpcDialOption, volumeLayout, vid, locationList) + batchVacuumVolumeCleanup(grpcDialOption, volumeLayout, vid, vacuumLocationList) } } } diff --git a/weed/topology/volume_growth.go b/weed/topology/volume_growth.go index 636eb2260..446c88f60 100644 --- a/weed/topology/volume_growth.go +++ b/weed/topology/volume_growth.go @@ -6,6 +6,9 @@ import ( "sync" "github.com/chrislusf/seaweedfs/weed/storage/needle" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" + "github.com/chrislusf/seaweedfs/weed/util" + "google.golang.org/grpc" "github.com/chrislusf/seaweedfs/weed/glog" @@ -22,7 +25,7 @@ This package is created to resolve these replica placement issues: type VolumeGrowOption struct { Collection string - ReplicaPlacement *storage.ReplicaPlacement + ReplicaPlacement *super_block.ReplicaPlacement Ttl *needle.TTL Prealloacte int64 DataCenter string @@ -46,15 +49,20 @@ func NewDefaultVolumeGrowth() *VolumeGrowth { // one replication type may need rp.GetCopyCount() actual volumes // given copyCount, how many logical volumes to create func (vg *VolumeGrowth) findVolumeCount(copyCount int) (count int) { + v := util.GetViper() + v.SetDefault("master.volume_growth.copy_1", 7) + v.SetDefault("master.volume_growth.copy_2", 6) + v.SetDefault("master.volume_growth.copy_3", 3) + v.SetDefault("master.volume_growth.copy_other", 1) switch copyCount { case 1: - count = 7 + count = v.GetInt("master.volume_growth.copy_1") case 2: - count = 6 + count = v.GetInt("master.volume_growth.copy_2") case 3: - count = 3 + count = v.GetInt("master.volume_growth.copy_3") default: - count = 1 + count = v.GetInt("master.volume_growth.copy_other") } return } @@ -104,7 +112,7 @@ func (vg *VolumeGrowth) findAndGrow(grpcDialOption grpc.DialOption, topo *Topolo func (vg *VolumeGrowth) findEmptySlotsForOneVolume(topo *Topology, option *VolumeGrowOption) (servers []*DataNode, err error) { //find main datacenter and other data centers rp := option.ReplicaPlacement - mainDataCenter, otherDataCenters, dc_err := topo.RandomlyPickNodes(rp.DiffDataCenterCount+1, func(node Node) error { + mainDataCenter, otherDataCenters, dc_err := topo.PickNodesByWeight(rp.DiffDataCenterCount+1, func(node Node) error { if option.DataCenter != "" && node.IsDataCenter() && node.Id() != NodeId(option.DataCenter) { return fmt.Errorf("Not matching preferred data center:%s", option.DataCenter) } @@ -136,7 +144,7 @@ func (vg *VolumeGrowth) findEmptySlotsForOneVolume(topo *Topology, option *Volum } //find main rack and other racks - mainRack, otherRacks, rackErr := mainDataCenter.(*DataCenter).RandomlyPickNodes(rp.DiffRackCount+1, func(node Node) error { + mainRack, otherRacks, rackErr := mainDataCenter.(*DataCenter).PickNodesByWeight(rp.DiffRackCount+1, func(node Node) error { if option.Rack != "" && node.IsRack() && node.Id() != NodeId(option.Rack) { return fmt.Errorf("Not matching preferred rack:%s", option.Rack) } @@ -163,7 +171,7 @@ func (vg *VolumeGrowth) findEmptySlotsForOneVolume(topo *Topology, option *Volum } //find main rack and other racks - mainServer, otherServers, serverErr := mainRack.(*Rack).RandomlyPickNodes(rp.SameRackCount+1, func(node Node) error { + mainServer, otherServers, serverErr := mainRack.(*Rack).PickNodesByWeight(rp.SameRackCount+1, func(node Node) error { if option.DataNode != "" && node.IsDataNode() && node.Id() != NodeId(option.DataNode) { return fmt.Errorf("Not matching preferred data node:%s", option.DataNode) } diff --git a/weed/topology/volume_growth_test.go b/weed/topology/volume_growth_test.go index 3573365fd..6ff5be0eb 100644 --- a/weed/topology/volume_growth_test.go +++ b/weed/topology/volume_growth_test.go @@ -8,6 +8,7 @@ import ( "github.com/chrislusf/seaweedfs/weed/sequence" "github.com/chrislusf/seaweedfs/weed/storage" "github.com/chrislusf/seaweedfs/weed/storage/needle" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" ) var topologyLayout = ` @@ -113,7 +114,7 @@ func setup(topologyLayout string) *Topology { func TestFindEmptySlotsForOneVolume(t *testing.T) { topo := setup(topologyLayout) vg := NewDefaultVolumeGrowth() - rp, _ := storage.NewReplicaPlacementFromString("002") + rp, _ := super_block.NewReplicaPlacementFromString("002") volumeGrowOption := &VolumeGrowOption{ Collection: "", ReplicaPlacement: rp, @@ -130,3 +131,212 @@ func TestFindEmptySlotsForOneVolume(t *testing.T) { fmt.Println("assigned node :", server.Id()) } } + +var topologyLayout2 = ` +{ + "dc1":{ + "rack1":{ + "server111":{ + "volumes":[ + {"id":1, "size":12312}, + {"id":2, "size":12312}, + {"id":3, "size":12312} + ], + "limit":300 + }, + "server112":{ + "volumes":[ + {"id":4, "size":12312}, + {"id":5, "size":12312}, + {"id":6, "size":12312} + ], + "limit":300 + }, + "server113":{ + "volumes":[], + "limit":300 + }, + "server114":{ + "volumes":[], + "limit":300 + }, + "server115":{ + "volumes":[], + "limit":300 + }, + "server116":{ + "volumes":[], + "limit":300 + } + }, + "rack2":{ + "server121":{ + "volumes":[ + {"id":4, "size":12312}, + {"id":5, "size":12312}, + {"id":6, "size":12312} + ], + "limit":300 + }, + "server122":{ + "volumes":[], + "limit":300 + }, + "server123":{ + "volumes":[ + {"id":2, "size":12312}, + {"id":3, "size":12312}, + {"id":4, "size":12312} + ], + "limit":300 + }, + "server124":{ + "volumes":[], + "limit":300 + }, + "server125":{ + "volumes":[], + "limit":300 + }, + "server126":{ + "volumes":[], + "limit":300 + } + }, + "rack3":{ + "server131":{ + "volumes":[], + "limit":300 + }, + "server132":{ + "volumes":[], + "limit":300 + }, + "server133":{ + "volumes":[], + "limit":300 + }, + "server134":{ + "volumes":[], + "limit":300 + }, + "server135":{ + "volumes":[], + "limit":300 + }, + "server136":{ + "volumes":[], + "limit":300 + } + } + } +} +` + +func TestReplication011(t *testing.T) { + topo := setup(topologyLayout2) + vg := NewDefaultVolumeGrowth() + rp, _ := super_block.NewReplicaPlacementFromString("011") + volumeGrowOption := &VolumeGrowOption{ + Collection: "MAIL", + ReplicaPlacement: rp, + DataCenter: "dc1", + Rack: "", + DataNode: "", + } + servers, err := vg.findEmptySlotsForOneVolume(topo, volumeGrowOption) + if err != nil { + fmt.Println("finding empty slots error :", err) + t.Fail() + } + for _, server := range servers { + fmt.Println("assigned node :", server.Id()) + } +} + +var topologyLayout3 = ` +{ + "dc1":{ + "rack1":{ + "server111":{ + "volumes":[], + "limit":2000 + } + } + }, + "dc2":{ + "rack2":{ + "server222":{ + "volumes":[], + "limit":2000 + } + } + }, + "dc3":{ + "rack3":{ + "server333":{ + "volumes":[], + "limit":1000 + } + } + }, + "dc4":{ + "rack4":{ + "server444":{ + "volumes":[], + "limit":1000 + } + } + }, + "dc5":{ + "rack5":{ + "server555":{ + "volumes":[], + "limit":500 + } + } + }, + "dc6":{ + "rack6":{ + "server666":{ + "volumes":[], + "limit":500 + } + } + } +} +` + +func TestFindEmptySlotsForOneVolumeScheduleByWeight(t *testing.T) { + topo := setup(topologyLayout3) + vg := NewDefaultVolumeGrowth() + rp, _ := super_block.NewReplicaPlacementFromString("100") + volumeGrowOption := &VolumeGrowOption{ + Collection: "Weight", + ReplicaPlacement: rp, + DataCenter: "", + Rack: "", + DataNode: "", + } + + distribution := map[NodeId]int{} + // assign 1000 volumes + for i := 0; i < 1000; i++ { + servers, err := vg.findEmptySlotsForOneVolume(topo, volumeGrowOption) + if err != nil { + fmt.Println("finding empty slots error :", err) + t.Fail() + } + for _, server := range servers { + // fmt.Println("assigned node :", server.Id()) + if _, ok := distribution[server.id]; !ok { + distribution[server.id] = 0 + } + distribution[server.id] += 1 + } + } + + for k, v := range distribution { + fmt.Printf("%s : %d\n", k, v) + } +} diff --git a/weed/topology/volume_layout.go b/weed/topology/volume_layout.go index 799cbca62..7633b28be 100644 --- a/weed/topology/volume_layout.go +++ b/weed/topology/volume_layout.go @@ -10,11 +10,12 @@ import ( "github.com/chrislusf/seaweedfs/weed/glog" "github.com/chrislusf/seaweedfs/weed/storage" "github.com/chrislusf/seaweedfs/weed/storage/needle" + "github.com/chrislusf/seaweedfs/weed/storage/super_block" ) // mapping from volume to its locations, inverted from server to volume type VolumeLayout struct { - rp *storage.ReplicaPlacement + rp *super_block.ReplicaPlacement ttl *needle.TTL vid2location map[needle.VolumeId]*VolumeLocationList writables []needle.VolumeId // transient array of writable volume id @@ -30,7 +31,7 @@ type VolumeLayoutStats struct { FileCount uint64 } -func NewVolumeLayout(rp *storage.ReplicaPlacement, ttl *needle.TTL, volumeSizeLimit uint64) *VolumeLayout { +func NewVolumeLayout(rp *super_block.ReplicaPlacement, ttl *needle.TTL, volumeSizeLimit uint64) *VolumeLayout { return &VolumeLayout{ rp: rp, ttl: ttl, |
