aboutsummaryrefslogtreecommitdiff
path: root/weed/topology
diff options
context:
space:
mode:
Diffstat (limited to 'weed/topology')
-rw-r--r--weed/topology/collection.go4
-rw-r--r--weed/topology/data_center.go1
-rw-r--r--weed/topology/data_node.go31
-rw-r--r--weed/topology/node.go97
-rw-r--r--weed/topology/rack.go1
-rw-r--r--weed/topology/store_replicate.go182
-rw-r--r--weed/topology/topology.go15
-rw-r--r--weed/topology/topology_event_handling.go1
-rw-r--r--weed/topology/topology_map.go1
-rw-r--r--weed/topology/topology_test.go5
-rw-r--r--weed/topology/topology_vacuum.go40
-rw-r--r--weed/topology/volume_growth.go24
-rw-r--r--weed/topology/volume_growth_test.go212
-rw-r--r--weed/topology/volume_layout.go5
14 files changed, 460 insertions, 159 deletions
diff --git a/weed/topology/collection.go b/weed/topology/collection.go
index f6b728ec9..7a611d904 100644
--- a/weed/topology/collection.go
+++ b/weed/topology/collection.go
@@ -3,8 +3,8 @@ package topology
import (
"fmt"
- "github.com/chrislusf/seaweedfs/weed/storage"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
"github.com/chrislusf/seaweedfs/weed/util"
)
@@ -24,7 +24,7 @@ func (c *Collection) String() string {
return fmt.Sprintf("Name:%s, volumeSizeLimit:%d, storageType2VolumeLayout:%v", c.Name, c.volumeSizeLimit, c.storageType2VolumeLayout)
}
-func (c *Collection) GetOrCreateVolumeLayout(rp *storage.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout {
+func (c *Collection) GetOrCreateVolumeLayout(rp *super_block.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout {
keyString := rp.String()
if ttl != nil {
keyString += ttl.String()
diff --git a/weed/topology/data_center.go b/weed/topology/data_center.go
index 640cb1937..dc3accb71 100644
--- a/weed/topology/data_center.go
+++ b/weed/topology/data_center.go
@@ -48,6 +48,7 @@ func (dc *DataCenter) ToDataCenterInfo() *master_pb.DataCenterInfo {
MaxVolumeCount: uint64(dc.GetMaxVolumeCount()),
FreeVolumeCount: uint64(dc.FreeSpace()),
ActiveVolumeCount: uint64(dc.GetActiveVolumeCount()),
+ RemoteVolumeCount: uint64(dc.GetRemoteVolumeCount()),
}
for _, c := range dc.Children() {
rack := c.(*Rack)
diff --git a/weed/topology/data_node.go b/weed/topology/data_node.go
index 3e72ccdbf..617341e54 100644
--- a/weed/topology/data_node.go
+++ b/weed/topology/data_node.go
@@ -2,14 +2,13 @@ package topology
import (
"fmt"
+ "strconv"
"sync"
"github.com/chrislusf/seaweedfs/weed/pb/master_pb"
"github.com/chrislusf/seaweedfs/weed/storage/erasure_coding"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
- "strconv"
-
"github.com/chrislusf/seaweedfs/weed/glog"
"github.com/chrislusf/seaweedfs/weed/storage"
)
@@ -44,15 +43,26 @@ func (dn *DataNode) String() string {
func (dn *DataNode) AddOrUpdateVolume(v storage.VolumeInfo) (isNew bool) {
dn.Lock()
defer dn.Unlock()
- if _, ok := dn.volumes[v.Id]; !ok {
+ if oldV, ok := dn.volumes[v.Id]; !ok {
dn.volumes[v.Id] = v
dn.UpAdjustVolumeCountDelta(1)
+ if v.IsRemote() {
+ dn.UpAdjustRemoteVolumeCountDelta(1)
+ }
if !v.ReadOnly {
dn.UpAdjustActiveVolumeCountDelta(1)
}
dn.UpAdjustMaxVolumeId(v.Id)
isNew = true
} else {
+ if oldV.IsRemote() != v.IsRemote() {
+ if v.IsRemote() {
+ dn.UpAdjustRemoteVolumeCountDelta(1)
+ }
+ if oldV.IsRemote() {
+ dn.UpAdjustRemoteVolumeCountDelta(-1)
+ }
+ }
dn.volumes[v.Id] = v
}
return
@@ -70,7 +80,12 @@ func (dn *DataNode) UpdateVolumes(actualVolumes []storage.VolumeInfo) (newVolume
delete(dn.volumes, vid)
deletedVolumes = append(deletedVolumes, v)
dn.UpAdjustVolumeCountDelta(-1)
- dn.UpAdjustActiveVolumeCountDelta(-1)
+ if v.IsRemote() {
+ dn.UpAdjustRemoteVolumeCountDelta(-1)
+ }
+ if !v.ReadOnly {
+ dn.UpAdjustActiveVolumeCountDelta(-1)
+ }
}
}
dn.Unlock()
@@ -88,7 +103,12 @@ func (dn *DataNode) DeltaUpdateVolumes(newlVolumes, deletedVolumes []storage.Vol
for _, v := range deletedVolumes {
delete(dn.volumes, v.Id)
dn.UpAdjustVolumeCountDelta(-1)
- dn.UpAdjustActiveVolumeCountDelta(-1)
+ if v.IsRemote() {
+ dn.UpAdjustRemoteVolumeCountDelta(-1)
+ }
+ if !v.ReadOnly {
+ dn.UpAdjustActiveVolumeCountDelta(-1)
+ }
}
dn.Unlock()
for _, v := range newlVolumes {
@@ -160,6 +180,7 @@ func (dn *DataNode) ToDataNodeInfo() *master_pb.DataNodeInfo {
MaxVolumeCount: uint64(dn.GetMaxVolumeCount()),
FreeVolumeCount: uint64(dn.FreeSpace()),
ActiveVolumeCount: uint64(dn.GetActiveVolumeCount()),
+ RemoteVolumeCount: uint64(dn.GetRemoteVolumeCount()),
}
for _, v := range dn.GetVolumes() {
m.VolumeInfos = append(m.VolumeInfos, v.ToVolumeInformationMessage())
diff --git a/weed/topology/node.go b/weed/topology/node.go
index b2808f589..ceeb96d60 100644
--- a/weed/topology/node.go
+++ b/weed/topology/node.go
@@ -20,6 +20,7 @@ type Node interface {
ReserveOneVolume(r int64) (*DataNode, error)
UpAdjustMaxVolumeCountDelta(maxVolumeCountDelta int64)
UpAdjustVolumeCountDelta(volumeCountDelta int64)
+ UpAdjustRemoteVolumeCountDelta(remoteVolumeCountDelta int64)
UpAdjustEcShardCountDelta(ecShardCountDelta int64)
UpAdjustActiveVolumeCountDelta(activeVolumeCountDelta int64)
UpAdjustMaxVolumeId(vid needle.VolumeId)
@@ -27,6 +28,7 @@ type Node interface {
GetVolumeCount() int64
GetEcShardCount() int64
GetActiveVolumeCount() int64
+ GetRemoteVolumeCount() int64
GetMaxVolumeCount() int64
GetMaxVolumeId() needle.VolumeId
SetParent(Node)
@@ -44,6 +46,7 @@ type Node interface {
}
type NodeImpl struct {
volumeCount int64
+ remoteVolumeCount int64
activeVolumeCount int64
ecShardCount int64
maxVolumeCount int64
@@ -59,56 +62,64 @@ type NodeImpl struct {
}
// the first node must satisfy filterFirstNodeFn(), the rest nodes must have one free slot
-func (n *NodeImpl) RandomlyPickNodes(numberOfNodes int, filterFirstNodeFn func(dn Node) error) (firstNode Node, restNodes []Node, err error) {
- candidates := make([]Node, 0, len(n.children))
+func (n *NodeImpl) PickNodesByWeight(numberOfNodes int, filterFirstNodeFn func(dn Node) error) (firstNode Node, restNodes []Node, err error) {
+ var totalWeights int64
var errs []string
n.RLock()
+ candidates := make([]Node, 0, len(n.children))
+ candidatesWeights := make([]int64, 0, len(n.children))
+ //pick nodes which has enough free volumes as candidates, and use free volumes number as node weight.
for _, node := range n.children {
- if err := filterFirstNodeFn(node); err == nil {
- candidates = append(candidates, node)
- } else {
- errs = append(errs, string(node.Id())+":"+err.Error())
+ if node.FreeSpace() <= 0 {
+ continue
}
+ totalWeights += node.FreeSpace()
+ candidates = append(candidates, node)
+ candidatesWeights = append(candidatesWeights, node.FreeSpace())
}
n.RUnlock()
- if len(candidates) == 0 {
- return nil, nil, errors.New("No matching data node found! \n" + strings.Join(errs, "\n"))
+ if len(candidates) < numberOfNodes {
+ glog.V(2).Infoln(n.Id(), "failed to pick", numberOfNodes, "from ", len(candidates), "node candidates")
+ return nil, nil, errors.New("No enough data node found!")
}
- firstNode = candidates[rand.Intn(len(candidates))]
- glog.V(2).Infoln(n.Id(), "picked main node:", firstNode.Id())
- restNodes = make([]Node, numberOfNodes-1)
- candidates = candidates[:0]
- n.RLock()
- for _, node := range n.children {
- if node.Id() == firstNode.Id() {
- continue
- }
- if node.FreeSpace() <= 0 {
- continue
+ //pick nodes randomly by weights, the node picked earlier has higher final weights
+ sortedCandidates := make([]Node, 0, len(candidates))
+ for i := 0; i < len(candidates); i++ {
+ weightsInterval := rand.Int63n(totalWeights)
+ lastWeights := int64(0)
+ for k, weights := range candidatesWeights {
+ if (weightsInterval >= lastWeights) && (weightsInterval < lastWeights+weights) {
+ sortedCandidates = append(sortedCandidates, candidates[k])
+ candidatesWeights[k] = 0
+ totalWeights -= weights
+ break
+ }
+ lastWeights += weights
}
- glog.V(2).Infoln("select rest node candidate:", node.Id())
- candidates = append(candidates, node)
}
- n.RUnlock()
- glog.V(2).Infoln(n.Id(), "picking", numberOfNodes-1, "from rest", len(candidates), "node candidates")
- ret := len(restNodes) == 0
- for k, node := range candidates {
- if k < len(restNodes) {
- restNodes[k] = node
- if k == len(restNodes)-1 {
- ret = true
+
+ restNodes = make([]Node, 0, numberOfNodes-1)
+ ret := false
+ n.RLock()
+ for k, node := range sortedCandidates {
+ if err := filterFirstNodeFn(node); err == nil {
+ firstNode = node
+ if k >= numberOfNodes-1 {
+ restNodes = sortedCandidates[:numberOfNodes-1]
+ } else {
+ restNodes = append(restNodes, sortedCandidates[:k]...)
+ restNodes = append(restNodes, sortedCandidates[k+1:numberOfNodes]...)
}
+ ret = true
+ break
} else {
- r := rand.Intn(k + 1)
- if r < len(restNodes) {
- restNodes[r] = node
- }
+ errs = append(errs, string(node.Id())+":"+err.Error())
}
}
+ n.RUnlock()
if !ret {
- glog.V(2).Infoln(n.Id(), "failed to pick", numberOfNodes-1, "from rest", len(candidates), "node candidates")
- err = errors.New("No enough data node found!")
+ return nil, nil, errors.New("No matching data node found! \n" + strings.Join(errs, "\n"))
}
return
}
@@ -132,10 +143,11 @@ func (n *NodeImpl) Id() NodeId {
return n.id
}
func (n *NodeImpl) FreeSpace() int64 {
+ freeVolumeSlotCount := n.maxVolumeCount + n.remoteVolumeCount - n.volumeCount
if n.ecShardCount > 0 {
- return n.maxVolumeCount - n.volumeCount - n.ecShardCount/erasure_coding.DataShardsCount - 1
+ freeVolumeSlotCount = freeVolumeSlotCount - n.ecShardCount/erasure_coding.DataShardsCount - 1
}
- return n.maxVolumeCount - n.volumeCount
+ return freeVolumeSlotCount
}
func (n *NodeImpl) SetParent(node Node) {
n.parent = node
@@ -191,6 +203,12 @@ func (n *NodeImpl) UpAdjustVolumeCountDelta(volumeCountDelta int64) { //can be n
n.parent.UpAdjustVolumeCountDelta(volumeCountDelta)
}
}
+func (n *NodeImpl) UpAdjustRemoteVolumeCountDelta(remoteVolumeCountDelta int64) { //can be negative
+ atomic.AddInt64(&n.remoteVolumeCount, remoteVolumeCountDelta)
+ if n.parent != nil {
+ n.parent.UpAdjustRemoteVolumeCountDelta(remoteVolumeCountDelta)
+ }
+}
func (n *NodeImpl) UpAdjustEcShardCountDelta(ecShardCountDelta int64) { //can be negative
atomic.AddInt64(&n.ecShardCount, ecShardCountDelta)
if n.parent != nil {
@@ -220,6 +238,9 @@ func (n *NodeImpl) GetVolumeCount() int64 {
func (n *NodeImpl) GetEcShardCount() int64 {
return n.ecShardCount
}
+func (n *NodeImpl) GetRemoteVolumeCount() int64 {
+ return n.remoteVolumeCount
+}
func (n *NodeImpl) GetActiveVolumeCount() int64 {
return n.activeVolumeCount
}
@@ -235,6 +256,7 @@ func (n *NodeImpl) LinkChildNode(node Node) {
n.UpAdjustMaxVolumeCountDelta(node.GetMaxVolumeCount())
n.UpAdjustMaxVolumeId(node.GetMaxVolumeId())
n.UpAdjustVolumeCountDelta(node.GetVolumeCount())
+ n.UpAdjustRemoteVolumeCountDelta(node.GetRemoteVolumeCount())
n.UpAdjustEcShardCountDelta(node.GetEcShardCount())
n.UpAdjustActiveVolumeCountDelta(node.GetActiveVolumeCount())
node.SetParent(n)
@@ -250,6 +272,7 @@ func (n *NodeImpl) UnlinkChildNode(nodeId NodeId) {
node.SetParent(nil)
delete(n.children, node.Id())
n.UpAdjustVolumeCountDelta(-node.GetVolumeCount())
+ n.UpAdjustRemoteVolumeCountDelta(-node.GetRemoteVolumeCount())
n.UpAdjustEcShardCountDelta(-node.GetEcShardCount())
n.UpAdjustActiveVolumeCountDelta(-node.GetActiveVolumeCount())
n.UpAdjustMaxVolumeCountDelta(-node.GetMaxVolumeCount())
diff --git a/weed/topology/rack.go b/weed/topology/rack.go
index 932c1a804..1921c0c05 100644
--- a/weed/topology/rack.go
+++ b/weed/topology/rack.go
@@ -67,6 +67,7 @@ func (r *Rack) ToRackInfo() *master_pb.RackInfo {
MaxVolumeCount: uint64(r.GetMaxVolumeCount()),
FreeVolumeCount: uint64(r.FreeSpace()),
ActiveVolumeCount: uint64(r.GetActiveVolumeCount()),
+ RemoteVolumeCount: uint64(r.GetRemoteVolumeCount()),
}
for _, c := range r.Children() {
dn := c.(*DataNode)
diff --git a/weed/topology/store_replicate.go b/weed/topology/store_replicate.go
index d21c4d210..8c4996d45 100644
--- a/weed/topology/store_replicate.go
+++ b/weed/topology/store_replicate.go
@@ -1,7 +1,6 @@
package topology
import (
- "bytes"
"encoding/json"
"errors"
"fmt"
@@ -25,58 +24,60 @@ func ReplicatedWrite(masterNode string, s *storage.Store,
//check JWT
jwt := security.GetJwt(r)
+ var remoteLocations []operation.Location
+ if r.FormValue("type") != "replicate" {
+ remoteLocations, err = getWritableRemoteReplications(s, volumeId, masterNode)
+ if err != nil {
+ glog.V(0).Infoln(err)
+ return
+ }
+ }
+
size, isUnchanged, err = s.WriteVolumeNeedle(volumeId, n)
if err != nil {
err = fmt.Errorf("failed to write to local disk: %v", err)
+ glog.V(0).Infoln(err)
return
}
- needToReplicate := !s.HasVolume(volumeId)
- needToReplicate = needToReplicate || s.GetVolume(volumeId).NeedToReplicate()
- if !needToReplicate {
- needToReplicate = s.GetVolume(volumeId).NeedToReplicate()
- }
- if needToReplicate { //send to other replica locations
- if r.FormValue("type") != "replicate" {
-
- if err = distributedOperation(masterNode, s, volumeId, func(location operation.Location) error {
- u := url.URL{
- Scheme: "http",
- Host: location.Url,
- Path: r.URL.Path,
- }
- q := url.Values{
- "type": {"replicate"},
- "ttl": {n.Ttl.String()},
- }
- if n.LastModified > 0 {
- q.Set("ts", strconv.FormatUint(n.LastModified, 10))
- }
- if n.IsChunkedManifest() {
- q.Set("cm", "true")
+ if len(remoteLocations) > 0 { //send to other replica locations
+ if err = distributedOperation(remoteLocations, s, func(location operation.Location) error {
+ u := url.URL{
+ Scheme: "http",
+ Host: location.Url,
+ Path: r.URL.Path,
+ }
+ q := url.Values{
+ "type": {"replicate"},
+ "ttl": {n.Ttl.String()},
+ }
+ if n.LastModified > 0 {
+ q.Set("ts", strconv.FormatUint(n.LastModified, 10))
+ }
+ if n.IsChunkedManifest() {
+ q.Set("cm", "true")
+ }
+ u.RawQuery = q.Encode()
+
+ pairMap := make(map[string]string)
+ if n.HasPairs() {
+ tmpMap := make(map[string]string)
+ err := json.Unmarshal(n.Pairs, &tmpMap)
+ if err != nil {
+ glog.V(0).Infoln("Unmarshal pairs error:", err)
}
- u.RawQuery = q.Encode()
-
- pairMap := make(map[string]string)
- if n.HasPairs() {
- tmpMap := make(map[string]string)
- err := json.Unmarshal(n.Pairs, &tmpMap)
- if err != nil {
- glog.V(0).Infoln("Unmarshal pairs error:", err)
- }
- for k, v := range tmpMap {
- pairMap[needle.PairNamePrefix+k] = v
- }
+ for k, v := range tmpMap {
+ pairMap[needle.PairNamePrefix+k] = v
}
-
- _, err := operation.Upload(u.String(),
- string(n.Name), bytes.NewReader(n.Data), n.IsGzipped(), string(n.Mime),
- pairMap, jwt)
- return err
- }); err != nil {
- size = 0
- err = fmt.Errorf("failed to write to replicas for volume %d: %v", volumeId, err)
}
+
+ // volume server do not know about encryption
+ _, err := operation.UploadData(u.String(), string(n.Name), false, n.Data, n.IsGzipped(), string(n.Mime), pairMap, jwt)
+ return err
+ }); err != nil {
+ size = 0
+ err = fmt.Errorf("failed to write to replicas for volume %d: %v", volumeId, err)
+ glog.V(0).Infoln(err)
}
}
return
@@ -84,31 +85,34 @@ func ReplicatedWrite(masterNode string, s *storage.Store,
func ReplicatedDelete(masterNode string, store *storage.Store,
volumeId needle.VolumeId, n *needle.Needle,
- r *http.Request) (uint32, error) {
+ r *http.Request) (size uint32, err error) {
//check JWT
jwt := security.GetJwt(r)
- ret, err := store.DeleteVolumeNeedle(volumeId, n)
+ var remoteLocations []operation.Location
+ if r.FormValue("type") != "replicate" {
+ remoteLocations, err = getWritableRemoteReplications(store, volumeId, masterNode)
+ if err != nil {
+ glog.V(0).Infoln(err)
+ return
+ }
+ }
+
+ size, err = store.DeleteVolumeNeedle(volumeId, n)
if err != nil {
glog.V(0).Infoln("delete error:", err)
- return ret, err
+ return
}
- needToReplicate := !store.HasVolume(volumeId)
- if !needToReplicate && ret > 0 {
- needToReplicate = store.GetVolume(volumeId).NeedToReplicate()
- }
- if needToReplicate { //send to other replica locations
- if r.FormValue("type") != "replicate" {
- if err = distributedOperation(masterNode, store, volumeId, func(location operation.Location) error {
- return util.Delete("http://"+location.Url+r.URL.Path+"?type=replicate", string(jwt))
- }); err != nil {
- ret = 0
- }
+ if len(remoteLocations) > 0 { //send to other replica locations
+ if err = distributedOperation(remoteLocations, store, func(location operation.Location) error {
+ return util.Delete("http://"+location.Url+r.URL.Path+"?type=replicate", string(jwt))
+ }); err != nil {
+ size = 0
}
}
- return ret, err
+ return
}
type DistributedOperationResult map[string]error
@@ -131,32 +135,44 @@ type RemoteResult struct {
Error error
}
-func distributedOperation(masterNode string, store *storage.Store, volumeId needle.VolumeId, op func(location operation.Location) error) error {
- if lookupResult, lookupErr := operation.Lookup(masterNode, volumeId.String()); lookupErr == nil {
- length := 0
- selfUrl := (store.Ip + ":" + strconv.Itoa(store.Port))
- results := make(chan RemoteResult)
- for _, location := range lookupResult.Locations {
- if location.Url != selfUrl {
- length++
- go func(location operation.Location, results chan RemoteResult) {
- results <- RemoteResult{location.Url, op(location)}
- }(location, results)
+func distributedOperation(locations []operation.Location, store *storage.Store, op func(location operation.Location) error) error {
+ length := len(locations)
+ results := make(chan RemoteResult)
+ for _, location := range locations {
+ go func(location operation.Location, results chan RemoteResult) {
+ results <- RemoteResult{location.Url, op(location)}
+ }(location, results)
+ }
+ ret := DistributedOperationResult(make(map[string]error))
+ for i := 0; i < length; i++ {
+ result := <-results
+ ret[result.Host] = result.Error
+ }
+
+ return ret.Error()
+}
+
+func getWritableRemoteReplications(s *storage.Store, volumeId needle.VolumeId, masterNode string) (
+ remoteLocations []operation.Location, err error) {
+ copyCount := s.GetVolume(volumeId).ReplicaPlacement.GetCopyCount()
+ if copyCount > 1 {
+ if lookupResult, lookupErr := operation.Lookup(masterNode, volumeId.String()); lookupErr == nil {
+ if len(lookupResult.Locations) < copyCount {
+ err = fmt.Errorf("replicating opetations [%d] is less than volume %d replication copy count [%d]",
+ len(lookupResult.Locations), volumeId, copyCount)
+ return
}
- }
- ret := DistributedOperationResult(make(map[string]error))
- for i := 0; i < length; i++ {
- result := <-results
- ret[result.Host] = result.Error
- }
- if volume := store.GetVolume(volumeId); volume != nil {
- if length+1 < volume.ReplicaPlacement.GetCopyCount() {
- return fmt.Errorf("replicating opetations [%d] is less than volume's replication copy count [%d]", length+1, volume.ReplicaPlacement.GetCopyCount())
+ selfUrl := s.Ip + ":" + strconv.Itoa(s.Port)
+ for _, location := range lookupResult.Locations {
+ if location.Url != selfUrl {
+ remoteLocations = append(remoteLocations, location)
+ }
}
+ } else {
+ err = fmt.Errorf("failed to lookup for %d: %v", volumeId, lookupErr)
+ return
}
- return ret.Error()
- } else {
- glog.V(0).Infoln()
- return fmt.Errorf("Failed to lookup for %d: %v", volumeId, lookupErr)
}
+
+ return
}
diff --git a/weed/topology/topology.go b/weed/topology/topology.go
index b7ebe8af5..fbf998707 100644
--- a/weed/topology/topology.go
+++ b/weed/topology/topology.go
@@ -7,11 +7,13 @@ import (
"sync"
"github.com/chrislusf/raft"
+
"github.com/chrislusf/seaweedfs/weed/glog"
"github.com/chrislusf/seaweedfs/weed/pb/master_pb"
"github.com/chrislusf/seaweedfs/weed/sequence"
"github.com/chrislusf/seaweedfs/weed/storage"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
"github.com/chrislusf/seaweedfs/weed/util"
)
@@ -58,7 +60,12 @@ func NewTopology(id string, seq sequence.Sequencer, volumeSizeLimit uint64, puls
func (t *Topology) IsLeader() bool {
if t.RaftServer != nil {
- return t.RaftServer.State() == raft.Leader
+ if t.RaftServer.State() == raft.Leader {
+ return true
+ }
+ if t.RaftServer.Leader() == "" {
+ return true
+ }
}
return false
}
@@ -73,7 +80,7 @@ func (t *Topology) Leader() (string, error) {
if l == "" {
// We are a single node cluster, we are the leader
- return t.RaftServer.Name(), errors.New("Raft Server not initialized!")
+ return t.RaftServer.Name(), nil
}
return l, nil
@@ -129,7 +136,7 @@ func (t *Topology) PickForWrite(count uint64, option *VolumeGrowOption) (string,
return needle.NewFileId(*vid, fileId, rand.Uint32()).String(), count, datanodes.Head(), nil
}
-func (t *Topology) GetVolumeLayout(collectionName string, rp *storage.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout {
+func (t *Topology) GetVolumeLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL) *VolumeLayout {
return t.collectionMap.Get(collectionName, func() interface{} {
return NewCollection(collectionName, t.volumeSizeLimit)
}).(*Collection).GetOrCreateVolumeLayout(rp, ttl)
@@ -150,7 +157,7 @@ func (t *Topology) ListCollections(includeNormalVolumes, includeEcVolumes bool)
t.ecShardMapLock.RUnlock()
}
- for k, _ := range mapOfCollections {
+ for k := range mapOfCollections {
ret = append(ret, k)
}
return ret
diff --git a/weed/topology/topology_event_handling.go b/weed/topology/topology_event_handling.go
index 041351492..068bd401e 100644
--- a/weed/topology/topology_event_handling.go
+++ b/weed/topology/topology_event_handling.go
@@ -59,6 +59,7 @@ func (t *Topology) UnRegisterDataNode(dn *DataNode) {
vl.SetVolumeUnavailable(dn, v.Id)
}
dn.UpAdjustVolumeCountDelta(-dn.GetVolumeCount())
+ dn.UpAdjustRemoteVolumeCountDelta(-dn.GetRemoteVolumeCount())
dn.UpAdjustActiveVolumeCountDelta(-dn.GetActiveVolumeCount())
dn.UpAdjustMaxVolumeCountDelta(-dn.GetMaxVolumeCount())
if dn.Parent() != nil {
diff --git a/weed/topology/topology_map.go b/weed/topology/topology_map.go
index 0ad30f12e..73c55d77d 100644
--- a/weed/topology/topology_map.go
+++ b/weed/topology/topology_map.go
@@ -85,6 +85,7 @@ func (t *Topology) ToTopologyInfo() *master_pb.TopologyInfo {
MaxVolumeCount: uint64(t.GetMaxVolumeCount()),
FreeVolumeCount: uint64(t.FreeSpace()),
ActiveVolumeCount: uint64(t.GetActiveVolumeCount()),
+ RemoteVolumeCount: uint64(t.GetRemoteVolumeCount()),
}
for _, c := range t.Children() {
dc := c.(*DataCenter)
diff --git a/weed/topology/topology_test.go b/weed/topology/topology_test.go
index 8f79ad684..e7676ccf7 100644
--- a/weed/topology/topology_test.go
+++ b/weed/topology/topology_test.go
@@ -5,6 +5,7 @@ import (
"github.com/chrislusf/seaweedfs/weed/sequence"
"github.com/chrislusf/seaweedfs/weed/storage"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
"testing"
)
@@ -94,7 +95,7 @@ func TestHandlingVolumeServerHeartbeat(t *testing.T) {
[]*master_pb.VolumeShortInformationMessage{newVolumeShortMessage},
nil,
dn)
- rp, _ := storage.NewReplicaPlacementFromString("000")
+ rp, _ := super_block.NewReplicaPlacementFromString("000")
layout := topo.GetVolumeLayout("", rp, needle.EMPTY_TTL)
assert(t, "writables after repeated add", len(layout.writables), volumeCount)
@@ -154,7 +155,7 @@ func TestAddRemoveVolume(t *testing.T) {
DeletedByteCount: 45,
ReadOnly: false,
Version: needle.CurrentVersion,
- ReplicaPlacement: &storage.ReplicaPlacement{},
+ ReplicaPlacement: &super_block.ReplicaPlacement{},
Ttl: needle.EMPTY_TTL,
}
diff --git a/weed/topology/topology_vacuum.go b/weed/topology/topology_vacuum.go
index ff32f1874..ca626e973 100644
--- a/weed/topology/topology_vacuum.go
+++ b/weed/topology/topology_vacuum.go
@@ -13,8 +13,10 @@ import (
"github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb"
)
-func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId, locationlist *VolumeLocationList, garbageThreshold float64) bool {
- ch := make(chan bool, locationlist.Length())
+func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId,
+ locationlist *VolumeLocationList, garbageThreshold float64) (*VolumeLocationList, bool) {
+ ch := make(chan int, locationlist.Length())
+ errCount := int32(0)
for index, dn := range locationlist.list {
go func(index int, url string, vid needle.VolumeId) {
err := operation.WithVolumeServerClient(url, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
@@ -22,11 +24,15 @@ func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vi
VolumeId: uint32(vid),
})
if err != nil {
- ch <- false
+ atomic.AddInt32(&errCount, 1)
+ ch <- -1
return err
}
- isNeeded := resp.GarbageRatio > garbageThreshold
- ch <- isNeeded
+ if resp.GarbageRatio >= garbageThreshold {
+ ch <- index
+ } else {
+ ch <- -1
+ }
return nil
})
if err != nil {
@@ -34,18 +40,21 @@ func batchVacuumVolumeCheck(grpcDialOption grpc.DialOption, vl *VolumeLayout, vi
}
}(index, dn.Url(), vid)
}
- isCheckSuccess := true
+ vacuumLocationList := NewVolumeLocationList()
for range locationlist.list {
select {
- case canVacuum := <-ch:
- isCheckSuccess = isCheckSuccess && canVacuum
+ case index := <-ch:
+ if index != -1 {
+ vacuumLocationList.list = append(vacuumLocationList.list, locationlist.list[index])
+ }
case <-time.After(30 * time.Minute):
- return false
+ return vacuumLocationList, false
}
}
- return isCheckSuccess
+ return vacuumLocationList, errCount == 0 && len(vacuumLocationList.list) > 0
}
-func batchVacuumVolumeCompact(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId, locationlist *VolumeLocationList, preallocate int64) bool {
+func batchVacuumVolumeCompact(grpcDialOption grpc.DialOption, vl *VolumeLayout, vid needle.VolumeId,
+ locationlist *VolumeLocationList, preallocate int64) bool {
vl.accessLock.Lock()
vl.removeFromWritable(vid)
vl.accessLock.Unlock()
@@ -163,11 +172,12 @@ func vacuumOneVolumeLayout(grpcDialOption grpc.DialOption, volumeLayout *VolumeL
}
glog.V(2).Infof("check vacuum on collection:%s volume:%d", c.Name, vid)
- if batchVacuumVolumeCheck(grpcDialOption, volumeLayout, vid, locationList, garbageThreshold) {
- if batchVacuumVolumeCompact(grpcDialOption, volumeLayout, vid, locationList, preallocate) {
- batchVacuumVolumeCommit(grpcDialOption, volumeLayout, vid, locationList)
+ if vacuumLocationList, needVacuum := batchVacuumVolumeCheck(
+ grpcDialOption, volumeLayout, vid, locationList, garbageThreshold); needVacuum {
+ if batchVacuumVolumeCompact(grpcDialOption, volumeLayout, vid, vacuumLocationList, preallocate) {
+ batchVacuumVolumeCommit(grpcDialOption, volumeLayout, vid, vacuumLocationList)
} else {
- batchVacuumVolumeCleanup(grpcDialOption, volumeLayout, vid, locationList)
+ batchVacuumVolumeCleanup(grpcDialOption, volumeLayout, vid, vacuumLocationList)
}
}
}
diff --git a/weed/topology/volume_growth.go b/weed/topology/volume_growth.go
index 636eb2260..446c88f60 100644
--- a/weed/topology/volume_growth.go
+++ b/weed/topology/volume_growth.go
@@ -6,6 +6,9 @@ import (
"sync"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
+ "github.com/chrislusf/seaweedfs/weed/util"
+
"google.golang.org/grpc"
"github.com/chrislusf/seaweedfs/weed/glog"
@@ -22,7 +25,7 @@ This package is created to resolve these replica placement issues:
type VolumeGrowOption struct {
Collection string
- ReplicaPlacement *storage.ReplicaPlacement
+ ReplicaPlacement *super_block.ReplicaPlacement
Ttl *needle.TTL
Prealloacte int64
DataCenter string
@@ -46,15 +49,20 @@ func NewDefaultVolumeGrowth() *VolumeGrowth {
// one replication type may need rp.GetCopyCount() actual volumes
// given copyCount, how many logical volumes to create
func (vg *VolumeGrowth) findVolumeCount(copyCount int) (count int) {
+ v := util.GetViper()
+ v.SetDefault("master.volume_growth.copy_1", 7)
+ v.SetDefault("master.volume_growth.copy_2", 6)
+ v.SetDefault("master.volume_growth.copy_3", 3)
+ v.SetDefault("master.volume_growth.copy_other", 1)
switch copyCount {
case 1:
- count = 7
+ count = v.GetInt("master.volume_growth.copy_1")
case 2:
- count = 6
+ count = v.GetInt("master.volume_growth.copy_2")
case 3:
- count = 3
+ count = v.GetInt("master.volume_growth.copy_3")
default:
- count = 1
+ count = v.GetInt("master.volume_growth.copy_other")
}
return
}
@@ -104,7 +112,7 @@ func (vg *VolumeGrowth) findAndGrow(grpcDialOption grpc.DialOption, topo *Topolo
func (vg *VolumeGrowth) findEmptySlotsForOneVolume(topo *Topology, option *VolumeGrowOption) (servers []*DataNode, err error) {
//find main datacenter and other data centers
rp := option.ReplicaPlacement
- mainDataCenter, otherDataCenters, dc_err := topo.RandomlyPickNodes(rp.DiffDataCenterCount+1, func(node Node) error {
+ mainDataCenter, otherDataCenters, dc_err := topo.PickNodesByWeight(rp.DiffDataCenterCount+1, func(node Node) error {
if option.DataCenter != "" && node.IsDataCenter() && node.Id() != NodeId(option.DataCenter) {
return fmt.Errorf("Not matching preferred data center:%s", option.DataCenter)
}
@@ -136,7 +144,7 @@ func (vg *VolumeGrowth) findEmptySlotsForOneVolume(topo *Topology, option *Volum
}
//find main rack and other racks
- mainRack, otherRacks, rackErr := mainDataCenter.(*DataCenter).RandomlyPickNodes(rp.DiffRackCount+1, func(node Node) error {
+ mainRack, otherRacks, rackErr := mainDataCenter.(*DataCenter).PickNodesByWeight(rp.DiffRackCount+1, func(node Node) error {
if option.Rack != "" && node.IsRack() && node.Id() != NodeId(option.Rack) {
return fmt.Errorf("Not matching preferred rack:%s", option.Rack)
}
@@ -163,7 +171,7 @@ func (vg *VolumeGrowth) findEmptySlotsForOneVolume(topo *Topology, option *Volum
}
//find main rack and other racks
- mainServer, otherServers, serverErr := mainRack.(*Rack).RandomlyPickNodes(rp.SameRackCount+1, func(node Node) error {
+ mainServer, otherServers, serverErr := mainRack.(*Rack).PickNodesByWeight(rp.SameRackCount+1, func(node Node) error {
if option.DataNode != "" && node.IsDataNode() && node.Id() != NodeId(option.DataNode) {
return fmt.Errorf("Not matching preferred data node:%s", option.DataNode)
}
diff --git a/weed/topology/volume_growth_test.go b/weed/topology/volume_growth_test.go
index 3573365fd..6ff5be0eb 100644
--- a/weed/topology/volume_growth_test.go
+++ b/weed/topology/volume_growth_test.go
@@ -8,6 +8,7 @@ import (
"github.com/chrislusf/seaweedfs/weed/sequence"
"github.com/chrislusf/seaweedfs/weed/storage"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
)
var topologyLayout = `
@@ -113,7 +114,7 @@ func setup(topologyLayout string) *Topology {
func TestFindEmptySlotsForOneVolume(t *testing.T) {
topo := setup(topologyLayout)
vg := NewDefaultVolumeGrowth()
- rp, _ := storage.NewReplicaPlacementFromString("002")
+ rp, _ := super_block.NewReplicaPlacementFromString("002")
volumeGrowOption := &VolumeGrowOption{
Collection: "",
ReplicaPlacement: rp,
@@ -130,3 +131,212 @@ func TestFindEmptySlotsForOneVolume(t *testing.T) {
fmt.Println("assigned node :", server.Id())
}
}
+
+var topologyLayout2 = `
+{
+ "dc1":{
+ "rack1":{
+ "server111":{
+ "volumes":[
+ {"id":1, "size":12312},
+ {"id":2, "size":12312},
+ {"id":3, "size":12312}
+ ],
+ "limit":300
+ },
+ "server112":{
+ "volumes":[
+ {"id":4, "size":12312},
+ {"id":5, "size":12312},
+ {"id":6, "size":12312}
+ ],
+ "limit":300
+ },
+ "server113":{
+ "volumes":[],
+ "limit":300
+ },
+ "server114":{
+ "volumes":[],
+ "limit":300
+ },
+ "server115":{
+ "volumes":[],
+ "limit":300
+ },
+ "server116":{
+ "volumes":[],
+ "limit":300
+ }
+ },
+ "rack2":{
+ "server121":{
+ "volumes":[
+ {"id":4, "size":12312},
+ {"id":5, "size":12312},
+ {"id":6, "size":12312}
+ ],
+ "limit":300
+ },
+ "server122":{
+ "volumes":[],
+ "limit":300
+ },
+ "server123":{
+ "volumes":[
+ {"id":2, "size":12312},
+ {"id":3, "size":12312},
+ {"id":4, "size":12312}
+ ],
+ "limit":300
+ },
+ "server124":{
+ "volumes":[],
+ "limit":300
+ },
+ "server125":{
+ "volumes":[],
+ "limit":300
+ },
+ "server126":{
+ "volumes":[],
+ "limit":300
+ }
+ },
+ "rack3":{
+ "server131":{
+ "volumes":[],
+ "limit":300
+ },
+ "server132":{
+ "volumes":[],
+ "limit":300
+ },
+ "server133":{
+ "volumes":[],
+ "limit":300
+ },
+ "server134":{
+ "volumes":[],
+ "limit":300
+ },
+ "server135":{
+ "volumes":[],
+ "limit":300
+ },
+ "server136":{
+ "volumes":[],
+ "limit":300
+ }
+ }
+ }
+}
+`
+
+func TestReplication011(t *testing.T) {
+ topo := setup(topologyLayout2)
+ vg := NewDefaultVolumeGrowth()
+ rp, _ := super_block.NewReplicaPlacementFromString("011")
+ volumeGrowOption := &VolumeGrowOption{
+ Collection: "MAIL",
+ ReplicaPlacement: rp,
+ DataCenter: "dc1",
+ Rack: "",
+ DataNode: "",
+ }
+ servers, err := vg.findEmptySlotsForOneVolume(topo, volumeGrowOption)
+ if err != nil {
+ fmt.Println("finding empty slots error :", err)
+ t.Fail()
+ }
+ for _, server := range servers {
+ fmt.Println("assigned node :", server.Id())
+ }
+}
+
+var topologyLayout3 = `
+{
+ "dc1":{
+ "rack1":{
+ "server111":{
+ "volumes":[],
+ "limit":2000
+ }
+ }
+ },
+ "dc2":{
+ "rack2":{
+ "server222":{
+ "volumes":[],
+ "limit":2000
+ }
+ }
+ },
+ "dc3":{
+ "rack3":{
+ "server333":{
+ "volumes":[],
+ "limit":1000
+ }
+ }
+ },
+ "dc4":{
+ "rack4":{
+ "server444":{
+ "volumes":[],
+ "limit":1000
+ }
+ }
+ },
+ "dc5":{
+ "rack5":{
+ "server555":{
+ "volumes":[],
+ "limit":500
+ }
+ }
+ },
+ "dc6":{
+ "rack6":{
+ "server666":{
+ "volumes":[],
+ "limit":500
+ }
+ }
+ }
+}
+`
+
+func TestFindEmptySlotsForOneVolumeScheduleByWeight(t *testing.T) {
+ topo := setup(topologyLayout3)
+ vg := NewDefaultVolumeGrowth()
+ rp, _ := super_block.NewReplicaPlacementFromString("100")
+ volumeGrowOption := &VolumeGrowOption{
+ Collection: "Weight",
+ ReplicaPlacement: rp,
+ DataCenter: "",
+ Rack: "",
+ DataNode: "",
+ }
+
+ distribution := map[NodeId]int{}
+ // assign 1000 volumes
+ for i := 0; i < 1000; i++ {
+ servers, err := vg.findEmptySlotsForOneVolume(topo, volumeGrowOption)
+ if err != nil {
+ fmt.Println("finding empty slots error :", err)
+ t.Fail()
+ }
+ for _, server := range servers {
+ // fmt.Println("assigned node :", server.Id())
+ if _, ok := distribution[server.id]; !ok {
+ distribution[server.id] = 0
+ }
+ distribution[server.id] += 1
+ }
+ }
+
+ for k, v := range distribution {
+ fmt.Printf("%s : %d\n", k, v)
+ }
+}
diff --git a/weed/topology/volume_layout.go b/weed/topology/volume_layout.go
index 799cbca62..7633b28be 100644
--- a/weed/topology/volume_layout.go
+++ b/weed/topology/volume_layout.go
@@ -10,11 +10,12 @@ import (
"github.com/chrislusf/seaweedfs/weed/glog"
"github.com/chrislusf/seaweedfs/weed/storage"
"github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "github.com/chrislusf/seaweedfs/weed/storage/super_block"
)
// mapping from volume to its locations, inverted from server to volume
type VolumeLayout struct {
- rp *storage.ReplicaPlacement
+ rp *super_block.ReplicaPlacement
ttl *needle.TTL
vid2location map[needle.VolumeId]*VolumeLocationList
writables []needle.VolumeId // transient array of writable volume id
@@ -30,7 +31,7 @@ type VolumeLayoutStats struct {
FileCount uint64
}
-func NewVolumeLayout(rp *storage.ReplicaPlacement, ttl *needle.TTL, volumeSizeLimit uint64) *VolumeLayout {
+func NewVolumeLayout(rp *super_block.ReplicaPlacement, ttl *needle.TTL, volumeSizeLimit uint64) *VolumeLayout {
return &VolumeLayout{
rp: rp,
ttl: ttl,