diff options
| -rw-r--r-- | weed/storage/erasure_coding/1.dat | bin | 0 -> 2590912 bytes | |||
| -rw-r--r-- | weed/storage/erasure_coding/1.idx | bin | 0 -> 4768 bytes | |||
| -rw-r--r-- | weed/storage/erasure_coding/ec_encoder.go | 92 | ||||
| -rw-r--r-- | weed/storage/erasure_coding/ec_locate.go | 68 | ||||
| -rw-r--r-- | weed/storage/erasure_coding/ec_test.go | 312 | ||||
| -rw-r--r-- | weed/storage/needle_map.go | 6 | ||||
| -rw-r--r-- | weed/storage/needle_map/btree_map.go | 2 | ||||
| -rw-r--r-- | weed/storage/needle_map/compact_map.go | 49 | ||||
| -rw-r--r-- | weed/storage/needle_map/compact_map_test.go | 2 | ||||
| -rw-r--r-- | weed/storage/needle_map/needle_value.go | 13 | ||||
| -rw-r--r-- | weed/storage/needle_map/needle_value_map.go | 2 |
11 files changed, 521 insertions, 25 deletions
diff --git a/weed/storage/erasure_coding/1.dat b/weed/storage/erasure_coding/1.dat Binary files differnew file mode 100644 index 000000000..869427926 --- /dev/null +++ b/weed/storage/erasure_coding/1.dat diff --git a/weed/storage/erasure_coding/1.idx b/weed/storage/erasure_coding/1.idx Binary files differnew file mode 100644 index 000000000..65a950e64 --- /dev/null +++ b/weed/storage/erasure_coding/1.idx diff --git a/weed/storage/erasure_coding/ec_encoder.go b/weed/storage/erasure_coding/ec_encoder.go new file mode 100644 index 000000000..4b5205dee --- /dev/null +++ b/weed/storage/erasure_coding/ec_encoder.go @@ -0,0 +1,92 @@ +package erasure_coding + +import ( + "fmt" + "io" + "os" + + "github.com/chrislusf/seaweedfs/weed/glog" + "github.com/klauspost/reedsolomon" +) + +const ( + DataShardsCount = 10 + ParityShardsCount = 4 + ErasureCodingLargeBlockSize = 1024 * 1024 * 1024 // 1GB + ErasureCodingSmallBlockSize = 1024 * 1024 // 1MB +) + +func encodeData(file *os.File, enc reedsolomon.Encoder, startOffset, blockSize int64, buffers [][]byte, outputs []*os.File) error { + + bufferSize := int64(len(buffers[0])) + batchCount := blockSize/bufferSize + if blockSize%bufferSize!=0 { + glog.Fatalf("unexpected block size %d buffer size %d", blockSize, bufferSize) + } + + for b := int64(0); b < batchCount; b++ { + err := encodeDataOneBatch(file, enc, startOffset+b*bufferSize, blockSize, buffers, outputs) + if err != nil { + return err + } + } + + return nil +} + +func openEcFiles(baseFileName string, forRead bool) (files []*os.File, err error){ + for i := 0; i< DataShardsCount+ParityShardsCount; i++{ + fname := fmt.Sprintf("%s.ec%02d", baseFileName, i+1) + openOption := os.O_TRUNC|os.O_CREATE|os.O_WRONLY + if forRead { + openOption = os.O_RDONLY + } + f, err := os.OpenFile(fname, openOption, 0644) + if err != nil { + return files, fmt.Errorf("failed to open file %s: %v", fname, err) + } + files = append(files, f) + } + return +} + +func closeEcFiles(files []*os.File){ + for _, f := range files{ + if f != nil { + f.Close() + } + } +} + + +func encodeDataOneBatch(file *os.File, enc reedsolomon.Encoder, startOffset, blockSize int64, buffers [][]byte, outputs []*os.File) error { + + // read data into buffers + for i := 0; i < DataShardsCount; i++ { + n, err := file.ReadAt(buffers[i], startOffset+blockSize*int64(i)) + if err != nil { + if err != io.EOF { + return err + } + } + if n < len(buffers[i]) { + for t := len(buffers[i]) - 1; t >= n; t-- { + buffers[i][t] = 0 + } + } + } + + err := enc.Encode(buffers) + if err != nil { + return err + } + + for i := 0; i < DataShardsCount+ParityShardsCount; i++ { + _, err := outputs[i].Write(buffers[i]) + if err != nil { + return err + } + } + + return nil +} diff --git a/weed/storage/erasure_coding/ec_locate.go b/weed/storage/erasure_coding/ec_locate.go new file mode 100644 index 000000000..b570f750c --- /dev/null +++ b/weed/storage/erasure_coding/ec_locate.go @@ -0,0 +1,68 @@ +package erasure_coding + +type Interval struct { + blockIndex int + innerBlockOffset int64 + size uint32 + isLargeBlock bool +} + +func locateData(largeBlockLength, smallBlockLength int64, datSize int64, offset int64, size uint32) (intervals []Interval) { + blockIndex, isLargeBlock, innerBlockOffset := locateOffset(largeBlockLength, smallBlockLength, datSize, offset) + + nLargeBlockRows := int(datSize / (largeBlockLength * DataShardsCount)) + + for size > 0 { + interval := Interval{ + blockIndex: blockIndex, + innerBlockOffset: innerBlockOffset, + isLargeBlock: isLargeBlock, + } + + blockRemaining := largeBlockLength - innerBlockOffset + if !isLargeBlock { + blockRemaining = smallBlockLength - innerBlockOffset + } + + if int64(size) <= blockRemaining { + interval.size = size + intervals = append(intervals, interval) + return + } + interval.size = uint32(blockRemaining) + intervals = append(intervals, interval) + + size -= interval.size + blockIndex += 1 + if isLargeBlock && blockIndex == nLargeBlockRows*DataShardsCount { + isLargeBlock = false + blockIndex = 0 + } + innerBlockOffset = 0 + + } + return +} + +func locateOffset(largeBlockLength, smallBlockLength int64, datSize int64, offset int64) (blockIndex int, isLargeBlock bool, innerBlockOffset int64) { + largeRowSize := largeBlockLength * DataShardsCount + nLargeBlockRows := datSize / (largeBlockLength * DataShardsCount) + + // if offset is within the large block area + if offset < nLargeBlockRows*largeRowSize { + isLargeBlock = true + blockIndex, innerBlockOffset = locateOffsetWithinBlocks(largeBlockLength, offset) + return + } + + isLargeBlock = false + offset -= nLargeBlockRows * largeRowSize + blockIndex, innerBlockOffset = locateOffsetWithinBlocks(smallBlockLength, offset) + return +} + +func locateOffsetWithinBlocks(blockLength int64, offset int64) (blockIndex int, innerBlockOffset int64) { + blockIndex = int(offset / blockLength) + innerBlockOffset = offset % blockLength + return +} diff --git a/weed/storage/erasure_coding/ec_test.go b/weed/storage/erasure_coding/ec_test.go new file mode 100644 index 000000000..d631471e9 --- /dev/null +++ b/weed/storage/erasure_coding/ec_test.go @@ -0,0 +1,312 @@ +package erasure_coding + +import ( + "bytes" + "fmt" + "math/rand" + "os" + "testing" + + "github.com/chrislusf/seaweedfs/weed/storage" + "github.com/chrislusf/seaweedfs/weed/storage/needle_map" + "github.com/chrislusf/seaweedfs/weed/storage/types" + "github.com/klauspost/reedsolomon" +) + +const ( + largeBlockSize = 10000 + smallBlockSize = 100 +) + +func TestEncodingDecoding(t *testing.T) { + bufferSize := 50 + baseFileName := "1" + + err := generateEcFiles(baseFileName, bufferSize, largeBlockSize, smallBlockSize) + if err != nil { + t.Logf("generateEcFiles: %v", err) + } + + err = writeSortedEcxFiles(baseFileName) + if err != nil { + t.Logf("writeSortedEcxFiles: %v", err) + } + + err = validateFiles(baseFileName) + if err != nil { + t.Logf("writeSortedEcxFiles: %v", err) + } + + removeGeneratedFiles(baseFileName) + +} + +func generateEcFiles(baseFileName string, bufferSize int, largeBlockSize int64, smallBlockSize int64) error { + file, err := os.OpenFile(baseFileName+".dat", os.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("failed to open dat file: %v", err) + } + defer file.Close() + + fi, err := file.Stat() + if err != nil { + return fmt.Errorf("failed to stat dat file: %v", err) + } + err = encodeDatFile(fi.Size(), err, baseFileName, bufferSize, largeBlockSize, file, smallBlockSize) + if err != nil { + return fmt.Errorf("encodeDatFile: %v", err) + } + return nil +} + +func encodeDatFile(remainingSize int64, err error, baseFileName string, bufferSize int, largeBlockSize int64, file *os.File, smallBlockSize int64) error { + var processedSize int64 + enc, err := reedsolomon.New(DataShardsCount, ParityShardsCount) + if err != nil { + return fmt.Errorf("failed to create encoder: %v", err) + } + buffers := make([][]byte, DataShardsCount+ParityShardsCount) + outputs, err := openEcFiles(baseFileName, false) + defer closeEcFiles(outputs) + if err != nil { + return fmt.Errorf("failed to open dat file: %v", err) + } + for i, _ := range buffers { + buffers[i] = make([]byte, bufferSize) + } + for remainingSize > largeBlockSize*DataShardsCount { + err = encodeData(file, enc, processedSize, largeBlockSize, buffers, outputs) + if err != nil { + return fmt.Errorf("failed to encode large chunk data: %v", err) + } + remainingSize -= largeBlockSize * DataShardsCount + processedSize += largeBlockSize * DataShardsCount + } + for remainingSize > 0 { + encodeData(file, enc, processedSize, smallBlockSize, buffers, outputs) + if err != nil { + return fmt.Errorf("failed to encode small chunk data: %v", err) + } + remainingSize -= smallBlockSize * DataShardsCount + processedSize += smallBlockSize * DataShardsCount + } + return nil +} + +func writeSortedEcxFiles(baseFileName string) (e error) { + + cm, err := readCompactMap(baseFileName) + if err != nil { + return fmt.Errorf("readCompactMap: %v", err) + } + + ecxFile, err := os.OpenFile(baseFileName+".ecx", os.O_TRUNC|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("failed to open dat file: %v", err) + } + defer ecxFile.Close() + + err = cm.AscendingVisit(func(value needle_map.NeedleValue) error { + bytes := value.ToBytes() + _, writeErr := ecxFile.Write(bytes) + return writeErr + }) + + if err != nil { + return fmt.Errorf("failed to open dat file: %v", err) + } + + return nil +} + +func validateFiles(baseFileName string) error { + cm, err := readCompactMap(baseFileName) + if err != nil { + return fmt.Errorf("readCompactMap: %v", err) + } + + datFile, err := os.OpenFile(baseFileName+".dat", os.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("failed to open dat file: %v", err) + } + defer datFile.Close() + + fi, err := datFile.Stat() + if err != nil { + return fmt.Errorf("failed to stat dat file: %v", err) + } + + ecFiles, err := openEcFiles(baseFileName, true) + defer closeEcFiles(ecFiles) + + err = cm.AscendingVisit(func(value needle_map.NeedleValue) error { + return assertSame(datFile, fi.Size(), ecFiles, value.Offset, value.Size) + }) + if err != nil { + return fmt.Errorf("failed to check ec files: %v", err) + } + return nil +} + +func readCompactMap(baseFileName string) (*needle_map.CompactMap, error) { + indexFile, err := os.OpenFile(baseFileName+".idx", os.O_RDONLY, 0644) + if err != nil { + return nil, fmt.Errorf("cannot read Volume Index %s.idx: %v", baseFileName, err) + } + defer indexFile.Close() + + cm := needle_map.NewCompactMap() + err = storage.WalkIndexFile(indexFile, func(key types.NeedleId, offset types.Offset, size uint32) error { + if !offset.IsZero() && size != types.TombstoneFileSize { + cm.Set(key, offset, size) + } else { + cm.Delete(key) + } + return nil + }) + return cm, err +} + +func assertSame(datFile *os.File, datSize int64, ecFiles []*os.File, offset types.Offset, size uint32) error { + + data, err := readDatFile(datFile, offset, size) + if err != nil { + return fmt.Errorf("failed to read dat file: %v", err) + } + + ecData, err := readEcFile(datSize, ecFiles, offset, size) + if err != nil { + return fmt.Errorf("failed to read ec file: %v", err) + } + + if bytes.Compare(data, ecData) != 0 { + return fmt.Errorf("unexpected data read") + } + + return nil +} + +func readDatFile(datFile *os.File, offset types.Offset, size uint32) ([]byte, error) { + + data := make([]byte, size) + n, err := datFile.ReadAt(data, offset.ToAcutalOffset()) + if err != nil { + return nil, fmt.Errorf("failed to ReadAt dat file: %v", err) + } + if n != int(size) { + return nil, fmt.Errorf("unexpected read size %d, expected %d", n, size) + } + return data, nil +} + +func readEcFile(datSize int64, ecFiles []*os.File, offset types.Offset, size uint32) (data []byte, err error) { + + intervals := locateData(largeBlockSize, smallBlockSize, datSize, offset.ToAcutalOffset(), size) + + nLargeBlockRows := int(datSize / (largeBlockSize * DataShardsCount)) + + for i, interval := range intervals { + if d, e := readOneInterval(interval, ecFiles, nLargeBlockRows); e != nil { + return nil, e + } else { + if i == 0 { + data = d + } else { + data = append(data, d...) + } + } + } + + return data, nil +} + +func readOneInterval(interval Interval, ecFiles []*os.File, nLargeBlockRows int) (data []byte, err error) { + ecFileOffset := interval.innerBlockOffset + rowIndex := interval.blockIndex / DataShardsCount + if interval.isLargeBlock { + ecFileOffset += int64(rowIndex) * largeBlockSize + } else { + ecFileOffset += int64(nLargeBlockRows)*largeBlockSize + int64(rowIndex)*smallBlockSize + } + + ecFileIndex := interval.blockIndex % DataShardsCount + + data = make([]byte, interval.size) + err = readFromFile(ecFiles[ecFileIndex], data, ecFileOffset) + { // do some ec testing + ecData, err := readFromOtherEcFiles(ecFiles, ecFileIndex, ecFileOffset, interval.size) + if err != nil { + return nil, fmt.Errorf("ec reconstruct error: %v", err) + } + if bytes.Compare(data, ecData) != 0 { + return nil, fmt.Errorf("ec compare error") + } + } + return +} + +func readFromOtherEcFiles(ecFiles []*os.File, ecFileIndex int, ecFileOffset int64, size uint32) (data []byte, err error) { + enc, err := reedsolomon.New(DataShardsCount, ParityShardsCount) + if err != nil { + return nil, fmt.Errorf("failed to create encoder: %v", err) + } + + bufs := make([][]byte, DataShardsCount+ParityShardsCount) + for i := 0; i < DataShardsCount; { + n := int(rand.Int31n(DataShardsCount + ParityShardsCount)) + if n == ecFileIndex || bufs[n] != nil { + continue + } + bufs[n] = make([]byte, size) + i++ + } + + for i, buf := range bufs { + if buf == nil { + continue + } + err = readFromFile(ecFiles[i], buf, ecFileOffset) + if err != nil { + return + } + } + + if err = enc.ReconstructData(bufs); err != nil { + return nil, err + } + + return bufs[ecFileIndex], nil +} + +func readFromFile(file *os.File, data []byte, ecFileOffset int64) (err error) { + _, err = file.ReadAt(data, ecFileOffset) + return +} + +func removeGeneratedFiles(baseFileName string) { + for i := 0; i < DataShardsCount+ParityShardsCount; i++ { + fname := fmt.Sprintf("%s.ec%02d", baseFileName, i+1) + os.Remove(fname) + } + os.Remove(baseFileName+".ecx") +} + +func TestLocateData(t *testing.T) { + intervals := locateData(largeBlockSize, smallBlockSize, DataShardsCount*largeBlockSize+1, DataShardsCount*largeBlockSize, 1) + if len(intervals) != 1 { + t.Errorf("unexpected interval size %d", len(intervals)) + } + if !intervals[0].sameAs(Interval{0, 0, 1, false}) { + t.Errorf("unexpected interval %+v", intervals[0]) + } + + intervals = locateData(largeBlockSize, smallBlockSize, DataShardsCount*largeBlockSize+1, DataShardsCount*largeBlockSize/2+100, DataShardsCount*largeBlockSize+1-DataShardsCount*largeBlockSize/2-100) + fmt.Printf("%+v\n", intervals) +} + +func (this Interval) sameAs(that Interval) bool { + return this.isLargeBlock == that.isLargeBlock && + this.innerBlockOffset == that.innerBlockOffset && + this.blockIndex == that.blockIndex && + this.size == that.size +} diff --git a/weed/storage/needle_map.go b/weed/storage/needle_map.go index 634685175..f2c88093f 100644 --- a/weed/storage/needle_map.go +++ b/weed/storage/needle_map.go @@ -62,10 +62,7 @@ func IdxFileEntry(bytes []byte) (key NeedleId, offset Offset, size uint32) { return } func (nm *baseNeedleMapper) appendToIndexFile(key NeedleId, offset Offset, size uint32) error { - bytes := make([]byte, NeedleIdSize+OffsetSize+SizeSize) - NeedleIdToBytes(bytes[0:NeedleIdSize], key) - OffsetToBytes(bytes[NeedleIdSize:NeedleIdSize+OffsetSize], offset) - util.Uint32toBytes(bytes[NeedleIdSize+OffsetSize:NeedleIdSize+OffsetSize+SizeSize], size) + bytes := needle_map.ToBytes(key, offset, size) nm.indexFileAccessLock.Lock() defer nm.indexFileAccessLock.Unlock() @@ -76,6 +73,7 @@ func (nm *baseNeedleMapper) appendToIndexFile(key NeedleId, offset Offset, size _, err := nm.indexFile.Write(bytes) return err } + func (nm *baseNeedleMapper) IndexFileContent() ([]byte, error) { nm.indexFileAccessLock.Lock() defer nm.indexFileAccessLock.Unlock() diff --git a/weed/storage/needle_map/btree_map.go b/weed/storage/needle_map/btree_map.go index 74b4952b7..a26c5e068 100644 --- a/weed/storage/needle_map/btree_map.go +++ b/weed/storage/needle_map/btree_map.go @@ -43,7 +43,7 @@ func (cm *BtreeMap) Get(key NeedleId) (*NeedleValue, bool) { } // Visit visits all entries or stop if any error when visiting -func (cm *BtreeMap) Visit(visit func(NeedleValue) error) (ret error) { +func (cm *BtreeMap) AscendingVisit(visit func(NeedleValue) error) (ret error) { cm.tree.Ascend(func(item btree.Item) bool { needle := item.(NeedleValue) ret = visit(needle) diff --git a/weed/storage/needle_map/compact_map.go b/weed/storage/needle_map/compact_map.go index 193d17e47..8aee51ec9 100644 --- a/weed/storage/needle_map/compact_map.go +++ b/weed/storage/needle_map/compact_map.go @@ -244,24 +244,37 @@ func (cm *CompactMap) binarySearchCompactSection(key NeedleId) int { } // Visit visits all entries or stop if any error when visiting -func (cm *CompactMap) Visit(visit func(NeedleValue) error) error { +func (cm *CompactMap) AscendingVisit(visit func(NeedleValue) error) error { for _, cs := range cm.list { cs.RLock() - for i, v := range cs.overflow { - if err := visit(toNeedleValue(cs.overflowExtra[i], v, cs)); err != nil { + var i, j int + for i, j = 0, 0; i < len(cs.overflow) && j < len(cs.values) && j<cs.counter; { + if cs.overflow[i].Key < cs.values[j].Key { + if err := visit(toNeedleValue(cs.overflowExtra[i], cs.overflow[i], cs)); err != nil { + cs.RUnlock() + return err + } + i++ + }else if cs.overflow[i].Key == cs.values[j].Key { + j++ + }else{ + if err := visit(toNeedleValue(cs.valuesExtra[j], cs.values[j], cs)); err != nil { + cs.RUnlock() + return err + } + j++ + } + } + for ;i < len(cs.overflow);i++{ + if err := visit(toNeedleValue(cs.overflowExtra[i], cs.overflow[i], cs)); err != nil { cs.RUnlock() return err } } - for i, v := range cs.values { - if i >= cs.counter { - break - } - if _, _, found := cs.findOverflowEntry(v.Key); !found { - if err := visit(toNeedleValue(cs.valuesExtra[i], v, cs)); err != nil { - cs.RUnlock() - return err - } + for ; j < len(cs.values)&& j<cs.counter;j++{ + if err := visit(toNeedleValue(cs.valuesExtra[j], cs.values[j], cs)); err != nil { + cs.RUnlock() + return err } } cs.RUnlock() @@ -279,10 +292,10 @@ func toNeedleValue(snve SectionalNeedleValueExtra, snv SectionalNeedleValue, cs func (nv NeedleValue) toSectionalNeedleValue(cs *CompactSection) (SectionalNeedleValue, SectionalNeedleValueExtra) { return SectionalNeedleValue{ - SectionalNeedleId(nv.Key - cs.start), - nv.Offset.OffsetLower, - nv.Size, - }, SectionalNeedleValueExtra{ - nv.Offset.OffsetHigher, - } + SectionalNeedleId(nv.Key - cs.start), + nv.Offset.OffsetLower, + nv.Size, + }, SectionalNeedleValueExtra{ + nv.Offset.OffsetHigher, + } } diff --git a/weed/storage/needle_map/compact_map_test.go b/weed/storage/needle_map/compact_map_test.go index 4894e59ad..3bad85727 100644 --- a/weed/storage/needle_map/compact_map_test.go +++ b/weed/storage/needle_map/compact_map_test.go @@ -19,7 +19,7 @@ func TestOverflow2(t *testing.T) { m.Set(NeedleId(150158), ToOffset(8), 3000073) m.Set(NeedleId(150162), ToOffset(8), 3000073) - m.Visit(func(value NeedleValue) error { + m.AscendingVisit(func(value NeedleValue) error { println("needle key:", value.Key) return nil }) diff --git a/weed/storage/needle_map/needle_value.go b/weed/storage/needle_map/needle_value.go index d0c0b9006..ef540b55e 100644 --- a/weed/storage/needle_map/needle_value.go +++ b/weed/storage/needle_map/needle_value.go @@ -2,6 +2,7 @@ package needle_map import ( . "github.com/chrislusf/seaweedfs/weed/storage/types" + "github.com/chrislusf/seaweedfs/weed/util" "github.com/google/btree" ) @@ -15,3 +16,15 @@ func (this NeedleValue) Less(than btree.Item) bool { that := than.(NeedleValue) return this.Key < that.Key } + +func (nv NeedleValue) ToBytes() []byte { + return ToBytes(nv.Key, nv.Offset, nv.Size) +} + +func ToBytes(key NeedleId, offset Offset, size uint32) []byte { + bytes := make([]byte, NeedleIdSize+OffsetSize+SizeSize) + NeedleIdToBytes(bytes[0:NeedleIdSize], key) + OffsetToBytes(bytes[NeedleIdSize:NeedleIdSize+OffsetSize], offset) + util.Uint32toBytes(bytes[NeedleIdSize+OffsetSize:NeedleIdSize+OffsetSize+SizeSize], size) + return bytes +} diff --git a/weed/storage/needle_map/needle_value_map.go b/weed/storage/needle_map/needle_value_map.go index acf38a4bf..0a5a00ef7 100644 --- a/weed/storage/needle_map/needle_value_map.go +++ b/weed/storage/needle_map/needle_value_map.go @@ -8,5 +8,5 @@ type NeedleValueMap interface { Set(key NeedleId, offset Offset, size uint32) (oldOffset Offset, oldSize uint32) Delete(key NeedleId) uint32 Get(key NeedleId) (*NeedleValue, bool) - Visit(visit func(NeedleValue) error) error + AscendingVisit(visit func(NeedleValue) error) error } |
