aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Lu <chrislusf@users.noreply.github.com>2019-08-30 08:52:50 -0700
committerGitHub <noreply@github.com>2019-08-30 08:52:50 -0700
commit5f283498c058ba0da5a31517040426e9f8291885 (patch)
tree67077b66829c6ec9589d47a952833ea5cada71b2
parent170ee6ef0f9a94504580db5fa8c82e4ef6d50a99 (diff)
parentf7a0a0e62cad5d625cf1a138c5da3f361c29bdf1 (diff)
downloadseaweedfs-5f283498c058ba0da5a31517040426e9f8291885.tar.xz
seaweedfs-5f283498c058ba0da5a31517040426e9f8291885.zip
Merge pull request #1057 from ingardm/master
new tool based on see_dat to remove duplicate fids
-rw-r--r--unmaintained/remove_duplicate_fids/remove_duplicate_fids.go92
1 files changed, 92 insertions, 0 deletions
diff --git a/unmaintained/remove_duplicate_fids/remove_duplicate_fids.go b/unmaintained/remove_duplicate_fids/remove_duplicate_fids.go
new file mode 100644
index 000000000..5716ffa90
--- /dev/null
+++ b/unmaintained/remove_duplicate_fids/remove_duplicate_fids.go
@@ -0,0 +1,92 @@
+package main
+
+import (
+ "flag"
+
+ "github.com/chrislusf/seaweedfs/weed/glog"
+ "github.com/chrislusf/seaweedfs/weed/storage"
+ "github.com/chrislusf/seaweedfs/weed/storage/needle"
+ "os"
+ "path/filepath"
+
+ "fmt"
+)
+
+var (
+ volumePath = flag.String("dir", "/tmp", "data directory to store files")
+ volumeCollection = flag.String("collection", "", "the volume collection name")
+ volumeId = flag.Int("volumeId", -1, "a volume id. The volume should already exist in the dir. The volume index file should not exist.")
+)
+
+func Checksum(n* needle.Needle) string {
+ return fmt.Sprintf("%s%x", n.Id, n.Cookie)
+}
+
+type VolumeFileScanner4SeeDat struct {
+ version needle.Version
+ block storage.SuperBlock
+
+ dir string
+ hashes map[string]bool
+ dat * os.File
+}
+
+func (scanner *VolumeFileScanner4SeeDat) VisitSuperBlock(superBlock storage.SuperBlock) error {
+ scanner.version = superBlock.Version()
+ scanner.block = superBlock
+ return nil
+
+}
+func (scanner *VolumeFileScanner4SeeDat) ReadNeedleBody() bool {
+ return true
+}
+
+func (scanner *VolumeFileScanner4SeeDat) VisitNeedle(n *needle.Needle, offset int64) error {
+
+ if scanner.dat == nil {
+ newDatFile, err := os.Create(filepath.Join(*volumePath, "dat_fixed"))
+ if err != nil {
+ glog.Fatalf("Write New Volume Data %v", err)
+ }
+ scanner.dat = newDatFile
+ scanner.dat.Write(scanner.block.Bytes())
+ }
+
+ checksum := Checksum(n)
+
+ if scanner.hashes[checksum] {
+ glog.V(0).Infof("duplicate checksum:%s fid:%d,%s%x @ offset:%d", checksum, *volumeId, n.Id, n.Cookie, offset)
+ return nil
+ }
+ scanner.hashes[checksum] = true
+
+ _, s, _, e := n.Append(scanner.dat, scanner.version)
+ fmt.Printf("size %d error %v\n", s, e)
+
+ return nil
+}
+
+func main() {
+ flag.Parse()
+
+ vid := needle.VolumeId(*volumeId)
+
+ outpath, _ := filepath.Abs(filepath.Dir(os.Args[0]))
+
+ scanner := &VolumeFileScanner4SeeDat{
+ dir: filepath.Join(outpath, "out"),
+ hashes: map[string]bool{},
+ }
+
+ if _, err := os.Stat(scanner.dir); err != nil {
+ if err := os.MkdirAll(scanner.dir, os.ModePerm); err != nil {
+ glog.Fatalf("could not create output dir : %s", err)
+ }
+ }
+
+ err := storage.ScanVolumeFile(*volumePath, *volumeCollection, vid, storage.NeedleMapInMemory, scanner)
+ if err != nil {
+ glog.Fatalf("Reading Volume File [ERROR] %s\n", err)
+ }
+
+}