use stats to determine how much to extract

scott-cotton · scott-cotton · commit 48d0edafebff · 2021-09-17T02:02:03.000+02:00
killer app command for the moment

dupi index &lt;stuff&gt;
dupi extract | awk '{print $1}' | xargs dupi unblot

added -all to unblot
diff --git a/blot.go b/blot.go
@@ -44,8 +44,12 @@ func (b *Blot) Len() int {
 	return len(b.Docs)
 }
 
-func (b *Blot) Next() *Doc {
+func (b *Blot) Next(lim bool) *Doc {
 	n := len(b.Docs)
-	b.Docs = b.Docs[:n+1]
+	if lim {
+		b.Docs = b.Docs[:n+1]
+	} else {
+		b.Docs = append(b.Docs, Doc{})
+	}
 	return &b.Docs[n]
 }
diff --git a/cmd/dupi/extract.go b/cmd/dupi/extract.go
@@ -19,6 +19,8 @@ import (
 	"flag"
 	"fmt"
 	"io"
+	"log"
+	"math"
 	"os"
 
 	"github.com/go-air/dupi"
@@ -28,6 +30,7 @@ type extractCmd struct {
 	subCmd
 	index *dupi.Index
 	json  *bool
+	sigma *float64
 }
 
 func newExtractCmd() *extractCmd {
@@ -37,6 +40,7 @@ func newExtractCmd() *extractCmd {
 		flags: flag.NewFlagSet("extract", flag.ExitOnError)}}
 
 	extract.json = extract.flags.Bool("json", false, "output json")
+	extract.sigma = extract.flags.Float64("sigma", 2.0, "explore blots within σ of average (higher=most probable dups, lower=more volume)")
 	return extract
 }
 
@@ -52,10 +56,14 @@ func (x *extractCmd) Run(args []string) error {
 		return err
 	}
 	defer x.index.Close()
+	st, err := x.index.Stats()
+	if err != nil {
+		log.Fatal(err)
+	}
+	σ := *x.sigma
+	N := int(math.Round(st.BlotMean + σ*st.BlotSigma))
 	query := x.index.StartQuery(dupi.QueryMaxBlot)
-	shape := []dupi.Blot{
-		{Blot: 0, Docs: make([]dupi.Doc, 0, 32)},
-		{Blot: 0, Docs: make([]dupi.Doc, 0, 32)}}
+	shape := []dupi.Blot{{Blot: 0}}
 	for {
 		n, err := query.Next(shape)
 		if err == io.EOF {
@@ -67,6 +75,9 @@ func (x *extractCmd) Run(args []string) error {
 		if n == 0 {
 			return fmt.Errorf("Query.Next gave 0 and no error")
 		}
+		if len(shape[0].Docs) < N {
+			return nil
+		}
 		if *x.json {
 			shp2 := shape
 			j := 0
@@ -98,7 +109,7 @@ func (x *extractCmd) Run(args []string) error {
 			}
 		}
 		for i := range shape {
-			shape[i].Docs = shape[i].Docs[:0]
+			shape[i].Docs = nil
 		}
 	}
 }
diff --git a/cmd/dupi/index.go b/cmd/dupi/index.go
@@ -45,6 +45,7 @@ func newIndexCmd() *indexCmd {
 	index.add = index.flags.Bool("a", false, "add to a given existing index")
 	index.verbose = index.flags.Bool("v", false, "verbose")
 	index.nshat = index.flags.Int("s", 4, "num shatterers")
+	index.shards = index.flags.Int("n", 4, "num shards")
 	return index
 }
 
diff --git a/cmd/dupi/unblot.go b/cmd/dupi/unblot.go
@@ -10,11 +10,14 @@ import (
 
 type unblotCmd struct {
 	subCmd
+	all *bool
 }
 
 func newUnblotCmd() *unblotCmd {
-	return &unblotCmd{
+	cmd := &unblotCmd{
 		subCmd: subCmd{name: "unblot", flags: flag.NewFlagSet("unblot", flag.ExitOnError)}}
+	cmd.all = cmd.flags.Bool("all", false, "output all matches")
+	return cmd
 }
 
 func (ub *unblotCmd) Usage() string {
@@ -54,7 +57,7 @@ func (ub *unblotCmd) Run(args []string) error {
 			m[dat] = append(m[dat], doc)
 		}
 		for k, ds := range m {
-			if len(ds) < 2 {
+			if !*ub.all && len(ds) < 2 {
 				continue
 			}
 			fmt.Printf("text:\n'''\n%s'''\n", k)
diff --git a/query.go b/query.go
@@ -67,11 +67,7 @@ func (q *Query) Get(blot *Blot) error {
 		if err != nil {
 			return err
 		}
-		if !lim && len(blot.Docs) == cap(blot.Docs) {
-			blot.Docs = append(blot.Docs, Doc{})
-			blot.Docs = blot.Docs[:len(blot.Docs)-1]
-		}
-		if err = q.index.docid2Doc(docid, blot.Next()); err != nil {
+		if err = q.index.docid2Doc(docid, blot.Next(lim)); err != nil {
 			return err
 		}
 	}
@@ -97,13 +93,18 @@ func (q *Query) Next(dst []Blot) (n int, err error) {
 			}
 			continue
 		}
+		lim := dstBlot.Docs != nil
 		_, err = q.fillBlot(dstBlot, shardState, state.i)
 		if err != nil {
 			return
 		}
 		if len(dstBlot.Docs) <= 1 {
 			q.advance(shardState, state.i)
-			dstBlot.Docs = dstBlot.Docs[:0]
+			if lim {
+				dstBlot.Docs = dstBlot.Docs[:0]
+			} else {
+				dstBlot.Docs = nil
+			}
 			continue
 		}
 		n++
@@ -116,20 +117,20 @@ func (q *Query) fillBlot(dst *Blot, src *shard.ReadState, srcPos uint32) (int, e
 		docid uint32
 		err   error
 		n     int
+		lim   bool
 	)
 	dst.Blot = uint32(src.Blot)*q.state.n + q.state.i
-	for dst.Len() < dst.Cap() {
+	lim = dst.Docs != nil
+	for !lim || dst.Len() < dst.Cap() {
 		docid, err = src.Next()
 		if err == io.EOF {
 			q.advance(src, srcPos)
 			return n, nil
 		} else if err != nil {
 			return 0, err
-		} else if docid == 0 {
-			continue
 		}
 		n++
-		q.index.docid2Doc(docid, dst.Next())
+		q.index.docid2Doc(docid, dst.Next(lim))
 	}
 	return n, err
 }

Original file line number	Diff line number	Diff line change
`@@ -44,8 +44,12 @@ func (b *Blot) Len() int {`
`44`	`44`	`return len(b.Docs)`
`45`	`45`	`}`
`46`	`46`
`47`		`-func (b Blot) Next() Doc {`
	`47`	`+func (b Blot) Next(lim bool) Doc {`
`48`	`48`	`n := len(b.Docs)`
`49`		`- b.Docs = b.Docs[:n+1]`
	`49`	`+ if lim {`
	`50`	`+ b.Docs = b.Docs[:n+1]`
	`51`	`+ } else {`
	`52`	`+ b.Docs = append(b.Docs, Doc{})`
	`53`	`+ }`
`50`	`54`	`return &b.Docs[n]`
`51`	`55`	`}`
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ func newIndexCmd() *indexCmd {`
`45`	`45`	`index.add = index.flags.Bool("a", false, "add to a given existing index")`
`46`	`46`	`index.verbose = index.flags.Bool("v", false, "verbose")`
`47`	`47`	`index.nshat = index.flags.Int("s", 4, "num shatterers")`
	`48`	`+ index.shards = index.flags.Int("n", 4, "num shards")`
`48`	`49`	`return index`
`49`	`50`	`}`
`50`	`51`
Original file line number	Diff line number	Diff line change
`@@ -10,11 +10,14 @@ import (`
`10`	`10`
`11`	`11`	`type unblotCmd struct {`
`12`	`12`	`subCmd`
	`13`	`+ all *bool`
`13`	`14`	`}`
`14`	`15`
`15`	`16`	`func newUnblotCmd() *unblotCmd {`
`16`		`- return &unblotCmd{`
	`17`	`+ cmd := &unblotCmd{`
`17`	`18`	`subCmd: subCmd{name: "unblot", flags: flag.NewFlagSet("unblot", flag.ExitOnError)}}`
	`19`	`+ cmd.all = cmd.flags.Bool("all", false, "output all matches")`
	`20`	`+ return cmd`
`18`	`21`	`}`
`19`	`22`
`20`	`23`	`func (ub *unblotCmd) Usage() string {`
`@@ -54,7 +57,7 @@ func (ub *unblotCmd) Run(args []string) error {`
`54`	`57`	`m[dat] = append(m[dat], doc)`
`55`	`58`	`}`
`56`	`59`	`for k, ds := range m {`
`57`		`- if len(ds) < 2 {`
	`60`	`+ if !*ub.all && len(ds) < 2 {`
`58`	`61`	`continue`
`59`	`62`	`}`
`60`	`63`	`fmt.Printf("text:\n'''\n%s'''\n", k)`
Original file line number	Diff line number	Diff line change
`@@ -67,11 +67,7 @@ func (q Query) Get(blot Blot) error {`
`67`	`67`	`if err != nil {`
`68`	`68`	`return err`
`69`	`69`	`}`
`70`		`- if !lim && len(blot.Docs) == cap(blot.Docs) {`
`71`		`- blot.Docs = append(blot.Docs, Doc{})`
`72`		`- blot.Docs = blot.Docs[:len(blot.Docs)-1]`
`73`		`- }`
`74`		`- if err = q.index.docid2Doc(docid, blot.Next()); err != nil {`
	`70`	`+ if err = q.index.docid2Doc(docid, blot.Next(lim)); err != nil {`
`75`	`71`	`return err`
`76`	`72`	`}`
`77`	`73`	`}`
`@@ -97,13 +93,18 @@ func (q *Query) Next(dst []Blot) (n int, err error) {`
`97`	`93`	`}`
`98`	`94`	`continue`
`99`	`95`	`}`
	`96`	`+ lim := dstBlot.Docs != nil`
`100`	`97`	`_, err = q.fillBlot(dstBlot, shardState, state.i)`
`101`	`98`	`if err != nil {`
`102`	`99`	`return`
`103`	`100`	`}`
`104`	`101`	`if len(dstBlot.Docs) <= 1 {`
`105`	`102`	`q.advance(shardState, state.i)`
`106`		`- dstBlot.Docs = dstBlot.Docs[:0]`
	`103`	`+ if lim {`
	`104`	`+ dstBlot.Docs = dstBlot.Docs[:0]`
	`105`	`+ } else {`
	`106`	`+ dstBlot.Docs = nil`
	`107`	`+ }`
`107`	`108`	`continue`
`108`	`109`	`}`
`109`	`110`	`n++`
`@@ -116,20 +117,20 @@ func (q Query) fillBlot(dst Blot, src *shard.ReadState, srcPos uint32) (int, e`
`116`	`117`	`docid uint32`
`117`	`118`	`err error`
`118`	`119`	`n int`
	`120`	`+ lim bool`
`119`	`121`	`)`
`120`	`122`	`dst.Blot = uint32(src.Blot)*q.state.n + q.state.i`
`121`		`- for dst.Len() < dst.Cap() {`
	`123`	`+ lim = dst.Docs != nil`
	`124`	`+ for !lim \|\| dst.Len() < dst.Cap() {`
`122`	`125`	`docid, err = src.Next()`
`123`	`126`	`if err == io.EOF {`
`124`	`127`	`q.advance(src, srcPos)`
`125`	`128`	`return n, nil`
`126`	`129`	`} else if err != nil {`
`127`	`130`	`return 0, err`
`128`		`- } else if docid == 0 {`
`129`		`- continue`
`130`	`131`	`}`
`131`	`132`	`n++`
`132`		`- q.index.docid2Doc(docid, dst.Next())`
	`133`	`+ q.index.docid2Doc(docid, dst.Next(lim))`
`133`	`134`	`}`
`134`	`135`	`return n, err`
`135`	`136`	`}`