@@ -19,6 +19,8 @@ import (
1919 "flag"
2020 "fmt"
2121 "io"
22+ "log"
23+ "math"
2224 "os"
2325
2426 "github.com/go-air/dupi"
@@ -28,6 +30,7 @@ type extractCmd struct {
2830 subCmd
2931 index * dupi.Index
3032 json * bool
33+ sigma * float64
3134}
3235
3336func newExtractCmd () * extractCmd {
@@ -37,6 +40,7 @@ func newExtractCmd() *extractCmd {
3740 flags : flag .NewFlagSet ("extract" , flag .ExitOnError )}}
3841
3942 extract .json = extract .flags .Bool ("json" , false , "output json" )
43+ extract .sigma = extract .flags .Float64 ("sigma" , 2.0 , "explore blots within σ of average (higher=most probable dups, lower=more volume)" )
4044 return extract
4145}
4246
@@ -52,10 +56,14 @@ func (x *extractCmd) Run(args []string) error {
5256 return err
5357 }
5458 defer x .index .Close ()
59+ st , err := x .index .Stats ()
60+ if err != nil {
61+ log .Fatal (err )
62+ }
63+ σ := * x .sigma
64+ N := int (math .Round (st .BlotMean + σ * st .BlotSigma ))
5565 query := x .index .StartQuery (dupi .QueryMaxBlot )
56- shape := []dupi.Blot {
57- {Blot : 0 , Docs : make ([]dupi.Doc , 0 , 32 )},
58- {Blot : 0 , Docs : make ([]dupi.Doc , 0 , 32 )}}
66+ shape := []dupi.Blot {{Blot : 0 }}
5967 for {
6068 n , err := query .Next (shape )
6169 if err == io .EOF {
@@ -67,6 +75,9 @@ func (x *extractCmd) Run(args []string) error {
6775 if n == 0 {
6876 return fmt .Errorf ("Query.Next gave 0 and no error" )
6977 }
78+ if len (shape [0 ].Docs ) < N {
79+ return nil
80+ }
7081 if * x .json {
7182 shp2 := shape
7283 j := 0
@@ -98,7 +109,7 @@ func (x *extractCmd) Run(args []string) error {
98109 }
99110 }
100111 for i := range shape {
101- shape [i ].Docs = shape [ i ]. Docs [: 0 ]
112+ shape [i ].Docs = nil
102113 }
103114 }
104115}
0 commit comments