Skip to content

Commit 48d0eda

Browse files
committed
use stats to determine how much to extract
killer app command for the moment dupi index <stuff> dupi extract | awk '{print $1}' | xargs dupi unblot added -all to unblot
1 parent 3f06150 commit 48d0eda

File tree

5 files changed

+38
-18
lines changed

5 files changed

+38
-18
lines changed

blot.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,12 @@ func (b *Blot) Len() int {
4444
return len(b.Docs)
4545
}
4646

47-
func (b *Blot) Next() *Doc {
47+
func (b *Blot) Next(lim bool) *Doc {
4848
n := len(b.Docs)
49-
b.Docs = b.Docs[:n+1]
49+
if lim {
50+
b.Docs = b.Docs[:n+1]
51+
} else {
52+
b.Docs = append(b.Docs, Doc{})
53+
}
5054
return &b.Docs[n]
5155
}

cmd/dupi/extract.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ import (
1919
"flag"
2020
"fmt"
2121
"io"
22+
"log"
23+
"math"
2224
"os"
2325

2426
"github.com/go-air/dupi"
@@ -28,6 +30,7 @@ type extractCmd struct {
2830
subCmd
2931
index *dupi.Index
3032
json *bool
33+
sigma *float64
3134
}
3235

3336
func newExtractCmd() *extractCmd {
@@ -37,6 +40,7 @@ func newExtractCmd() *extractCmd {
3740
flags: flag.NewFlagSet("extract", flag.ExitOnError)}}
3841

3942
extract.json = extract.flags.Bool("json", false, "output json")
43+
extract.sigma = extract.flags.Float64("sigma", 2.0, "explore blots within σ of average (higher=most probable dups, lower=more volume)")
4044
return extract
4145
}
4246

@@ -52,10 +56,14 @@ func (x *extractCmd) Run(args []string) error {
5256
return err
5357
}
5458
defer x.index.Close()
59+
st, err := x.index.Stats()
60+
if err != nil {
61+
log.Fatal(err)
62+
}
63+
σ := *x.sigma
64+
N := int(math.Round(st.BlotMean + σ*st.BlotSigma))
5565
query := x.index.StartQuery(dupi.QueryMaxBlot)
56-
shape := []dupi.Blot{
57-
{Blot: 0, Docs: make([]dupi.Doc, 0, 32)},
58-
{Blot: 0, Docs: make([]dupi.Doc, 0, 32)}}
66+
shape := []dupi.Blot{{Blot: 0}}
5967
for {
6068
n, err := query.Next(shape)
6169
if err == io.EOF {
@@ -67,6 +75,9 @@ func (x *extractCmd) Run(args []string) error {
6775
if n == 0 {
6876
return fmt.Errorf("Query.Next gave 0 and no error")
6977
}
78+
if len(shape[0].Docs) < N {
79+
return nil
80+
}
7081
if *x.json {
7182
shp2 := shape
7283
j := 0
@@ -98,7 +109,7 @@ func (x *extractCmd) Run(args []string) error {
98109
}
99110
}
100111
for i := range shape {
101-
shape[i].Docs = shape[i].Docs[:0]
112+
shape[i].Docs = nil
102113
}
103114
}
104115
}

cmd/dupi/index.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ func newIndexCmd() *indexCmd {
4545
index.add = index.flags.Bool("a", false, "add to a given existing index")
4646
index.verbose = index.flags.Bool("v", false, "verbose")
4747
index.nshat = index.flags.Int("s", 4, "num shatterers")
48+
index.shards = index.flags.Int("n", 4, "num shards")
4849
return index
4950
}
5051

cmd/dupi/unblot.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@ import (
1010

1111
type unblotCmd struct {
1212
subCmd
13+
all *bool
1314
}
1415

1516
func newUnblotCmd() *unblotCmd {
16-
return &unblotCmd{
17+
cmd := &unblotCmd{
1718
subCmd: subCmd{name: "unblot", flags: flag.NewFlagSet("unblot", flag.ExitOnError)}}
19+
cmd.all = cmd.flags.Bool("all", false, "output all matches")
20+
return cmd
1821
}
1922

2023
func (ub *unblotCmd) Usage() string {
@@ -54,7 +57,7 @@ func (ub *unblotCmd) Run(args []string) error {
5457
m[dat] = append(m[dat], doc)
5558
}
5659
for k, ds := range m {
57-
if len(ds) < 2 {
60+
if !*ub.all && len(ds) < 2 {
5861
continue
5962
}
6063
fmt.Printf("text:\n'''\n%s'''\n", k)

query.go

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,7 @@ func (q *Query) Get(blot *Blot) error {
6767
if err != nil {
6868
return err
6969
}
70-
if !lim && len(blot.Docs) == cap(blot.Docs) {
71-
blot.Docs = append(blot.Docs, Doc{})
72-
blot.Docs = blot.Docs[:len(blot.Docs)-1]
73-
}
74-
if err = q.index.docid2Doc(docid, blot.Next()); err != nil {
70+
if err = q.index.docid2Doc(docid, blot.Next(lim)); err != nil {
7571
return err
7672
}
7773
}
@@ -97,13 +93,18 @@ func (q *Query) Next(dst []Blot) (n int, err error) {
9793
}
9894
continue
9995
}
96+
lim := dstBlot.Docs != nil
10097
_, err = q.fillBlot(dstBlot, shardState, state.i)
10198
if err != nil {
10299
return
103100
}
104101
if len(dstBlot.Docs) <= 1 {
105102
q.advance(shardState, state.i)
106-
dstBlot.Docs = dstBlot.Docs[:0]
103+
if lim {
104+
dstBlot.Docs = dstBlot.Docs[:0]
105+
} else {
106+
dstBlot.Docs = nil
107+
}
107108
continue
108109
}
109110
n++
@@ -116,20 +117,20 @@ func (q *Query) fillBlot(dst *Blot, src *shard.ReadState, srcPos uint32) (int, e
116117
docid uint32
117118
err error
118119
n int
120+
lim bool
119121
)
120122
dst.Blot = uint32(src.Blot)*q.state.n + q.state.i
121-
for dst.Len() < dst.Cap() {
123+
lim = dst.Docs != nil
124+
for !lim || dst.Len() < dst.Cap() {
122125
docid, err = src.Next()
123126
if err == io.EOF {
124127
q.advance(src, srcPos)
125128
return n, nil
126129
} else if err != nil {
127130
return 0, err
128-
} else if docid == 0 {
129-
continue
130131
}
131132
n++
132-
q.index.docid2Doc(docid, dst.Next())
133+
q.index.docid2Doc(docid, dst.Next(lim))
133134
}
134135
return n, err
135136
}

0 commit comments

Comments
 (0)