Skip to content

Commit 53be587

Browse files
committed
various bugfixes and improvments
1. only generate tokens for valid utf8 1. fix the count in shard.ReadState.Total 1. properly order dupi.Query.Next instead of only round-robin.
1 parent b36e8eb commit 53be587

File tree

10 files changed

+54
-17
lines changed

10 files changed

+54
-17
lines changed

blotter/circ.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
package blotter
1616

1717
import (
18+
"bytes"
1819
"hash"
1920
"hash/fnv"
2021
)
@@ -41,7 +42,7 @@ func (c *Circ) Interleaving() int {
4142
func (c *Circ) Blot(word []byte) uint32 {
4243
fn := c.fn
4344
fn.Reset()
44-
fn.Write(word)
45+
fn.Write(bytes.ToLower(word))
4546
h := fn.Sum32()
4647
c.hash ^= c.hashes[c.i]
4748
c.hashes[c.i] = h

cmd/dupi/extract.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ func (x *extractCmd) Run(args []string) error {
6969
for {
7070
n, err := query.Next(shape)
7171
if err == io.EOF {
72+
if n != 0 {
73+
panic(fmt.Sprintf("next gave EOF but n=%d\n", n))
74+
}
7275
return nil
7376
}
7477
if err != nil {

docs/tutorial.md

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,28 @@ dupi index .
8383

8484
## Extracting Duplicates
8585

86+
Dupi extracts sets of documents which share a blot with the 'extract' verb.
87+
8688
```
8789
dupi extract
8890
```
8991

92+
These results are fast but noisy due to blot collisions. Extraction
93+
skips any blots which are associated with one or fewer documents.
94+
95+
Some options may be of interest
96+
97+
```
98+
-b output only blots, one per line
99+
-json output json
100+
-sigma float output only those blots with mean + sigma documents
101+
By default, sigma is 3.0, it represents the standard deviation of
102+
the number of documents associated with a blot. A higher value
103+
outputs less information which is more likely to be associated with
104+
actual duplicate text. A lower value is more thorough (has higher
105+
recall) but less precision.
106+
```
107+
90108
## Appending to the index
91109

92110
```
@@ -114,13 +132,10 @@ rudimentary, but here are some examples.
114132
dupi extract -b | xargs dupi unblot
115133
```
116134

117-
Or
135+
### Like
118136

119-
```
120-
dupi blot file | xargs dupi unblot
121-
```
122-
123-
Much nicer, however is the 'like' verb
137+
Dupi provides a 'like' verb which permits finding documents that
138+
are similar to a given one which is not in the index.
124139

125140
```
126141
dupi like file
@@ -137,3 +152,5 @@ documentation using [issues](https://github.com/go-air/dupi/issues) or
137152

138153

139154

155+
156+

fnames.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ func (s *fnames) abs(v uint32) string {
7070
func (s *fnames) addPath(path string) (uint32, error) {
7171
abs, err := filepath.Abs(path)
7272
if err != nil {
73-
fmt.Printf("abs\n")
7473
return 0, err
7574
}
7675
var (

index.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,7 @@ func (x *Index) qstate(s QueryStrategy) *qstate {
283283
shard := &x.shards[i]
284284
qstate.shardStates[i] = shard.ReadStateAt(0)
285285
}
286-
qstate.blot = uint32(qstate.shardStates[qstate.i].Blot)
287-
qstate.blot *= uint32(qstate.n)
288-
qstate.blot += qstate.i
286+
qstate.setMax()
289287
return qstate
290288
}
291289

internal/shard/index.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func (x *Index) ReadStateForBlotAt(blot, at uint16) *ReadState {
7373
res.Shard = x.id
7474
res.Blot = blot
7575
res.At = at
76-
res.Total = x.counts[at]
76+
res.Total = x.counts[blot]
7777
res.rdr = x.postFile
7878
return res
7979
}

internal/shard/read_state.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,5 @@ type ReadState struct {
2929
func (s *ReadState) Next() (uint32, error) {
3030
var docid uint32
3131
docid, s.Error = s.Posts.next(s.rdr)
32-
if s.Error != nil {
33-
s.Total++
34-
}
3532
return docid, s.Error
3633
}

lock/file.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ func New(path string) (*File, error) {
2929
return &File{path, f}, nil
3030
}
3131

32+
// Close unlocks and then closes the file, returning any
33+
// error. The file handle is closed whether or not
34+
// unlocking fails with an error.
3235
func (f *File) Close() error {
3336
erru := f.Unlock()
3437
errc := f.handle.Close()

query.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,23 @@ type qstate struct {
2727
shardStates []*shard.ReadState
2828
i uint32
2929
n uint32
30-
blot uint32
3130
nilCount uint32
3231
}
3332

33+
func (s *qstate) setMax() {
34+
var maxTotal, p uint32
35+
for i, ss := range s.shardStates {
36+
if ss == nil {
37+
continue
38+
}
39+
if ss.Total > maxTotal {
40+
maxTotal = ss.Total
41+
p = uint32(i)
42+
}
43+
}
44+
s.i = p
45+
}
46+
3447
var ErrInvalidQueryState = errors.New("query state invalid")
3548

3649
type QueryStrategy int
@@ -144,10 +157,12 @@ func (q *Query) advance(src *shard.ReadState, pos uint32) *shard.ReadState {
144157
if src.At == math.MaxUint16 {
145158

146159
} else if src.Total <= 1 {
160+
//fmt.Printf("read state at %d has %d, exhausted\n", pos, src.Total)
147161

148162
} else {
149163
rs = q.index.shards[pos].ReadStateAt(src.At + 1)
150164
}
151165
q.state.shardStates[pos] = rs
152-
return rs
166+
q.state.setMax()
167+
return q.state.shardStates[q.state.i]
153168
}

token/t.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package token
1818
import (
1919
"fmt"
2020
"unicode"
21+
"unicode/utf8"
2122
)
2223

2324
// Tag represents a value in an enumeration of
@@ -56,6 +57,9 @@ func (t *T) String() string {
5657

5758
// Tokenize is a tokenizer function.
5859
func Tokenize(dst []T, d []byte, offset uint32) []T {
60+
if !utf8.Valid(d) {
61+
return dst
62+
}
5963
inWord := false
6064
var i, j int
6165
var r rune

0 commit comments

Comments
 (0)