Skip to content

Commit e4834fa

Browse files
committed
add verb 'dupi like'
1 parent efa6cc4 commit e4834fa

File tree

12 files changed

+251
-9
lines changed

12 files changed

+251
-9
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,7 @@ Find and query for repeated chunks of text.
2727

2828
[Design Document](docs/design.md)
2929

30+
## Library Reference
31+
32+
33+

cmd/dupi/blot.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
// Copyright 2021 the Dupi authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
115
package main
216

317
import (

cmd/dupi/dupi.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ var scMap = map[string]SubCmd{
3333
"extract": newExtractCmd(),
3434
"blot": newBlotCmd(),
3535
"unblot": newUnblotCmd(),
36-
"inspect": newInspectCmd()}
36+
"inspect": newInspectCmd(),
37+
"like": newLikeCmd()}
3738

3839
var gFlags = flag.NewFlagSet("dupi", flag.ExitOnError)
3940

cmd/dupi/index.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,6 @@ func (x *indexCmd) mkWalkFn(perr *error) func(path string, entry fs.DirEntry, er
106106
return nil
107107
}
108108

109-
if *x.verbose {
110-
log.Printf("doPath %s\n", path)
111-
}
112-
113109
f, e := os.Open(path)
114110
if e != nil {
115111
return e
@@ -121,6 +117,9 @@ func (x *indexCmd) mkWalkFn(perr *error) func(path string, entry fs.DirEntry, er
121117
return err
122118
}
123119
doc := &dupi.Doc{Path: path, Dat: dat, End: uint32(len(dat))}
120+
if *x.verbose {
121+
log.Printf("indexing %s %d:%d\n", path, 0, doc.End)
122+
}
124123
return x.indexer.Add(doc)
125124
}
126125
}

cmd/dupi/inspect.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ import (
2626

2727
type inspectCmd struct {
2828
subCmd
29-
json *bool
29+
json *bool
30+
files *bool
3031
}
3132

3233
func newInspectCmd() *inspectCmd {
@@ -35,7 +36,8 @@ func newInspectCmd() *inspectCmd {
3536
flags: flag.NewFlagSet("inspect", flag.ExitOnError)}
3637
res := &inspectCmd{
3738
subCmd: *sub,
38-
json: sub.flags.Bool("json", false, "output json.")}
39+
json: sub.flags.Bool("json", false, "output json."),
40+
files: sub.flags.Bool("files", false, "output file info.")}
3941
return res
4042
}
4143

cmd/dupi/like.go

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// Copyright 2021 the Dupi authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"flag"
19+
"fmt"
20+
"io/ioutil"
21+
"log"
22+
"os"
23+
"sort"
24+
25+
"github.com/go-air/dupi"
26+
"github.com/go-air/dupi/token"
27+
)
28+
29+
type likeCmd struct {
30+
subCmd
31+
}
32+
33+
func newLikeCmd() *likeCmd {
34+
sc := &subCmd{flags: flag.NewFlagSet("like", flag.ExitOnError)}
35+
lc := &likeCmd{subCmd: *sc}
36+
return lc
37+
}
38+
39+
func (lc *likeCmd) Usage() string {
40+
return "[file path]"
41+
}
42+
43+
func (lc *likeCmd) Run(args []string) error {
44+
lc.flags.Parse(args)
45+
root := getIndexRoot()
46+
idx, err := dupi.OpenIndex(root)
47+
if err != nil {
48+
log.Fatalf("couldn't open dupi index at '%s': %s", root, err)
49+
}
50+
defer idx.Close()
51+
for _, fname := range args {
52+
if err := doFilename(idx, fname); err != nil {
53+
return err
54+
}
55+
}
56+
return nil
57+
}
58+
59+
type docKey struct {
60+
Path string
61+
start, end uint32
62+
}
63+
64+
func dkey(doc *dupi.Doc) docKey {
65+
return docKey{
66+
Path: doc.Path,
67+
start: doc.Start,
68+
end: doc.End}
69+
}
70+
71+
func doFilename(idx *dupi.Index, fname string) error {
72+
f, e := os.Open(fname)
73+
if e != nil {
74+
return e
75+
}
76+
defer f.Close()
77+
dat, err := ioutil.ReadAll(f)
78+
if err != nil {
79+
return err
80+
}
81+
doc := &dupi.Doc{Path: fname, Dat: dat, End: uint32(len(dat))}
82+
blots := idx.BlotDoc(nil, doc)
83+
var toks []token.T
84+
toks = idx.TokenFunc()(nil, doc.Dat, 0)
85+
j := 0
86+
for i := range toks {
87+
if toks[i].Tag != token.Word {
88+
continue
89+
}
90+
toks[j] = toks[i]
91+
j++
92+
}
93+
N := uint32(idx.NumShards()) * (1 << 16)
94+
seqLen := idx.SeqLen()
95+
bm := make(map[uint32][]byte, len(blots))
96+
for i, b := range blots {
97+
beg := toks[i].Pos
98+
end := toks[i+seqLen].Pos + uint32(len(toks[i+seqLen].Lit))
99+
bm[b%N] = dat[beg:end]
100+
}
101+
query := idx.StartQuery(dupi.QueryMaxBlot)
102+
var db dupi.Blot
103+
found := make(map[docKey]int)
104+
for _, blot := range blots {
105+
b := blot % N
106+
db.Blot = b
107+
db.Docs = nil
108+
if err := query.Get(&db); err != nil {
109+
return err
110+
}
111+
for i := range db.Docs {
112+
doc := &db.Docs[i]
113+
dk := dkey(doc)
114+
if found[dk] != 0 {
115+
continue
116+
}
117+
rm, err := idx.FindBlots(bm, doc)
118+
if err != nil {
119+
return err
120+
}
121+
if len(rm) == 0 {
122+
continue
123+
}
124+
found[dk]++
125+
}
126+
}
127+
keys := make([]docKey, 0, len(found))
128+
for k, _ := range found {
129+
keys = append(keys, k)
130+
}
131+
sort.Slice(keys, func(i, j int) bool {
132+
return found[keys[i]] < found[keys[j]]
133+
})
134+
fmt.Printf("like %s:\n", fname)
135+
for _, dk := range keys {
136+
fmt.Printf("\t%s %d:%d\n", dk.Path, dk.start, dk.end)
137+
}
138+
return nil
139+
}

cmd/dupi/unblot.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
// Copyright 2021 the Dupi authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
115
package main
216

317
import (

dmd/adder_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
// Copyright 2021 the Dupi authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
115
package dmd
216

317
import (

doc.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func (doc *Doc) Load() error {
6262
doc.Dat = make([]byte, doc.End-doc.Start)
6363
_, err = f.ReadAt(doc.Dat, int64(doc.Start))
6464
if err != nil {
65-
return fmt.Errorf("readat len=%d at=%d: %w\n", len(doc.Dat), doc.Start, err)
65+
return fmt.Errorf("readat %s len=%d at=%d: %w\n", doc.Path, len(doc.Dat), doc.Start, err)
6666
}
6767
f.Close()
6868
}

index.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
package dupi
1616

1717
import (
18+
"bytes"
1819
"fmt"
1920
"log"
2021
"math"
@@ -188,6 +189,47 @@ func (x *Index) JoinBlot(shard uint32, sblot uint16) uint32 {
188189
return blot
189190
}
190191

192+
func (x *Index) FindBlots(m map[uint32][]byte, doc *Doc) (map[uint32][]byte, error) {
193+
if doc.Dat == nil {
194+
err := doc.Load()
195+
if err != nil {
196+
return nil, err
197+
}
198+
}
199+
toks := x.TokenFunc()(nil, doc.Dat, doc.Start)
200+
j := 0
201+
for _, tok := range toks {
202+
if tok.Tag != token.Word {
203+
continue
204+
}
205+
toks[j] = tok
206+
j++
207+
}
208+
blotter := x.Blotter()
209+
seqLen := x.SeqLen()
210+
nShard := uint32(x.NumShards())
211+
res := make(map[uint32][]byte)
212+
for i, tok := range toks[:j] {
213+
blot := blotter.Blot(tok.Lit)
214+
if i < seqLen {
215+
continue
216+
}
217+
blot %= nShard * (1 << 16)
218+
if m[blot] == nil {
219+
continue
220+
}
221+
start := toks[i-seqLen].Pos
222+
end := tok.Pos + uint32(len(tok.Lit))
223+
txt := doc.Dat[start:end]
224+
if bytes.Equal(m[blot], txt) {
225+
res[blot] = txt
226+
} else {
227+
//fmt.Printf("not equal: %s %s\n", string(m[blot]), string(txt))
228+
}
229+
}
230+
return res, nil
231+
}
232+
191233
func (x *Index) FindBlot(theBlot uint32, doc *Doc) (start, end uint32, err error) {
192234
if doc.Dat == nil {
193235
err = doc.Load()

0 commit comments

Comments
 (0)