Skip to content

Commit 01713a4

Browse files
committed
added inspect verb
1 parent 370aee0 commit 01713a4

File tree

6 files changed

+145
-14
lines changed

6 files changed

+145
-14
lines changed

cmd/dupi/dupi.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ var scMap = map[string]SubCmd{
3232
"index": newIndexCmd(),
3333
"extract": newExtractCmd(),
3434
"blot": newBlotCmd(),
35-
"unblot": newUnblotCmd()}
35+
"unblot": newUnblotCmd(),
36+
"inspect": newInspectCmd()}
3637

3738
var gFlags = flag.NewFlagSet("dupi", flag.ExitOnError)
3839

cmd/dupi/inspect.go

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,57 @@
1515
package main
1616

1717
import (
18+
"encoding/json"
19+
"flag"
20+
"fmt"
21+
"log"
22+
"os"
23+
1824
"github.com/go-air/dupi"
1925
)
2026

2127
type inspectCmd struct {
2228
subCmd
23-
index *dupi.Index
24-
json *bool
29+
json *bool
30+
}
31+
32+
func newInspectCmd() *inspectCmd {
33+
sub := &subCmd{
34+
name: "inspect",
35+
flags: flag.NewFlagSet("inspect", flag.ExitOnError)}
36+
res := &inspectCmd{
37+
subCmd: *sub,
38+
json: sub.flags.Bool("json", false, "output json.")}
39+
return res
40+
}
41+
42+
func (in *inspectCmd) Usage() string {
43+
return "inspect the root index."
44+
}
45+
46+
func (in *inspectCmd) Run(args []string) error {
47+
var (
48+
err error
49+
idx *dupi.Index
50+
)
51+
in.flags.Parse(args)
52+
idx, err = dupi.OpenIndex(getIndexRoot())
53+
if err != nil {
54+
return err
55+
}
56+
defer idx.Close()
57+
st, err := idx.Stats()
58+
if err != nil {
59+
log.Fatal(err)
60+
}
61+
if *in.json {
62+
d, err := json.MarshalIndent(st, "", "\t")
63+
if err != nil {
64+
log.Fatal(err)
65+
}
66+
os.Stdout.Write(d)
67+
} else {
68+
fmt.Print(st)
69+
}
70+
return nil
2571
}

dmd/t.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ type T struct {
2727
file *os.File
2828
}
2929

30+
const rcdSize = 12
31+
3032
func New(root string) (*T, error) {
3133
res := &T{path: filepath.Join(root, "dmd")}
3234
var err error
@@ -37,20 +39,28 @@ func New(root string) (*T, error) {
3739
return res, nil
3840
}
3941

42+
func (t *T) NumDocs() (uint64, error) {
43+
fi, err := t.file.Stat()
44+
if err != nil {
45+
return 0, err
46+
}
47+
return uint64(fi.Size()) / rcdSize, nil
48+
}
49+
4050
func (t *T) Lookup(did uint32) (fid, start, end uint32, err error) {
4151
f := t.file
42-
_, err = f.Seek(int64(did)*12, 0)
52+
_, err = f.Seek(int64(did)*rcdSize, 0)
4353
if err != nil {
4454
return
4555
}
46-
var buf [12]byte
56+
var buf [rcdSize]byte
4757
_, err = io.ReadFull(f, buf[:])
4858
if err != nil {
4959
return
5060
}
5161
fid = binary.BigEndian.Uint32(buf[0:4])
5262
start = binary.BigEndian.Uint32(buf[4:8])
53-
end = binary.BigEndian.Uint32(buf[8:12])
63+
end = binary.BigEndian.Uint32(buf[8:rcdSize])
5464
return
5565
}
5666

index.go

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package dupi
1717
import (
1818
"fmt"
1919
"log"
20+
"math"
2021
"os"
2122

2223
"github.com/go-air/dupi/blotter"
@@ -95,10 +96,36 @@ func (x *Index) Root() string {
9596
return x.config.IndexRoot
9697
}
9798

99+
func (x *Index) Stats() (*Stats, error) {
100+
var err error
101+
st := &Stats{}
102+
st.Root = x.config.IndexRoot
103+
st.NumBlots = 1 << 16 * uint64(len(x.shards))
104+
st.NumDocs, err = x.dmd.NumDocs()
105+
if err != nil {
106+
return nil, err
107+
}
108+
st.NumPaths = uint64(len(x.fnames.d))
109+
110+
for i := range x.shards {
111+
shrd := &x.shards[i]
112+
st.NumPosts += shrd.NumPosts()
113+
}
114+
st.BlotMean = float64(st.NumPosts) / float64(st.NumBlots)
115+
var sos float64
116+
for i := range x.shards {
117+
shrd := &x.shards[i]
118+
sos += shrd.SosDiffs(st.BlotMean)
119+
}
120+
sos /= float64(st.NumBlots)
121+
st.BlotSigma = math.Sqrt(sos)
122+
return st, nil
123+
}
124+
98125
func (x *Index) TokenFunc() token.TokenizerFunc {
99126
tf, err := token.FromConfig(&x.config.TokenConfig)
100127
if err != nil {
101-
panic(err) // should be impossible.
128+
panic(err) // should be impossible, tf created in ctor
102129
}
103130
return tf
104131
}
@@ -159,7 +186,6 @@ func (x *Index) JoinBlot(shard uint32, sblot uint16) uint32 {
159186
blot := nsh * uint32(sblot)
160187
blot += shard
161188
return blot
162-
163189
}
164190

165191
func (x *Index) FindBlot(theBlot uint32, doc *Doc) (start, end uint32, err error) {

internal/shard/index.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@ import (
2323
)
2424

2525
type Index struct {
26-
id uint32
27-
path string
28-
heads [1 << 16]int64
29-
counts [1 << 16]uint32
30-
perm [1 << 16]uint16
31-
//ind [1 << 16]Posts
26+
id uint32
27+
path string
28+
heads [1 << 16]int64
29+
counts [1 << 16]uint32
30+
perm [1 << 16]uint16
3231
postFile *os.File
3332
}
3433

@@ -91,6 +90,27 @@ func (x *Index) Count(blot uint32) uint32 {
9190
return x.counts[blot]
9291
}
9392

93+
func (x *Index) NumPosts() uint64 {
94+
var ttl uint64
95+
for _, ct := range x.counts {
96+
ttl += uint64(ct)
97+
}
98+
return ttl
99+
}
100+
101+
func (x *Index) NumBlots() uint64 {
102+
return 1 << 16
103+
}
104+
105+
func (x *Index) SosDiffs(avg float64) float64 {
106+
var ttl float64
107+
for _, ct := range x.counts {
108+
d := avg - float64(ct)
109+
ttl += d * d
110+
}
111+
return ttl
112+
}
113+
94114
func (x *Index) readIix() error {
95115
f, err := os.Open(fmt.Sprintf("%s.iix", x.path))
96116
if err != nil {

stats.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package dupi
2+
3+
import "fmt"
4+
5+
type Stats struct {
6+
Root string
7+
NumDocs uint64
8+
NumPaths uint64
9+
NumPosts uint64
10+
NumBlots uint64
11+
BlotMean float64
12+
BlotSigma float64
13+
}
14+
15+
const stFmt = `dupi index at %s:
16+
- %d docs
17+
- %d nodes in path tree
18+
- %d posts
19+
- %d blots
20+
- %.2f mean docs per blot
21+
- %.2f sigma (std deviation)
22+
`
23+
24+
func (st *Stats) String() string {
25+
return fmt.Sprintf(stFmt, st.Root, st.NumDocs,
26+
st.NumPaths, st.NumPosts, st.NumBlots,
27+
st.BlotMean, st.BlotSigma)
28+
}

0 commit comments

Comments
 (0)