Skip to content

Commit ecb5d4a

Browse files
committed
chore: init
0 parents  commit ecb5d4a

File tree

143 files changed

+82933
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+82933
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.env
2+
.vscode
3+
__debug_bin*

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 Julian Claus
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Makefile

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
GOLANGCILINT_VERSION := v2.5.0
2+
GO_PATH := $(shell go env GOPATH)/bin
3+
GOLANGCILINT_BIN := $(GO_PATH)/golangci-lint
4+
5+
.PHONY: lint fmt test docs
6+
7+
all: lint fmt test
8+
9+
$(GOLANGCILINT_BIN):
10+
@if ! test -x $(GOLANGCILINT_BIN) || ! $(GOLANGCILINT_BIN) --version | grep -q $(GOLANGCILINT_VERSION); then \
11+
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GO_PATH) $(GOLANGCILINT_VERSION); \
12+
fi
13+
14+
fmt: $(GOLANGCILINT_BIN)
15+
$(GOLANGCILINT_BIN) fmt ./... -v
16+
17+
lint: $(GOLANGCILINT_BIN)
18+
@$(GOLANGCILINT_BIN) run $(LINT_FLAGS) ./... --fix -v
19+
20+
test:
21+
go test -v ./... -short
22+
23+
docs:
24+
cd web && npm run build

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# entityscrape
2+
3+
`entityscrape` is a live demo of the Go module
4+
[entitydebs](https://github.com/ndabAP/entitydebs), a social science tool to
5+
programmatically analyze entities in non-fictional texts. In particular, it's
6+
well-suited to extract the sentiment for an entity using dependency parsing.
7+
Tokenization is highly customizable and supports the Google Cloud Natural
8+
Language API out-of-the-box.
9+
10+
The live demo gives insights to the following questions:
11+
12+
- How do politicians describe their country in governmental speeches?
13+
- Which current topics correlate with celebrities?
14+
- What are the most common root verbs used in different music genres?
15+
16+
Visit the [live demo](https://ndabap.github.io/entityscrape/).
17+
18+
## Author
19+
20+
[Julian Claus](https://www.julian-claus.de) and contributors.
21+
22+
## License
23+
24+
MIT

cases/conduct.go

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
package cases
2+
3+
import (
4+
"bufio"
5+
"bytes"
6+
"context"
7+
"errors"
8+
"log/slog"
9+
"math/rand/v2"
10+
"os"
11+
"strings"
12+
"unicode"
13+
14+
"github.com/ndabAP/entitydebs"
15+
"github.com/ndabAP/entitydebs/tokenize"
16+
"github.com/ndabAP/entitydebs/tokenize/nlp"
17+
"github.com/ndabAP/entityscrape/sbd"
18+
"github.com/ndabAP/entityscrape/translator"
19+
"golang.org/x/text/language"
20+
)
21+
22+
func (study study[samples, aggregated]) Conduct(ctx context.Context) error {
23+
slog.Debug("processing subjects", "n", len(study.Subjects))
24+
25+
translator := translator.NewGoogle(ctx, GoogleCloudSvcAccountKey)
26+
for subject, analyses := range study.Subjects {
27+
select {
28+
case <-ctx.Done():
29+
return ctx.Err()
30+
default:
31+
}
32+
33+
slog.Debug("processing analyses", "subject", subject)
34+
var (
35+
entity = analyses.Entity
36+
ext = analyses.Ext
37+
feats = analyses.Feats
38+
filenames = analyses.Filenames
39+
fuzzyMatching = analyses.FuzzyMatching
40+
lang = analyses.Language
41+
parser = analyses.Parser
42+
)
43+
tokenizer := nlp.New(GoogleCloudSvcAccountKey, lang.String())
44+
frames, err := study.frames(
45+
ctx,
46+
entity,
47+
filenames,
48+
parser,
49+
fuzzyMatching,
50+
tokenizer,
51+
feats,
52+
lang,
53+
)
54+
if err != nil {
55+
return err
56+
}
57+
slog.Debug("analysis done")
58+
59+
slog.Debug("collecting samples")
60+
samples := study.collect(frames)
61+
slog.Debug("sample collection done")
62+
63+
slog.Debug("aggregating samples")
64+
aggregated := study.aggregate(samples)
65+
slog.Debug("aggregation done")
66+
67+
slog.Debug("reporting aggregation")
68+
translator := func(w []string) ([]string, error) {
69+
switch lang {
70+
case language.English:
71+
slog.Debug("skipping translation for English")
72+
return w, nil
73+
default:
74+
}
75+
76+
return translator.Translate(w, lang, language.English)
77+
}
78+
if err := func() error {
79+
pref := strings.Map(func(r rune) rune {
80+
switch {
81+
case r >= 'a' && r <= 'z':
82+
return r
83+
case r >= 'A' && r <= 'Z':
84+
return unicode.ToLower(r)
85+
case r == ' ', r == '-':
86+
return '_'
87+
default:
88+
return -1
89+
}
90+
}, subject)
91+
writer, err := study.store.NewWriter(pref, ext)
92+
if err != nil {
93+
return err
94+
}
95+
//nolint:errcheck
96+
defer writer.Close()
97+
98+
if err := study.report(aggregated, translator, writer); err != nil {
99+
return err
100+
}
101+
102+
return nil
103+
}(); err != nil {
104+
return err
105+
}
106+
slog.Debug("reporting done", "subject", subject)
107+
}
108+
109+
return nil
110+
}
111+
112+
func (study study[samples, aggregated]) frames(
113+
ctx context.Context,
114+
entity,
115+
filenames []string,
116+
parser Parser,
117+
fuzzyMatching bool,
118+
tokenizer tokenize.Tokenizer,
119+
feats tokenize.Features,
120+
lang language.Tag,
121+
) (
122+
entitydebs.Frames,
123+
error,
124+
) {
125+
slog.Debug("parsing files", "n", len(filenames))
126+
if fuzzyMatching {
127+
slog.Debug("fuzzy matching enabled")
128+
}
129+
130+
var (
131+
texts = make([]string, 0, len(filenames))
132+
133+
textChan = make(chan []byte, 50)
134+
errChan = make(chan error, 1)
135+
)
136+
137+
// Consumer
138+
go func() {
139+
defer close(errChan)
140+
141+
for text := range textChan {
142+
n := rand.Uint64N(100)
143+
if n >= SampleRate {
144+
continue
145+
}
146+
147+
t := string(text)
148+
149+
if fuzzyMatching {
150+
var (
151+
buf = new(bytes.Buffer)
152+
153+
c = make(chan string, 50)
154+
done = make(chan struct{}, 1)
155+
)
156+
157+
// Consumer
158+
study.fuzzyMatch(c, entity, buf, done)
159+
// Producer
160+
sbd.Tokenize(lang, t, c)
161+
close(c)
162+
163+
<-done
164+
if buf.Len() > 0 {
165+
texts = append(texts, strings.TrimSpace(buf.String()))
166+
}
167+
168+
continue
169+
}
170+
171+
// No fuzzy matching.
172+
texts = append(texts, strings.TrimSpace(t))
173+
}
174+
}()
175+
// Producer
176+
go func() {
177+
defer close(textChan)
178+
179+
for _, filename := range filenames {
180+
file, err := os.Open(filename)
181+
if err != nil {
182+
errChan <- err
183+
return
184+
}
185+
for err := range parser(file, textChan) {
186+
if errors.Is(err, bufio.ErrTooLong) {
187+
continue
188+
}
189+
errChan <- err
190+
}
191+
//nolint:errcheck
192+
_ = file.Close()
193+
}
194+
}()
195+
196+
select {
197+
case <-ctx.Done():
198+
return entitydebs.Frames{}, ctx.Err()
199+
200+
case err := <-errChan:
201+
if err != nil {
202+
return entitydebs.Frames{}, err
203+
}
204+
}
205+
slog.Debug("texts sampled and parsed", "n", len(texts))
206+
207+
slog.Debug("generating frames")
208+
src := entitydebs.NewSource(entity, texts)
209+
return src.Frames(ctx, tokenizer, feats, entitydebs.NFKC)
210+
}

cases/corpus.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package cases
2+
3+
import (
4+
"io/fs"
5+
"path/filepath"
6+
)
7+
8+
// corpusDir is the absolute directory to the corpus.
9+
var corpusDir string
10+
11+
func SetCorpusRootDir(base string) {
12+
corpusDir = filepath.Join(base, "corpus")
13+
}
14+
15+
func GetCorpusRootDir() string {
16+
return corpusDir
17+
}
18+
19+
func WalkCorpus(corpus string, fn func(filename string) error) error {
20+
root := filepath.Join(corpusDir, corpus)
21+
err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
22+
if err != nil {
23+
return err
24+
}
25+
if d.IsDir() {
26+
return nil
27+
}
28+
switch filepath.Ext(path) {
29+
case ".gitignore", ".gitkeep":
30+
return nil
31+
}
32+
return fn(path)
33+
})
34+
return err
35+
}

cases/fuzzy_match.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package cases
2+
3+
import (
4+
"bytes"
5+
"strings"
6+
)
7+
8+
// fuzzyMatch reads sentences from channel c, applies fuzzy matching against
9+
// entity, and writes matching sentences into buf. When done, it signals
10+
// completion by closing the done channel.
11+
func (study study[samples, aggregated]) fuzzyMatch(
12+
c chan string,
13+
entity []string,
14+
buf *bytes.Buffer,
15+
done chan struct{},
16+
) {
17+
go func() {
18+
defer close(done)
19+
20+
for s := range c {
21+
// Size
22+
if len(s) < 3 {
23+
continue
24+
}
25+
// Filters
26+
switch {
27+
case strings.Contains(s, "http"), strings.Contains(s, ".com"), strings.Contains(s, "www"):
28+
continue
29+
default:
30+
}
31+
32+
// Entity
33+
for _, e := range entity {
34+
if !strings.Contains(s, e) {
35+
continue
36+
}
37+
38+
// Concatenate sentence.
39+
buf.WriteString(s)
40+
break
41+
}
42+
}
43+
}()
44+
}

0 commit comments

Comments
 (0)