Skip to content

Commit 884acce

Browse files
committed
stem takes in account ii suffix (close #238)
1 parent 3b06115 commit 884acce

5 files changed

Lines changed: 253 additions & 244 deletions

File tree

Makefile

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,24 @@
1+
PROJ_NAME = gnparser
2+
13
VERSION = $(shell git describe --tags)
24
VER = $(shell git describe --tags --abbrev=0)
35
DATE = $(shell date -u '+%Y-%m-%d_%H:%M:%S%Z')
46

5-
FLAG_MODULE = GO111MODULE=on
6-
FLAGS_SHARED = $(FLAG_MODULE) GOARCH=amd64
77
NO_C = CGO_ENABLED=0
8+
FLAGS_SHARED = GOARCH=amd64
89
FLAGS_LINUX = $(FLAGS_SHARED) GOOS=linux
910
FLAGS_MAC = $(FLAGS_SHARED) GOOS=darwin
10-
FLAGS_MAC_ARM = GO111MODULE=on $GOARCH=arm64 GOOS=darwin
11+
FLAGS_MAC_ARM = $GOARCH=arm64 GOOS=darwin
1112
FLAGS_WIN = $(FLAGS_SHARED) GOOS=windows
12-
FLAGS_LD=-ldflags "-s -w -X github.com/gnames/gnparser.Build=${DATE} \
13-
-X github.com/gnames/gnparser.Version=${VERSION}"
13+
FLAGS_LD=-ldflags "-s -w -X github.com/gnames/$(PROJ_NAME).Build=$(DATE) \
14+
-X github.com/gnames/$(PROJ_NAME).Version=$(VERSION)"
15+
FLAGS_REL = -trimpath -ldflags "-s -w \
16+
-X github.com/gnames/$(PROJ_NAME).Build=$(DATE)"
17+
1418
GOCMD = go
1519
GOBUILD = $(GOCMD) build $(FLAGS_LD)
1620
GOINSTALL = $(GOCMD) install $(FLAGS_LD)
21+
GORELEASE = $(GOCMD) build $(FLAGS_REL)
1722
GOCLEAN = $(GOCMD) clean
1823
GOGET = $(GOCMD) get
1924

@@ -24,7 +29,7 @@ CLIB_DIR ?= "."
2429
all: install
2530

2631
test: deps install
27-
$(FLAG_MODULE) go test -race ./...
32+
$(FLAG_MODULE) go test -shuffle=on -race -count=1 ./...
2833

2934
test-build: deps build
3035

@@ -33,7 +38,7 @@ deps:
3338

3439
tools: deps
3540
@echo Installing tools from tools.go
36-
@cat gnparser/tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go install %
41+
@cat $(PROJ_NAME)/tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go install %
3742

3843
peg:
3944
cd ent/parser; \
@@ -53,55 +58,60 @@ asset:
5358
$(FLAGS_SHARED) go run -tags=dev assets_gen.go
5459

5560
build: peg
56-
cd gnparser; \
61+
cd $(PROJ_NAME); \
5762
$(GOCLEAN); \
5863
$(NO_C) $(GOBUILD) -o $(BUILD_DIR)
5964

65+
buildrel: peg
66+
cd $(PROJ_NAME); \
67+
$(GOCLEAN); \
68+
$(NO_C) $(GORELEASE) -o $(BUILD_DIR)
69+
6070
install: peg
61-
cd gnparser; \
71+
cd $(PROJ_NAME); \
6272
$(GOCLEAN); \
6373
$(NO_C) $(GOINSTALL)
6474

6575
release: peg dockerhub
66-
cd gnparser; \
76+
cd $(PROJ_NAME); \
6777
$(GOCLEAN); \
6878
$(FLAGS_LINUX) $(NO_C) $(GOBUILD); \
69-
tar zcf $(RELEASE_DIR)/gnparser-$(VER)-linux.tar.gz gnparser; \
79+
tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-linux.tar.gz $(PROJ_NAME); \
7080
$(GOCLEAN); \
7181
$(FLAGS_MAC) $(NO_C) $(GOBUILD); \
72-
tar zcf $(RELEASE_DIR)/gnparser-$(VER)-mac.tar.gz gnparser; \
82+
tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac.tar.gz $(PROJ_NAME); \
7383
$(GOCLEAN); \
7484
$(FLAGS_MAC_ARM) $(NO_C) $(GOBUILD); \
75-
tar zcf $(RELEASE_DIR)/gnparser-$(VER)-mac-arm64.tar.gz gnparser; \
85+
tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac-arm64.tar.gz $(PROJ_NAME); \
7686
$(GOCLEAN); \
7787
$(FLAGS_WIN) $(NO_C) $(GOBUILD); \
78-
zip -9 $(RELEASE_DIR)/gnparser-$(VER)-win-64.zip gnparser.exe; \
88+
zip -9 $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-win-64.zip $(PROJ_NAME).exe; \
7989
$(GOCLEAN);
8090

8191
dc: asset build
8292
docker-compose build;
8393

8494
docker: build
85-
docker build -t gnames/gognparser:latest -t gnames/gognparser:$(VERSION) .; \
86-
cd gnparser; \
95+
docker build -t gnames/go$(PROJ_NAME):latest -t gnames/go$(PROJ_NAME):$(VERSION) .; \
96+
cd $(PROJ_NAME); \
8797
$(GOCLEAN);
8898

8999
dockerhub: docker
90-
docker push gnames/gognparser; \
91-
docker push gnames/gognparser:$(VERSION)
100+
docker push gnames/go$(PROJ_NAME); \
101+
docker push gnames/go$(PROJ_NAME):$(VERSION)
92102

93103
clib_darwin: peg
94104
cd binding; \
95105
$(GOCLEAN); \
96-
CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/libgnparser_arm64.so; \
97-
CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/libgnparser_amd64.so; \
98-
rm libgnparser_amd64.h; \
99-
mv libgnparser_arm64.h libgnparser.h; \
100-
lipo -create -output $(CLIB_DIR)/libgnparser.so $(CLIB_DIR)/libgnparser_arm64.so $(CLIB_DIR)/libgnparser_amd64.so;
106+
CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so; \
107+
CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so; \
108+
rm lib$(PROJ_NAME)_amd64.h; \
109+
mv lib$(PROJ_NAME)_arm64.h lib$(PROJ_NAME).h; \
110+
lipo -create -output $(CLIB_DIR)/lib$(PROJ_NAME).so $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so;
101111

102112
clib: peg
103113
cd binding; \
104-
$(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/libgnparser.so;
114+
$(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME).so;
105115

106116
quality:
107117
cd tools;\
@@ -110,7 +120,7 @@ quality:
110120

111121
.PHONY: man
112122
man: ronn
113-
@ronn ./man/gnparser.1.ronn --style=dark
123+
@ronn ./man/$(PROJ_NAME).1.ronn --style=dark
114124

115125
.PHONY: ronn
116126
ronn:

ent/stemmer/stemmer.go

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -11,48 +11,48 @@
1111
//
1212
// It has the feature that it stems each word to two forms, noun and verb. For example,
1313
//
14-
// NOUN VERB
15-
// ---- ----
16-
// aquila aquil aquila
17-
// portat portat porta
18-
// portis port por
14+
// NOUN VERB
15+
// ---- ----
16+
// aquila aquil aquila
17+
// portat portat porta
18+
// portis port por
1919
//
2020
// Here (slightly reformatted) are the rules of the stemmer,
2121
//
2222
// 1. (start)
2323
//
24-
// 2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
24+
// 2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
2525
// respectively.
2626
//
27-
// 3. If the word ends in '-que' then
28-
// if the word is on the list shown in Figure 4, then
29-
// write the original word to both the noun-based and verb-based
30-
// stem dictionaries and go to 8.
31-
// else remove '-que'
27+
// 3. If the word ends in '-que' then
28+
// if the word is on the list shown in Figure 4, then
29+
// write the original word to both the noun-based and verb-based
30+
// stem dictionaries and go to 8.
31+
// else remove '-que'
3232
//
3333
// [Figure 4 was
3434
//
35-
// atque quoque neque itaque absque apsque abusque adaeque adusque denique
36-
// deque susque oblique peraeque plenisque quandoque quisque quaeque
37-
// cuiusque cuique quemque quamque quaque quique quorumque quarumque
38-
// quibusque quosque quasque quotusquisque quousque ubique undique usque
39-
// uterque utique utroque utribique torque coque concoque contorque
40-
// detorque decoque excoque extorque obtorque optorque retorque recoque
41-
// attorque incoque intorque praetorque]
35+
// atque quoque neque itaque absque apsque abusque adaeque adusque denique
36+
// deque susque oblique peraeque plenisque quandoque quisque quaeque
37+
// cuiusque cuique quemque quamque quaque quique quorumque quarumque
38+
// quibusque quosque quasque quotusquisque quousque ubique undique usque
39+
// uterque utique utroque utribique torque coque concoque contorque
40+
// detorque decoque excoque extorque obtorque optorque retorque recoque
41+
// attorque incoque intorque praetorque]
4242
//
43-
// 4. Match the end of the word against the suffix list show in Figure 6(a),
43+
// 4. Match the end of the word against the suffix list show in Figure 6(a),
4444
// removing the longest matching suffix, (if any).
4545
//
4646
// [Figure 6(a) was
4747
//
48-
// -ibus -ius -ae -am -as -em -es -ia
49-
// -is -nt -os -ud -um -us -a -e
50-
// -i -o -u]
48+
// -ibus -ius -ae -am -as -em -es -ia
49+
// -is -nt -os -ud -um -us -a -e
50+
// -i -o -u]
5151
//
52-
// 5. If the resulting stem contains at least two characters then write this stem
52+
// 5. If the resulting stem contains at least two characters then write this stem
5353
// to the noun-based stem dictionary.
5454
//
55-
// 6. Match the end of the word against the suffix list show in Figure 6(b),
55+
// 6. Match the end of the word against the suffix list show in Figure 6(b),
5656
// identifying the longest matching suffix, (if any).
5757
//
5858
// [Figure 6(b) was
@@ -64,22 +64,24 @@
6464
//
6565
// If any of the following suffixes are found then convert them as shown:
6666
//
67-
// '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
68-
// '-beris', '-bor', and '-bo' to '-bi';
69-
// '-ero' to '-eri'
67+
// '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
68+
// '-beris', '-bor', and '-bo' to '-bi';
69+
// '-ero' to '-eri'
7070
//
7171
// else remove the suffix in the normal way.
7272
//
73-
// 7. If the resulting stem contains at least two characters then write this stem
73+
// 7. If the resulting stem contains at least two characters then write this stem
7474
// to the verb-based stem dictionary.
7575
//
7676
// 8. (end)
7777
//
78+
// Addendum: adding -ii to Step 4.
7879
package stemmer
7980

8081
import (
81-
"github.com/gnames/gnparser/ent/str"
8282
"strings"
83+
84+
"github.com/gnames/gnparser/ent/str"
8385
)
8486

8587
var empty = struct{}{}
@@ -105,7 +107,7 @@ var nounSuffixes = []string{
105107
"ibus", "ius", "ae", "am", "as",
106108
"em", "es", "ia", "is",
107109
"nt", "os", "ud", "um", "us",
108-
"a", "e", "i", "o", "u",
110+
"a", "e", "ii", "i", "o", "u",
109111
}
110112

111113
// StemmedWord is the output of stemming algorithm applied to a word.
@@ -123,12 +125,11 @@ type StemmedWord struct {
123125
// epithet.
124126
// It assumes the following properties of a string:
125127
//
126-
// 1. There are no empty spaces over any side of a string.
127-
// 2. All spaces within the string are single.
128-
// 3. All characters in the string are ASCII with exception of the
129-
// hybrid sign.
130-
// 4. The string always starts with a capitalized word.
131-
//
128+
// 1. There are no empty spaces over any side of a string.
129+
// 2. All spaces within the string are single.
130+
// 3. All characters in the string are ASCII with exception of the
131+
// hybrid sign.
132+
// 4. The string always starts with a capitalized word.
132133
func StemCanonical(c string) string {
133134
graftChimeraFormulaParts := strings.Split(c, " + ")
134135
for gci, gcv := range graftChimeraFormulaParts {

0 commit comments

Comments
 (0)