Skip to content

Commit b02e40c

Browse files
committed
add exeptions to annotations (close #53)
1 parent 51131ef commit b02e40c

6 files changed

Lines changed: 3628 additions & 3519 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22

33
## Unreleased
44

5-
## [v1.3.1]
5+
## [v1.3.2]
66

77
- Add [#182]: support `Do`, `Oo`, `Nu` 2-letter genera.
8+
- Add [#53]: exceptions to annotations (`Bottaria nudum` for example).
9+
10+
## [v1.3.1]
11+
812
- Add [#180]: Zenodo DOI.
913

1014
## [v1.3.0]

ent/internal/preprocess/preprocess.go

Lines changed: 62 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,57 @@ import (
1313
var hybridCharRe1 = regexp.MustCompile(`(^)[Xx](\p{Lu})`)
1414
var hybridCharRe2 = regexp.MustCompile(`(\s|^)[Xx](\s|$)`)
1515

16+
var VirusException = map[string]string{
17+
"Aspilota": "vector",
18+
"Bembidion": "satellites",
19+
"Bolivina": "prion",
20+
"Ceylonesmus": "vector",
21+
"Cryptops": "vector",
22+
"Culex": "vector",
23+
"Dasyproctus": "cevirus",
24+
"Desmoxytes": "vector",
25+
"Dicathais": "vector",
26+
"Erateina": "satellites",
27+
"Euragallia": "prion",
28+
"Exochus": "virus",
29+
"Hilara": "vector",
30+
"Ithomeis": "satellites",
31+
"Microgoneplax": "prion",
32+
"Neoaemula": "vector",
33+
"Nephodia": "satellites",
34+
"Ophion": "virus",
35+
"Psenulus": "trevirus",
36+
"Tidabius": "vector",
37+
}
38+
39+
var AnnotationException = map[string]string{
40+
"Acrostichum": "nudum",
41+
"Adiantum": "nudum",
42+
"Africanthion": "nudum",
43+
"Agathidium": "nudum",
44+
"Aphaniosoma": "nudum",
45+
"Aspidium": "nudum",
46+
"Athyrium": "nudum",
47+
"Blechnum": "nudum",
48+
"Bottaria": "nudum",
49+
"Gnathopleustes": "den",
50+
"Lycopodium": "nudum",
51+
"Nephrodium": "nudum",
52+
"Paralvinella": "dela",
53+
"Polypodium": "nudum",
54+
"Polystichum": "nudum",
55+
"Psilotum": "nudum",
56+
"Ruteloryctes": "bis",
57+
"Selenops": "ab",
58+
"Tortolena": "dela",
59+
"Trachyphloeosoma": "nudum",
60+
"Zodarion": "van",
61+
}
62+
63+
var NoParseException = map[string]string{
64+
"Navicula": "bacterium",
65+
}
66+
1667
var notesRe = regexp.MustCompile(
1768
`(?i)\s+(environmental|samples|species\s+group|species\s+complex|clade|group|author)\b.*$`,
1869
)
@@ -68,14 +119,17 @@ func Preprocess(bs []byte) *Preprocessor {
68119
}
69120
i := len(bs)
70121
name := string(bs)
71-
if !VirusLikeName(name) {
122+
if !IsException(name, VirusException) {
72123
pr.Virus = IsVirus(bs[0:i])
73124
}
74125
if pr.Virus {
75126
pr.NoParse = true
76127
return pr
77128
}
78129
pr.NoParse = NoParse(bs[0:i])
130+
if IsException(name, NoParseException) {
131+
pr.NoParse = false
132+
}
79133
if pr.NoParse {
80134
return pr
81135
}
@@ -96,50 +150,14 @@ func Preprocess(bs []byte) *Preprocessor {
96150
return pr
97151
}
98152

99-
// LikeVirus takes a string and checks it against known species that can
100-
// easily be mistaken for viruses. If the string belongs to one of such species
101-
// returns true.
102-
// The following names are covered:
103-
// Aspilota vector Belokobylskij, 2007
104-
// Ceylonesmus vector Chamberlin, 1941
105-
// Cryptops (Cryptops) vector Chamberlin, 1939
106-
// Culex vector Dyar & Knab, 1906
107-
// Dasyproctus cevirus Leclercq, 1963
108-
// Desmoxytes vector (Chamberlin, 1941)
109-
// Dicathais vector Thornley, 1952
110-
// Euragallia prion Kramer, 1976
111-
// Exochus virus Gauld & Sithole, 2002
112-
// Hilara vector Miller, 1923
113-
// Microgoneplax prion Castro, 2007
114-
// Neoaemula vector Mackinnon, Hiller, Long & Marshall, 2008
115-
// Ophion virus Gauld & Mitchell, 1981
116-
// Psenulus trevirus Leclercq, 1961
117-
// Tidabius vector Chamberlin, 1931
118-
func VirusLikeName(name string) bool {
119-
names := map[string]string{
120-
"Aspilota": "vector",
121-
"Ceylonesmus": "vector",
122-
"Cryptops": "vector",
123-
"Culex": "vector",
124-
"Dasyproctus": "cevirus",
125-
"Desmoxytes": "vector",
126-
"Dicathais": "vector",
127-
"Euragallia": "prion",
128-
"Exochus": "virus",
129-
"Hilara": "vector",
130-
"Microgoneplax": "prion",
131-
"Neoaemula": "vector",
132-
"Ophion": "virus",
133-
"Psenulus": "trevirus",
134-
"Tidabius": "vector",
135-
}
153+
func IsException(name string, names map[string]string) bool {
136154
words := strings.Fields(name)
137155
if len(words) < 2 {
138156
return false
139157
}
140158
if epithet, ok := names[words[0]]; ok {
141159
for _, w := range words[1:] {
142-
if strings.HasPrefix(w, epithet) {
160+
if w == epithet {
143161
return true
144162
}
145163
}
@@ -161,6 +179,9 @@ func NormalizeHybridChar(bs []byte) []byte {
161179
// input.
162180
func Annotation(bs []byte) int {
163181
i := len(bs)
182+
if IsException(string(bs), AnnotationException) {
183+
return i
184+
}
164185
regexps := []*regexp.Regexp{
165186
notesRe, taxonConceptsRe1, taxonConceptsRe2, taxonConceptsRe3,
166187
nomenConceptsRe, lastWordJunkRe, stopWordsRe,
@@ -178,8 +199,8 @@ func Annotation(bs []byte) int {
178199
// `Anthurium Trustees of the British Museum` should not.
179200
cultivarRankLoc := cultivarRankRe.FindIndex(bs[0:i])
180201
ofLoc := ofWordRe.FindIndex(bs[0:i])
181-
if( len(ofLoc) > 0 && ofLoc[0] < i &&
182-
(len(cultivarRankLoc) == 0 || cultivarRankLoc[0] > ofLoc[0])) {
202+
if len(ofLoc) > 0 && ofLoc[0] < i &&
203+
(len(cultivarRankLoc) == 0 || cultivarRankLoc[0] > ofLoc[0]) {
183204
i = ofLoc[0]
184205
}
185206

ent/internal/preprocess/preprocess_test.go

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,50 @@ func TestPreprocess(t *testing.T) {
113113
[]byte(v.out), v.msg)
114114
}
115115
})
116+
t.Run("NoParseLikeName", func(t *testing.T) {
117+
data := []struct {
118+
msg string
119+
name string
120+
likeAnnotation bool
121+
}{
122+
{"name", "Navicula bacterium", true},
123+
}
124+
for _, v := range data {
125+
assert.Equal(t, ppr.IsException(v.name, ppr.NoParseException), v.likeAnnotation, v.msg)
126+
}
127+
})
128+
t.Run("AnnotationLikeName", func(t *testing.T) {
129+
data := []struct {
130+
msg string
131+
name string
132+
likeAnnotation bool
133+
}{
134+
{"name", "Acrostichum nudum", true},
135+
{"name", "Adiantum nudum", true},
136+
{"name", "Africanthion nudum", true},
137+
{"name", "Agathidium nudum", true},
138+
{"name", "Aphaniosoma nudum", true},
139+
{"name", "Aspidium nudum", true},
140+
{"name", "Athyrium nudum", true},
141+
{"name", "Blechnum nudum", true},
142+
{"name", "Bottaria nudum", true},
143+
{"name", "Gnathopleustes den", true},
144+
{"name", "Lycopodium nudum", true},
145+
{"name", "Nephrodium nudum", true},
146+
{"name", "Paralvinella dela", true},
147+
{"name", "Polypodium nudum", true},
148+
{"name", "Polystichum nudum", true},
149+
{"name", "Psilotum nudum", true},
150+
{"name", "Ruteloryctes bis", true},
151+
{"name", "Selenops ab", true},
152+
{"name", "Tortolena dela", true},
153+
{"name", "Trachyphloeosoma nudum", true},
154+
{"name", "Zodarion van", true},
155+
}
156+
for _, v := range data {
157+
assert.Equal(t, ppr.IsException(v.name, ppr.AnnotationException), v.likeAnnotation, v.msg)
158+
}
159+
})
116160

117161
t.Run("VirusLikeName", func(t *testing.T) {
118162
data := []struct {
@@ -139,7 +183,7 @@ func TestPreprocess(t *testing.T) {
139183
{"name17", "Homo sapiens coronavirus", false},
140184
}
141185
for _, v := range data {
142-
assert.Equal(t, ppr.VirusLikeName(v.name), v.likeVirus, v.msg)
186+
assert.Equal(t, ppr.IsException(v.name, ppr.VirusException), v.likeVirus, v.msg)
143187
}
144188
})
145189

ent/parser/grammar.peg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ CultivarApostrophe <- '\'' / '‘' / '’' / '"' / '“' / '”'
5959

6060
SpeciesEpithet <- !(AuthorEx) Word (_? Authorship)?
6161

62-
Comparison <- 'cf' '.'?
62+
Comparison <- 'cf' '.'? &(SpaceCharEOI)
6363

6464
Rank <- (RankForma / RankVar / RankSsp / RankOther / RankOtherUncommon /
6565
RankAgamo / RankNotho) (_? LowerGreek ('.' / &(SpaceCharEOI)))?

0 commit comments

Comments
 (0)