@@ -13,6 +13,57 @@ import (
1313var hybridCharRe1 = regexp .MustCompile (`(^)[Xx](\p{Lu})` )
1414var hybridCharRe2 = regexp .MustCompile (`(\s|^)[Xx](\s|$)` )
1515
16+ var VirusException = map [string ]string {
17+ "Aspilota" : "vector" ,
18+ "Bembidion" : "satellites" ,
19+ "Bolivina" : "prion" ,
20+ "Ceylonesmus" : "vector" ,
21+ "Cryptops" : "vector" ,
22+ "Culex" : "vector" ,
23+ "Dasyproctus" : "cevirus" ,
24+ "Desmoxytes" : "vector" ,
25+ "Dicathais" : "vector" ,
26+ "Erateina" : "satellites" ,
27+ "Euragallia" : "prion" ,
28+ "Exochus" : "virus" ,
29+ "Hilara" : "vector" ,
30+ "Ithomeis" : "satellites" ,
31+ "Microgoneplax" : "prion" ,
32+ "Neoaemula" : "vector" ,
33+ "Nephodia" : "satellites" ,
34+ "Ophion" : "virus" ,
35+ "Psenulus" : "trevirus" ,
36+ "Tidabius" : "vector" ,
37+ }
38+
39+ var AnnotationException = map [string ]string {
40+ "Acrostichum" : "nudum" ,
41+ "Adiantum" : "nudum" ,
42+ "Africanthion" : "nudum" ,
43+ "Agathidium" : "nudum" ,
44+ "Aphaniosoma" : "nudum" ,
45+ "Aspidium" : "nudum" ,
46+ "Athyrium" : "nudum" ,
47+ "Blechnum" : "nudum" ,
48+ "Bottaria" : "nudum" ,
49+ "Gnathopleustes" : "den" ,
50+ "Lycopodium" : "nudum" ,
51+ "Nephrodium" : "nudum" ,
52+ "Paralvinella" : "dela" ,
53+ "Polypodium" : "nudum" ,
54+ "Polystichum" : "nudum" ,
55+ "Psilotum" : "nudum" ,
56+ "Ruteloryctes" : "bis" ,
57+ "Selenops" : "ab" ,
58+ "Tortolena" : "dela" ,
59+ "Trachyphloeosoma" : "nudum" ,
60+ "Zodarion" : "van" ,
61+ }
62+
63+ var NoParseException = map [string ]string {
64+ "Navicula" : "bacterium" ,
65+ }
66+
1667var notesRe = regexp .MustCompile (
1768 `(?i)\s+(environmental|samples|species\s+group|species\s+complex|clade|group|author)\b.*$` ,
1869)
@@ -68,14 +119,17 @@ func Preprocess(bs []byte) *Preprocessor {
68119 }
69120 i := len (bs )
70121 name := string (bs )
71- if ! VirusLikeName (name ) {
122+ if ! IsException (name , VirusException ) {
72123 pr .Virus = IsVirus (bs [0 :i ])
73124 }
74125 if pr .Virus {
75126 pr .NoParse = true
76127 return pr
77128 }
78129 pr .NoParse = NoParse (bs [0 :i ])
130+ if IsException (name , NoParseException ) {
131+ pr .NoParse = false
132+ }
79133 if pr .NoParse {
80134 return pr
81135 }
@@ -96,50 +150,14 @@ func Preprocess(bs []byte) *Preprocessor {
96150 return pr
97151}
98152
99- // LikeVirus takes a string and checks it against known species that can
100- // easily be mistaken for viruses. If the string belongs to one of such species
101- // returns true.
102- // The following names are covered:
103- // Aspilota vector Belokobylskij, 2007
104- // Ceylonesmus vector Chamberlin, 1941
105- // Cryptops (Cryptops) vector Chamberlin, 1939
106- // Culex vector Dyar & Knab, 1906
107- // Dasyproctus cevirus Leclercq, 1963
108- // Desmoxytes vector (Chamberlin, 1941)
109- // Dicathais vector Thornley, 1952
110- // Euragallia prion Kramer, 1976
111- // Exochus virus Gauld & Sithole, 2002
112- // Hilara vector Miller, 1923
113- // Microgoneplax prion Castro, 2007
114- // Neoaemula vector Mackinnon, Hiller, Long & Marshall, 2008
115- // Ophion virus Gauld & Mitchell, 1981
116- // Psenulus trevirus Leclercq, 1961
117- // Tidabius vector Chamberlin, 1931
118- func VirusLikeName (name string ) bool {
119- names := map [string ]string {
120- "Aspilota" : "vector" ,
121- "Ceylonesmus" : "vector" ,
122- "Cryptops" : "vector" ,
123- "Culex" : "vector" ,
124- "Dasyproctus" : "cevirus" ,
125- "Desmoxytes" : "vector" ,
126- "Dicathais" : "vector" ,
127- "Euragallia" : "prion" ,
128- "Exochus" : "virus" ,
129- "Hilara" : "vector" ,
130- "Microgoneplax" : "prion" ,
131- "Neoaemula" : "vector" ,
132- "Ophion" : "virus" ,
133- "Psenulus" : "trevirus" ,
134- "Tidabius" : "vector" ,
135- }
153+ func IsException (name string , names map [string ]string ) bool {
136154 words := strings .Fields (name )
137155 if len (words ) < 2 {
138156 return false
139157 }
140158 if epithet , ok := names [words [0 ]]; ok {
141159 for _ , w := range words [1 :] {
142- if strings . HasPrefix ( w , epithet ) {
160+ if w == epithet {
143161 return true
144162 }
145163 }
@@ -161,6 +179,9 @@ func NormalizeHybridChar(bs []byte) []byte {
161179// input.
162180func Annotation (bs []byte ) int {
163181 i := len (bs )
182+ if IsException (string (bs ), AnnotationException ) {
183+ return i
184+ }
164185 regexps := []* regexp.Regexp {
165186 notesRe , taxonConceptsRe1 , taxonConceptsRe2 , taxonConceptsRe3 ,
166187 nomenConceptsRe , lastWordJunkRe , stopWordsRe ,
@@ -178,8 +199,8 @@ func Annotation(bs []byte) int {
178199 // `Anthurium Trustees of the British Museum` should not.
179200 cultivarRankLoc := cultivarRankRe .FindIndex (bs [0 :i ])
180201 ofLoc := ofWordRe .FindIndex (bs [0 :i ])
181- if ( len (ofLoc ) > 0 && ofLoc [0 ] < i &&
182- (len (cultivarRankLoc ) == 0 || cultivarRankLoc [0 ] > ofLoc [0 ]) ) {
202+ if len (ofLoc ) > 0 && ofLoc [0 ] < i &&
203+ (len (cultivarRankLoc ) == 0 || cultivarRankLoc [0 ] > ofLoc [0 ]) {
183204 i = ofLoc [0 ]
184205 }
185206
0 commit comments