Skip to content

Commit b8af813

Browse files
committed
add capitalize option (close #169)
1 parent 6123ed8 commit b8af813

16 files changed

Lines changed: 146 additions & 17 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## Unreleased
44

5+
## [v1.2.0]
6+
7+
- Add [#169]: option to capitalize first letter of name-strings.
58
- Add [#166]: support 'fm.' as 'forma'.
69

710
## [v1.1.0]
@@ -261,6 +264,7 @@ array of names instead of a stream.
261264

262265
This document follows [changelog guidelines]
263266

267+
[v1.2.0]: https://github.com/gnames/gnparser/compare/v1.1.0...v1.2.0
264268
[v1.1.0]: https://github.com/gnames/gnparser/compare/v1.0.14...v1.1.0
265269
[v1.0.14]: https://github.com/gnames/gnparser/compare/v1.0.13...v1.0.14
266270
[v1.0.13]: https://github.com/gnames/gnparser/compare/v1.0.12...v1.0.13

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,9 @@ Relevant flags:
344344
: Sets a maximum number of names collected into a batch before processing.
345345
This flag is ignored if parsing mode is set to streaming with ``-s`` flag.
346346

347+
``--capitalize -c``
348+
: Capitalizes the first letter of name-strings.
349+
347350
``--details -d``
348351
: Return more details for a parsed name. This flag is ignored for CSV
349352
formatting.
@@ -393,6 +396,10 @@ gnparser -f pretty "Parus major Linnaeus, 1788"
393396
394397
# to parse a name from the standard input
395398
echo "Parus major Linnaeus, 1788" | gnparser
399+
400+
# to parse name that is all in low-case
401+
gnparser "parus major" --capitalize
402+
gnparser "parus major" -c
396403
```
397404

398405
To parse a file:

config.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ type Config struct {
3838
// WithNoOrder flag, when true, output and input are in different order.
3939
WithNoOrder bool
4040

41+
// WithCapitalization flag, when true, the first letter of a name-string
42+
// is capitalized, if appropriate.
43+
WithCapitalization bool
44+
4145
// Port to run wer-service.
4246
Port int
4347

@@ -134,6 +138,13 @@ func OptWithNoOrder(b bool) Option {
134138
}
135139
}
136140

141+
// OptWithCapitaliation sets the WithCapitalization field.
142+
func OptWithCapitaliation(b bool) Option {
143+
return func(cfg *Config) {
144+
cfg.WithCapitalization = b
145+
}
146+
}
147+
137148
// OptPort sets a port for web-service.
138149
func OptPort(i int) Option {
139150
return func(cfg *Config) {

ent/parsed/warning.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ const (
3838
HybridFormulaIncompleteWarn
3939
HybridFormulaProbIncompleteWarn
4040
HybridNamedWarn
41+
LowCaseWarn
4142
NameApproxWarn
4243
NameComparisonWarn
4344
RankUncommonWarn
@@ -89,6 +90,7 @@ var warningMap = map[Warning]string{
8990
HybridFormulaIncompleteWarn: "Incomplete hybrid formula",
9091
HybridFormulaProbIncompleteWarn: "Probably incomplete hybrid formula",
9192
HybridNamedWarn: "Named hybrid",
93+
LowCaseWarn: "Name starts with low-case character",
9294
NameApproxWarn: "Name is approximate",
9395
NameComparisonWarn: "Name comparison",
9496
RankUncommonWarn: "Uncommon rank",
@@ -149,6 +151,7 @@ var WarningQualityMap = map[Warning]int{
149151
HybridFormulaIncompleteWarn: 4,
150152
HybridFormulaProbIncompleteWarn: 2,
151153
HybridNamedWarn: 2,
154+
LowCaseWarn: 4,
152155
NameApproxWarn: 4,
153156
NameComparisonWarn: 4,
154157
RankUncommonWarn: 3,

ent/parser/ast.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ import (
77

88
"github.com/gnames/gnparser/ent/internal/preprocess"
99

10-
"github.com/gnames/gnparser/ent/internal/str"
1110
"github.com/gnames/gnparser/ent/parsed"
11+
"github.com/gnames/gnparser/ent/str"
1212
"github.com/gnames/gnparser/io/dict"
1313
"github.com/gnames/gnuuid"
1414
"github.com/gnames/tribool"

ent/parser/interfaces.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ import (
99
type Parser interface {
1010
// PreprocessAndParse takes a scientific name and returns back Abstract
1111
// Syntax Tree of the name-string.
12-
PreprocessAndParse(name, version string, keepHTML bool) ScientificNameNode
12+
PreprocessAndParse(
13+
name, version string,
14+
keepHTML, capitalize bool,
15+
) ScientificNameNode
1316
Debug(name string) []byte
1417
}
1518

ent/parser/name.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ package parser
33
import (
44
"fmt"
55

6-
"github.com/gnames/gnparser/ent/internal/str"
76
"github.com/gnames/gnparser/ent/parsed"
87
"github.com/gnames/gnparser/ent/stemmer"
8+
"github.com/gnames/gnparser/ent/str"
99
)
1010

1111
type canonical struct {

ent/parser/parser.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ import (
77
"fmt"
88

99
"github.com/gnames/gnparser/ent/internal/preprocess"
10-
"github.com/gnames/gnparser/ent/internal/str"
1110
"github.com/gnames/gnparser/ent/parsed"
11+
"github.com/gnames/gnparser/ent/str"
1212
)
1313

1414
// Debug takes a string, parsers it, and returns a byte representation of
@@ -38,16 +38,25 @@ func (p *Engine) Debug(s string) []byte {
3838
func (p *Engine) PreprocessAndParse(
3939
s, ver string,
4040
keepHTML bool,
41+
capitalize bool,
4142
) ScientificNameNode {
4243

4344
originalString := s
44-
tagsOrEntities := false
45+
var tagsOrEntities, lowCase bool
4546
if !keepHTML {
4647
s = preprocess.StripTags(s)
4748
if originalString != s {
4849
tagsOrEntities = true
4950
}
5051
}
52+
53+
if capitalize {
54+
s = str.CapitalizeName(s)
55+
if s != originalString {
56+
lowCase = true
57+
}
58+
}
59+
5160
preproc := preprocess.Preprocess([]byte(s))
5261

5362
defer func() {
@@ -74,9 +83,15 @@ func (p *Engine) PreprocessAndParse(
7483

7584
p.Buffer = string(preproc.Body)
7685
p.fullReset()
86+
7787
if tagsOrEntities {
7888
p.addWarn(parsed.HTMLTagsEntitiesWarn)
7989
}
90+
91+
if lowCase {
92+
p.addWarn(parsed.LowCaseWarn)
93+
}
94+
8095
if preproc.Underscore {
8196
p.addWarn(parsed.SpaceNonStandardWarn)
8297
}

ent/parser/parser_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ func TestPreNParse(t *testing.T) {
1818
{"something", ""},
1919
}
2020
for _, v := range testData {
21-
sn := p.PreprocessAndParse(v.name, "test_version", true)
21+
sn := p.PreprocessAndParse(v.name, "test_version", true, false)
2222
parsed := sn.ToOutput(false)
2323
can := parsed.Canonical
2424
msg := v.name
@@ -54,7 +54,7 @@ func TestToOutput(t *testing.T) {
5454
{"something", "", "", false, false},
5555
}
5656
for _, v := range testData {
57-
sn := p.PreprocessAndParse(v.name, "test_version", true)
57+
sn := p.PreprocessAndParse(v.name, "test_version", true, false)
5858
out := sn.ToOutput(v.det)
5959
msg := v.name
6060
if !out.Parsed {
Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Package str provides string functions for parsing scientific names.
1+
// Package str provides functions for manipulating scientific name-strings.
22
package str
33

44
import (
@@ -9,6 +9,28 @@ import (
99
"unicode/utf8"
1010
)
1111

12+
// CapitalizeName function capitalizes the first character of a name-string.
13+
// It can be a useful option if the data is known to contain 'real' names, for
14+
// example canonical forms, but they are provided with all letters in lower
15+
// case.
16+
func CapitalizeName(name string) string {
17+
runes := []rune(name)
18+
if len(runes) < 2 {
19+
return name
20+
}
21+
22+
one := runes[0]
23+
two := runes[1]
24+
if unicode.IsUpper(one) || !unicode.IsLetter(one) {
25+
return name
26+
}
27+
if one == 'x' && (two == ' ' || unicode.IsUpper(two)) {
28+
return name
29+
}
30+
runes[0] = unicode.ToUpper(one)
31+
return string(runes)
32+
}
33+
1234
// ToASCII converts a UTF-8 diacritics to corresponding ASCII chars.
1335
func ToASCII(b []byte, m map[rune]string) ([]byte, error) {
1436
tlBuf := bytes.NewBuffer(make([]byte, 0, len(b)*125/100))

0 commit comments

Comments
 (0)