1111//
1212// It has the feature that it stems each word to two forms, noun and verb. For example,
1313//
14- // NOUN VERB
15- // ---- ----
16- // aquila aquil aquila
17- // portat portat porta
18- // portis port por
14+ // NOUN VERB
15+ // ---- ----
16+ // aquila aquil aquila
17+ // portat portat porta
18+ // portis port por
1919//
2020// Here (slightly reformatted) are the rules of the stemmer,
2121//
2222// 1. (start)
2323//
24- // 2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
24+ // 2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
2525// respectively.
2626//
27- // 3. If the word ends in '-que' then
28- // if the word is on the list shown in Figure 4, then
29- // write the original word to both the noun-based and verb-based
30- // stem dictionaries and go to 8.
31- // else remove '-que'
27+ // 3. If the word ends in '-que' then
28+ // if the word is on the list shown in Figure 4, then
29+ // write the original word to both the noun-based and verb-based
30+ // stem dictionaries and go to 8.
31+ // else remove '-que'
3232//
3333// [Figure 4 was
3434//
35- // atque quoque neque itaque absque apsque abusque adaeque adusque denique
36- // deque susque oblique peraeque plenisque quandoque quisque quaeque
37- // cuiusque cuique quemque quamque quaque quique quorumque quarumque
38- // quibusque quosque quasque quotusquisque quousque ubique undique usque
39- // uterque utique utroque utribique torque coque concoque contorque
40- // detorque decoque excoque extorque obtorque optorque retorque recoque
41- // attorque incoque intorque praetorque]
35+ // atque quoque neque itaque absque apsque abusque adaeque adusque denique
36+ // deque susque oblique peraeque plenisque quandoque quisque quaeque
37+ // cuiusque cuique quemque quamque quaque quique quorumque quarumque
38+ // quibusque quosque quasque quotusquisque quousque ubique undique usque
39+ // uterque utique utroque utribique torque coque concoque contorque
40+ // detorque decoque excoque extorque obtorque optorque retorque recoque
41+ // attorque incoque intorque praetorque]
4242//
43- // 4. Match the end of the word against the suffix list show in Figure 6(a),
43+ // 4. Match the end of the word against the suffix list show in Figure 6(a),
4444// removing the longest matching suffix, (if any).
4545//
4646// [Figure 6(a) was
4747//
48- // -ibus -ius -ae -am -as -em -es -ia
49- // -is -nt -os -ud -um -us -a -e
50- // -i -o -u]
48+ // -ibus -ius -ae -am -as -em -es -ia
49+ // -is -nt -os -ud -um -us -a -e
50+ // -i -o -u]
5151//
52- // 5. If the resulting stem contains at least two characters then write this stem
52+ // 5. If the resulting stem contains at least two characters then write this stem
5353// to the noun-based stem dictionary.
5454//
55- // 6. Match the end of the word against the suffix list show in Figure 6(b),
55+ // 6. Match the end of the word against the suffix list show in Figure 6(b),
5656// identifying the longest matching suffix, (if any).
5757//
5858// [Figure 6(b) was
6464//
6565// If any of the following suffixes are found then convert them as shown:
6666//
67- // '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
68- // '-beris', '-bor', and '-bo' to '-bi';
69- // '-ero' to '-eri'
67+ // '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
68+ // '-beris', '-bor', and '-bo' to '-bi';
69+ // '-ero' to '-eri'
7070//
7171// else remove the suffix in the normal way.
7272//
73- // 7. If the resulting stem contains at least two characters then write this stem
73+ // 7. If the resulting stem contains at least two characters then write this stem
7474// to the verb-based stem dictionary.
7575//
7676// 8. (end)
7777//
78+ // Addendum: adding -ii to Step 4.
7879package stemmer
7980
8081import (
81- "github.com/gnames/gnparser/ent/str"
8282 "strings"
83+
84+ "github.com/gnames/gnparser/ent/str"
8385)
8486
8587var empty = struct {}{}
@@ -105,7 +107,7 @@ var nounSuffixes = []string{
105107 "ibus" , "ius" , "ae" , "am" , "as" ,
106108 "em" , "es" , "ia" , "is" ,
107109 "nt" , "os" , "ud" , "um" , "us" ,
108- "a" , "e" , "i" , "o" , "u" ,
110+ "a" , "e" , "ii" , " i" , "o" , "u" ,
109111}
110112
111113// StemmedWord is the output of stemming algorithm applied to a word.
@@ -123,12 +125,11 @@ type StemmedWord struct {
123125// epithet.
124126// It assumes the following properties of a string:
125127//
126- // 1. There are no empty spaces over any side of a string.
127- // 2. All spaces within the string are single.
128- // 3. All characters in the string are ASCII with exception of the
129- // hybrid sign.
130- // 4. The string always starts with a capitalized word.
131- //
128+ // 1. There are no empty spaces over any side of a string.
129+ // 2. All spaces within the string are single.
130+ // 3. All characters in the string are ASCII with exception of the
131+ // hybrid sign.
132+ // 4. The string always starts with a capitalized word.
132133func StemCanonical (c string ) string {
133134 graftChimeraFormulaParts := strings .Split (c , " + " )
134135 for gci , gcv := range graftChimeraFormulaParts {
0 commit comments