Skip to content

Commit 9a3f2be

Browse files
committed
Select Names is working a bi better
1 parent a39c608 commit 9a3f2be

18 files changed

Lines changed: 115 additions & 73 deletions

File tree

536 Bytes
Binary file not shown.

alix-cli/lib/alix-common-1.0.0.jar

-498 Bytes
Binary file not shown.

alix-cli/lib/alix-fr-1.0.0.jar

23 Bytes
Binary file not shown.

alix-cli/lib/alix-util-1.0.0.jar

1 Byte
Binary file not shown.

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FilterHTML.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ public class FilterHTML extends TokenFilter
8282
PARA.add("h5");
8383
PARA.add("h6");
8484
PARA.add("item");
85+
PARA.add("label");
8586
PARA.add("li");
8687
PARA.add("p");
8788
PARA.add("tr");

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/tokenattributes/CharsAttImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ else if (other instanceof Chain) {
480480
if (chain.length() != len)
481481
return false;
482482
char[] test = chain.array();
483-
int start = chain.start();
483+
int start = chain.offset();
484484
for (int i = zero; i < zero + len; i++) {
485485
if (test[start] != chars[i])
486486
return false;

analysis/src/java/com/github/oeuvres/alix/lucene/index/Cli.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ public abstract class Cli
4747
}
4848
}
4949

50+
/** Current path processed */
51+
Path path;
52+
53+
5054
public class AnaCli extends Analyzer
5155
{
5256
/**

analysis/src/java/com/github/oeuvres/alix/lucene/index/Names.java

Lines changed: 58 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,32 @@
4343
public class Names extends Cli implements Callable<Integer>
4444
{
4545
HashMap<Chain, IntMutable> forms = new HashMap<>(16384);
46-
Chain form = new Chain();
46+
final static CharArraySet STOP = new CharArraySet(200, false);
47+
static {
48+
String[] words = new String[]{
49+
"A' B",
50+
"BC",
51+
"Einstellung",
52+
"IB",
53+
"Ib",
54+
"IIA",
55+
"IIa",
56+
"IIB",
57+
"IIb",
58+
"IIIA",
59+
"IIIa",
60+
"IIIB",
61+
"IIIb",
62+
"I-IV",
63+
"Melle",
64+
"Pr",
65+
"The",
66+
};
67+
for (final String word: words) {
68+
STOP.add(word);
69+
}
70+
}
71+
4772

4873

4974
@Override
@@ -56,6 +81,7 @@ public Integer call() throws Exception
5681
paths.sort(null);
5782
for (final Path path: paths) {
5883
System.err.println(path);
84+
this.path = path;
5985
BufferedReader reader = new BufferedReader(
6086
new InputStreamReader(
6187
new FileInputStream(path.toFile())
@@ -81,36 +107,60 @@ public Integer call() throws Exception
81107
private void analyze(final TokenStream tokenStream) throws IOException
82108
{
83109

84-
110+
final Chain form = new Chain();
85111
final CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
86112
final FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
87113
final LemAtt lemAtt = tokenStream.addAttribute(LemAtt.class);
88114
final OrthAtt orthAtt = tokenStream.addAttribute(OrthAtt.class);
89-
90115
final CharsAttImpl testAtt = new CharsAttImpl();
91116
tokenStream.reset();
92117
int words = 0;
118+
Chain glob = new Chain("? ?");
93119
while(tokenStream.incrementToken()) {
94120
final int flags = flagsAtt.getFlags();
95121
final int group = (flags & 0xF0);
96122
if (termAtt.isEmpty()) {
97123
continue; // skip empty position
98124
}
99125
// candidate name, append
100-
if (group == NAME.code) {
126+
if (
127+
group == NAME.code
128+
&& flags != NAMEspec.code
129+
&& flags != NAMEplace.code
130+
&& flags != NAMEorg.code
131+
&& !Char.isDigit(termAtt.charAt(termAtt.length() - 1)) // A1
132+
) {
101133
if (!form.isEmpty()) form.append(" ");
102134
if (!orthAtt.isEmpty()) form.append(orthAtt);
103135
else form.append(termAtt);
104-
words++;
136+
137+
if (STOP.contains(form.array(), form.offset(), form.length())) {
138+
form.setLength(0);
139+
words = 0;
140+
}
141+
else {
142+
words++;
143+
}
105144
continue;
106145
}
107146
// breaks
108147
if (
109148
PUN.isPun(flags)
149+
|| Char.isMath(termAtt.charAt(0)) // < >
110150
|| Char.isDigit(termAtt.charAt(0))
111151
|| !lemAtt.isEmpty() // token known from dictionary as a word
112152
) {
113-
if (form.isEmpty()) continue;
153+
if (
154+
form.isEmpty()
155+
|| form.length() == 1 // variable
156+
|| form.last() == '\'' // A'
157+
|| form.last() == '.' // A.
158+
|| glob.glob(form) // W q
159+
) {
160+
form.setLength(0);
161+
words = 0;
162+
continue;
163+
}
114164
IntMutable count = forms.get(form);
115165
if (count == null) {
116166
count = new IntMutable(0);
@@ -119,17 +169,10 @@ private void analyze(final TokenStream tokenStream) throws IOException
119169
count.inc();
120170
form.setLength(0);
121171
words = 0;
122-
}
123-
// ?
124-
if (form.isEmpty()) {
125-
System.out.println(termAtt);
126172
continue;
127173
}
128-
// Arion subfuscus ? (maybe foreign words like Piaget said)
129-
form.append(" ");
130-
if (!orthAtt.isEmpty()) form.append(orthAtt);
131-
else form.append(termAtt);
132-
words++;
174+
// not yet a name, append nothing
175+
if (form.isEmpty()) continue;
133176
continue;
134177
}
135178
tokenStream.close();

common/src/java/com/github/oeuvres/alix/fr/TagFr.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,9 @@ public enum TagFr implements Tag {
9494
/** Fiction character name */
9595
NAMEfict(0x49, "Personnage", "Rodogune, Chicot… (nom de personnage fictif, dictionnaire)."),
9696
/** title name */
97-
NAMEtitle(0x4A, "Titre", " Titre d’œuvre (dictionnaire)") { },
97+
NAMEtitle(0x4A, "Titre", " Titre d’œuvre (dictionnaire)"),
98+
/** title name */
99+
NAMEspec(0x4B, "Espèce", " Xerophila candidula"),
98100
/** People name */
99101
NAMEpeople(0x4E, "Peuple", " (nom de peuple, dictionnaire)."),
100102
/** God name */

fr/src/resources/com/github/oeuvres/alix/fr/brevidot.csv

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ Cf.,,,cf.
1818
ch.,,,chapitre
1919
chap.,,,chapitre
2020
Chap.,,,chapitre
21-
cit.,SUB,,
22-
coll.,SUB,,
21+
cit.,REF,,
22+
coll.,REF,,
2323
D.,NAME
2424
Dec.,,,décembre
2525
Déc.,,,décembre
@@ -32,21 +32,27 @@ ed.,,,édition
3232
eq.,,,équation
3333
éq.,,,équation
3434
etc.,ADV
35-
exp.,SUB,,
35+
exp.,REF,,
3636
Exp.,,,exp.
37+
Expér.,,,exp.
38+
EXP.,,,exp.
39+
EXP,,,exp.
3740
F.,NAME
38-
Fam.,,,Fam.
39-
fam.,SUB,,
40-
fasc.,,,fascicule
41+
fam.,,,Fam.
42+
Fam.,REF,,
43+
fasc.,REF,,
44+
Fasc.,,,fasc.
4145
fev.,,,février
4246
Fev.,,,février
4347
fevr.,,,février
4448
Fevr.,,,février
45-
fig.,NAME,,
46-
fol.,NAME,,
49+
fig.,REF,,
50+
fol.,REF,,
4751
fr.,,,francs
4852
Fr.,,,francs
4953
G.,NAME
54+
gen.,,,Gen.
55+
Gen.,REF,,
5056
H.,NAME
5157
h.,,,heure
5258
Ib.,,,ibid
@@ -69,8 +75,8 @@ J.-C.,NAME
6975
K.,NAME
7076
L.,NAME
7177
liv.,,,livre
72-
loc. cit.,SUB
73-
loc.,SUB
78+
loc. cit.,REF
79+
loc.,REF
7480
M.,,,monsieur
7581
Me.,,,maître
7682
Melle.,,,mademoiselle
@@ -94,55 +100,56 @@ obs.,,,observation
94100
op.,,,opus
95101
oct.,,,octobre
96102
P.,NAME
103+
PUF,NAME
97104
P.U.F.,,,PUF
98105
P.u.F.,,,PUF
99106
p.u.f.,,,PUF
100-
p.,,,page
107+
p.,REF,,
101108
pag.,,,page
102109
phr.,,,phrase
103-
pp.,,,pages
104-
P.P.,,,pages
110+
pp.,REF,,
111+
P.P.,,,pp.
105112
Pr.,,,Professeur
106-
probl.,,,problème
107-
Probl.,,,problème
113+
probl.,REF,,
114+
Probl.,,,probl.
108115
prop.,,,proposition
109116
Prs.,,,Professeur
110117
P.S.,,,post-scriptum
111118
PS.,,,post-scriptum
112119
Psychol.,,,Psychologie
113120
pt.,,,point
114121
Q.,NAME
115-
quest.,,,question
116-
Quest.,,,question
122+
quest.,REF,,
123+
Quest.,,,quest.
117124
R.,NAME
118125
S.,NAME
119-
sc.,,,scène
120-
Sect.,,,section
121-
sect.,,,section
126+
sc.,REF,,
127+
Sect.,,,sect.
128+
sect.,REF,,
122129
sept.,,,septembre
123-
séq.,,,suivante
124-
sq.,,,suivante
125-
sqq.,,,suivantes
126-
suppl.,,,supplément
130+
séq.,,,sq.
131+
sq.,REF,,
132+
sqq.,REF,,
133+
suppl.,REF,,
127134
T.,NAME
128-
tabl.,,,table
135+
tabl.,REF,,
129136
techn.,,,technique
130137
tél.,,,tél
131138
tel.,,,tél
132139
télép.,,,tél
133140
téléph.,,,tél
134141
téleph.,,,tél
135142
teleph.,,,tél
136-
trad.,,,traduction
137-
Trad.,,,traduction
143+
trad.,SUB,,
144+
Trad.,,,trad.
138145
U.,NAME
139146
U.S.A.,,,USA
140147
V.,NAME
141148
v.,,,voir
142149
Var.,,,var.
143-
var.,SUB,,
150+
var.,REF,,
144151
vol.,,,volume
145-
voy.,VERB,,
152+
voy.,REF,,
146153
Voy.,,,voy.
147154
W.,NAME
148155
X.,NAME

0 commit comments

Comments
 (0)