Skip to content

Commit b8b2a1f

Browse files
author
Dominika Tkaczyk
committed
better author-email matching
1 parent 237cc75 commit b8b2a1f

1 file changed

Lines changed: 22 additions & 9 deletions

File tree

  • cermine-impl/src/main/java/pl/edu/icm/cermine/metadata/extraction/enhancers

cermine-impl/src/main/java/pl/edu/icm/cermine/metadata/extraction/enhancers/EmailEnhancer.java

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.Set;
2323
import java.util.regex.Matcher;
2424
import java.util.regex.Pattern;
25+
import org.apache.commons.lang.StringUtils;
2526
import pl.edu.icm.cermine.metadata.model.DocumentAuthor;
2627
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
2728
import pl.edu.icm.cermine.structure.model.BxZone;
@@ -55,7 +56,9 @@ protected boolean enhanceMetadata(BxZone zone, DocumentMetadata metadata) {
5556
String domain = matcher.group(2);
5657
String[] names = emails.split("[\\|, ]+");
5758
for (String name : names) {
58-
addEmail(metadata, name+"@"+domain);
59+
if (!name.isEmpty()) {
60+
addEmail(metadata, name+"@"+domain);
61+
}
5962
}
6063
}
6164
matcher = PATTERN.matcher(zone.toText());
@@ -74,17 +77,27 @@ private void addEmail(DocumentMetadata metadata, String email) {
7477

7578
for (DocumentAuthor a : metadata.getAuthors()) {
7679
String[] names = a.getName().split(" ");
77-
for (String namePart : names) {
78-
if (namePart.length() > 2 && email.toLowerCase().contains(namePart.toLowerCase())) {
79-
if (author == null) {
80-
author = a;
81-
break;
82-
} else {
83-
one = false;
80+
String fname = StringUtils.join(names, "");
81+
if (fname.toLowerCase().contains(email.toLowerCase().replaceFirst("@.*", ""))) {
82+
if (author == null) {
83+
author = a;
84+
break;
85+
} else {
86+
one = false;
87+
}
88+
} else {
89+
for (String namePart : names) {
90+
if (namePart.length() > 2 && email.toLowerCase().contains(namePart.toLowerCase())) {
91+
if (author == null) {
92+
author = a;
93+
break;
94+
} else {
95+
one = false;
96+
}
8497
}
98+
}
8599
}
86100
}
87-
}
88101

89102
if (author != null && one) {
90103
author.addEmail(email);

0 commit comments

Comments
 (0)