Skip to content

Commit 8a19991

Browse files
committed
Clean up and precision improvement
1 parent f7cef83 commit 8a19991

15 files changed

+596
-76
lines changed

pom.xml

+6
Original file line numberDiff line numberDiff line change
@@ -147,5 +147,11 @@
147147
<artifactId>nlp-pipelines-conceptmapper</artifactId>
148148
<version>0.5.4</version>
149149
</dependency>
150+
<dependency>
151+
<groupId>au.com.nicta.csp</groupId>
152+
<artifactId>MTIMLExtension</artifactId>
153+
<version>0.0.1-SNAPSHOT</version>
154+
<packaging>jar</packaging>
155+
</dependency>
150156
</dependencies>
151157
</project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package readbiomed.annotators.characterization;
2+
3+
import java.util.ArrayList;
4+
5+
import org.apache.uima.UimaContext;
6+
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
7+
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
8+
import org.apache.uima.fit.factory.AnalysisEngineFactory;
9+
import org.apache.uima.fit.util.JCasUtil;
10+
import org.apache.uima.jcas.JCas;
11+
import org.apache.uima.resource.ResourceInitializationException;
12+
import org.cleartk.ne.type.NamedEntityMention;
13+
import org.cleartk.util.ViewUriUtil;
14+
15+
import gov.nih.nlm.nls.mti.documents.Document;
16+
import gov.nih.nlm.nls.mti.instances.Instance;
17+
import readbiomed.annotators.ml.mtiml.MTIMLAnnotator;
18+
19+
public class BMIPDocumentNotRelevantAnnotator extends MTIMLAnnotator {
20+
21+
public void initialize(UimaContext context) throws ResourceInitializationException {
22+
super.initialize(context);
23+
}
24+
25+
@Override
26+
public void process(JCas jCas) throws AnalysisEngineProcessException {
27+
Document d = new Document();
28+
d.addField("TEXT", jCas.getDocumentText());
29+
30+
Instance i = getFeatureExtractor().prepareInstance(d);
31+
32+
String pmid = ViewUriUtil.getURI(jCas).toString();
33+
System.out.println(pmid);
34+
35+
// Remove all pathogen mentions if document classified as not relevant
36+
System.out.println("Predicted " + getClassifier().predict(i));
37+
38+
if (((readbiomed.mme.classifiers.SGD)getClassifier()).predictProbability(i).getConfidence() < 0.4) {
39+
new ArrayList<NamedEntityMention>(JCasUtil.select(jCas, NamedEntityMention.class)).stream()
40+
// Remove only NCBI annotations
41+
.filter(e -> e.getMentionType().contentEquals("pathogen") && e.getMentionId().startsWith("ncbi-"))
42+
.forEach(e -> e.removeFromIndexes());
43+
}
44+
}
45+
46+
public static AnalysisEngineDescription getDescription(String trieFileName, String classifiersFileName,
47+
String featureExtractorClassName, String featureExtractorParameters)
48+
throws ResourceInitializationException {
49+
return AnalysisEngineFactory.createEngineDescription(BMIPDocumentNotRelevantAnnotator.class,
50+
PARAM_TRIE_FILE_NAME, trieFileName, PARAM_CLASSIFIERS_FILE_NAME, classifiersFileName,
51+
PARAM_FEATURE_EXTRACTOR_CLASS_NAME, featureExtractorClassName, PARAM_FEATURE_EXTRACTOR_PARAMETERS,
52+
featureExtractorParameters);
53+
}
54+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
package readbiomed.annotators.characterization;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collections;
5+
import java.util.HashSet;
6+
import java.util.List;
7+
import java.util.Set;
8+
9+
import org.apache.uima.UimaContext;
10+
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
11+
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
12+
import org.apache.uima.fit.factory.AnalysisEngineFactory;
13+
import org.apache.uima.fit.util.JCasUtil;
14+
import org.apache.uima.jcas.JCas;
15+
import org.apache.uima.resource.ResourceInitializationException;
16+
import org.cleartk.ne.type.NamedEntityMention;
17+
import org.cleartk.util.ViewUriUtil;
18+
19+
import gov.nih.nlm.nls.mti.documents.Document;
20+
import gov.nih.nlm.nls.mti.instances.Instance;
21+
import readbiomed.annotators.ml.mtiml.MTIMLAnnotator;
22+
23+
public class BMIPPathogenNotRelevantAnnotator extends MTIMLAnnotator {
24+
25+
public void initialize(UimaContext context) throws ResourceInitializationException {
26+
super.initialize(context);
27+
}
28+
29+
@Override
30+
public void process(JCas jCas) throws AnalysisEngineProcessException {
31+
List<NamedEntityMention> list = new ArrayList<>();
32+
Set<String> ids = new HashSet<>();
33+
34+
for (NamedEntityMention ne : JCasUtil.select(jCas, NamedEntityMention.class)) {
35+
ids.add(ne.getMentionId());
36+
list.add(ne);
37+
}
38+
39+
String pmid = ViewUriUtil.getURI(jCas).toString();
40+
System.out.println(pmid);
41+
42+
Set<NamedEntityMention> removal = new HashSet<>();
43+
44+
// Remove potential overlapping mentions of the same pathogen
45+
for (NamedEntityMention ne : list) {
46+
for (NamedEntityMention neIn : list) {
47+
if (ne != neIn) {
48+
if (ne.getMentionId().equals(neIn.getMentionId())) {
49+
if (ne.getBegin() == neIn.getBegin() || ne.getEnd() == neIn.getEnd()) {
50+
if (!(removal.contains(ne) || removal.contains(neIn))) {
51+
removal.add(ne);
52+
}
53+
}
54+
}
55+
}
56+
57+
}
58+
}
59+
60+
for (NamedEntityMention ne : removal) {
61+
list.remove(ne);
62+
}
63+
64+
Collections.sort(list, new RelevantPathogenSet().new SortNamedEntityMentions());
65+
66+
for (String id : ids) {
67+
68+
String text = jCas.getDocumentText();
69+
70+
for (NamedEntityMention ne : list) {
71+
if (ne.getMentionId().contentEquals(id)) {
72+
if (ne.getMentionId().equals(id)) {
73+
text = text.substring(0, ne.getBegin()) + "@PATHOGEN$" + text.substring(ne.getEnd());
74+
}
75+
}
76+
}
77+
78+
Document d = new Document();
79+
d.addField("TEXT", text);
80+
81+
Instance i = getFeatureExtractor().prepareInstance(d);
82+
83+
System.out.println("Predicted " + ((readbiomed.mme.classifiers.AdaBoostM1)getClassifier()).predictProbability(i).getConfidence() + " for " + id);
84+
// Remove all pathogen mentions if document classified as not relevant and it is
85+
// an NCBI pathogen
86+
if (((readbiomed.mme.classifiers.AdaBoostM1)getClassifier()).predictProbability(i).getConfidence() < 0.1) {
87+
new ArrayList<NamedEntityMention>(JCasUtil.select(jCas, NamedEntityMention.class)).stream()
88+
.filter(e -> e.getMentionId().equals(id) && e.getMentionId().startsWith("ncbi"))
89+
.forEach(ne -> ne.removeFromIndexes());
90+
;
91+
}
92+
}
93+
}
94+
95+
public static AnalysisEngineDescription getDescription(String trieFileName, String classifiersFileName,
96+
String featureExtractorClassName, String featureExtractorParameters)
97+
throws ResourceInitializationException {
98+
return AnalysisEngineFactory.createEngineDescription(BMIPPathogenNotRelevantAnnotator.class,
99+
PARAM_TRIE_FILE_NAME, trieFileName, PARAM_CLASSIFIERS_FILE_NAME, classifiersFileName,
100+
PARAM_FEATURE_EXTRACTOR_CLASS_NAME, featureExtractorClassName, PARAM_FEATURE_EXTRACTOR_PARAMETERS,
101+
featureExtractorParameters);
102+
}
103+
}

src/main/java/readbiomed/annotators/characterization/PathogenAnnotator.java

-5
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package readbiomed.annotators.characterization;
2+
3+
import java.io.IOException;
4+
5+
import org.apache.uima.fit.factory.AggregateBuilder;
6+
import org.apache.uima.resource.ResourceInitializationException;
7+
import org.apache.uima.util.InvalidXMLException;
8+
import org.xml.sax.SAXException;
9+
10+
import readbiomed.annotators.dictionary.pathogens.PathogenDictionaryAnnotator;
11+
12+
public class PathogenCharacterizationAnnotator {
13+
public static AggregateBuilder getPipeline(String dictFileName)
14+
throws InvalidXMLException, ResourceInitializationException, IOException, SAXException {
15+
AggregateBuilder builder = new AggregateBuilder();
16+
builder.add(PathogenDictionaryAnnotator.getPipeline(dictFileName).createAggregateDescription());
17+
18+
/* builder.add(BMIPPathogenNotRelevantAnnotator.getDescription("/home/antonio/Downloads/mti-ml/MTI_ML/trie.gz",
19+
"/home/antonio/Downloads/mti-ml/MTI_ML/classifiers.gz",
20+
"gov.nih.nlm.nls.mti.featuresextractors.BinaryFeatureExtractor", "-l -n -c"));
21+
*/
22+
builder.add(
23+
BMIPDocumentNotRelevantAnnotator.getDescription("/home/antonio/Downloads/mti-ml/MTI_ML/trie.excel.gz",
24+
"/home/antonio/Downloads/mti-ml/MTI_ML/classifiers.excel.gz",
25+
"gov.nih.nlm.nls.mti.featuresextractors.BinaryFeatureExtractor", "-l -n -c"));
26+
27+
return builder;
28+
}
29+
}

src/main/java/readbiomed/annotators/characterization/PathogenExperimenter.java

+75-32
Original file line numberDiff line numberDiff line change
@@ -2,63 +2,106 @@
22

33
import java.io.File;
44
import java.io.IOException;
5+
import java.net.URI;
6+
import java.net.URISyntaxException;
7+
import java.nio.file.Files;
8+
import java.util.HashMap;
9+
import java.util.HashSet;
10+
import java.util.Map;
11+
import java.util.Set;
12+
import java.util.stream.Collectors;
513

14+
import org.apache.commons.io.FileUtils;
615
import org.apache.uima.UIMAException;
716
import org.apache.uima.analysis_engine.AnalysisEngine;
8-
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
917
import org.apache.uima.fit.factory.AggregateBuilder;
1018
import org.apache.uima.fit.factory.AnalysisEngineFactory;
1119
import org.apache.uima.fit.factory.JCasFactory;
1220
import org.apache.uima.fit.util.JCasUtil;
1321
import org.apache.uima.jcas.JCas;
1422
import org.cleartk.ne.type.NamedEntityMention;
15-
import org.cleartk.opennlp.tools.SentenceAnnotator;
1623
import org.cleartk.util.ViewUriUtil;
1724
import org.xml.sax.SAXException;
1825

19-
import readbiomed.annotators.dictionary.pathogens.PathogenAnnotator;
20-
import readbiomed.annotators.discourse.sdt.SDTAnnotator;
21-
import readbiomed.document.SDTSentence;
22-
import readbiomed.document.Section;
23-
import readbiomed.readers.medline.MedlineReader;
26+
import readbiomed.annotators.dictionary.utils.CharacterizationEvaluation;
27+
import readbiomed.annotators.dictionary.utils.TextFileFilter;
2428

2529
public class PathogenExperimenter {
26-
public static void main(String[] argc) throws IOException, SAXException, UIMAException {
30+
31+
private static void evaluate(Map<String, Set<String>> gt, Map<String, Set<String>> predictions) {
32+
double tps = 0.0;
33+
double fns = 0.0;
34+
double fps = 0.0;
35+
36+
// Compare GT
37+
for (Map.Entry<String, Set<String>> entry : gt.entrySet()) {
38+
long common = entry.getValue().stream()
39+
.filter(predictions.computeIfAbsent(entry.getKey(), o -> new HashSet<>())::contains).count();
40+
41+
Set<String> fp = predictions.computeIfAbsent(entry.getKey(), o -> new HashSet<>()).stream()
42+
.filter(e -> !entry.getValue().contains(e)).collect(Collectors.toSet());
43+
44+
Set<String> fn = entry.getValue().stream()
45+
.filter(e -> !predictions.computeIfAbsent(entry.getKey(), o -> new HashSet<>()).contains(e))
46+
.collect(Collectors.toSet());
47+
48+
System.out.println(entry.getKey() + "|" + common + "|" + entry.getValue().size() + "|"
49+
+ predictions.get(entry.getKey()).size());
50+
51+
double recall = common / (double) (common + fn.size());
52+
double precision = common / (double) (common + fp.size());
53+
double f1 = (2 * precision * recall) / (precision + recall);
54+
55+
tps += common;
56+
fns += fn.size();
57+
fps += fp.size();
58+
59+
System.out.println(entry.getKey() + "|" + precision + "|" + recall + "|" + f1);
60+
System.out.println("FP:" + fp);
61+
System.out.println("FN:" + fn);
62+
}
63+
64+
double recalls = tps / (tps + fns);
65+
double precisions = tps / (tps + fps);
66+
double f1s = (2 * precisions * recalls) / (precisions + recalls);
67+
68+
System.out.println("Overall recall: " + recalls);
69+
System.out.println("Overall precision: " + precisions);
70+
System.out.println("Overall f1: " + f1s);
71+
}
72+
73+
public static void main(String[] argc) throws IOException, SAXException, UIMAException, URISyntaxException {
2774
String inputFolderName = argc[0];
2875
String dictionaryFileName = argc[1];
2976
String SDTPredictionFolderName = argc[2];
3077

31-
AggregateBuilder pa = PathogenAnnotator.getPipeline(dictionaryFileName);
32-
pa.add(SentenceAnnotator.getDescription());
33-
pa.add(SDTAnnotator.getDescription(SDTPredictionFolderName));
78+
Map<String, Set<String>> gt = CharacterizationEvaluation.getGT(
79+
"/home/antonio/Documents/git/readbiomed-bmip-datasets/manual-set/ground-truth/manual-annotation-gt.csv");
3480

35-
AnalysisEngine ae = AnalysisEngineFactory.createEngine(pa.createAggregateDescription());
36-
37-
JCas jCas = JCasFactory.createJCas();
81+
Map<String, Set<String>> predictions = new HashMap<>();
3882

39-
for (File file : new File(inputFolderName).listFiles()) {
40-
JCasCollectionReader_ImplBase cr = (JCasCollectionReader_ImplBase) org.apache.uima.fit.factory.CollectionReaderFactory
41-
.createReader(MedlineReader.getDescriptionFromFiles(file.getAbsolutePath()));
83+
AggregateBuilder pa = PathogenCharacterizationAnnotator.getPipeline(dictionaryFileName);
84+
// pa.add(SentenceAnnotator.getDescription());
85+
// pa.add(SDTAnnotator.getDescription(SDTPredictionFolderName));
4286

43-
while (cr.hasNext()) {
44-
cr.getNext(jCas);
45-
ae.process(jCas);
87+
AnalysisEngine ae = AnalysisEngineFactory.createEngine(pa.createAggregateDescription());
4688

47-
String pmid = ViewUriUtil.getURI(jCas).toString();
89+
for (File file : FileUtils.listFiles(new File(
90+
"/home/antonio/Downloads/bmip/readbiomed-bmip-8648708be55b/data/corpora/bmip-pubmed-corpus/articles-txt-format"),
91+
new TextFileFilter(), null)) {
92+
String fileName = file.getName().replaceAll(".txt$", "");
4893

49-
JCasUtil.select(jCas, Section.class).forEach(e -> {
50-
if (e.getSectionType().equalsIgnoreCase("title"))
51-
System.out.println(pmid + "|" + e.getCoveredText() + "|" + e.getSectionType());
52-
JCasUtil.selectCovered(jCas, NamedEntityMention.class, e).forEach(ne -> System.out.println(ne));
53-
});
94+
JCas jCas = JCasFactory.createText(Files.readString(file.toPath()));
95+
ViewUriUtil.setURI(jCas, new URI(file.getName()));
5496

55-
JCasUtil.select(jCas, SDTSentence.class).forEach(e -> {
56-
System.out.println(pmid + "|" + e.getCoveredText() + "|" + e.getSdtType());
57-
JCasUtil.selectCovered(jCas, NamedEntityMention.class, e).forEach(ne -> System.out.println(ne));
58-
});
97+
ae.process(jCas);
5998

60-
jCas.reset();
61-
}
99+
predictions.put(fileName,
100+
JCasUtil.select(jCas, NamedEntityMention.class).stream()
101+
.filter(e -> e.getMentionType().equals("pathogen")).map(e -> e.getMentionId())
102+
.collect(Collectors.toSet()));
62103
}
104+
105+
evaluate(gt, predictions);
63106
}
64107
}

0 commit comments

Comments
 (0)