2
2
3
3
import java .io .File ;
4
4
import java .io .IOException ;
5
+ import java .net .URI ;
6
+ import java .net .URISyntaxException ;
7
+ import java .nio .file .Files ;
8
+ import java .util .HashMap ;
9
+ import java .util .HashSet ;
10
+ import java .util .Map ;
11
+ import java .util .Set ;
12
+ import java .util .stream .Collectors ;
5
13
14
+ import org .apache .commons .io .FileUtils ;
6
15
import org .apache .uima .UIMAException ;
7
16
import org .apache .uima .analysis_engine .AnalysisEngine ;
8
- import org .apache .uima .fit .component .JCasCollectionReader_ImplBase ;
9
17
import org .apache .uima .fit .factory .AggregateBuilder ;
10
18
import org .apache .uima .fit .factory .AnalysisEngineFactory ;
11
19
import org .apache .uima .fit .factory .JCasFactory ;
12
20
import org .apache .uima .fit .util .JCasUtil ;
13
21
import org .apache .uima .jcas .JCas ;
14
22
import org .cleartk .ne .type .NamedEntityMention ;
15
- import org .cleartk .opennlp .tools .SentenceAnnotator ;
16
23
import org .cleartk .util .ViewUriUtil ;
17
24
import org .xml .sax .SAXException ;
18
25
19
- import readbiomed .annotators .dictionary .pathogens .PathogenAnnotator ;
20
- import readbiomed .annotators .discourse .sdt .SDTAnnotator ;
21
- import readbiomed .document .SDTSentence ;
22
- import readbiomed .document .Section ;
23
- import readbiomed .readers .medline .MedlineReader ;
26
+ import readbiomed .annotators .dictionary .utils .CharacterizationEvaluation ;
27
+ import readbiomed .annotators .dictionary .utils .TextFileFilter ;
24
28
25
29
public class PathogenExperimenter {
26
- public static void main (String [] argc ) throws IOException , SAXException , UIMAException {
30
+
31
+ private static void evaluate (Map <String , Set <String >> gt , Map <String , Set <String >> predictions ) {
32
+ double tps = 0.0 ;
33
+ double fns = 0.0 ;
34
+ double fps = 0.0 ;
35
+
36
+ // Compare GT
37
+ for (Map .Entry <String , Set <String >> entry : gt .entrySet ()) {
38
+ long common = entry .getValue ().stream ()
39
+ .filter (predictions .computeIfAbsent (entry .getKey (), o -> new HashSet <>())::contains ).count ();
40
+
41
+ Set <String > fp = predictions .computeIfAbsent (entry .getKey (), o -> new HashSet <>()).stream ()
42
+ .filter (e -> !entry .getValue ().contains (e )).collect (Collectors .toSet ());
43
+
44
+ Set <String > fn = entry .getValue ().stream ()
45
+ .filter (e -> !predictions .computeIfAbsent (entry .getKey (), o -> new HashSet <>()).contains (e ))
46
+ .collect (Collectors .toSet ());
47
+
48
+ System .out .println (entry .getKey () + "|" + common + "|" + entry .getValue ().size () + "|"
49
+ + predictions .get (entry .getKey ()).size ());
50
+
51
+ double recall = common / (double ) (common + fn .size ());
52
+ double precision = common / (double ) (common + fp .size ());
53
+ double f1 = (2 * precision * recall ) / (precision + recall );
54
+
55
+ tps += common ;
56
+ fns += fn .size ();
57
+ fps += fp .size ();
58
+
59
+ System .out .println (entry .getKey () + "|" + precision + "|" + recall + "|" + f1 );
60
+ System .out .println ("FP:" + fp );
61
+ System .out .println ("FN:" + fn );
62
+ }
63
+
64
+ double recalls = tps / (tps + fns );
65
+ double precisions = tps / (tps + fps );
66
+ double f1s = (2 * precisions * recalls ) / (precisions + recalls );
67
+
68
+ System .out .println ("Overall recall: " + recalls );
69
+ System .out .println ("Overall precision: " + precisions );
70
+ System .out .println ("Overall f1: " + f1s );
71
+ }
72
+
73
+ public static void main (String [] argc ) throws IOException , SAXException , UIMAException , URISyntaxException {
27
74
String inputFolderName = argc [0 ];
28
75
String dictionaryFileName = argc [1 ];
29
76
String SDTPredictionFolderName = argc [2 ];
30
77
31
- AggregateBuilder pa = PathogenAnnotator .getPipeline (dictionaryFileName );
32
- pa .add (SentenceAnnotator .getDescription ());
33
- pa .add (SDTAnnotator .getDescription (SDTPredictionFolderName ));
78
+ Map <String , Set <String >> gt = CharacterizationEvaluation .getGT (
79
+ "/home/antonio/Documents/git/readbiomed-bmip-datasets/manual-set/ground-truth/manual-annotation-gt.csv" );
34
80
35
- AnalysisEngine ae = AnalysisEngineFactory .createEngine (pa .createAggregateDescription ());
36
-
37
- JCas jCas = JCasFactory .createJCas ();
81
+ Map <String , Set <String >> predictions = new HashMap <>();
38
82
39
- for ( File file : new File ( inputFolderName ). listFiles ()) {
40
- JCasCollectionReader_ImplBase cr = ( JCasCollectionReader_ImplBase ) org . apache . uima . fit . factory . CollectionReaderFactory
41
- . createReader ( MedlineReader . getDescriptionFromFiles ( file . getAbsolutePath () ));
83
+ AggregateBuilder pa = PathogenCharacterizationAnnotator . getPipeline ( dictionaryFileName );
84
+ // pa.add(SentenceAnnotator.getDescription());
85
+ // pa.add(SDTAnnotator.getDescription(SDTPredictionFolderName ));
42
86
43
- while (cr .hasNext ()) {
44
- cr .getNext (jCas );
45
- ae .process (jCas );
87
+ AnalysisEngine ae = AnalysisEngineFactory .createEngine (pa .createAggregateDescription ());
46
88
47
- String pmid = ViewUriUtil .getURI (jCas ).toString ();
89
+ for (File file : FileUtils .listFiles (new File (
90
+ "/home/antonio/Downloads/bmip/readbiomed-bmip-8648708be55b/data/corpora/bmip-pubmed-corpus/articles-txt-format" ),
91
+ new TextFileFilter (), null )) {
92
+ String fileName = file .getName ().replaceAll (".txt$" , "" );
48
93
49
- JCasUtil .select (jCas , Section .class ).forEach (e -> {
50
- if (e .getSectionType ().equalsIgnoreCase ("title" ))
51
- System .out .println (pmid + "|" + e .getCoveredText () + "|" + e .getSectionType ());
52
- JCasUtil .selectCovered (jCas , NamedEntityMention .class , e ).forEach (ne -> System .out .println (ne ));
53
- });
94
+ JCas jCas = JCasFactory .createText (Files .readString (file .toPath ()));
95
+ ViewUriUtil .setURI (jCas , new URI (file .getName ()));
54
96
55
- JCasUtil .select (jCas , SDTSentence .class ).forEach (e -> {
56
- System .out .println (pmid + "|" + e .getCoveredText () + "|" + e .getSdtType ());
57
- JCasUtil .selectCovered (jCas , NamedEntityMention .class , e ).forEach (ne -> System .out .println (ne ));
58
- });
97
+ ae .process (jCas );
59
98
60
- jCas .reset ();
61
- }
99
+ predictions .put (fileName ,
100
+ JCasUtil .select (jCas , NamedEntityMention .class ).stream ()
101
+ .filter (e -> e .getMentionType ().equals ("pathogen" )).map (e -> e .getMentionId ())
102
+ .collect (Collectors .toSet ()));
62
103
}
104
+
105
+ evaluate (gt , predictions );
63
106
}
64
107
}
0 commit comments