Skip to content

Commit 7d59a37

Browse files
committed
Merge pull request #11 from chrismattmann/master
Merge NLTKvsCoreNLP viz
2 parents fb88585 + 199a568 commit 7d59a37

File tree

10 files changed

+406
-3
lines changed

10 files changed

+406
-3
lines changed

nltkrest-examples/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ target
22
.classpath
33
.settings
44
.project
5-
5+
.idea
6+
*.iml

nltkrest-examples/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,18 @@ Connecting to NLTKRest at: [http://localhost:8881/nltk]
3838
"result": "success"
3939
}
4040
```
41+
Generating JSON for comparing NLTK and CoreNLP
42+
==============================================
43+
NLTKandCoreNLP compares the results of Named Entities recognized by Stanford CoreNLP against those extracted by Tika trunk containing ability to use NLTKNeRecignizer Parser.
4144

45+
1. `java -cp target/nltkrest-examples-1.0-SNAPSHOT.jar edu.usc.ir.visualization.NLTKandCoreNLP "url/to/solr/dev" "username" "password" "path/to/destination/folder"`
46+
47+
Which should return:
48+
49+
```
50+
Json ready for Visualization: path/to/destination/folder/nltk_vs_corenlp.json
51+
```
52+
You can then follow the procedure to see a beautiful visualization using instructions from [Tika-NLTKvsCoreNLP](https://github.com/manalishah/Tika-NLTKvsCoreNLP.git)
4253

4354
Questions, comments?
4455
===================

nltkrest-examples/pom.xml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,16 @@ the License.
5252
<build>
5353
<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
5454
<plugins>
55+
<plugin>
56+
<groupId>org.apache.maven.plugins</groupId>
57+
<artifactId>maven-assembly-plugin</artifactId>
58+
<version>2.3</version>
59+
<configuration>
60+
<descriptors>
61+
<descriptor>src/main/assembly/assembly.xml</descriptor>
62+
</descriptors>
63+
</configuration>
64+
</plugin>
5565
<plugin>
5666
<groupId>org.codehaus.mojo</groupId>
5767
<artifactId>animal-sniffer-maven-plugin</artifactId>
@@ -100,7 +110,7 @@ the License.
100110
</filters>
101111
<transformers>
102112
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
103-
<mainClass>edu.usc.cs.ir.nltkrest.CXFClient</mainClass>
113+
<mainClass>edu.usc.ir.nltkrest.CXFClient</mainClass>
104114
</transformer>
105115
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
106116
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
@@ -238,5 +248,17 @@ the License.
238248
<version>3.8.1</version>
239249
<scope>test</scope>
240250
</dependency>
251+
<dependency>
252+
<groupId>com.fasterxml.jackson.core</groupId>
253+
<artifactId>jackson-databind</artifactId>
254+
<version>2.4.4</version>
255+
</dependency>
256+
<dependency>
257+
<groupId>org.apache.tika</groupId>
258+
<artifactId>tika-app</artifactId>
259+
<version>1.13-SNAPSHOT</version>
260+
<!-- <scope>system</scope>
261+
<systemPath>/Users/manali/cs599_dr/tika/tika-app/target/tika-app-1.13-SNAPSHOT.jar</systemPath> -->
262+
</dependency>
241263
</dependencies>
242264
</project>
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
2+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
4+
<!-- TODO: a jarjar format would be better -->
5+
<id>jar-with-dependencies</id>
6+
<formats>
7+
<format>jar</format>
8+
</formats>
9+
<includeBaseDirectory>false</includeBaseDirectory>
10+
<dependencySets>
11+
<dependencySet>
12+
<outputDirectory>/</outputDirectory>
13+
<useProjectArtifact>true</useProjectArtifact>
14+
<unpack>true</unpack>
15+
<scope>runtime</scope>
16+
</dependencySet>
17+
</dependencySets>
18+
</assembly>

nltkrest-examples/src/main/java/edu/usc/ir/nltkrest/CXFClient.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
*/
1717
package edu.usc.ir.nltkrest;
1818

19-
import javax.ws.rs.core.Form;
2019
import javax.ws.rs.core.Response;
2120
import org.apache.cxf.jaxrs.client.WebClient;
2221

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package edu.usc.ir.visualization;
18+
19+
import java.util.ArrayList;
20+
import edu.usc.ir.visualization.Series;
21+
22+
public class Labels {
23+
ArrayList<String> labels;
24+
Series[] series;
25+
public Labels(ArrayList<String> labels, Series[] series) {
26+
this.labels = labels;
27+
this.series = series;
28+
}
29+
public ArrayList<String> getLabels() {
30+
return labels;
31+
}
32+
public void setLabels(ArrayList<String> labels) {
33+
this.labels = labels;
34+
}
35+
public Series[] getSeries() {
36+
return series;
37+
}
38+
public void setSeries(Series[] series) {
39+
this.series = series;
40+
}
41+
}
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package edu.usc.ir.visualization;
18+
19+
import java.io.ByteArrayInputStream;
20+
import java.io.File;
21+
import java.io.IOException;
22+
import java.io.InputStream;
23+
import java.nio.charset.StandardCharsets;
24+
import java.util.ArrayList;
25+
import java.util.Arrays;
26+
import java.util.Collections;
27+
import java.util.Comparator;
28+
import java.util.HashMap;
29+
import java.util.HashSet;
30+
import java.util.Iterator;
31+
32+
import javax.ws.rs.core.MediaType;
33+
import org.apache.cxf.jaxrs.client.WebClient;
34+
import org.apache.tika.Tika;
35+
import org.apache.tika.config.TikaConfig;
36+
import org.apache.tika.exception.TikaException;
37+
import org.apache.tika.metadata.Metadata;
38+
import org.apache.tika.parser.ner.NamedEntityParser;
39+
import org.apache.tika.parser.ner.nltk.NLTKNERecogniser;
40+
import org.xml.sax.SAXException;
41+
42+
import com.fasterxml.jackson.core.JsonGenerationException;
43+
import com.fasterxml.jackson.core.JsonParseException;
44+
import com.fasterxml.jackson.core.JsonProcessingException;
45+
import com.fasterxml.jackson.databind.JsonMappingException;
46+
import com.fasterxml.jackson.databind.JsonNode;
47+
import com.fasterxml.jackson.databind.ObjectMapper;
48+
49+
public class NLTKandCoreNLP {
50+
51+
private HashSet<String> freq;
52+
private HashMap<String, Integer> nltk;
53+
private HashMap<String, Integer> nlp;
54+
private JsonNode datasetElement;
55+
private Tika tika;
56+
private Metadata md;
57+
private ObjectMapper mapper;
58+
59+
public NLTKandCoreNLP() {
60+
61+
freq = new HashSet<String>();
62+
nltk = new HashMap<String,Integer>();
63+
nlp = new HashMap<String,Integer>();
64+
tika = null;
65+
datasetElement=null;
66+
mapper = new ObjectMapper();
67+
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
68+
try {
69+
tika = new Tika(new TikaConfig(NLTKandCoreNLP.class.getResourceAsStream("tika-config.xml")));
70+
} catch (TikaException | IOException | SAXException e) {
71+
System.out.println("Could not load Tika");
72+
e.printStackTrace();
73+
}
74+
}
75+
76+
public static void main(String m[]) throws JsonParseException, JsonMappingException, IOException {
77+
78+
String memexUrl = m[0];
79+
String username = m[1];
80+
String password = m[2];
81+
File destination =new File(m[3]);
82+
NLTKandCoreNLP obj = new NLTKandCoreNLP();
83+
obj.countNER(memexUrl, username, password);
84+
obj.createJSON(destination);
85+
86+
}
87+
88+
private void countNER(String memexUrl, String username, String password) throws JsonParseException, JsonMappingException, IOException {
89+
90+
JsonNode node;
91+
JsonNode dataset=null;
92+
String url;
93+
String response;
94+
95+
for (int c=0; c<101; c+=100) {
96+
97+
url = memexUrl + "/select?q=gunsamerica&start="+c+"&rows=100&fl=content%2Corganizations%2Cpersons%2Cdates%2Clocations&wt=json&indent=true";
98+
response = WebClient
99+
.create(url, username, password, null)
100+
.accept(MediaType.APPLICATION_JSON)
101+
.get()
102+
.readEntity(String.class);
103+
104+
try {
105+
node = mapper.readTree(response);
106+
dataset= node.get("response").get("docs");
107+
} catch (JsonProcessingException e) {
108+
e.printStackTrace();
109+
} catch (IOException e) {
110+
e.printStackTrace();
111+
}
112+
113+
Iterator<JsonNode> datasetElements = dataset.iterator();
114+
115+
while (datasetElements.hasNext()) {
116+
datasetElement = datasetElements.next();
117+
String content = datasetElement.get("content").asText();
118+
md = new Metadata();
119+
try (InputStream stream = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8))) {
120+
tika.parse(stream, md);
121+
} catch (IOException e) {
122+
e.printStackTrace();
123+
}
124+
125+
if(datasetElement.has("locations")) {
126+
extract("locations");
127+
}
128+
129+
if(datasetElement.has("dates")) {
130+
extract("organizations");
131+
}
132+
133+
if(datasetElement.has("organizations")) {
134+
extract("organizations");
135+
}
136+
137+
if(datasetElement.has("persons")) {
138+
extract("persons");
139+
}
140+
141+
if(md.getValues("NER_NAMES").length > 0) {
142+
for(String ner_name: Arrays.asList(md.getValues("NER_NAMES"))) {
143+
if(!freq.contains(ner_name)) {
144+
freq.add(ner_name);
145+
}
146+
if(nltk.containsKey(ner_name)) {
147+
nltk.put(ner_name, nltk.get(ner_name) + 1);
148+
}
149+
else {
150+
nltk.put(ner_name, 1);
151+
}
152+
}
153+
}
154+
}
155+
}
156+
}
157+
158+
private void extract(String ner) throws JsonParseException, JsonMappingException, IOException {
159+
String names[]=null;
160+
names= mapper.readValue(datasetElement.get(ner).toString(),String[].class);
161+
for(int i=0; i<names.length; i++) {
162+
if(!freq.contains(names[i])) {
163+
freq.add(names[i]);
164+
}
165+
if(nlp.containsKey(names[i])) {
166+
nlp.put(names[i], nlp.get(names[i]) + 1);
167+
} else {
168+
nlp.put(names[i], 1);
169+
}
170+
}
171+
}
172+
173+
private void createJSON(File destination) throws JsonGenerationException, JsonMappingException, IOException {
174+
ArrayList<Names> frequencies = new ArrayList<Names>();
175+
for (String value:freq) {
176+
int x = nltk.containsKey(value)?nltk.get(value):0;
177+
int y = nlp.containsKey(value)?nlp.get(value):0;
178+
int z = x+y-Math.abs(x-y);
179+
if (z==0) {
180+
z = x>y?0:-y;
181+
}
182+
frequencies.add(new Names(value, z ));
183+
}
184+
185+
Collections.sort(frequencies, maximumOverlap);
186+
ArrayList<String> final_labels = new ArrayList<String>();
187+
ArrayList<Integer> nltk_value = new ArrayList<Integer>();
188+
ArrayList<Integer> nlp_value = new ArrayList<Integer>();
189+
for (int i=0; i<frequencies.size(); i++) {
190+
String value = frequencies.get(i).name;
191+
final_labels.add(value);
192+
if (nltk.containsKey(value)) {
193+
nltk_value.add(nltk.get(value));
194+
} else {
195+
nltk_value.add(0);
196+
}
197+
if (nlp.containsKey(value)) {
198+
nlp_value.add(nlp.get(value));
199+
} else {
200+
nlp_value.add(0);
201+
}
202+
}
203+
204+
Series []s = {new Series("nltk", nltk_value),new Series("nlp", nlp_value)};
205+
Labels labels = new Labels(final_labels, s);
206+
ObjectMapper mapper = new ObjectMapper();
207+
destination = new File(destination.getAbsolutePath() + "/nltk_vs_corenlp.json");
208+
mapper.writerWithDefaultPrettyPrinter().writeValue(destination, labels);
209+
System.out.println("Json ready for Visualization: " + destination.getAbsolutePath());
210+
}
211+
212+
public static Comparator<Names> maximumOverlap = new Comparator<Names>() {
213+
public int compare(Names one, Names two) {
214+
return (int)two.strength - (int)one.strength;
215+
}
216+
};
217+
218+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package edu.usc.ir.visualization;
18+
19+
public class Names {
20+
String name;
21+
int strength;
22+
23+
Names(String x, int y) {
24+
name=x;
25+
strength=y;
26+
}
27+
}

0 commit comments

Comments
 (0)