Skip to content

Commit 7369e2d

Browse files
author
Heleno Campos
committed
Allows the parsing of whole folders
1 parent 4cf317d commit 7369e2d

File tree

3 files changed

+49
-27
lines changed

3 files changed

+49
-27
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ full: outputs the bibtex references for a pdf file
1818
short: outputs only titles/year/publication venue of the references for a pdf file. They are separeted by semicollons, to make it easier to create csv files.
1919

2020

21+
2122
How to use:
2223
java -jar refExtractor.jar full pdffile.pdf
2324

@@ -26,6 +27,8 @@ or
2627
java -jar refExtractor.jar short pdffile.pdf
2728

2829

30+
If you pass a folder instead of a file as argument, it will attempt to parse all files in that folder and will output in the selected mode.
31+
2932

3033
Alternatively, if you dont want to output to be printed in the terminal, you can redirect the output using "> output.txt"
3134

nbactions.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
<goal>org.codehaus.mojo:exec-maven-plugin:1.5.0:exec</goal>
1111
</goals>
1212
<properties>
13-
<exec.args>-classpath %classpath info.heleno.references_extractor.App short C:\Users\Heleno.DESKTOP-89HH2F6\Downloads\840.pdf</exec.args>
13+
<exec.args>-classpath %classpath info.heleno.references_extractor.App short C:\Users\Heleno.DESKTOP-89HH2F6\Downloads\papers</exec.args>
1414
<exec.executable>java</exec.executable>
1515
</properties>
1616
</action>
@@ -24,7 +24,7 @@
2424
<goal>org.codehaus.mojo:exec-maven-plugin:1.5.0:exec</goal>
2525
</goals>
2626
<properties>
27-
<exec.args>-agentlib:jdwp=transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath info.heleno.references_extractor.App short C:\Users\Heleno.DESKTOP-89HH2F6\Downloads\840.pdf</exec.args>
27+
<exec.args>-agentlib:jdwp=transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath info.heleno.references_extractor.App short C:\Users\Heleno.DESKTOP-89HH2F6\Downloads\papers</exec.args>
2828
<exec.executable>java</exec.executable>
2929
<jpda.listen>true</jpda.listen>
3030
</properties>
@@ -39,7 +39,7 @@
3939
<goal>org.codehaus.mojo:exec-maven-plugin:1.5.0:exec</goal>
4040
</goals>
4141
<properties>
42-
<exec.args>-classpath %classpath info.heleno.references_extractor.App short C:\Users\Heleno.DESKTOP-89HH2F6\Downloads\840.pdf</exec.args>
42+
<exec.args>-classpath %classpath info.heleno.references_extractor.App short C:\Users\Heleno.DESKTOP-89HH2F6\Downloads\papers</exec.args>
4343
<exec.executable>java</exec.executable>
4444
</properties>
4545
</action>

src/main/java/info/heleno/references_extractor/App.java

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
*/
66
package info.heleno.references_extractor;
77

8+
import java.io.File;
89
import java.io.FileInputStream;
9-
import java.io.FileNotFoundException;
1010
import java.io.IOException;
1111
import java.util.List;
1212
import java.util.logging.Level;
@@ -30,34 +30,53 @@ public static String getField(List<String> values) {
3030
return field;
3131
}
3232

33-
public static void main(String[] args) {
33+
public static void printFullReferences(List<BibEntry> references) {
34+
for (BibEntry entry : references) {
35+
System.out.println(entry.toBibTeX());
36+
}
37+
}
38+
39+
public static void printShortReferences(List<BibEntry> references, String source) {
40+
System.out.println("Source;Title;Year;Publication");
41+
source = source.replaceFirst("[.][^.]+$", ""); //removes file name extension
42+
for (BibEntry entry : references) {
43+
String title = getField(entry.getAllFieldValues(BibEntryFieldType.TITLE));
44+
String year = entry.getFirstFieldValue(BibEntryFieldType.YEAR);
45+
String publication = getField(entry.getAllFieldValues(BibEntryFieldType.JOURNAL));
46+
System.out.println("\"" + source + "\";"+"\"" + title + "\"" + ";" + "\"" + year + "\"" + ";" + "\"" + publication + "\"");
47+
}
48+
}
49+
50+
public static void extractFile(File file, String mode) {
3451
try {
35-
if (args.length >= 2) {
36-
37-
ContentExtractor extractor = new ContentExtractor();
38-
FileInputStream inputStream = new FileInputStream(args[1]);
39-
extractor.setPDF(inputStream);
40-
List<BibEntry> references = extractor.getReferences();
41-
if (args[0].equals("short")) {
42-
for (BibEntry entry : references) {
43-
String title = getField(entry.getAllFieldValues(BibEntryFieldType.TITLE));
44-
String year = entry.getFirstFieldValue(BibEntryFieldType.YEAR);
45-
String publication = getField(entry.getAllFieldValues(BibEntryFieldType.JOURNAL));
46-
System.out.println("\""+title +"\""+ ";" +"\""+ year +"\""+ ";" +"\""+ publication+"\"");
47-
}
48-
} else if (args[0].equals("full")) {
49-
for (BibEntry entry : references) {
50-
System.out.println(entry.toBibTeX());
52+
ContentExtractor extractor = new ContentExtractor();
53+
FileInputStream inputStream = new FileInputStream(file);
54+
extractor.setPDF(inputStream);
55+
List<BibEntry> references = extractor.getReferences();
56+
if (mode.equals("short")) {
57+
printShortReferences(references, file.getName());
58+
} else if (mode.equals("full")) {
59+
printFullReferences(references);
60+
}
61+
62+
} catch (AnalysisException | IOException ex) {
63+
Logger.getLogger(App.class.getName()).log(Level.SEVERE, null, ex);
64+
}
65+
}
66+
67+
public static void main(String[] args) {
68+
if (args.length >= 2) {
69+
File file = new File(args[1]);
70+
if (file.exists()) {
71+
if (file.isDirectory()) {
72+
for(File subfile: file.listFiles()){
73+
extractFile(subfile, args[0]);
5174
}
75+
} else {
76+
extractFile(new File(args[1]), args[0]);
5277
}
5378
}
5479

55-
} catch (AnalysisException ex) {
56-
Logger.getLogger(App.class.getName()).log(Level.SEVERE, null, ex);
57-
} catch (FileNotFoundException ex) {
58-
Logger.getLogger(App.class.getName()).log(Level.SEVERE, null, ex);
59-
} catch (IOException ex) {
60-
Logger.getLogger(App.class.getName()).log(Level.SEVERE, null, ex);
6180
}
6281
}
6382
}

0 commit comments

Comments
 (0)