Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d9b872a
use batik
kermitt2 Jul 4, 2021
ddddcf6
batik integration
kermitt2 Jul 4, 2021
d20fe7c
test svg element merging
kermitt2 Jul 4, 2021
f7e301c
cleaning
kermitt2 Jul 4, 2021
06e5fbc
start FigureSegmenterParser
kermitt2 Aug 4, 2021
3dcb38e
some progress on new models
kermitt2 Aug 9, 2021
c7897cf
review direction
kermitt2 Aug 11, 2021
3c35047
createTraining for figure segmenter
kermitt2 Aug 26, 2021
528358c
update figure-segmenter features
kermitt2 Aug 28, 2021
44fe317
complete create training for figures
kermitt2 Aug 28, 2021
cb0cf1a
various fixes
kermitt2 Aug 29, 2021
b6dfe75
cleaning
kermitt2 Aug 30, 2021
4302d02
update fulltext model with updated vector graphic processing
kermitt2 Sep 12, 2021
50f50ea
fix merge
kermitt2 Dec 6, 2022
9a541c6
working version
kermitt2 Sep 24, 2024
b81c639
adapt segmention
kermitt2 Sep 29, 2024
a04e30c
catch-up review
kermitt2 Jan 11, 2025
bc9cd3d
correct merging and rebase
lfoppiano May 27, 2025
4d4a493
update libraries
lfoppiano May 27, 2025
7c7857a
extract svg boxes
lfoppiano May 27, 2025
128fe1a
refine extraction from svg figures
lfoppiano May 30, 2025
35f562b
collect image boundaries from groups and merge them
lfoppiano May 30, 2025
d23c92f
update dropwizard, which seems not building anymore
lfoppiano May 30, 2025
848ec0d
ignore too small figures
lfoppiano Jun 27, 2025
6bc150c
add entry for extracting figures and tables only
lfoppiano Jun 27, 2025
e0bb61c
more filtering and started merging boxes avoding L shape merge
lfoppiano Jun 28, 2025
172e595
refine the way boxes are aggregated
lfoppiano Jun 30, 2025
a972e02
Merge branch 'master' into new-figure-table-models2
lfoppiano Nov 7, 2025
bd1b3d3
typos
lfoppiano Nov 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ subprojects {
publications {
mavenJava(MavenPublication) {
from components.java
//artifact jar
//artifact jar
}
}
repositories {
Expand Down Expand Up @@ -185,6 +185,11 @@ subprojects {
implementation "com.fasterxml.jackson.core:jackson-databind:2.14.3"
implementation "com.fasterxml.jackson.module:jackson-module-afterburner:2.14.3"
implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.14.3"

implementation 'org.apache.xmlgraphics:batik-anim:1.19'
implementation 'org.apache.xmlgraphics:batik-bridge:1.19'
implementation 'org.apache.xmlgraphics:batik-svg-dom:1.19'
implementation 'org.apache.xmlgraphics:batik-dom:1.19'
}

tasks.register('sourceJar', Jar) {
Expand Down Expand Up @@ -259,6 +264,11 @@ project("grobid-core") {
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
implementation group: 'org.jruby', name: 'jruby-complete', version: '9.4.12.1'

implementation 'org.apache.xmlgraphics:batik-anim:1.14'
implementation 'org.apache.xmlgraphics:batik-bridge:1.14'
implementation 'org.apache.xmlgraphics:batik-svg-dom:1.14'
implementation 'org.apache.xmlgraphics:batik-dom:1.14'

shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"
}

Expand Down Expand Up @@ -376,17 +386,17 @@ project(":grobid-service") {
//Dropwizard
implementation 'ru.vyarus:dropwizard-guicey:7.0.2'

implementation 'io.dropwizard:dropwizard-bom:4.0.13'
implementation 'io.dropwizard:dropwizard-core:4.0.13'
implementation 'io.dropwizard:dropwizard-assets:4.0.13'
implementation 'io.dropwizard:dropwizard-testing:4.0.13'
implementation 'io.dropwizard.modules:dropwizard-testing-junit4:4.0.13'
implementation 'io.dropwizard:dropwizard-forms:4.0.13'
implementation 'io.dropwizard:dropwizard-client:4.0.13'
implementation 'io.dropwizard:dropwizard-auth:4.0.13'
implementation platform('io.dropwizard:dropwizard-bom:4.0.14')
implementation 'io.dropwizard:dropwizard-core:4.0.14'
implementation 'io.dropwizard:dropwizard-assets:4.0.14'
implementation 'io.dropwizard:dropwizard-testing:4.0.14'
implementation 'io.dropwizard.modules:dropwizard-testing-junit4:4.0.14'
implementation 'io.dropwizard:dropwizard-forms:4.0.14'
implementation 'io.dropwizard:dropwizard-client:4.0.14'
implementation 'io.dropwizard:dropwizard-auth:4.0.14'
implementation 'io.dropwizard.metrics:metrics-core:4.2.30'
implementation 'io.dropwizard.metrics:metrics-servlets:4.2.30'
implementation 'io.dropwizard:dropwizard-json-logging:4.0.13'
implementation 'io.dropwizard:dropwizard-json-logging:4.0.14'

implementation "org.apache.pdfbox:pdfbox:2.0.33"
implementation "javax.activation:activation:1.1.1"
Expand Down
5 changes: 4 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/GrobidModels.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ public enum GrobidModels implements GrobidModel {
INFRASTRUCTURE("infrastructure"),
DUMMY("none"),
LICENSE("license"),
COPYRIGHT("copyright");
COPYRIGHT("copyright"),
FIGURE_SEGMENTER("figure-segmenter"),
FIGURE_SEGMENTER_UP("figure-segmenter-up"),
FIGURE_SEGMENTER_DOWN("figure-segmenter-down");

//I cannot declare it before
public static final String DUMMY_FOLDER_LABEL = "none";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
import org.grobid.core.layout.Block;
import org.grobid.core.layout.Cluster;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.LayoutTokenization;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.layout.GraphicObject;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -58,7 +61,7 @@ public class BasicStructureBuilder {
* @param b integer
* @param doc a document
*/
private static void addBlockToCluster(Integer b, Document doc) {
/*private static void addBlockToCluster(Integer b, Document doc) {
// get block features
Block block = doc.getBlocks().get(b);
String font = block.getFont();
Expand Down Expand Up @@ -98,8 +101,7 @@ private static void addBlockToCluster(Integer b, Document doc) {
cluster.addBlock2(b);
doc.getClusters().add(cluster);
}

}
}*/

static public Document generalResultSegmentation(Document doc, String labeledResult, List<LayoutToken> documentTokens) {
List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);
Expand Down Expand Up @@ -287,250 +289,13 @@ static public Document generalResultSegmentation(Document doc, String labeledRes
return doc;
}

/**
* Set the main segments of the document based on the full text parsing results
*
* @param doc a document
* @param labeledResult string
* @param tokenizations tokens
* @return a document
*/
static public Document resultSegmentation(Document doc, String labeledResult, List<String> tokenizations) {
if (doc == null) {
throw new NullPointerException("Document is null");
}
if (doc.getBlocks() == null) {
throw new NullPointerException("Blocks of the documents are null");
}
//System.out.println(tokenizations.toString());
// int i = 0;
// boolean first = true;
List<Integer> blockHeaders = new ArrayList<Integer>();
List<Integer> blockFooters = new ArrayList<Integer>();
List<Integer> blockDocumentHeaders = new ArrayList<Integer>();
List<Integer> blockSectionTitles = new ArrayList<Integer>();

SortedSet<DocumentPiece> blockReferences = new TreeSet<DocumentPiece>();

doc.setBibDataSets(new ArrayList<BibDataSet>());

// StringTokenizer st = new StringTokenizer(labeledResult, "\n");

String[] lines = labeledResult.split("\n");

String currentTag = null;
String s2 = null;
String lastTag = null;
String lastPlainTag = null;

int p = 0; // index in the results' tokenization (st)
int blockIndex = 0;

BibDataSet bib = null;

DocumentPointer pointerA = null;
// DocumentPointer pointerB = null;
DocumentPointer currentPointer;
DocumentPointer lastPointer = null;


for (String line : lines) {
// while (st.hasMoreTokens()) {

for (; blockIndex < doc.getBlocks().size() - 1; blockIndex++) {
// int startTok = doc.getBlocks().get(blockIndex).getStartToken();
int endTok = doc.getBlocks().get(blockIndex).getEndToken();

if (endTok >= p) {
break;
}
}

ArrayList<String> localFeatures = new ArrayList<String>();
boolean addSpace = false;

// String tok = st.nextToken().trim();
line = line.trim();

StringTokenizer stt = new StringTokenizer(line, "\t");
int j = 0;

boolean newLine = false;
int ll = stt.countTokens();
while (stt.hasMoreTokens()) {
String s = stt.nextToken().trim();
if (j == 0) {
s2 = s;
boolean strop = false;
while ((!strop) && (p < tokenizations.size())) {
String tokOriginal = tokenizations.get(p);
if (tokOriginal.equals(" ")
| tokOriginal.equals("\n")
| tokOriginal.equals("\r")
| tokOriginal.equals("\t")) {
addSpace = true;
p++;
} else if (tokOriginal.equals("")) {
p++;
} else //if (tokOriginal.equals(s))
{
strop = true;
}

}
} else if (j == ll - 1) {
currentTag = s; // current tag
} else {
if (s.equals("LINESTART")) {
newLine = true;
}
localFeatures.add(s);
}
j++;
}

if (lastTag != null) {
if (lastTag.startsWith("I-")) {
lastPlainTag = lastTag.substring(2, lastTag.length());
} else {
lastPlainTag = lastTag;
}
}


String currentPlainTag = null;
if (currentTag != null) {
if (currentTag.startsWith("I-")) {
currentPlainTag = currentTag.substring(2, currentTag.length());
} else {
currentPlainTag = currentTag;
}
}


currentPointer = new DocumentPointer(doc, blockIndex, p);


if (lastPlainTag != null && !currentPlainTag.equals(lastPlainTag) && lastPlainTag.equals("<references>")) {
blockReferences.add(new DocumentPiece(pointerA, lastPointer));
pointerA = currentPointer;
}

if (currentPlainTag.equals("<header>")) {
if (!blockDocumentHeaders.contains(blockIndex)) {
blockDocumentHeaders.add(blockIndex);
//System.out.println("add block header: " + blockIndexInteger.intValue());
}

} else if (currentPlainTag.equals("<references>")) {// if (!blockReferences.contains(blockIndex)) {
// blockReferences.add(blockIndex);
// //System.out.println("add block reference: " + blockIndexInteger.intValue());
// }

if (currentTag.equals("I-<references>")) {
pointerA = new DocumentPointer(doc, blockIndex, p);
if (bib != null) {
if (bib.getRawBib() != null) {
doc.getBibDataSets().add(bib);
bib = new BibDataSet();
}
} else {
bib = new BibDataSet();
}
bib.setRawBib(s2);
} else {
if (addSpace) {
if (bib == null) {
bib = new BibDataSet();
bib.setRawBib(" " + s2);
} else {
bib.setRawBib(bib.getRawBib() + " " + s2);
}
} else {
if (bib == null) {
bib = new BibDataSet();
bib.setRawBib(s2);
} else {
bib.setRawBib(bib.getRawBib() + s2);
}
}
}

// case "<reference_marker>":
// if (!blockReferences.contains(blockIndex)) {
// blockReferences.add(blockIndex);
// //System.out.println("add block reference: " + blockIndexInteger.intValue());
// }
//
// if (currentTag.equals("I-<reference_marker>")) {
// if (bib != null) {
// if (bib.getRefSymbol() != null) {
// doc.getBibDataSets().add(bib);
// bib = new BibDataSet();
// }
// } else {
// bib = new BibDataSet();
// }
// bib.setRefSymbol(s2);
// } else {
// if (addSpace) {
// if (bib == null) {
// bib = new BibDataSet();
// bib.setRefSymbol(s2);
// } else {
// bib.setRefSymbol(bib.getRefSymbol() + " " + s2);
// }
// } else {
// if (bib == null) {
// bib = new BibDataSet();
// bib.setRefSymbol(s2);
// } else {
// bib.setRefSymbol(bib.getRefSymbol() + s2);
// }
// }
// }
// break;
} else if (currentPlainTag.equals("<page_footnote>")) {
if (!blockFooters.contains(blockIndex)) {
blockFooters.add(blockIndex);
//System.out.println("add block foot note: " + blockIndexInteger.intValue());
}

} else if (currentPlainTag.equals("<page_header>")) {
if (!blockHeaders.contains(blockIndex)) {
blockHeaders.add(blockIndex);
//System.out.println("add block page header: " + blockIndexInteger.intValue());
}

} else if (currentPlainTag.equals("<section>")) {
if (!blockSectionTitles.contains(blockIndex)) {
blockSectionTitles.add(blockIndex);
//System.out.println("add block page header: " + blockIndexInteger.intValue());
}

}

lastTag = currentTag;
p++;
lastPointer = currentPointer;
}

if (bib != null) {
doc.getBibDataSets().add(bib);
}


if (!lastPointer.equals(pointerA)) {
if (lastPlainTag.equals("<references>")) {
blockReferences.add(new DocumentPiece(pointerA, lastPointer));
}
}
static public Document figureResultSegmentation(Document doc,
List<GraphicObject> figureAnchors,
String labelledResultsUp,
List<LayoutTokenization> theTokenizationsUp,
String labelledResultsDown,
List<LayoutTokenization> theTokenizationsDown) {

/*doc.setBlockHeaders(blockHeaders);
doc.setBlockFooters(blockFooters);
doc.setBlockDocumentHeaders(blockDocumentHeaders);
doc.setBlockReferences(blockReferences);
doc.setBlockSectionTitles(blockSectionTitles);*/

return doc;
}
Expand Down
Loading
Loading