Skip to content

New figure/table segmentation approach and models #963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 35 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
2721f64
use batik
kermitt2 Jul 4, 2021
1cbb815
batik integration
kermitt2 Jul 4, 2021
aa654fd
Merge branch 'master' into fix-vector-graphics
kermitt2 Jul 4, 2021
5e9e53a
test svg element merging
kermitt2 Jul 4, 2021
7b8917d
Merge branch 'master' into fix-vector-graphics
kermitt2 Jul 4, 2021
3f1058a
cleaning
kermitt2 Jul 4, 2021
b18bb88
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 3, 2021
9eecec7
start FigureSegmenterParser
kermitt2 Aug 4, 2021
7a10012
some progress on new models
kermitt2 Aug 9, 2021
d3f0df3
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 9, 2021
e964a41
review direction
kermitt2 Aug 11, 2021
0255b80
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 18, 2021
6d19e34
createTraining for figure segmenter
kermitt2 Aug 26, 2021
a92f692
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 26, 2021
01cf993
fix crop box for reference over 2 pages
kermitt2 Aug 28, 2021
0d4981d
update figure-segmenter features
kermitt2 Aug 28, 2021
6d52e2e
complete create training for figures
kermitt2 Aug 28, 2021
432d40b
various fixes
kermitt2 Aug 29, 2021
ea542e2
cleaning
kermitt2 Aug 30, 2021
711e6c9
update fulltext model with updated vector graphic processing
kermitt2 Sep 12, 2021
31bfc34
Merge branch 'master' into fix-vector-graphics
kermitt2 Sep 24, 2022
f868fe0
fix conflict
kermitt2 Nov 7, 2022
121a1cb
add stacktrace in circleci build
lfoppiano Dec 6, 2022
2b69b37
try github actions
lfoppiano Dec 6, 2022
0e722f4
fix merge conflict
kermitt2 Dec 6, 2022
ab0a514
fix merge
kermitt2 Dec 6, 2022
fb33e5d
fix conflict with latest master
kermitt2 Sep 14, 2023
e2bf621
minor doc update
kermitt2 Sep 27, 2023
d189cb5
Merge branch 'master' into new-figure-table-models
lfoppiano Dec 17, 2023
d649e22
Merge branch 'release-0.8.1' into new-figure-table-models
kermitt2 Aug 10, 2024
44d1801
Merge branch 'master' into new-figure-table-models
kermitt2 Sep 23, 2024
c664d63
working version
kermitt2 Sep 24, 2024
8e09ba4
adapt segmention
kermitt2 Sep 29, 2024
e2409b8
catch-up review
kermitt2 Jan 11, 2025
4c3893d
solve conflicts
kermitt2 Feb 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/ci-build-unstable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ concurrency:
jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up JDK 11
Expand Down
8 changes: 8 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ subprojects {
implementation "com.fasterxml.jackson.core:jackson-databind:2.14.3"
implementation "com.fasterxml.jackson.module:jackson-module-afterburner:2.14.3"
implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.14.3"
implementation 'org.apache.xmlgraphics:batik-anim:1.14'
implementation 'org.apache.xmlgraphics:batik-bridge:1.14'
implementation 'org.apache.xmlgraphics:batik-svg-dom:1.14'
implementation 'org.apache.xmlgraphics:batik-dom:1.14'
}

task sourceJar(type: Jar) {
Expand Down Expand Up @@ -253,6 +257,10 @@ project("grobid-core") {
implementation 'black.ninia:jep:4.0.2'
implementation 'org.apache.opennlp:opennlp-tools:1.9.1'
implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0'
implementation 'org.apache.xmlgraphics:batik-anim:1.14'
implementation 'org.apache.xmlgraphics:batik-bridge:1.14'
implementation 'org.apache.xmlgraphics:batik-svg-dom:1.14'
implementation 'org.apache.xmlgraphics:batik-dom:1.14'

shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"
}
Expand Down
2 changes: 2 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/GrobidModels.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ public enum GrobidModels implements GrobidModel {
ASTRO("astro"),
SOFTWARE("software"),
DATASEER("dataseer"),
FIGURE_SEGMENTER_UP("figure-segmenter-up"),
FIGURE_SEGMENTER_DOWN("figure-segmenter-down"),
//ACKNOWLEDGEMENT("acknowledgement"),
FUNDING_ACKNOWLEDGEMENT("funding-acknowledgement"),
INFRASTRUCTURE("infrastructure"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
import org.grobid.core.layout.Block;
import org.grobid.core.layout.Cluster;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.LayoutTokenization;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.layout.GraphicObject;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -57,7 +60,7 @@ public class BasicStructureBuilder {
* @param b integer
* @param doc a document
*/
private static void addBlockToCluster(Integer b, Document doc) {
/*private static void addBlockToCluster(Integer b, Document doc) {
// get block features
Block block = doc.getBlocks().get(b);
String font = block.getFont();
Expand Down Expand Up @@ -97,8 +100,7 @@ private static void addBlockToCluster(Integer b, Document doc) {
cluster.addBlock2(b);
doc.getClusters().add(cluster);
}

}
}*/

static public Document generalResultSegmentation(Document doc, String labeledResult, List<LayoutToken> documentTokens) {
List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);
Expand Down Expand Up @@ -286,250 +288,13 @@ static public Document generalResultSegmentation(Document doc, String labeledRes
return doc;
}

/**
* Set the main segments of the document based on the full text parsing results
*
* @param doc a document
* @param labeledResult string
* @param tokenizations tokens
* @return a document
*/
static public Document resultSegmentation(Document doc, String labeledResult, List<String> tokenizations) {
if (doc == null) {
throw new NullPointerException("Document is null");
}
if (doc.getBlocks() == null) {
throw new NullPointerException("Blocks of the documents are null");
}
//System.out.println(tokenizations.toString());
// int i = 0;
// boolean first = true;
List<Integer> blockHeaders = new ArrayList<Integer>();
List<Integer> blockFooters = new ArrayList<Integer>();
List<Integer> blockDocumentHeaders = new ArrayList<Integer>();
List<Integer> blockSectionTitles = new ArrayList<Integer>();

SortedSet<DocumentPiece> blockReferences = new TreeSet<DocumentPiece>();

doc.setBibDataSets(new ArrayList<BibDataSet>());

// StringTokenizer st = new StringTokenizer(labeledResult, "\n");

String[] lines = labeledResult.split("\n");

String currentTag = null;
String s2 = null;
String lastTag = null;
String lastPlainTag = null;

int p = 0; // index in the results' tokenization (st)
int blockIndex = 0;

BibDataSet bib = null;

DocumentPointer pointerA = null;
// DocumentPointer pointerB = null;
DocumentPointer currentPointer;
DocumentPointer lastPointer = null;


for (String line : lines) {
// while (st.hasMoreTokens()) {

for (; blockIndex < doc.getBlocks().size() - 1; blockIndex++) {
// int startTok = doc.getBlocks().get(blockIndex).getStartToken();
int endTok = doc.getBlocks().get(blockIndex).getEndToken();

if (endTok >= p) {
break;
}
}

ArrayList<String> localFeatures = new ArrayList<String>();
boolean addSpace = false;

// String tok = st.nextToken().trim();
line = line.trim();

StringTokenizer stt = new StringTokenizer(line, "\t");
int j = 0;

boolean newLine = false;
int ll = stt.countTokens();
while (stt.hasMoreTokens()) {
String s = stt.nextToken().trim();
if (j == 0) {
s2 = s;
boolean strop = false;
while ((!strop) && (p < tokenizations.size())) {
String tokOriginal = tokenizations.get(p);
if (tokOriginal.equals(" ")
| tokOriginal.equals("\n")
| tokOriginal.equals("\r")
| tokOriginal.equals("\t")) {
addSpace = true;
p++;
} else if (tokOriginal.equals("")) {
p++;
} else //if (tokOriginal.equals(s))
{
strop = true;
}

}
} else if (j == ll - 1) {
currentTag = s; // current tag
} else {
if (s.equals("LINESTART")) {
newLine = true;
}
localFeatures.add(s);
}
j++;
}

if (lastTag != null) {
if (lastTag.startsWith("I-")) {
lastPlainTag = lastTag.substring(2, lastTag.length());
} else {
lastPlainTag = lastTag;
}
}


String currentPlainTag = null;
if (currentTag != null) {
if (currentTag.startsWith("I-")) {
currentPlainTag = currentTag.substring(2, currentTag.length());
} else {
currentPlainTag = currentTag;
}
}


currentPointer = new DocumentPointer(doc, blockIndex, p);


if (lastPlainTag != null && !currentPlainTag.equals(lastPlainTag) && lastPlainTag.equals("<references>")) {
blockReferences.add(new DocumentPiece(pointerA, lastPointer));
pointerA = currentPointer;
}

if (currentPlainTag.equals("<header>")) {
if (!blockDocumentHeaders.contains(blockIndex)) {
blockDocumentHeaders.add(blockIndex);
//System.out.println("add block header: " + blockIndexInteger.intValue());
}

} else if (currentPlainTag.equals("<references>")) {// if (!blockReferences.contains(blockIndex)) {
// blockReferences.add(blockIndex);
// //System.out.println("add block reference: " + blockIndexInteger.intValue());
// }

if (currentTag.equals("I-<references>")) {
pointerA = new DocumentPointer(doc, blockIndex, p);
if (bib != null) {
if (bib.getRawBib() != null) {
doc.getBibDataSets().add(bib);
bib = new BibDataSet();
}
} else {
bib = new BibDataSet();
}
bib.setRawBib(s2);
} else {
if (addSpace) {
if (bib == null) {
bib = new BibDataSet();
bib.setRawBib(" " + s2);
} else {
bib.setRawBib(bib.getRawBib() + " " + s2);
}
} else {
if (bib == null) {
bib = new BibDataSet();
bib.setRawBib(s2);
} else {
bib.setRawBib(bib.getRawBib() + s2);
}
}
}

// case "<reference_marker>":
// if (!blockReferences.contains(blockIndex)) {
// blockReferences.add(blockIndex);
// //System.out.println("add block reference: " + blockIndexInteger.intValue());
// }
//
// if (currentTag.equals("I-<reference_marker>")) {
// if (bib != null) {
// if (bib.getRefSymbol() != null) {
// doc.getBibDataSets().add(bib);
// bib = new BibDataSet();
// }
// } else {
// bib = new BibDataSet();
// }
// bib.setRefSymbol(s2);
// } else {
// if (addSpace) {
// if (bib == null) {
// bib = new BibDataSet();
// bib.setRefSymbol(s2);
// } else {
// bib.setRefSymbol(bib.getRefSymbol() + " " + s2);
// }
// } else {
// if (bib == null) {
// bib = new BibDataSet();
// bib.setRefSymbol(s2);
// } else {
// bib.setRefSymbol(bib.getRefSymbol() + s2);
// }
// }
// }
// break;
} else if (currentPlainTag.equals("<page_footnote>")) {
if (!blockFooters.contains(blockIndex)) {
blockFooters.add(blockIndex);
//System.out.println("add block foot note: " + blockIndexInteger.intValue());
}

} else if (currentPlainTag.equals("<page_header>")) {
if (!blockHeaders.contains(blockIndex)) {
blockHeaders.add(blockIndex);
//System.out.println("add block page header: " + blockIndexInteger.intValue());
}

} else if (currentPlainTag.equals("<section>")) {
if (!blockSectionTitles.contains(blockIndex)) {
blockSectionTitles.add(blockIndex);
//System.out.println("add block page header: " + blockIndexInteger.intValue());
}

}

lastTag = currentTag;
p++;
lastPointer = currentPointer;
}

if (bib != null) {
doc.getBibDataSets().add(bib);
}


if (!lastPointer.equals(pointerA)) {
if (lastPlainTag.equals("<references>")) {
blockReferences.add(new DocumentPiece(pointerA, lastPointer));
}
}

/*doc.setBlockHeaders(blockHeaders);
doc.setBlockFooters(blockFooters);
doc.setBlockDocumentHeaders(blockDocumentHeaders);
doc.setBlockReferences(blockReferences);
doc.setBlockSectionTitles(blockSectionTitles);*/
static public Document figureResultSegmentation(Document doc,
List<GraphicObject> figureAnchors,
String labelledResultsUp,
List<LayoutTokenization> theTokenizationsUp,
String labelledResultsDown,
List<LayoutTokenization> theTokenizationsDown) {


return doc;
}
Expand Down
Loading
Loading