Skip to content

Commit a94a7cc

Browse files
committed
Adding some warning if tables/figures are dropped. Cleanup.
1 parent ab11f2d commit a94a7cc

File tree

2 files changed

+22
-18
lines changed

2 files changed

+22
-18
lines changed

grobid-core/src/main/java/org/grobid/core/data/Figure.java

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ public boolean isCompleteForTEI() {
329329

330330
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
331331
if (!isCompleteForTEI()) {
332+
LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now.");
332333
return null;
333334
}
334335
Element figureElement = XmlBuilderUtils.teiElement("figure");
@@ -339,18 +340,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
339340
if (config.isGenerateTeiCoordinates("figure")) {
340341
List<BoundingBox> theBoxes = null;
341342
// non graphic elements
342-
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
343+
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
343344
theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
344345
}
345346

346347
// if (getBitmapGraphicObjects() != null && !getBitmapGraphicObjects().isEmpty()) {
347348
// -> note: this was restricted to the bitmap objects only... the bounding box calculation
348349
// with vector graphics might need some double check
349350

350-
// here we bound all figure graphics in one single box (given that we can have hundred graphics
351+
// here we bound all figure graphics in one single box (given that we can have a hundred graphics
351352
// in a single figure)
352353
BoundingBox theGraphicsBox = null;
353-
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
354+
if (CollectionUtils.isNotEmpty(graphicObjects)) {
354355
for (GraphicObject graphicObject : graphicObjects) {
355356
if (theGraphicsBox == null) {
356357
theGraphicsBox = graphicObject.getBoundingBox();
@@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
366367
theBoxes.add(theGraphicsBox);
367368
}
368369

369-
if (theBoxes != null && theBoxes.size() > 0) {
370+
if (CollectionUtils.isNotEmpty(theBoxes)) {
370371
String coords = Joiner.on(";").join(theBoxes);
371372
XmlBuilderUtils.addCoords(figureElement, coords);
372373
}
373374
}
374-
if (header != null) {
375+
376+
if (StringUtils.isNotBlank(header)) {
375377
Element head = XmlBuilderUtils.teiElement("head",
376378
LayoutTokensUtil.normalizeText(header.toString()));
377379
figureElement.appendChild(head);
378-
379380
}
380-
if (label != null) {
381+
382+
if (StringUtils.isNotBlank(label)) {
381383
Element labelEl = XmlBuilderUtils.teiElement("label",
382384
LayoutTokensUtil.normalizeText(label.toString()));
383385
figureElement.appendChild(labelEl);
384386
}
385-
if (caption != null) {
386-
387+
if (StringUtils.isNotBlank(caption)) {
387388
Element desc = XmlBuilderUtils.teiElement("figDesc");
388389
if (config.isGenerateTeiIds()) {
389390
String divID = KeyGen.getKey().substring(0, 7);
@@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
392393

393394
// if the segment has been parsed with the full text model we further extract the clusters
394395
// to get the bibliographical references
395-
if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
396+
if (StringUtils.isNotBlank(labeledCaption)) {
396397
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
397398
List<TaggingTokenCluster> clusters = clusteror.cluster();
398399

399400
MarkerType citationMarkerType = null;
400-
if (markerTypes != null && markerTypes.size()>0) {
401+
if (CollectionUtils.isNotEmpty(markerTypes)) {
401402
citationMarkerType = markerTypes.get(0);
402403
}
403404

@@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
453454

454455
figureElement.appendChild(desc);
455456
}
456-
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
457+
if (CollectionUtils.isNotEmpty(graphicObjects)) {
457458
for (GraphicObject graphicObject : graphicObjects) {
458459
Element go = XmlBuilderUtils.teiElement("graphic");
459460
String uri = graphicObject.getURI();

grobid-core/src/main/java/org/grobid/core/data/Table.java

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.grobid.core.data;
22

3+
import org.apache.commons.collections4.CollectionUtils;
34
import org.grobid.core.GrobidModels;
45
import org.apache.commons.lang3.StringUtils;
56
import org.grobid.core.data.table.Cell;
@@ -69,6 +70,7 @@ public boolean isCompleteForTEI() {
6970
@Override
7071
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
7172
if (!isCompleteForTEI()) {
73+
LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now.");
7274
return null;
7375
}
7476

@@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
98100
}*/
99101

100102
Element desc = null;
101-
if (caption != null) {
103+
if (StringUtils.isNotBlank(caption)) {
102104
// if the segment has been parsed with the full text model we further extract the clusters
103105
// to get the bibliographical references
104106

@@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
111113
if (StringUtils.isNotBlank(labeledCaption)) {
112114
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
113115
List<TaggingTokenCluster> clusters = clusteror.cluster();
116+
117+
MarkerType citationMarkerType = null;
118+
if (CollectionUtils.isNotEmpty(markerTypes)) {
119+
citationMarkerType = markerTypes.get(0);
120+
}
121+
114122
for (TaggingTokenCluster cluster : clusters) {
115123
if (cluster == null) {
116124
continue;
117125
}
118126

119-
MarkerType citationMarkerType = null;
120-
if (markerTypes != null && markerTypes.size()>0) {
121-
citationMarkerType = markerTypes.get(0);
122-
}
123-
124127
TaggingLabel clusterLabel = cluster.getTaggingLabel();
125128
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
126129
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());

0 commit comments

Comments
 (0)