Skip to content

Commit f8ede4b

Browse files
author
Dominika Tkaczyk
committed
main extractor and utils refactored
1 parent 81ef7d8 commit f8ede4b

13 files changed

Lines changed: 562 additions & 641 deletions

cermine-impl/src/main/java/pl/edu/icm/cermine/ContentExtractor.java

Lines changed: 148 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import pl.edu.icm.cermine.bibref.model.BibEntry;
3737
import pl.edu.icm.cermine.configuration.ContentExtractorConfigLoader;
3838
import pl.edu.icm.cermine.configuration.ContentExtractorConfig;
39+
import pl.edu.icm.cermine.content.model.ContentStructure;
3940
import pl.edu.icm.cermine.exception.AnalysisException;
4041
import pl.edu.icm.cermine.exception.TransformationException;
4142
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
@@ -169,15 +170,6 @@ public void setBxDocument(BxDocument bxDocument) throws IOException {
169170
this.extractor.setBxDocument(bxDocument);
170171
}
171172

172-
/**
173-
* Stores the document's references.
174-
*
175-
* @param references the document's references
176-
*/
177-
public void setReferences(List<BibEntry> references) {
178-
this.extractor.setReferences(references);
179-
}
180-
181173
/**
182174
* Resets the extraction results.
183175
*
@@ -232,7 +224,83 @@ public BxDocument getBxDocument(long timeoutSeconds)
232224
throws AnalysisException, TimeoutException {
233225
return getBxDocument(combineWithMainTimeout(timeoutSeconds));
234226
}
227+
228+
private BxDocument getBxDocumentWithGeneralLabels(Timeout timeout)
229+
throws AnalysisException, TimeoutException {
230+
try {
231+
TimeoutRegister.set(timeout);
232+
TimeoutRegister.get().check();
233+
return extractor.getBxDocumentWithGeneralLabels();
234+
} finally {
235+
TimeoutRegister.remove();
236+
}
237+
}
238+
239+
/**
240+
* Extracts geometric structure with general labels.
241+
*
242+
* @return geometric structure
243+
* @throws AnalysisException
244+
* @throws TimeoutException thrown when timeout deadline has passed. See
245+
* {@link #setTimeout(long)} for additional information about the timeout.
246+
*/
247+
public BxDocument getBxDocumentWithGeneralLabels()
248+
throws AnalysisException, TimeoutException {
249+
return getBxDocumentWithGeneralLabels(mainTimeout);
250+
}
251+
252+
/**
253+
* The same as {@link #getBxDocumentWithGeneralLabels()} but with a timeout.
254+
*
255+
* @param timeoutSeconds approximate timeout in seconds
256+
* @return
257+
* @throws AnalysisException
258+
* @throws TimeoutException thrown when timeout deadline has passed. See
259+
* {@link #setTimeout(long)} for additional information about the timeout.
260+
*/
261+
public BxDocument getBxDocumentWithGeneralLabels(long timeoutSeconds)
262+
throws AnalysisException, TimeoutException {
263+
return getBxDocumentWithGeneralLabels(combineWithMainTimeout(timeoutSeconds));
264+
}
265+
266+
private BxDocument getBxDocumentWithSpecificLabels(Timeout timeout)
267+
throws AnalysisException, TimeoutException {
268+
try {
269+
TimeoutRegister.set(timeout);
270+
TimeoutRegister.get().check();
271+
return extractor.getBxDocumentWithSpecificLabels();
272+
} finally {
273+
TimeoutRegister.remove();
274+
}
275+
}
276+
277+
/**
278+
* Extracts geometric structure with specific labels.
279+
*
280+
* @return geometric structure
281+
* @throws AnalysisException
282+
* @throws TimeoutException thrown when timeout deadline has passed. See
283+
* {@link #setTimeout(long)} for additional information about the timeout.
284+
*/
285+
public BxDocument getBxDocumentWithSpecificLabels()
286+
throws AnalysisException, TimeoutException {
287+
return getBxDocumentWithSpecificLabels(mainTimeout);
288+
}
235289

290+
/**
291+
* The same as {@link #getBxDocumentWithSpecificLabels()} but with a timeout.
292+
*
293+
* @param timeoutSeconds approximate timeout in seconds
294+
* @return
295+
* @throws AnalysisException
296+
* @throws TimeoutException thrown when timeout deadline has passed. See
297+
* {@link #setTimeout(long)} for additional information about the timeout.
298+
*/
299+
public BxDocument getBxDocumentWithSpecificLabels(long timeoutSeconds)
300+
throws AnalysisException, TimeoutException {
301+
return getBxDocumentWithSpecificLabels(combineWithMainTimeout(timeoutSeconds));
302+
}
303+
236304
private DocumentMetadata getMetadata(Timeout timeout)
237305
throws AnalysisException, TimeoutException {
238306
try {
@@ -271,12 +339,12 @@ public DocumentMetadata getMetadata(long timeoutSeconds)
271339
return getMetadata(combineWithMainTimeout(timeoutSeconds));
272340
}
273341

274-
private Element getNLMMetadata(Timeout timeout)
342+
private Element getMetadataAsNLM(Timeout timeout)
275343
throws AnalysisException, TimeoutException {
276344
try {
277345
TimeoutRegister.set(timeout);
278346
TimeoutRegister.get().check();
279-
return extractor.getNLMMetadata();
347+
return extractor.getMetadataAsNLM();
280348
} finally {
281349
TimeoutRegister.remove();
282350
}
@@ -290,9 +358,9 @@ private Element getNLMMetadata(Timeout timeout)
290358
* @throws TimeoutException thrown when timeout deadline has passed. See
291359
* {@link #setTimeout(long)} for additional information about the timeout.
292360
*/
293-
public Element getNLMMetadata()
361+
public Element getMetadataAsNLM()
294362
throws AnalysisException, TimeoutException {
295-
return getNLMMetadata(mainTimeout);
363+
return getMetadataAsNLM(mainTimeout);
296364
}
297365

298366
/**
@@ -304,9 +372,9 @@ public Element getNLMMetadata()
304372
* @throws TimeoutException thrown when timeout deadline has passed. See
305373
* {@link #setTimeout(long)} for additional information about the timeout.
306374
*/
307-
public Element getNLMMetadata(long timeoutSeconds)
375+
public Element getMetadataAsNLM(long timeoutSeconds)
308376
throws AnalysisException, TimeoutException {
309-
return getNLMMetadata(combineWithMainTimeout(timeoutSeconds));
377+
return getMetadataAsNLM(combineWithMainTimeout(timeoutSeconds));
310378
}
311379

312380
private List<BibEntry> getReferences(Timeout timeout)
@@ -347,12 +415,12 @@ public List<BibEntry> getReferences(long timeoutSeconds)
347415
return getReferences(combineWithMainTimeout(timeoutSeconds));
348416
}
349417

350-
private List<Element> getNLMReferences(Timeout timeout)
418+
private List<Element> getReferencesAsNLM(Timeout timeout)
351419
throws AnalysisException, TimeoutException {
352420
try {
353421
TimeoutRegister.set(timeout);
354422
TimeoutRegister.get().check();
355-
return extractor.getNLMReferences();
423+
return extractor.getReferencesAsNLM();
356424
} finally {
357425
TimeoutRegister.remove();
358426
}
@@ -366,9 +434,9 @@ private List<Element> getNLMReferences(Timeout timeout)
366434
* @throws TimeoutException thrown when timeout deadline has passed. See
367435
* {@link #setTimeout(long)} for additional information about the timeout.
368436
*/
369-
public List<Element> getNLMReferences()
437+
public List<Element> getReferencesAsNLM()
370438
throws AnalysisException, TimeoutException {
371-
return getNLMReferences(mainTimeout);
439+
return getReferencesAsNLM(mainTimeout);
372440
}
373441

374442
/**
@@ -380,9 +448,9 @@ public List<Element> getNLMReferences()
380448
* @throws TimeoutException thrown when timeout deadline has passed. See
381449
* {@link #setTimeout(long)} for additional information about the timeout.
382450
*/
383-
public List<Element> getNLMReferences(long timeoutSeconds)
451+
public List<Element> getReferencesAsNLM(long timeoutSeconds)
384452
throws AnalysisException, TimeoutException {
385-
return getNLMReferences(combineWithMainTimeout(timeoutSeconds));
453+
return getReferencesAsNLM(combineWithMainTimeout(timeoutSeconds));
386454
}
387455

388456
private String getRawFullText(Timeout timeout)
@@ -423,17 +491,16 @@ public String getRawFullText(long timeoutSeconds)
423491
return getRawFullText(combineWithMainTimeout(timeoutSeconds));
424492
}
425493

426-
private Element getLabelledRawFullText(Timeout timeout)
494+
private Element getLabelledFullText(Timeout timeout)
427495
throws AnalysisException, TimeoutException {
428496
try {
429497
TimeoutRegister.set(timeout);
430498
TimeoutRegister.get().check();
431-
return extractor.getLabelledRawFullText();
499+
return extractor.getLabelledFullText();
432500
} finally {
433501
TimeoutRegister.remove();
434502
}
435503
}
436-
437504
/**
438505
* Extracts labeled raw text.
439506
*
@@ -442,9 +509,9 @@ private Element getLabelledRawFullText(Timeout timeout)
442509
* @throws TimeoutException thrown when timeout deadline has passed. See
443510
* {@link #setTimeout(long)} for additional information about the timeout.
444511
*/
445-
public Element getLabelledRawFullText()
512+
public Element getLabelledFullText()
446513
throws AnalysisException, TimeoutException {
447-
return getLabelledRawFullText(mainTimeout);
514+
return getLabelledFullText(mainTimeout);
448515
}
449516

450517
/**
@@ -456,17 +523,55 @@ public Element getLabelledRawFullText()
456523
* @throws TimeoutException thrown when timeout deadline has passed. See
457524
* {@link #setTimeout(long)} for additional information about the timeout.
458525
*/
459-
public Element getLabelledRawFullText(long timeoutSeconds)
526+
public Element getLabelledFullText(long timeoutSeconds)
460527
throws AnalysisException, TimeoutException {
461-
return getLabelledRawFullText(combineWithMainTimeout(timeoutSeconds));
528+
return getLabelledFullText(combineWithMainTimeout(timeoutSeconds));
529+
}
530+
531+
private ContentStructure getBody(Timeout timeout)
532+
throws AnalysisException, TimeoutException {
533+
try {
534+
TimeoutRegister.set(timeout);
535+
TimeoutRegister.get().check();
536+
return extractor.getBody();
537+
} finally {
538+
TimeoutRegister.remove();
539+
}
540+
}
541+
542+
/**
543+
* Extracts structured full text.
544+
*
545+
* @return full text
546+
* @throws AnalysisException
547+
* @throws TimeoutException thrown when timeout deadline has passed. See
548+
* {@link #setTimeout(long)} for additional information about the timeout.
549+
*/
550+
public ContentStructure getBody()
551+
throws AnalysisException, TimeoutException {
552+
return getBody(mainTimeout);
462553
}
463554

464-
private Element getNLMText(Timeout timeout)
555+
/**
556+
* The same as {@link #getNLMText()} but with a timeout.
557+
*
558+
* @param timeoutSeconds approximate timeout in seconds
559+
* @return full text
560+
* @throws AnalysisException
561+
* @throws TimeoutException thrown when timeout deadline has passed. See
562+
* {@link #setTimeout(long)} for additional information about the timeout.
563+
*/
564+
public ContentStructure getBody(long timeoutSeconds)
565+
throws AnalysisException, TimeoutException {
566+
return getBody(combineWithMainTimeout(timeoutSeconds));
567+
}
568+
569+
private Element getBodyAsNLM(Timeout timeout)
465570
throws AnalysisException, TimeoutException {
466571
try {
467572
TimeoutRegister.set(timeout);
468573
TimeoutRegister.get().check();
469-
return extractor.getNLMText();
574+
return extractor.getBodyAsNLM();
470575
} finally {
471576
TimeoutRegister.remove();
472577
}
@@ -480,9 +585,9 @@ private Element getNLMText(Timeout timeout)
480585
* @throws TimeoutException thrown when timeout deadline has passed. See
481586
* {@link #setTimeout(long)} for additional information about the timeout.
482587
*/
483-
public Element getNLMText()
588+
public Element getBodyAsNLM()
484589
throws AnalysisException, TimeoutException {
485-
return getNLMText(mainTimeout);
590+
return getBodyAsNLM(mainTimeout);
486591
}
487592

488593
/**
@@ -494,17 +599,17 @@ public Element getNLMText()
494599
* @throws TimeoutException thrown when timeout deadline has passed. See
495600
* {@link #setTimeout(long)} for additional information about the timeout.
496601
*/
497-
public Element getNLMText(long timeoutSeconds)
602+
public Element getBodyAsNLM(long timeoutSeconds)
498603
throws AnalysisException, TimeoutException {
499-
return getNLMText(combineWithMainTimeout(timeoutSeconds));
604+
return getBodyAsNLM(combineWithMainTimeout(timeoutSeconds));
500605
}
501606

502-
private Element getNLMContent(Timeout timeout)
607+
private Element getContentAsNLM(Timeout timeout)
503608
throws AnalysisException, TimeoutException {
504609
try {
505610
TimeoutRegister.set(timeout);
506611
TimeoutRegister.get().check();
507-
return extractor.getNLMContent();
612+
return extractor.getContentAsNLM();
508613
} finally {
509614
TimeoutRegister.remove();
510615
}
@@ -518,9 +623,9 @@ private Element getNLMContent(Timeout timeout)
518623
* @throws TimeoutException thrown when timeout deadline has passed. See
519624
* {@link #setTimeout(long)} for additional information about the timeout.
520625
*/
521-
public Element getNLMContent()
626+
public Element getContentAsNLM()
522627
throws AnalysisException, TimeoutException {
523-
return getNLMContent(mainTimeout);
628+
return getContentAsNLM(mainTimeout);
524629
}
525630

526631
/**
@@ -532,9 +637,9 @@ public Element getNLMContent()
532637
* @throws TimeoutException thrown when timeout deadline has passed. See
533638
* {@link #setTimeout(long)} for additional information about the timeout.
534639
*/
535-
public Element getNLMContent(long timeoutSeconds)
640+
public Element getContentAsNLM(long timeoutSeconds)
536641
throws AnalysisException, TimeoutException {
537-
return getNLMContent(combineWithMainTimeout(timeoutSeconds));
642+
return getContentAsNLM(combineWithMainTimeout(timeoutSeconds));
538643
}
539644

540645
private Timeout combineWithMainTimeout(long timeoutSeconds) {
@@ -616,20 +721,19 @@ public static void main(String[] args) throws ParseException, AnalysisException,
616721
extractor.setPDF(in);
617722

618723
if (outputs.containsKey("jats")) {
619-
Element jats = extractor.getNLMContent();
724+
Element jats = extractor.getContentAsNLM();
620725
XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
621726
FileUtils.writeStringToFile(outputs.get("jats"), outputter.outputString(jats), "UTF-8");
622727
}
623728

624729
if (outputs.containsKey("trueviz")) {
625-
extractor.getLabelledRawFullText();
626-
BxDocument doc = extractor.getBxDocument();
730+
BxDocument doc = extractor.getBxDocumentWithSpecificLabels();
627731
BxDocumentToTrueVizWriter writer = new BxDocumentToTrueVizWriter();
628732
writer.write(new FileWriter(outputs.get("trueviz")), Lists.newArrayList(doc));
629733
}
630734

631735
if (outputs.containsKey("zones")) {
632-
Element text = extractor.getLabelledRawFullText();
736+
Element text = extractor.getLabelledFullText();
633737
XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
634738
FileUtils.writeStringToFile(outputs.get("zones"), outputter.outputString(text), "UTF-8");
635739
}

0 commit comments

Comments
 (0)