3636import pl .edu .icm .cermine .bibref .model .BibEntry ;
3737import pl .edu .icm .cermine .configuration .ContentExtractorConfigLoader ;
3838import pl .edu .icm .cermine .configuration .ContentExtractorConfig ;
39+ import pl .edu .icm .cermine .content .model .ContentStructure ;
3940import pl .edu .icm .cermine .exception .AnalysisException ;
4041import pl .edu .icm .cermine .exception .TransformationException ;
4142import pl .edu .icm .cermine .metadata .model .DocumentMetadata ;
@@ -169,15 +170,6 @@ public void setBxDocument(BxDocument bxDocument) throws IOException {
169170 this .extractor .setBxDocument (bxDocument );
170171 }
171172
172- /**
173- * Stores the document's references.
174- *
175- * @param references the document's references
176- */
177- public void setReferences (List <BibEntry > references ) {
178- this .extractor .setReferences (references );
179- }
180-
181173 /**
182174 * Resets the extraction results.
183175 *
@@ -232,7 +224,83 @@ public BxDocument getBxDocument(long timeoutSeconds)
232224 throws AnalysisException , TimeoutException {
233225 return getBxDocument (combineWithMainTimeout (timeoutSeconds ));
234226 }
227+
228+ private BxDocument getBxDocumentWithGeneralLabels (Timeout timeout )
229+ throws AnalysisException , TimeoutException {
230+ try {
231+ TimeoutRegister .set (timeout );
232+ TimeoutRegister .get ().check ();
233+ return extractor .getBxDocumentWithGeneralLabels ();
234+ } finally {
235+ TimeoutRegister .remove ();
236+ }
237+ }
238+
239+ /**
240+ * Extracts geometric structure with general labels.
241+ *
242+ * @return geometric structure
243+ * @throws AnalysisException
244+ * @throws TimeoutException thrown when timeout deadline has passed. See
245+ * {@link #setTimeout(long)} for additional information about the timeout.
246+ */
247+ public BxDocument getBxDocumentWithGeneralLabels ()
248+ throws AnalysisException , TimeoutException {
249+ return getBxDocumentWithGeneralLabels (mainTimeout );
250+ }
251+
252+ /**
253+ * The same as {@link #getBxDocumentWithGeneralLabels()} but with a timeout.
254+ *
255+ * @param timeoutSeconds approximate timeout in seconds
256+ * @return
257+ * @throws AnalysisException
258+ * @throws TimeoutException thrown when timeout deadline has passed. See
259+ * {@link #setTimeout(long)} for additional information about the timeout.
260+ */
261+ public BxDocument getBxDocumentWithGeneralLabels (long timeoutSeconds )
262+ throws AnalysisException , TimeoutException {
263+ return getBxDocumentWithGeneralLabels (combineWithMainTimeout (timeoutSeconds ));
264+ }
265+
266+ private BxDocument getBxDocumentWithSpecificLabels (Timeout timeout )
267+ throws AnalysisException , TimeoutException {
268+ try {
269+ TimeoutRegister .set (timeout );
270+ TimeoutRegister .get ().check ();
271+ return extractor .getBxDocumentWithSpecificLabels ();
272+ } finally {
273+ TimeoutRegister .remove ();
274+ }
275+ }
276+
277+ /**
278+ * Extracts geometric structure with specific labels.
279+ *
280+ * @return geometric structure
281+ * @throws AnalysisException
282+ * @throws TimeoutException thrown when timeout deadline has passed. See
283+ * {@link #setTimeout(long)} for additional information about the timeout.
284+ */
285+ public BxDocument getBxDocumentWithSpecificLabels ()
286+ throws AnalysisException , TimeoutException {
287+ return getBxDocumentWithSpecificLabels (mainTimeout );
288+ }
235289
290+ /**
291+ * The same as {@link #getBxDocumentWithSpecificLabels()} but with a timeout.
292+ *
293+ * @param timeoutSeconds approximate timeout in seconds
294+ * @return
295+ * @throws AnalysisException
296+ * @throws TimeoutException thrown when timeout deadline has passed. See
297+ * {@link #setTimeout(long)} for additional information about the timeout.
298+ */
299+ public BxDocument getBxDocumentWithSpecificLabels (long timeoutSeconds )
300+ throws AnalysisException , TimeoutException {
301+ return getBxDocumentWithSpecificLabels (combineWithMainTimeout (timeoutSeconds ));
302+ }
303+
236304 private DocumentMetadata getMetadata (Timeout timeout )
237305 throws AnalysisException , TimeoutException {
238306 try {
@@ -271,12 +339,12 @@ public DocumentMetadata getMetadata(long timeoutSeconds)
271339 return getMetadata (combineWithMainTimeout (timeoutSeconds ));
272340 }
273341
274- private Element getNLMMetadata (Timeout timeout )
342+ private Element getMetadataAsNLM (Timeout timeout )
275343 throws AnalysisException , TimeoutException {
276344 try {
277345 TimeoutRegister .set (timeout );
278346 TimeoutRegister .get ().check ();
279- return extractor .getNLMMetadata ();
347+ return extractor .getMetadataAsNLM ();
280348 } finally {
281349 TimeoutRegister .remove ();
282350 }
@@ -290,9 +358,9 @@ private Element getNLMMetadata(Timeout timeout)
290358 * @throws TimeoutException thrown when timeout deadline has passed. See
291359 * {@link #setTimeout(long)} for additional information about the timeout.
292360 */
293- public Element getNLMMetadata ()
361+ public Element getMetadataAsNLM ()
294362 throws AnalysisException , TimeoutException {
295- return getNLMMetadata (mainTimeout );
363+ return getMetadataAsNLM (mainTimeout );
296364 }
297365
298366 /**
@@ -304,9 +372,9 @@ public Element getNLMMetadata()
304372 * @throws TimeoutException thrown when timeout deadline has passed. See
305373 * {@link #setTimeout(long)} for additional information about the timeout.
306374 */
307- public Element getNLMMetadata (long timeoutSeconds )
375+ public Element getMetadataAsNLM (long timeoutSeconds )
308376 throws AnalysisException , TimeoutException {
309- return getNLMMetadata (combineWithMainTimeout (timeoutSeconds ));
377+ return getMetadataAsNLM (combineWithMainTimeout (timeoutSeconds ));
310378 }
311379
312380 private List <BibEntry > getReferences (Timeout timeout )
@@ -347,12 +415,12 @@ public List<BibEntry> getReferences(long timeoutSeconds)
347415 return getReferences (combineWithMainTimeout (timeoutSeconds ));
348416 }
349417
350- private List <Element > getNLMReferences (Timeout timeout )
418+ private List <Element > getReferencesAsNLM (Timeout timeout )
351419 throws AnalysisException , TimeoutException {
352420 try {
353421 TimeoutRegister .set (timeout );
354422 TimeoutRegister .get ().check ();
355- return extractor .getNLMReferences ();
423+ return extractor .getReferencesAsNLM ();
356424 } finally {
357425 TimeoutRegister .remove ();
358426 }
@@ -366,9 +434,9 @@ private List<Element> getNLMReferences(Timeout timeout)
366434 * @throws TimeoutException thrown when timeout deadline has passed. See
367435 * {@link #setTimeout(long)} for additional information about the timeout.
368436 */
369- public List <Element > getNLMReferences ()
437+ public List <Element > getReferencesAsNLM ()
370438 throws AnalysisException , TimeoutException {
371- return getNLMReferences (mainTimeout );
439+ return getReferencesAsNLM (mainTimeout );
372440 }
373441
374442 /**
@@ -380,9 +448,9 @@ public List<Element> getNLMReferences()
380448 * @throws TimeoutException thrown when timeout deadline has passed. See
381449 * {@link #setTimeout(long)} for additional information about the timeout.
382450 */
383- public List <Element > getNLMReferences (long timeoutSeconds )
451+ public List <Element > getReferencesAsNLM (long timeoutSeconds )
384452 throws AnalysisException , TimeoutException {
385- return getNLMReferences (combineWithMainTimeout (timeoutSeconds ));
453+ return getReferencesAsNLM (combineWithMainTimeout (timeoutSeconds ));
386454 }
387455
388456 private String getRawFullText (Timeout timeout )
@@ -423,17 +491,16 @@ public String getRawFullText(long timeoutSeconds)
423491 return getRawFullText (combineWithMainTimeout (timeoutSeconds ));
424492 }
425493
426- private Element getLabelledRawFullText (Timeout timeout )
494+ private Element getLabelledFullText (Timeout timeout )
427495 throws AnalysisException , TimeoutException {
428496 try {
429497 TimeoutRegister .set (timeout );
430498 TimeoutRegister .get ().check ();
431- return extractor .getLabelledRawFullText ();
499+ return extractor .getLabelledFullText ();
432500 } finally {
433501 TimeoutRegister .remove ();
434502 }
435503 }
436-
437504 /**
438505 * Extracts labeled raw text.
439506 *
@@ -442,9 +509,9 @@ private Element getLabelledRawFullText(Timeout timeout)
442509 * @throws TimeoutException thrown when timeout deadline has passed. See
443510 * {@link #setTimeout(long)} for additional information about the timeout.
444511 */
445- public Element getLabelledRawFullText ()
512+ public Element getLabelledFullText ()
446513 throws AnalysisException , TimeoutException {
447- return getLabelledRawFullText (mainTimeout );
514+ return getLabelledFullText (mainTimeout );
448515 }
449516
450517 /**
@@ -456,17 +523,55 @@ public Element getLabelledRawFullText()
456523 * @throws TimeoutException thrown when timeout deadline has passed. See
457524 * {@link #setTimeout(long)} for additional information about the timeout.
458525 */
459- public Element getLabelledRawFullText (long timeoutSeconds )
526+ public Element getLabelledFullText (long timeoutSeconds )
460527 throws AnalysisException , TimeoutException {
461- return getLabelledRawFullText (combineWithMainTimeout (timeoutSeconds ));
528+ return getLabelledFullText (combineWithMainTimeout (timeoutSeconds ));
529+ }
530+
531+ private ContentStructure getBody (Timeout timeout )
532+ throws AnalysisException , TimeoutException {
533+ try {
534+ TimeoutRegister .set (timeout );
535+ TimeoutRegister .get ().check ();
536+ return extractor .getBody ();
537+ } finally {
538+ TimeoutRegister .remove ();
539+ }
540+ }
541+
542+ /**
543+ * Extracts structured full text.
544+ *
545+ * @return full text
546+ * @throws AnalysisException
547+ * @throws TimeoutException thrown when timeout deadline has passed. See
548+ * {@link #setTimeout(long)} for additional information about the timeout.
549+ */
550+ public ContentStructure getBody ()
551+ throws AnalysisException , TimeoutException {
552+ return getBody (mainTimeout );
462553 }
463554
464- private Element getNLMText (Timeout timeout )
555+ /**
556+ * The same as {@link #getNLMText()} but with a timeout.
557+ *
558+ * @param timeoutSeconds approximate timeout in seconds
559+ * @return full text
560+ * @throws AnalysisException
561+ * @throws TimeoutException thrown when timeout deadline has passed. See
562+ * {@link #setTimeout(long)} for additional information about the timeout.
563+ */
564+ public ContentStructure getBody (long timeoutSeconds )
565+ throws AnalysisException , TimeoutException {
566+ return getBody (combineWithMainTimeout (timeoutSeconds ));
567+ }
568+
569+ private Element getBodyAsNLM (Timeout timeout )
465570 throws AnalysisException , TimeoutException {
466571 try {
467572 TimeoutRegister .set (timeout );
468573 TimeoutRegister .get ().check ();
469- return extractor .getNLMText ();
574+ return extractor .getBodyAsNLM ();
470575 } finally {
471576 TimeoutRegister .remove ();
472577 }
@@ -480,9 +585,9 @@ private Element getNLMText(Timeout timeout)
480585 * @throws TimeoutException thrown when timeout deadline has passed. See
481586 * {@link #setTimeout(long)} for additional information about the timeout.
482587 */
483- public Element getNLMText ()
588+ public Element getBodyAsNLM ()
484589 throws AnalysisException , TimeoutException {
485- return getNLMText (mainTimeout );
590+ return getBodyAsNLM (mainTimeout );
486591 }
487592
488593 /**
@@ -494,17 +599,17 @@ public Element getNLMText()
494599 * @throws TimeoutException thrown when timeout deadline has passed. See
495600 * {@link #setTimeout(long)} for additional information about the timeout.
496601 */
497- public Element getNLMText (long timeoutSeconds )
602+ public Element getBodyAsNLM (long timeoutSeconds )
498603 throws AnalysisException , TimeoutException {
499- return getNLMText (combineWithMainTimeout (timeoutSeconds ));
604+ return getBodyAsNLM (combineWithMainTimeout (timeoutSeconds ));
500605 }
501606
502- private Element getNLMContent (Timeout timeout )
607+ private Element getContentAsNLM (Timeout timeout )
503608 throws AnalysisException , TimeoutException {
504609 try {
505610 TimeoutRegister .set (timeout );
506611 TimeoutRegister .get ().check ();
507- return extractor .getNLMContent ();
612+ return extractor .getContentAsNLM ();
508613 } finally {
509614 TimeoutRegister .remove ();
510615 }
@@ -518,9 +623,9 @@ private Element getNLMContent(Timeout timeout)
518623 * @throws TimeoutException thrown when timeout deadline has passed. See
519624 * {@link #setTimeout(long)} for additional information about the timeout.
520625 */
521- public Element getNLMContent ()
626+ public Element getContentAsNLM ()
522627 throws AnalysisException , TimeoutException {
523- return getNLMContent (mainTimeout );
628+ return getContentAsNLM (mainTimeout );
524629 }
525630
526631 /**
@@ -532,9 +637,9 @@ public Element getNLMContent()
532637 * @throws TimeoutException thrown when timeout deadline has passed. See
533638 * {@link #setTimeout(long)} for additional information about the timeout.
534639 */
535- public Element getNLMContent (long timeoutSeconds )
640+ public Element getContentAsNLM (long timeoutSeconds )
536641 throws AnalysisException , TimeoutException {
537- return getNLMContent (combineWithMainTimeout (timeoutSeconds ));
642+ return getContentAsNLM (combineWithMainTimeout (timeoutSeconds ));
538643 }
539644
540645 private Timeout combineWithMainTimeout (long timeoutSeconds ) {
@@ -616,20 +721,19 @@ public static void main(String[] args) throws ParseException, AnalysisException,
616721 extractor .setPDF (in );
617722
618723 if (outputs .containsKey ("jats" )) {
619- Element jats = extractor .getNLMContent ();
724+ Element jats = extractor .getContentAsNLM ();
620725 XMLOutputter outputter = new XMLOutputter (Format .getPrettyFormat ());
621726 FileUtils .writeStringToFile (outputs .get ("jats" ), outputter .outputString (jats ), "UTF-8" );
622727 }
623728
624729 if (outputs .containsKey ("trueviz" )) {
625- extractor .getLabelledRawFullText ();
626- BxDocument doc = extractor .getBxDocument ();
730+ BxDocument doc = extractor .getBxDocumentWithSpecificLabels ();
627731 BxDocumentToTrueVizWriter writer = new BxDocumentToTrueVizWriter ();
628732 writer .write (new FileWriter (outputs .get ("trueviz" )), Lists .newArrayList (doc ));
629733 }
630734
631735 if (outputs .containsKey ("zones" )) {
632- Element text = extractor .getLabelledRawFullText ();
736+ Element text = extractor .getLabelledFullText ();
633737 XMLOutputter outputter = new XMLOutputter (Format .getPrettyFormat ());
634738 FileUtils .writeStringToFile (outputs .get ("zones" ), outputter .outputString (text ), "UTF-8" );
635739 }
0 commit comments