diff --git a/R/RcppExports.R b/R/RcppExports.R index 16c4583..2e3e6da 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -17,8 +17,8 @@ poppler_pdf_data <- function(x, get_font_info, opw, upw) { .Call('_pdftools_poppler_pdf_data', PACKAGE = 'pdftools', x, get_font_info, opw, upw) } -poppler_pdf_text <- function(x, opw, upw) { - .Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw) +poppler_pdf_text <- function(x, opw, upw, raw = FALSE) { + .Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw, raw) } poppler_pdf_pagesize <- function(x, opw, upw) { diff --git a/R/tools.R b/R/tools.R index 785163e..6674992 100644 --- a/R/tools.R +++ b/R/tools.R @@ -41,9 +41,10 @@ pdf_info <- function(pdf, opw = "", upw = "") { } #' @rdname pdftools +#' @param raw if TRUE text is kept in content stream order. Default: FALSE. #' @export -pdf_text <- function(pdf, opw = "", upw = "") { - poppler_pdf_text(loadfile(pdf), opw, upw) +pdf_text <- function(pdf, opw = "", upw = "", raw = FALSE) { + poppler_pdf_text(loadfile(pdf), opw, upw, raw) } #' @rdname pdftools diff --git a/man/pdftools.Rd b/man/pdftools.Rd index 92bc426..08a5a73 100644 --- a/man/pdftools.Rd +++ b/man/pdftools.Rd @@ -13,7 +13,7 @@ \usage{ pdf_info(pdf, opw = "", upw = "") -pdf_text(pdf, opw = "", upw = "") +pdf_text(pdf, opw = "", upw = "", raw = FALSE) pdf_data(pdf, font_info = FALSE, opw = "", upw = "") @@ -32,6 +32,8 @@ pdf_pagesize(pdf, opw = "", upw = "") \item{upw}{string with user password to open pdf} +\item{raw}{if TRUE text is kept in content stream order. Default: FALSE.} + \item{font_info}{if TRUE, extract font-data for each box. Be careful, this requires a very recent version of poppler and will error otherwise.} } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 5b1d683..6116bab 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -59,15 +59,16 @@ BEGIN_RCPP END_RCPP } // poppler_pdf_text -CharacterVector poppler_pdf_text(RawVector x, std::string opw, std::string upw); -RcppExport SEXP _pdftools_poppler_pdf_text(SEXP xSEXP, SEXP opwSEXP, SEXP upwSEXP) { +CharacterVector poppler_pdf_text(RawVector x, std::string opw, std::string upw, bool raw); +RcppExport SEXP _pdftools_poppler_pdf_text(SEXP xSEXP, SEXP opwSEXP, SEXP upwSEXP, SEXP rawSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< RawVector >::type x(xSEXP); Rcpp::traits::input_parameter< std::string >::type opw(opwSEXP); Rcpp::traits::input_parameter< std::string >::type upw(upwSEXP); - rcpp_result_gen = Rcpp::wrap(poppler_pdf_text(x, opw, upw)); + Rcpp::traits::input_parameter< bool >::type raw(rawSEXP); + rcpp_result_gen = Rcpp::wrap(poppler_pdf_text(x, opw, upw, raw)); return rcpp_result_gen; END_RCPP } @@ -175,7 +176,7 @@ static const R_CallMethodDef CallEntries[] = { {"_pdftools_get_poppler_config", (DL_FUNC) &_pdftools_get_poppler_config, 0}, {"_pdftools_poppler_pdf_info", (DL_FUNC) &_pdftools_poppler_pdf_info, 3}, {"_pdftools_poppler_pdf_data", (DL_FUNC) &_pdftools_poppler_pdf_data, 4}, - {"_pdftools_poppler_pdf_text", (DL_FUNC) &_pdftools_poppler_pdf_text, 3}, + {"_pdftools_poppler_pdf_text", (DL_FUNC) &_pdftools_poppler_pdf_text, 4}, {"_pdftools_poppler_pdf_pagesize", (DL_FUNC) &_pdftools_poppler_pdf_pagesize, 3}, {"_pdftools_poppler_pdf_fonts", (DL_FUNC) &_pdftools_poppler_pdf_fonts, 3}, {"_pdftools_poppler_pdf_files", (DL_FUNC) &_pdftools_poppler_pdf_files, 3}, diff --git a/src/bindings.cpp b/src/bindings.cpp index a71acc1..9e61ad4 100644 --- a/src/bindings.cpp +++ b/src/bindings.cpp @@ -260,13 +260,13 @@ List poppler_pdf_data (RawVector x, bool get_font_info, std::string opw, std::st } // [[Rcpp::export]] -CharacterVector poppler_pdf_text (RawVector x, std::string opw, std::string upw) { +CharacterVector poppler_pdf_text (RawVector x, std::string opw, std::string upw, bool raw = false) { std::unique_ptr doc(read_raw_pdf(x, opw, upw)); CharacterVector out(doc->pages()); for(int i = 0; i < doc->pages(); i++){ std::unique_ptr p(doc->create_page(i)); if(!p) continue; //missing page - page::text_layout_enum show_text_layout = page::physical_layout; + page::text_layout_enum show_text_layout = raw ? page::raw_order_layout : page::physical_layout; /* media_box includes text in margins: https://github.com/ropensci/pdftools/issues/67 */ rectf target(p->page_rect(media_box)); diff --git a/tests/testthat/test-reading.R b/tests/testthat/test-reading.R index 57368e1..027ab1a 100644 --- a/tests/testthat/test-reading.R +++ b/tests/testthat/test-reading.R @@ -12,6 +12,7 @@ test_that("reading password protected pdf", { # Get text with password expect_equal(4, length(pdf_text("pdf-example-password.original.pdf", upw = "test"))) + expect_equal(4, length(pdf_text("pdf-example-password.original.pdf", upw = "test", raw = TRUE))) expect_false(pdf_info("pdf-example-password.original.pdf", upw = "test")$locked) # Reading 'encrypted' file