Skip to content

Commit cf4067e

Browse files
Merge pull request #29 from StabRise/protected_pdf
Added support protected pdf files
2 parents 23e716a + 619ace3 commit cf4067e

11 files changed

Lines changed: 59 additions & 19 deletions

File tree

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ Spark 4.0.0 is supported in the version `0.1.11` and later (need Java 17 and Sca
8484
Binary package is available in the Maven Central Repository.
8585

8686

87-
- **Spark 3.5.***: com.stabrise:spark-pdf-spark35_2.12:0.1.16
87+
- **Spark 3.5.***: com.stabrise:spark-pdf-spark35_2.12:0.1.17
8888
- **Spark 3.4.***: com.stabrise:spark-pdf-spark34_2.12:0.1.11 (issue with publishing fresh version)
89-
- **Spark 3.3.***: com.stabrise:spark-pdf-spark33_2.12:0.1.16
90-
- **Spark 4.0.***: com.stabrise:spark-pdf-spark40_2.13:0.1.16
89+
- **Spark 3.3.***: com.stabrise:spark-pdf-spark33_2.12:0.1.17
90+
- **Spark 4.0.***: com.stabrise:spark-pdf-spark40_2.13:0.1.17
9191

9292
## Options for the data source:
9393

@@ -96,6 +96,7 @@ Binary package is available in the Maven Central Repository.
9696
- `pagePerPartition`: Number pages per partition in Spark DataFrame. Default: "5".
9797
- `reader`: Supports: `pdfBox` - based on PdfBox java lib, `gs` - based on GhostScript (need installation GhostScipt to the system)
9898
- `ocrConfig`: Tesseract OCR configuration. Default: "psm=3". For more information see [Tesseract OCR Params](TesseractParams.md)
99+
- `password`: Password for protected PDF files
99100

100101
## Output Columns in the DataFrame:
101102

@@ -158,6 +159,7 @@ val df = spark.read.format("pdf")
158159
.option("pagePerPartition", "2")
159160
.option("reader", "pdfBox")
160161
.option("ocrConfig", "psm=11")
162+
.option("password", "pdf_password")
161163
.load("path to the pdf file(s)")
162164

163165
df.select("path", "document").show()
@@ -180,6 +182,7 @@ df = spark.read.format("pdf") \
180182
.option("pagePerPartition", "2") \
181183
.option("reader", "pdfBox") \
182184
.option("ocrConfig", "psm=11") \
185+
.option("password", "pdf_password") \
183186
.load("path to the pdf file(s)")
184187

185188
df.select("path", "document").show()

build.sbt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import xerial.sbt.Sonatype.sonatypeCentralHost
22
import xerial.sbt.Sonatype.GitHubHosting
33

4-
ThisBuild / version := "0.1.16"
4+
ThisBuild / version := "0.1.17"
55

66
ThisBuild / scalaVersion := scala.util.Properties.envOrElse("SCALA_VERSION", "2.12.15") // "2.13.14", "2.12.15"
77
ThisBuild / organization := "com.stabrise"

examples/test_encrypted.pdf

15.3 KB
Binary file not shown.

spark33/src/main/scala/datasources/PdfPartitionedFileUtil.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ object PdfPartitionedFileUtil {
2121
filePath: Path,
2222
isSplitable: Boolean,
2323
maxSplitBytes: Long,
24-
partitionValues: InternalRow): Seq[PartitionedFile] = {
24+
partitionValues: InternalRow,
25+
options: Map[String,String]
26+
): Seq[PartitionedFile] = {
2527
val path = filePath
2628
val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
2729

2830
// Load the PDF document
29-
val document = PDDocument.load(fs.open(file.getPath))
31+
val document = PDDocument.load(fs.open(file.getPath), options.getOrElse("password", ""))
3032
val page_num = document.getNumberOfPages
3133
document.close()
3234

spark34/src/main/scala/datasources/PdfPartitionedFileUtil.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@ object PdfPartitionedFileUtil {
2020
filePath: Path,
2121
isSplitable: Boolean,
2222
maxSplitBytes: Long,
23-
partitionValues: InternalRow): Seq[PartitionedFile] = {
23+
partitionValues: InternalRow,
24+
options: Map[String,String]
25+
): Seq[PartitionedFile] = {
2426
val path = filePath
2527
val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
2628

2729
// Load the PDF document
28-
val document = PDDocument.load(fs.open(file.getPath))
30+
val document = PDDocument.load(fs.open(file.getPath), options.getOrElse("password", ""))
2931
val page_num = document.getNumberOfPages
3032
document.close()
3133

spark35/src/main/scala/datasources/PdfPartitionedFileUtil.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@ object PdfPartitionedFileUtil {
2121
filePath: Path,
2222
isSplitable: Boolean,
2323
maxSplitBytes: Long,
24-
partitionValues: InternalRow): Seq[PartitionedFile] = {
24+
partitionValues: InternalRow,
25+
options: Map[String,String]): Seq[PartitionedFile] = {
2526
val path = filePath
2627
val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
2728

2829
// Load the PDF document
29-
val document = PDDocument.load(fs.open(file.getPath))
30+
val document = PDDocument.load(fs.open(file.getPath), options.getOrElse("password", ""))
3031
val page_num = document.getNumberOfPages
3132
document.close()
3233

spark40/src/main/scala/datasources/PdfPartitionedFileUtil.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ object PdfPartitionedFileUtil {
2121
filePath: Path,
2222
isSplitable: Boolean,
2323
maxSplitBytes: Long,
24-
partitionValues: InternalRow): Seq[PartitionedFile] = {
24+
partitionValues: InternalRow,
25+
options: Map[String,String]
26+
): Seq[PartitionedFile] = {
2527
val path = filePath
2628
val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
2729

2830
// Load the PDF document
29-
val document = PDDocument.load(fs.open(file.getPath))
31+
val document = PDDocument.load(fs.open(file.getPath), options.getOrElse("password", ""))
3032
val page_num = document.getNumberOfPages
3133
document.close()
3234

src/main/scala/datasources/PdfPartitionReaderPDFBox.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class PdfPartitionReaderPDFBox(inputPartition: FilePartition,
3636
val hdfsPath = PdfPartitionedFileUtil.getHdfsPath(file)
3737
val fs = hdfsPath.getFileSystem(broadcastedConf.value.value)
3838
val status = fs.getFileStatus(hdfsPath)
39-
document = PDDocument.load(fs.open(status.getPath))
39+
document = PDDocument.load(fs.open(status.getPath), options.getOrElse("password", ""))
4040
stripper = new PDFTextStripper()
4141
pdfRenderer = new PDFRenderer(document)
4242
}

src/main/scala/datasources/PdfScan.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ case class PdfScan(
8080
filePath = filePath,
8181
isSplitable = true,
8282
maxSplitBytes = maxSplitBytes,
83-
partitionValues = partitionValues
83+
partitionValues = partitionValues,
84+
options = options.asScala.toMap,
8485
)
8586
}.toArray.sortBy(_.length)(implicitly[Ordering[Long]].reverse)
8687
}
15.3 KB
Binary file not shown.

0 commit comments

Comments
 (0)