Skip to content

Commit 1f430ae

Browse files
committed
#3 support recognition of watermarks wrapped in marked content
1 parent fcb0732 commit 1f430ae

File tree

2 files changed

+57
-6
lines changed

2 files changed

+57
-6
lines changed

src/com/amastigote/unstamper/core/Processor.java

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
AUTH | hwding
3-
DATE | Dec 25 2018
3+
DATE | Feb 19 2019
44
DESC | textual watermark remover for PDF files
55
66
GITH | github.com/hwding
@@ -17,21 +17,29 @@
1717
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
1818
import org.apache.pdfbox.pdmodel.PDDocument;
1919
import org.apache.pdfbox.pdmodel.common.PDStream;
20+
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
2021
import org.apache.pdfbox.pdmodel.font.PDFont;
2122
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
23+
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
24+
import org.apache.pdfbox.text.TextPosition;
2225

2326
import java.io.File;
2427
import java.io.IOException;
2528
import java.io.OutputStream;
2629
import java.util.*;
30+
import java.util.stream.Collectors;
2731

2832
public class Processor {
2933

3034
private static final byte[] empBytes = new byte[0];
35+
private static final Object empObj = new Object();
36+
private static final Collection<Object> empObjCol = Collections.singleton(empObj);
3137
private static final List<PDAnnotation> empList = Collections.emptyList();
3238

3339
private static final String SHOW_TEXT = "Tj";
3440
private static final String SHOW_TEXT_ADJUSTED = "TJ";
41+
private static final String BEGIN_MARKED_CONTENT_WITH_PROPERTY = "BDC";
42+
private static final String END_MARKDED_CONTENT = "EMC";
3543

3644
public static void process(
3745
@NotNull File file,
@@ -47,11 +55,13 @@ public static void process(
4755
}
4856

4957
PDDocument pdDocument = PDDocument.load(file);
58+
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
59+
5060
pdDocument.getPages().forEach(pdPage -> {
5161
try {
52-
boolean needRewrite = false;
62+
boolean needRewrite;
5363

54-
/* START: loading font resources from current page */
64+
/* >> loading font resources from current page */
5565
PDFStreamParser pdfStreamParser = new PDFStreamParser(pdPage);
5666
pdfStreamParser.parse();
5767

@@ -66,14 +76,52 @@ public static void process(
6676
} catch (Exception ignored) {
6777
}
6878
});
69-
/* END */
79+
/* << loading font resources from current page */
80+
81+
/* >> pre-check marked contents and remember orders */
82+
extractor.processPage(pdPage);
83+
84+
List<PDMarkedContent> markedContents = extractor.getMarkedContents();
85+
List<Boolean> markedContentMatchRecords = markedContents
86+
.stream()
87+
.map(PDMarkedContent::getContents)
88+
.map(c -> c.stream()
89+
.filter(e -> e instanceof TextPosition)
90+
.map(e -> ((TextPosition) e).getUnicode())
91+
.collect(Collectors.joining()))
92+
.map(s -> TextStampRecognizer.recognizePlain(strings, s.getBytes(), useStrict))
93+
.collect(Collectors.toList());
94+
95+
needRewrite = markedContentMatchRecords.contains(true);
96+
/* << pre-check marked contents and remember orders */
97+
98+
int mcCount = -1;
99+
boolean mcRemovingFlag = false;
70100

71101
/* handle both string array and string */
72102
List<Object> objects = pdfStreamParser.getTokens();
73103
Object object, prevObject;
74104
for (int i = 0; i < objects.size(); i++) {
75105
object = objects.get(i);
76106

107+
/* >> mark marked content */
108+
if (object instanceof Operator) {
109+
Operator op = (Operator) object;
110+
if (op.getName().equals(BEGIN_MARKED_CONTENT_WITH_PROPERTY)) {
111+
++mcCount;
112+
mcRemovingFlag = markedContentMatchRecords.get(mcCount);
113+
} else if (op.getName().equals(END_MARKDED_CONTENT)) {
114+
mcRemovingFlag = false;
115+
objects.set(i, empObj);
116+
}
117+
}
118+
119+
if (mcRemovingFlag) {
120+
objects.set(i, empObj);
121+
continue;
122+
}
123+
/* << mark marked content */
124+
77125
if (object instanceof Operator) {
78126
Operator op = (Operator) object;
79127
String testStr;
@@ -131,6 +179,9 @@ public static void process(
131179
OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
132180
ContentStreamWriter writer = new ContentStreamWriter(out);
133181

182+
/* remove all marked marked contents */
183+
objects.removeAll(empObjCol);
184+
134185
writer.writeTokens(objects);
135186
out.close();
136187
/* << write modified tokens back to the stream */

src/com/amastigote/unstamper/core/TextStampRecognizer.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
AUTH | hwding
3-
DATE | Dec 25 2018
3+
DATE | Feb 19 2019
44
DESC | textual watermark remover for PDF files
55
66
GITH | github.com/hwding
@@ -42,7 +42,7 @@ private static boolean recognizeWithFont(
4242
return false;
4343
}
4444

45-
private static boolean recognizePlain(
45+
static boolean recognizePlain(
4646
@NotNull String[] keywords,
4747
@NotNull byte[] inputText,
4848
@NotNull boolean useStrict

0 commit comments

Comments
 (0)