11/*
22 AUTH | hwding
3- DATE | Dec 25 2018
3+ DATE | Feb 19 2019
44 DESC | textual watermark remover for PDF files
5566 GITH | github.com/hwding
1717import org .apache .pdfbox .pdfwriter .ContentStreamWriter ;
1818import org .apache .pdfbox .pdmodel .PDDocument ;
1919import org .apache .pdfbox .pdmodel .common .PDStream ;
20+ import org .apache .pdfbox .pdmodel .documentinterchange .markedcontent .PDMarkedContent ;
2021import org .apache .pdfbox .pdmodel .font .PDFont ;
2122import org .apache .pdfbox .pdmodel .interactive .annotation .PDAnnotation ;
23+ import org .apache .pdfbox .text .PDFMarkedContentExtractor ;
24+ import org .apache .pdfbox .text .TextPosition ;
2225
2326import java .io .File ;
2427import java .io .IOException ;
2528import java .io .OutputStream ;
2629import java .util .*;
30+ import java .util .stream .Collectors ;
2731
2832public class Processor {
2933
3034 private static final byte [] empBytes = new byte [0 ];
35+ private static final Object empObj = new Object ();
36+ private static final Collection <Object > empObjCol = Collections .singleton (empObj );
3137 private static final List <PDAnnotation > empList = Collections .emptyList ();
3238
3339 private static final String SHOW_TEXT = "Tj" ;
3440 private static final String SHOW_TEXT_ADJUSTED = "TJ" ;
41+ private static final String BEGIN_MARKED_CONTENT_WITH_PROPERTY = "BDC" ;
42+ private static final String END_MARKDED_CONTENT = "EMC" ;
3543
3644 public static void process (
3745 @ NotNull File file ,
@@ -47,11 +55,13 @@ public static void process(
4755 }
4856
4957 PDDocument pdDocument = PDDocument .load (file );
58+ PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor ();
59+
5060 pdDocument .getPages ().forEach (pdPage -> {
5161 try {
52- boolean needRewrite = false ;
62+ boolean needRewrite ;
5363
54- /* START: loading font resources from current page */
64+ /* >> loading font resources from current page */
5565 PDFStreamParser pdfStreamParser = new PDFStreamParser (pdPage );
5666 pdfStreamParser .parse ();
5767
@@ -66,14 +76,52 @@ public static void process(
6676 } catch (Exception ignored ) {
6777 }
6878 });
69- /* END */
79+ /* << loading font resources from current page */
80+
81+ /* >> pre-check marked contents and remember orders */
82+ extractor .processPage (pdPage );
83+
84+ List <PDMarkedContent > markedContents = extractor .getMarkedContents ();
85+ List <Boolean > markedContentMatchRecords = markedContents
86+ .stream ()
87+ .map (PDMarkedContent ::getContents )
88+ .map (c -> c .stream ()
89+ .filter (e -> e instanceof TextPosition )
90+ .map (e -> ((TextPosition ) e ).getUnicode ())
91+ .collect (Collectors .joining ()))
92+ .map (s -> TextStampRecognizer .recognizePlain (strings , s .getBytes (), useStrict ))
93+ .collect (Collectors .toList ());
94+
95+ needRewrite = markedContentMatchRecords .contains (true );
96+ /* << pre-check marked contents and remember orders */
97+
98+ int mcCount = -1 ;
99+ boolean mcRemovingFlag = false ;
70100
71101 /* handle both string array and string */
72102 List <Object > objects = pdfStreamParser .getTokens ();
73103 Object object , prevObject ;
74104 for (int i = 0 ; i < objects .size (); i ++) {
75105 object = objects .get (i );
76106
107+ /* >> mark marked content */
108+ if (object instanceof Operator ) {
109+ Operator op = (Operator ) object ;
110+ if (op .getName ().equals (BEGIN_MARKED_CONTENT_WITH_PROPERTY )) {
111+ ++mcCount ;
112+ mcRemovingFlag = markedContentMatchRecords .get (mcCount );
113+ } else if (op .getName ().equals (END_MARKDED_CONTENT )) {
114+ mcRemovingFlag = false ;
115+ objects .set (i , empObj );
116+ }
117+ }
118+
119+ if (mcRemovingFlag ) {
120+ objects .set (i , empObj );
121+ continue ;
122+ }
123+ /* << mark marked content */
124+
77125 if (object instanceof Operator ) {
78126 Operator op = (Operator ) object ;
79127 String testStr ;
@@ -131,6 +179,9 @@ public static void process(
131179 OutputStream out = newContents .createOutputStream (COSName .FLATE_DECODE );
132180 ContentStreamWriter writer = new ContentStreamWriter (out );
133181
182+ /* remove all marked marked contents */
183+ objects .removeAll (empObjCol );
184+
134185 writer .writeTokens (objects );
135186 out .close ();
136187 /* << write modified tokens back to the stream */
0 commit comments