-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfmerger.py
301 lines (250 loc) · 9.95 KB
/
pdfmerger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
from PyPDF2 import PdfFileReader, PdfFileMerger, PdfFileWriter
from PIL import Image
import imghdr
import time
"""
pdfmerger.py ver. 20.9.26 by mariafshan
https://github.com/mariafshan/
This program convert image files to PDF then merge the PDF files.
HOW TO USE:
1. Input 2 or more image or PDF files
2. All image files will be converted to PDF
3. All PDF files will be merged into 1 file
PyPDF2 documentation = https://pythonhosted.org/PyPDF2/ https://pythonhosted.org/PyPDF2/PageObject.html
PyPDF documentation = http://pybrary.net/pyPdf/pythondoc-pyPdf.pdf.html
img checker = https://stackoverflow.com/questions/889333/how-to-check-if-a-file-is-a-valid-image-file
pdf merger = https://medium.com/@codeforests/split-or-merge-pdf-files-with-5-lines-of-python-code-c1fa710b4902
img2pdf = https://datatofish.com/images-to-pdf-python/
imgchecker = https://www.blog.pythonlibrary.org/2020/02/09/how-to-check-if-a-file-is-a-valid-image-with-python/
Automate the Boring Stuff with Python = https://automatetheboringstuff.com/
PDF compression = https://stackoverflow.com/questions/22776388/pypdf2-compression
Extracting page size = https://stackoverflow.com/questions/6230752/extracting-page-sizes-from-pdf-in-python
Resizing PDF pages = https://stackoverflow.com/questions/6536552/resize-pdf-pages-in-python
"""
def main():
file_name = None
file_name = input_filename(file_name)
print("Files to be merged:")
for i in range(len(file_name)):
print(str(i + 1) + ".", file_name[i])
time.sleep(0.5)
print("")
new_file = pdf_merger(file_name)
time.sleep(0.5)
print("pdfmerger.py has completed checking, merging, scaling, and compressing. Files saved as", new_file + ".")
return
# The actual main function of this program - merging files! Also shorter than than the compressor function haha!
def pdf_merger(file_name):
# list of files to be merged
output = PdfFileMerger()
for i in range(len(file_name)):
# convert image files to pdf
if img_check(file_name[i]):
file = convert_image(file_name[i])
file_name[i] = file
time.sleep(0.5)
# append file name to output list
output.append(file_name[i])
# save merged file
mergedpdf_filename = file_name[0][:file_name[0].find(".p")] + "_merged.pdf"
try:
output.write(mergedpdf_filename)
except:
print("Files failed to merge.")
return
print(len(file_name), "files merged successfully. Merged file is saved as",
mergedpdf_filename)
print("")
compress_pdf(mergedpdf_filename)
return mergedpdf_filename
# Prompt user to input file name and then check the file validity
def input_filename(file_name):
if file_name == None:
# list of file names to be merged
file_name = list()
# Prompting user to input file name
while True:
print("Enter file name or press Enter to continue.")
filename = input("file_name.ext: ")
time.sleep(0.5)
# Exit the program
if filename == "":
if len(file_name) < 2:
print("Please enter 2 or more files")
input_filename(file_name)
print("")
break
# Check file
elif not file_check(filename):
continue
# Append file name
file_name.append(filename)
return file_name
# execute compress PDF, I should merge this helper function with the other helper function I think...
def compress_pdf(filename):
# Execute PDF writer
writer = PdfFileWriter()
if not pdf_check(filename):
print("Unable to compress", filename, ", merged PDF is saved uncompressed and unscaled.")
# Open compressed PDF file
reader = PdfFileReader(filename)
# scale and compress PDF file
smallest_page_width, smallest_page_height, smallest_page_area = find_smallest_page_area(reader)
scale_page(reader, writer, filename, smallest_page_width, smallest_page_height, smallest_page_area)
return
# resize the pages to make the PDF more uniform and easier to read, also compress the file
# I should decompose this helper function but atm I'm too lazy. Also I'm sure this helper function is still buggy.
def scale_page(reader, writer, filename, min_page_width, min_page_height, min_page_size):
# Rezising pages
smallestpage_orientation = orientation_check(min_page_width, min_page_height)
for page_num in range(reader.numPages):
page_size = reader.getPage(page_num).mediaBox
page_width = page_size[2]
page_height = page_size[3]
page_area = page_width * page_height
mintocurrwidth = float(min_page_width / page_width)
mintocurrheight = float(min_page_height / page_height)
# Scaling PDF file
page = reader.getPage(page_num)
print("smallest page width: ", str(min_page_width))
print("smallest page height: ", str(min_page_height))
print("smallest page area:", str(min_page_size))
print("Current page size:", str(page_width), "x", str(page_height))
print("")
time.sleep(0.5)
curr_page_orientation = orientation_check(page_width, page_height)
print("This page is a", curr_page_orientation)
print("")
time.sleep(0.5)
# if page has the same orientation
if curr_page_orientation == smallestpage_orientation:
if smallestpage_orientation == "square":
resize_by = mintocurrwidth
elif smallestpage_orientation == "portrait":
if min_page_width >= page_width:
resize_by = mintocurrheight
elif min_page_height <= page_height:
resize_by = mintocurrwidth
else:
resize_by = mintocurrheight
elif smallestpage_orientation == "landscape":
if min_page_width >= page_width:
resize_by = mintocurrheight
elif min_page_height <= page_height:
resize_by = mintocurrwidth
else:
resize_by = mintocurrwidth
# if page does not have the same orientation
if curr_page_orientation != smallestpage_orientation:
if smallestpage_orientation == "square":
if curr_page_orientation == "landscape":
resize_by = mintocurrheight
elif curr_page_orientation == "portrait":
resize_by = mintocurrwidth
elif smallestpage_orientation == "portrait":
resize_by = mintocurrheight
elif smallestpage_orientation == "landscape":
resize_by = mintocurrwidth
print("Resizing page", str(page_num + 1), "by", str(resize_by * 100) + "%...")
# scaling page
try:
page.scaleBy(resize_by)
print("Page", str(page_num + 1), "is successfully scaled")
except:
print("Unable to scale page", str(page_num + 1))
time.sleep(0.5)
# compressing page
print("Compressing page", str(page_num + 1) + "...")
try:
page.compressContentStreams()
print("Page", str(page_num + 1), "is successfully compressed.")
print("")
except:
print("Unable to compress page", str(page_num + 1))
writer.addPage(page)
time.sleep(1)
# writing scaled PDF file
with open(filename, "wb+") as file:
writer.write(file)
file.close()
return
# finding the smallest page
def find_smallest_page_area(reader):
page_sizes = dict()
smallest_page_width = None
smallest_page_height = None
smallest_page_area = None
# Measuring the page sizes
for page in range(reader.numPages):
page_size = reader.getPage(page).mediaBox
page_width = page_size[2]
page_height = page_size[3]
page_area = page_width * page_height
if smallest_page_area == None or page_area < smallest_page_area:
smallest_page_area = page_area
smallest_page_height = page_height
smallest_page_width = page_width
# This dictionary might be useful someday....
page_sizes[page] = page_sizes.setdefault(page, [page_width, page_height])
print("Page sizes:", page_sizes)
time.sleep(0.5)
print("Smallest page area:", smallest_page_area)
print("Smallest page width:", smallest_page_width)
print("Smallest page heigt:", smallest_page_height)
print("")
time.sleep(0.5)
return smallest_page_width, smallest_page_height, smallest_page_area
# Check if page is landscape/ portrait/ square
def orientation_check(width, height):
if width > height:
return "landscape"
if height > width:
return "portrait"
if height == width:
return "square"
return
# Convert image file to PDF
def convert_image(img_file):
img = Image.open(img_file)
new_pdf = img_file[:img_file.find(".")] + "_convert.pdf"
print("Converting", img_file, "to", new_pdf + "...")
imgtopdf = img.convert("RGB")
try:
imgtopdf.save(new_pdf)
print("Conversion successful.")
except:
print("Failed to convert", img_file)
print("")
return new_pdf
# Check if user input file is valid
def file_check(filename):
if pdf_check(filename):
print("File is a valid PDF file")
print("")
return True
elif img_check(filename):
imgtype = imghdr.what(filename)
imgtype = imgtype.upper()
print("File is a valid", imgtype, "file")
print("")
return True
else:
print("Please enter valid PDF or image file")
print("")
return False
# check if PDF file can be opened
def pdf_check(filename):
try:
filename = PdfFileReader(filename)
except:
return False
return True
# check if image file can be opened
def img_check(filename):
try:
filename = Image.open(filename)
except:
return False
return True
if __name__ == "__main__":
main()