Open
Description
Test run on Python 3.8, Windows 7:
- I took 4 arbitrary page numbers (pages 4,6,8,9).
- For each of the benchmark listed pdf files I extracted those pages from it (if available).
- Then I created a new pdf using the extracted pages, and repeated them between 1 and 5 times (to check how well pdfrw / pypdf optimize size of created pdfs containing repetitive information). So output pdfs will have up to 4x5 = 20 pages
- I measure time employed and output sizes
I recall my initial code also deleted original bookmarks/annotations from pdfs, but I removed that part for simplicity and commented where I had read about that.
Code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
def fsize(filepath):
import os
finfo = os.stat(filepath)
fsize = finfo.st_size
KB = "%.2f" % (fsize/1024)
return([fsize,KB])
#@profile
def createpdf_from_sourcepdf_pages_pdfrw(sourcepdf=None, pageslist=None, destpdf=None, debug=False):
""" <https://github.com/pmaupin/pdfrw/blob/master/examples/subset.py>
"""
from pdfrw import PdfWriter,PdfReader
#import pdfrw_bookmarks # code from https://github.com/pmaupin/pdfrw/issues/52#issuecomment-271190546
pages = PdfReader(sourcepdf).pages
totalpages = len(pages)
outdata = PdfWriter(destpdf)
for p in pageslist:
if p<totalpages:
if debug: print("pdfrw ",p)
#pdfrw_pageannots(pages[p-1])
outdata.addpage(pages[p-1])
outdata.write()
#@profile
def createpdf_from_sourcepdf_pages_pypdf(sourcepdf=None, pageslist=None, destpdf=None, debug=False, compress=False):
""" Generate destpdf with list of certain pages taken from sourcepdf.
- <https://pypdf2.readthedocs.io/en/stable/user/merging-pdfs.html>
- SO [Extract specific pages of PDF and save it with Python](https://stackoverflow.com/a/51885963/710788)
"""
from pypdf import PdfWriter,PdfReader
fsource = open(sourcepdf, "rb")
merger = PdfWriter()
totalpages = len(PdfReader(fsource).pages)
for p in pageslist:
if p<totalpages:
if debug: print("pypdf ",p)
# add page p (0-based index):
merger.append(fileobj=fsource, pages=(p-1,p))
if compress: # Compress the data
for page in merger.pages:
page.compress_content_streams() # This is CPU intensive!
# Write to an output PDF document
output = open(destpdf, "wb")
merger.write(output)
# Close File Descriptors
merger.close()
output.close()
#from memory_profiler import profile
#@profile
def pypdf_vs_pdfrw():
""" [performance comparative](https://github.com/pmaupin/pdfrw/issues/232#issuecomment-1436153435) between two packages:
- pdfrw
- pypdf
"""
print(datetime.now() - startTime, " before comparing")
pdfurls = [
"https://arxiv.org/pdf/2201.00151.pdf",
"https://arxiv.org/pdf/1707.09725.pdf",
"https://arxiv.org/pdf/2201.00021.pdf",
"https://arxiv.org/pdf/2201.00037.pdf",
"https://arxiv.org/pdf/2201.00069.pdf",
"https://arxiv.org/pdf/2201.00178.pdf",
"https://arxiv.org/pdf/2201.00201.pdf",
"https://arxiv.org/pdf/1602.06541.pdf",
"https://arxiv.org/pdf/2201.00200.pdf",
"https://arxiv.org/pdf/2201.00022.pdf",
"https://arxiv.org/pdf/2201.00029.pdf",
"https://arxiv.org/pdf/1601.03642.pdf",
]
import requests,os
pdfrw_Tsize = 0
pdfrw_Ttime = 0
pypdf_Tsize = 0
pypdf_Ttime = 0
for pdfurl in pdfurls:
sourcepdf = pdfurl.split("/")[-1]
if not os.path.exists(sourcepdf):
response = requests.get(pdfurl, headers=None, params=None)
if response.status_code == 200:
with open(sourcepdf, 'wb') as f:
f.write(response.content)
else:
print(response.status_code)
print("COULDN'T DOWNLOAD '{}' FILE:\n".format(pdfurl))
if not os.path.exists(sourcepdf):
print("\n","-_"*40,"\n\nSKIPPING '{}' FILE:\n".format(sourcepdf))
else:
print("\n","-_"*40,"\n\nTESTING WITH '{}' FILE:\n".format(sourcepdf))
for i in range(1,6):
pageslist=[4,6,8,9]*i #*5 eats all my memory when using pypdf with large pdf files
print("-"*50,"\npageslist:",pageslist)
start=datetime.now()
destpdf=sourcepdf+"_pdfrw-test_{}.pdf".format(".".join([str(p) for p in pageslist]))
createpdf_from_sourcepdf_pages_pdfrw(sourcepdf=sourcepdf, pageslist=pageslist, destpdf=destpdf);
pdfrw_t = round((datetime.now() - start).total_seconds(),3)
pdfrw_s = fsize(destpdf)
pdfrw_Ttime += pdfrw_t
pdfrw_Tsize += pdfrw_s[0]
print("pdfrw: {} KB output size, took {} seconds".format(pdfrw_s[1],pdfrw_t))
start=datetime.now()
destpdf=sourcepdf+"_pypdf-test_{}.pdf".format(".".join([str(p) for p in pageslist]))
createpdf_from_sourcepdf_pages_pypdf(sourcepdf=sourcepdf, pageslist=pageslist, destpdf=destpdf);
pypdf_t = round((datetime.now() - start).total_seconds(),3)
pypdf_s = fsize(destpdf)
pypdf_Ttime += pypdf_t
pypdf_Tsize += pypdf_s[0]
print("pypdf: {} KB output size, took {} seconds".format(pypdf_s[1],pypdf_t))
print("pypdf_time / pdfrw_time = {} ratio".format(round(pypdf_t/pdfrw_t, 2)))
print("pypdf_size / pdfrw_size = {} ratio".format(round(pypdf_s[0]/pdfrw_s[0], 2)))
import pdfrw,pypdf
print("-_"*40)
print("\n pdfrw.__version__ {}\nAccumulated output file size: {:.2f} MB\nTotal time: {:.2f} seconds".format(
pdfrw.__version__, pdfrw_Tsize/1024/1024, pdfrw_Ttime))
print("\n pypdf.__version__ {}\nAccumulated output file size: {:.2f} MB\nTotal time: {:.2f} seconds".format(
pypdf.__version__, pypdf_Tsize/1024/1024, pypdf_Ttime))
if __name__ == "__main__":
import sys
from datetime import datetime
startTime = datetime.now()
print("START: ",startTime)
pypdf_vs_pdfrw()
endTime = datetime.now()
print("\nEND: ",endTime)
print("\nTOTAL TIME: ",endTime-startTime)
OUTPUT:
START: 2023-07-01 22:06:17.718288
0:00:00 before comparing
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
TESTING WITH '2201.00151.pdf' FILE:
--------------------------------------------------
pageslist: [4, 6, 8, 9]
pdfrw: 591.29 KB output size, took 0.109 seconds
pypdf: 660.78 KB output size, took 0.499 seconds
pypdf_time / pdfrw_time = 4.58 ratio
pypdf_size / pdfrw_size = 1.12 ratio
--------------------------------------------------
(... LINES DELETED TO AVOID TOO LONG OUTPUT ...)
--------------------------------------------------
pageslist: [4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9]
pdfrw: 130.20 KB output size, took 0.047 seconds
pypdf: 836.60 KB output size, took 1.031 seconds
pypdf_time / pdfrw_time = 21.94 ratio
pypdf_size / pdfrw_size = 6.43 ratio
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
pdfrw.__version__ 0.5.0
Accumulated output file size: 50.73 MB
Total time: 4.47 seconds
pypdf.__version__ 3.2.0
Accumulated output file size: 193.77 MB
Total time: 108.14 seconds
END: 2023-07-01 22:08:11.767827
TOTAL TIME: 0:01:54.049539