Skip to content

pdfrw vs pypdf page extraction & merge  #7

Open
@abubelinha

Description

@abubelinha

Test run on Python 3.8, Windows 7:

  • I took 4 arbitrary page numbers (pages 4,6,8,9).
  • For each of the benchmark listed pdf files I extracted those pages from it (if available).
  • Then I created a new pdf using the extracted pages, and repeated them between 1 and 5 times (to check how well pdfrw / pypdf optimize size of created pdfs containing repetitive information). So output pdfs will have up to 4x5 = 20 pages
  • I measure time employed and output sizes

I recall my initial code also deleted original bookmarks/annotations from pdfs, but I removed that part for simplicity and commented where I had read about that.

Code:

#!/usr/bin/python
# -*- coding: utf-8 -*-

def fsize(filepath):
	import os
	finfo = os.stat(filepath)
	fsize = finfo.st_size
	KB = "%.2f" % (fsize/1024)
	return([fsize,KB])
	
#@profile
def createpdf_from_sourcepdf_pages_pdfrw(sourcepdf=None, pageslist=None, destpdf=None, debug=False):
	""" <https://github.com/pmaupin/pdfrw/blob/master/examples/subset.py>
		"""
	from pdfrw import PdfWriter,PdfReader
	#import pdfrw_bookmarks # code from https://github.com/pmaupin/pdfrw/issues/52#issuecomment-271190546
	pages = PdfReader(sourcepdf).pages
	totalpages = len(pages)
	outdata = PdfWriter(destpdf)
	for p in pageslist:
		if p<totalpages:
			if debug: print("pdfrw ",p)
			#pdfrw_pageannots(pages[p-1])
			outdata.addpage(pages[p-1])
	outdata.write()

#@profile
def createpdf_from_sourcepdf_pages_pypdf(sourcepdf=None, pageslist=None, destpdf=None, debug=False, compress=False):
	""" Generate destpdf with list of certain pages taken from sourcepdf. 
		- <https://pypdf2.readthedocs.io/en/stable/user/merging-pdfs.html>
		- SO [Extract specific pages of PDF and save it with Python](https://stackoverflow.com/a/51885963/710788)
		"""
	from pypdf import PdfWriter,PdfReader
	fsource = open(sourcepdf, "rb")
	merger = PdfWriter()
	totalpages = len(PdfReader(fsource).pages)
	for p in pageslist:
		if p<totalpages:
			if debug: print("pypdf ",p)
			# add page p (0-based index):
			merger.append(fileobj=fsource, pages=(p-1,p))
	if compress: # Compress the data
		for page in merger.pages:
			page.compress_content_streams()  # This is CPU intensive!
	# Write to an output PDF document
	output = open(destpdf, "wb")
	merger.write(output)
	# Close File Descriptors
	merger.close()
	output.close()

#from memory_profiler import profile
#@profile
def pypdf_vs_pdfrw():
	""" [performance comparative](https://github.com/pmaupin/pdfrw/issues/232#issuecomment-1436153435) between two packages:
		- pdfrw
		- pypdf
		"""
	print(datetime.now() - startTime, " before comparing")
	pdfurls = [
		"https://arxiv.org/pdf/2201.00151.pdf",
		"https://arxiv.org/pdf/1707.09725.pdf",
		"https://arxiv.org/pdf/2201.00021.pdf",
		"https://arxiv.org/pdf/2201.00037.pdf",
		"https://arxiv.org/pdf/2201.00069.pdf",
		"https://arxiv.org/pdf/2201.00178.pdf",
		"https://arxiv.org/pdf/2201.00201.pdf",
		"https://arxiv.org/pdf/1602.06541.pdf",
		"https://arxiv.org/pdf/2201.00200.pdf",
		"https://arxiv.org/pdf/2201.00022.pdf",
		"https://arxiv.org/pdf/2201.00029.pdf",
		"https://arxiv.org/pdf/1601.03642.pdf",
		]
	import requests,os
	pdfrw_Tsize = 0
	pdfrw_Ttime = 0
	pypdf_Tsize = 0
	pypdf_Ttime = 0
	for pdfurl in pdfurls:
		sourcepdf = pdfurl.split("/")[-1]
		if not os.path.exists(sourcepdf):
			response = requests.get(pdfurl, headers=None, params=None)
			if response.status_code == 200:
				with open(sourcepdf, 'wb') as f:
					f.write(response.content)
			else:
				print(response.status_code)
				print("COULDN'T DOWNLOAD  '{}' FILE:\n".format(pdfurl))
		if not os.path.exists(sourcepdf):
			print("\n","-_"*40,"\n\nSKIPPING '{}' FILE:\n".format(sourcepdf))
		else:
			print("\n","-_"*40,"\n\nTESTING WITH '{}' FILE:\n".format(sourcepdf))
			for i in range(1,6):
				pageslist=[4,6,8,9]*i #*5 eats all my memory when using pypdf with large pdf files
				print("-"*50,"\npageslist:",pageslist)
				start=datetime.now()
				destpdf=sourcepdf+"_pdfrw-test_{}.pdf".format(".".join([str(p) for p in pageslist]))
				createpdf_from_sourcepdf_pages_pdfrw(sourcepdf=sourcepdf, pageslist=pageslist, destpdf=destpdf); 
				pdfrw_t = round((datetime.now() - start).total_seconds(),3)
				pdfrw_s = fsize(destpdf)
				pdfrw_Ttime += pdfrw_t
				pdfrw_Tsize += pdfrw_s[0]
				print("pdfrw: {} KB output size, took {} seconds".format(pdfrw_s[1],pdfrw_t))
				start=datetime.now()
				destpdf=sourcepdf+"_pypdf-test_{}.pdf".format(".".join([str(p) for p in pageslist]))
				createpdf_from_sourcepdf_pages_pypdf(sourcepdf=sourcepdf, pageslist=pageslist, destpdf=destpdf);
				pypdf_t = round((datetime.now() - start).total_seconds(),3)
				pypdf_s = fsize(destpdf)
				pypdf_Ttime += pypdf_t
				pypdf_Tsize += pypdf_s[0]
				print("pypdf: {} KB output size, took {} seconds".format(pypdf_s[1],pypdf_t))
				print("pypdf_time / pdfrw_time = {} ratio".format(round(pypdf_t/pdfrw_t, 2)))
				print("pypdf_size / pdfrw_size = {} ratio".format(round(pypdf_s[0]/pdfrw_s[0], 2)))
			
	import pdfrw,pypdf
	print("-_"*40)
	print("\n pdfrw.__version__ {}\nAccumulated output file size: {:.2f} MB\nTotal time: {:.2f} seconds".format(
		pdfrw.__version__, pdfrw_Tsize/1024/1024, pdfrw_Ttime))
	print("\n pypdf.__version__ {}\nAccumulated output file size: {:.2f} MB\nTotal time: {:.2f} seconds".format(
		pypdf.__version__, pypdf_Tsize/1024/1024, pypdf_Ttime))

if __name__ == "__main__":
	import sys
	from datetime import datetime
	startTime = datetime.now()
	print("START: ",startTime)
	pypdf_vs_pdfrw()
	endTime = datetime.now()
	print("\nEND: ",endTime)
	print("\nTOTAL TIME: ",endTime-startTime)

OUTPUT:

START:  2023-07-01 22:06:17.718288
0:00:00  before comparing

 -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

TESTING WITH '2201.00151.pdf' FILE:

--------------------------------------------------
pageslist: [4, 6, 8, 9]
pdfrw: 591.29 KB output size, took 0.109 seconds
pypdf: 660.78 KB output size, took 0.499 seconds
pypdf_time / pdfrw_time = 4.58 ratio
pypdf_size / pdfrw_size = 1.12 ratio
--------------------------------------------------
(... LINES DELETED TO AVOID TOO LONG OUTPUT ...)
--------------------------------------------------
pageslist: [4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9]
pdfrw: 130.20 KB output size, took 0.047 seconds
pypdf: 836.60 KB output size, took 1.031 seconds
pypdf_time / pdfrw_time = 21.94 ratio
pypdf_size / pdfrw_size = 6.43 ratio
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

 pdfrw.__version__ 0.5.0
Accumulated output file size: 50.73 MB
Total time: 4.47 seconds

 pypdf.__version__ 3.2.0
Accumulated output file size: 193.77 MB
Total time: 108.14 seconds

END:  2023-07-01 22:08:11.767827

TOTAL TIME:  0:01:54.049539

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions