Skip to content

Fix nested directory xrefs in combined PDFs #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
# MkDocs PDF Export Plugin [![Build Status][travis-status]][travis-link]
# MkDocs PDF Export Plugin (refactored)

This project is a fork of mkdocs-pdf-export-plugin by @zhaoterryy.

This version correctly resolves inter-page cross reference links in generated PDF documents, no matter where they are in a nested directory structure.

**IMPORTANT** I have not registered this package with PyPi and following the instructions below will install the *original* package, not this one. For now the only way to use this is to install it locally, or use pip and then overwrite the plugin with these files.

---

*An MkDocs plugin to export content pages as PDF files*

Expand Down
10 changes: 6 additions & 4 deletions mkdocs_pdf_export_plugin/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ class PdfExportPlugin(BasePlugin):
('enabled_if_env', config_options.Type(utils.string_types)),
('combined', config_options.Type(bool, default=False)),
('combined_output_path', config_options.Type(utils.string_types, default="pdf/combined.pdf")),
('theme_handler_path', config_options.Type(utils.string_types))
('theme_handler_path', config_options.Type(utils.string_types)),
# Declaring 'docs' base directory manually here; this should come in with the global config object though!
('output_dir', config_options.Type(utils.string_types, default='site'))
)

def __init__(self):
Expand All @@ -41,8 +43,7 @@ def on_config(self, config):
print('Combined PDF export is enabled')

from .renderer import Renderer
self.renderer = Renderer(self.combined, config['theme'].name, self.config['theme_handler_path'])

self.renderer = Renderer(self.combined, config['theme'].name, self.config['theme_handler_path'], self.config['output_dir'])
from weasyprint.logger import LOGGER
import logging

Expand Down Expand Up @@ -88,6 +89,7 @@ def on_post_page(self, output_content, page, config):

from weasyprint import urls
base_url = urls.path2url(os.path.join(path, filename))

pdf_file = filename + '.pdf'

try:
Expand Down Expand Up @@ -116,7 +118,7 @@ def on_post_build(self, config):

abs_pdf_path = os.path.join(config['site_dir'], self.config['combined_output_path'])
os.makedirs(os.path.dirname(abs_pdf_path), exist_ok=True)
self.renderer.write_combined_pdf(abs_pdf_path)
self.renderer.write_combined_pdf(abs_pdf_path, self.config['output_dir'])

end = timer()
self.total_time += (end - start)
Expand Down
2 changes: 1 addition & 1 deletion mkdocs_pdf_export_plugin/preprocessor/links/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .transform import transform_href, transform_id
from .util import get_body_id, replace_asset_hrefs, rel_pdf_href
from .util import get_body_id, replace_asset_hrefs, rel_pdf_href, get_xref_href
54 changes: 25 additions & 29 deletions mkdocs_pdf_export_plugin/preprocessor/links/transform.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,32 @@
import os

from .util import is_doc, normalize_href
from .util import is_doc, normalize_href, abs_asset_href, get_xref_href

# normalize href to #foo/bar/section:id
def transform_href(href: str, rel_url: str):
head, tail = os.path.split(href)

num_hashtags = tail.count('#')

if tail.startswith('#'):
head, section = os.path.split(rel_url)
section = os.path.splitext(section)[0]
id = tail[1:]
elif num_hashtags is 1:
section, ext = tuple(os.path.splitext(tail))
id = str.split(ext, '#')[1]

if head == '..':
href = normalize_href(href, rel_url)
return '#{}:{}'.format(href, id)

elif num_hashtags is 0:
if not is_doc(href):
return href

href = normalize_href(href, rel_url)
return '#{}:'.format(href)

if head != '':
head += '/'
def transform_href(href: str, rel_url: str, base_url: str, output_dir: str):

# print('---')
# print('Link in: "{}". baseurl:"{}". rel_url:"{}"'.format(href, base_url, rel_url))
if href.count('#') is 1:

path, anchor = href.split('#')

if path is '':
# print('Building xref. Path:{}, base_url:{}'.format(path,base_url))
xref = rel_url.strip('/')
else:
xref = get_xref_href(path.strip('/'), base_url, output_dir)

out = '#{}:{}'.format(xref, anchor)

else:
xref = get_xref_href(href, base_url, output_dir)

out = '#{}:'.format(xref)

# print('Link out: "{}"'.format(out))
return out

return '#{}{}:{}'.format(head, section, id)

# normalize id to foo/bar/section:id
def transform_id(id: str, rel_url: str):
Expand All @@ -40,4 +36,4 @@ def transform_id(id: str, rel_url: str):
if len(head) > 0:
head += '/'

return '{}{}:{}'.format(head, section, id)
return '{}{}:{}'.format(head, section, id)
14 changes: 12 additions & 2 deletions mkdocs_pdf_export_plugin/preprocessor/links/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,22 @@ def reduce_rel(x):
return x

rel_dir = os.path.dirname(rel_url)
href = str.split(os.path.join(rel_dir, href), '/')
href = str.split(os.path.join(rel_dir, href), os.sep)
href = reduce_rel(href)
href[-1], _ = os.path.splitext(href[-1])

return os.path.join(*href)

def get_body_id(url: str):
section, _ = os.path.splitext(url)
return '{}:'.format(section)
return '{}:'.format(section).strip('/')

# Prepare inter-page xrefs; these are recalculated relative to the output directory
def get_xref_href(href: str, base_url: str, output_dir: str):
full_url = urls.iri_to_uri(urls.urljoin(base_url, href))
rel_url = full_url.split(output_dir)[-1]
out, _ = os.path.splitext(rel_url)

return out.strip('/')


40 changes: 32 additions & 8 deletions mkdocs_pdf_export_plugin/preprocessor/prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,40 @@
from weasyprint import urls
from bs4 import BeautifulSoup

def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str):
for id in soup.find_all(id=True):
id['id'] = transform_id(id['id'], rel_url)
def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str, output_dir: str):

for a in soup.find_all('a', href=True):
if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
continue
# the relative URL base is the incoming rel_url with the extension and trailing slash stripped off.
if rel_url.count('.') is 1:
rel_url_base, _ = rel_url.split('.')
else:
rel_url_base = rel_url
rel_url_base = rel_url_base.strip('/')

# Only process links and headings that are inside the article tag
for article in soup.find_all('article'):

# If the permalink plugin is active, headings will contain extra useless links. Delete these.
for headerlink in article.find_all('a', {'class':'headerlink'}):
headerlink.decompose()

# Process H1 elements. There *should* be only one of these per page, but process any that are found.
# Format for H1 IDs is "path/to/page:" with a trailing colon.
for title in article.find_all('h1'):
title['id'] = '{}:'.format(rel_url_base)
print('[pdf export] Processing page: "{}"'.format(title.string))

# process H2-6 elements.
# Format for H2-6 IDs is "path/to/page:subhead-id".
for heading in article.find_all(['h2','h3','h4','h5','h6']):
heading['id'] = '{}:{}'.format(rel_url_base, heading.get('id'))

# process body (article) link hrefs.
for a in article.find_all('a', href=True):
if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
continue

a['href'] = transform_href(a['href'], rel_url_base, base_url, output_dir)

a['href'] = transform_href(a['href'], rel_url)

soup.body['id'] = get_body_id(rel_url)
soup = replace_asset_hrefs(soup, base_url)
return soup
Expand Down
7 changes: 4 additions & 3 deletions mkdocs_pdf_export_plugin/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
from .preprocessor import get_separate as prep_separate, get_combined as prep_combined

class Renderer(object):
def __init__(self, combined: bool, theme: str, theme_handler_path: str=None):
def __init__(self, combined: bool, theme: str, theme_handler_path: str=None, output_dir: str='site'):
self.theme = self._load_theme_handler(theme, theme_handler_path)
self.combined = combined
self.page_order = []
self.pgnum = 0
self.pages = []
self.output_dir = output_dir

def write_pdf(self, content: str, base_url: str, filename: str):
self.render_doc(content, base_url).write_pdf(filename)
Expand All @@ -34,7 +35,7 @@ def render_doc(self, content: str, base_url: str, rel_url: str = None):


if self.combined:
soup = prep_combined(soup, base_url, rel_url)
soup = prep_combined(soup, base_url, rel_url, self.output_dir)
else:
soup = prep_separate(soup, base_url)

Expand All @@ -45,7 +46,7 @@ def add_doc(self, content: str, base_url: str, rel_url: str):
pos = self.page_order.index(rel_url)
self.pages[pos] = (content, base_url, rel_url)

def write_combined_pdf(self, output_path: str):
def write_combined_pdf(self, output_path: str, output_dir:str):
rendered_pages = []
for p in self.pages:
if p is None:
Expand Down