diff --git a/README.md b/README.md index b0bcef9..1503b9c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,12 @@ -# MkDocs PDF Export Plugin [![Build Status][travis-status]][travis-link] +# MkDocs PDF Export Plugin (refactored) + +This project is a fork of mkdocs-pdf-export-plugin by @zhaoterryy. + +This version correctly resolves inter-page cross reference links in generated PDF documents, no matter where they are in a nested directory structure. + +**IMPORTANT** I have not registered this package with PyPi and following the instructions below will install the *original* package, not this one. For now the only way to use this is to install it locally, or use pip and then overwrite the plugin with these files. + +--- *An MkDocs plugin to export content pages as PDF files* diff --git a/mkdocs_pdf_export_plugin/plugin.py b/mkdocs_pdf_export_plugin/plugin.py index 271213e..669aca4 100644 --- a/mkdocs_pdf_export_plugin/plugin.py +++ b/mkdocs_pdf_export_plugin/plugin.py @@ -16,7 +16,9 @@ class PdfExportPlugin(BasePlugin): ('enabled_if_env', config_options.Type(utils.string_types)), ('combined', config_options.Type(bool, default=False)), ('combined_output_path', config_options.Type(utils.string_types, default="pdf/combined.pdf")), - ('theme_handler_path', config_options.Type(utils.string_types)) + ('theme_handler_path', config_options.Type(utils.string_types)), + # Declaring 'docs' base directory manually here; this should come in with the global config object though! + ('output_dir', config_options.Type(utils.string_types, default='site')) ) def __init__(self): @@ -41,8 +43,7 @@ def on_config(self, config): print('Combined PDF export is enabled') from .renderer import Renderer - self.renderer = Renderer(self.combined, config['theme'].name, self.config['theme_handler_path']) - + self.renderer = Renderer(self.combined, config['theme'].name, self.config['theme_handler_path'], self.config['output_dir']) from weasyprint.logger import LOGGER import logging @@ -88,6 +89,7 @@ def on_post_page(self, output_content, page, config): from weasyprint import urls base_url = urls.path2url(os.path.join(path, filename)) + pdf_file = filename + '.pdf' try: @@ -116,7 +118,7 @@ def on_post_build(self, config): abs_pdf_path = os.path.join(config['site_dir'], self.config['combined_output_path']) os.makedirs(os.path.dirname(abs_pdf_path), exist_ok=True) - self.renderer.write_combined_pdf(abs_pdf_path) + self.renderer.write_combined_pdf(abs_pdf_path, self.config['output_dir']) end = timer() self.total_time += (end - start) diff --git a/mkdocs_pdf_export_plugin/preprocessor/links/__init__.py b/mkdocs_pdf_export_plugin/preprocessor/links/__init__.py index fae95c2..9694017 100644 --- a/mkdocs_pdf_export_plugin/preprocessor/links/__init__.py +++ b/mkdocs_pdf_export_plugin/preprocessor/links/__init__.py @@ -1,2 +1,2 @@ from .transform import transform_href, transform_id -from .util import get_body_id, replace_asset_hrefs, rel_pdf_href \ No newline at end of file +from .util import get_body_id, replace_asset_hrefs, rel_pdf_href, get_xref_href \ No newline at end of file diff --git a/mkdocs_pdf_export_plugin/preprocessor/links/transform.py b/mkdocs_pdf_export_plugin/preprocessor/links/transform.py index fcc85c8..2a8dfa2 100644 --- a/mkdocs_pdf_export_plugin/preprocessor/links/transform.py +++ b/mkdocs_pdf_export_plugin/preprocessor/links/transform.py @@ -1,36 +1,32 @@ import os -from .util import is_doc, normalize_href +from .util import is_doc, normalize_href, abs_asset_href, get_xref_href # normalize href to #foo/bar/section:id -def transform_href(href: str, rel_url: str): - head, tail = os.path.split(href) - - num_hashtags = tail.count('#') - - if tail.startswith('#'): - head, section = os.path.split(rel_url) - section = os.path.splitext(section)[0] - id = tail[1:] - elif num_hashtags is 1: - section, ext = tuple(os.path.splitext(tail)) - id = str.split(ext, '#')[1] - - if head == '..': - href = normalize_href(href, rel_url) - return '#{}:{}'.format(href, id) - - elif num_hashtags is 0: - if not is_doc(href): - return href - - href = normalize_href(href, rel_url) - return '#{}:'.format(href) - - if head != '': - head += '/' +def transform_href(href: str, rel_url: str, base_url: str, output_dir: str): + + # print('---') + # print('Link in: "{}". baseurl:"{}". rel_url:"{}"'.format(href, base_url, rel_url)) + if href.count('#') is 1: + + path, anchor = href.split('#') + + if path is '': + # print('Building xref. Path:{}, base_url:{}'.format(path,base_url)) + xref = rel_url.strip('/') + else: + xref = get_xref_href(path.strip('/'), base_url, output_dir) + + out = '#{}:{}'.format(xref, anchor) + + else: + xref = get_xref_href(href, base_url, output_dir) + + out = '#{}:'.format(xref) + + # print('Link out: "{}"'.format(out)) + return out - return '#{}{}:{}'.format(head, section, id) # normalize id to foo/bar/section:id def transform_id(id: str, rel_url: str): @@ -40,4 +36,4 @@ def transform_id(id: str, rel_url: str): if len(head) > 0: head += '/' - return '{}{}:{}'.format(head, section, id) \ No newline at end of file + return '{}{}:{}'.format(head, section, id) diff --git a/mkdocs_pdf_export_plugin/preprocessor/links/util.py b/mkdocs_pdf_export_plugin/preprocessor/links/util.py index df73efd..8fad7b1 100644 --- a/mkdocs_pdf_export_plugin/preprocessor/links/util.py +++ b/mkdocs_pdf_export_plugin/preprocessor/links/util.py @@ -59,7 +59,7 @@ def reduce_rel(x): return x rel_dir = os.path.dirname(rel_url) - href = str.split(os.path.join(rel_dir, href), '/') + href = str.split(os.path.join(rel_dir, href), os.sep) href = reduce_rel(href) href[-1], _ = os.path.splitext(href[-1]) @@ -67,4 +67,14 @@ def reduce_rel(x): def get_body_id(url: str): section, _ = os.path.splitext(url) - return '{}:'.format(section) \ No newline at end of file + return '{}:'.format(section).strip('/') + +# Prepare inter-page xrefs; these are recalculated relative to the output directory +def get_xref_href(href: str, base_url: str, output_dir: str): + full_url = urls.iri_to_uri(urls.urljoin(base_url, href)) + rel_url = full_url.split(output_dir)[-1] + out, _ = os.path.splitext(rel_url) + + return out.strip('/') + + diff --git a/mkdocs_pdf_export_plugin/preprocessor/prep.py b/mkdocs_pdf_export_plugin/preprocessor/prep.py index 1292bad..f86dd12 100644 --- a/mkdocs_pdf_export_plugin/preprocessor/prep.py +++ b/mkdocs_pdf_export_plugin/preprocessor/prep.py @@ -5,16 +5,40 @@ from weasyprint import urls from bs4 import BeautifulSoup -def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str): - for id in soup.find_all(id=True): - id['id'] = transform_id(id['id'], rel_url) +def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str, output_dir: str): - for a in soup.find_all('a', href=True): - if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']): - continue + # the relative URL base is the incoming rel_url with the extension and trailing slash stripped off. + if rel_url.count('.') is 1: + rel_url_base, _ = rel_url.split('.') + else: + rel_url_base = rel_url + rel_url_base = rel_url_base.strip('/') + + # Only process links and headings that are inside the article tag + for article in soup.find_all('article'): + + # If the permalink plugin is active, headings will contain extra useless links. Delete these. + for headerlink in article.find_all('a', {'class':'headerlink'}): + headerlink.decompose() + + # Process H1 elements. There *should* be only one of these per page, but process any that are found. + # Format for H1 IDs is "path/to/page:" with a trailing colon. + for title in article.find_all('h1'): + title['id'] = '{}:'.format(rel_url_base) + print('[pdf export] Processing page: "{}"'.format(title.string)) + + # process H2-6 elements. + # Format for H2-6 IDs is "path/to/page:subhead-id". + for heading in article.find_all(['h2','h3','h4','h5','h6']): + heading['id'] = '{}:{}'.format(rel_url_base, heading.get('id')) + + # process body (article) link hrefs. + for a in article.find_all('a', href=True): + if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']): + continue + + a['href'] = transform_href(a['href'], rel_url_base, base_url, output_dir) - a['href'] = transform_href(a['href'], rel_url) - soup.body['id'] = get_body_id(rel_url) soup = replace_asset_hrefs(soup, base_url) return soup diff --git a/mkdocs_pdf_export_plugin/renderer.py b/mkdocs_pdf_export_plugin/renderer.py index b0eb7e8..732e20b 100644 --- a/mkdocs_pdf_export_plugin/renderer.py +++ b/mkdocs_pdf_export_plugin/renderer.py @@ -10,12 +10,13 @@ from .preprocessor import get_separate as prep_separate, get_combined as prep_combined class Renderer(object): - def __init__(self, combined: bool, theme: str, theme_handler_path: str=None): + def __init__(self, combined: bool, theme: str, theme_handler_path: str=None, output_dir: str='site'): self.theme = self._load_theme_handler(theme, theme_handler_path) self.combined = combined self.page_order = [] self.pgnum = 0 self.pages = [] + self.output_dir = output_dir def write_pdf(self, content: str, base_url: str, filename: str): self.render_doc(content, base_url).write_pdf(filename) @@ -34,7 +35,7 @@ def render_doc(self, content: str, base_url: str, rel_url: str = None): if self.combined: - soup = prep_combined(soup, base_url, rel_url) + soup = prep_combined(soup, base_url, rel_url, self.output_dir) else: soup = prep_separate(soup, base_url) @@ -45,7 +46,7 @@ def add_doc(self, content: str, base_url: str, rel_url: str): pos = self.page_order.index(rel_url) self.pages[pos] = (content, base_url, rel_url) - def write_combined_pdf(self, output_path: str): + def write_combined_pdf(self, output_path: str, output_dir:str): rendered_pages = [] for p in self.pages: if p is None: