zhaoterryy · erilot · Nov 18, 2019
diff --git a/README.md b/README.md
@@ -1,4 +1,12 @@
-# MkDocs PDF Export Plugin [![Build Status][travis-status]][travis-link]
+# MkDocs PDF Export Plugin (refactored)
+
+This project is a fork of mkdocs-pdf-export-plugin by @zhaoterryy.
+
+This version correctly resolves inter-page cross reference links in generated PDF documents, no matter where they are in a nested directory structure.
+
+**IMPORTANT** I have not registered this package with PyPi and following the instructions below will install the *original* package, not this one. For now the only way to use this is to install it locally, or use pip and then overwrite the plugin with these files.
+
+---
 
 *An MkDocs plugin to export content pages as PDF files*
 

diff --git a/mkdocs_pdf_export_plugin/plugin.py b/mkdocs_pdf_export_plugin/plugin.py
@@ -16,7 +16,9 @@ class PdfExportPlugin(BasePlugin):
         ('enabled_if_env', config_options.Type(utils.string_types)),
         ('combined', config_options.Type(bool, default=False)),
         ('combined_output_path', config_options.Type(utils.string_types, default="pdf/combined.pdf")),
-        ('theme_handler_path', config_options.Type(utils.string_types))
+        ('theme_handler_path', config_options.Type(utils.string_types)),
+        # Declaring 'docs' base directory manually here; this should come in with the global config object though!
+        ('output_dir', config_options.Type(utils.string_types, default='site'))
     )
 
     def __init__(self):
@@ -41,8 +43,7 @@ def on_config(self, config):
             print('Combined PDF export is enabled')
 
         from .renderer import Renderer
-        self.renderer = Renderer(self.combined, config['theme'].name, self.config['theme_handler_path'])
-
+        self.renderer = Renderer(self.combined, config['theme'].name, self.config['theme_handler_path'], self.config['output_dir'])
         from weasyprint.logger import LOGGER
         import logging
 
@@ -88,6 +89,7 @@ def on_post_page(self, output_content, page, config):
 
         from weasyprint import urls
         base_url = urls.path2url(os.path.join(path, filename))
+
         pdf_file = filename + '.pdf'
 
         try:
@@ -116,7 +118,7 @@ def on_post_build(self, config):
 
             abs_pdf_path = os.path.join(config['site_dir'], self.config['combined_output_path'])
             os.makedirs(os.path.dirname(abs_pdf_path), exist_ok=True)
-            self.renderer.write_combined_pdf(abs_pdf_path)
+            self.renderer.write_combined_pdf(abs_pdf_path, self.config['output_dir'])
 
             end = timer()
             self.total_time += (end - start)

diff --git a/mkdocs_pdf_export_plugin/preprocessor/links/__init__.py b/mkdocs_pdf_export_plugin/preprocessor/links/__init__.py
@@ -1,2 +1,2 @@
 from .transform import transform_href, transform_id
-from .util import get_body_id, replace_asset_hrefs, rel_pdf_href
+from .util import get_body_id, replace_asset_hrefs, rel_pdf_href, get_xref_href
diff --git a/mkdocs_pdf_export_plugin/preprocessor/links/transform.py b/mkdocs_pdf_export_plugin/preprocessor/links/transform.py
@@ -1,36 +1,32 @@
 import os
 
-from .util import is_doc, normalize_href
+from .util import is_doc, normalize_href, abs_asset_href, get_xref_href
 
 # normalize href to #foo/bar/section:id
-def transform_href(href: str, rel_url: str):
-    head, tail = os.path.split(href)
-
-    num_hashtags = tail.count('#')
-
-    if tail.startswith('#'):
-        head, section = os.path.split(rel_url)
-        section = os.path.splitext(section)[0]
-        id = tail[1:]
-    elif num_hashtags is 1:
-        section, ext = tuple(os.path.splitext(tail))
-        id = str.split(ext, '#')[1]
-
-        if head == '..':
-            href = normalize_href(href, rel_url)
-            return '#{}:{}'.format(href, id)
-
-    elif num_hashtags is 0:
-        if not is_doc(href):
-            return href
-
-        href = normalize_href(href, rel_url)
-        return '#{}:'.format(href)
-
-    if head != '':
-        head += '/'
+def transform_href(href: str, rel_url: str, base_url: str, output_dir: str):
+
+    # print('---')
+    # print('Link in: "{}". baseurl:"{}". rel_url:"{}"'.format(href, base_url, rel_url))
+    if href.count('#') is 1:
+
+        path, anchor = href.split('#')
+
+        if path is '':
+            # print('Building xref. Path:{}, base_url:{}'.format(path,base_url))
+            xref = rel_url.strip('/')
+        else:
+            xref = get_xref_href(path.strip('/'), base_url, output_dir)
+
+        out = '#{}:{}'.format(xref, anchor)
+
+    else:
+        xref = get_xref_href(href, base_url, output_dir)
+
+        out = '#{}:'.format(xref)
+
+    # print('Link out: "{}"'.format(out))
+    return out
 
-    return '#{}{}:{}'.format(head, section, id)
 
 # normalize id to foo/bar/section:id
 def transform_id(id: str, rel_url: str):
@@ -40,4 +36,4 @@ def transform_id(id: str, rel_url: str):
     if len(head) > 0:
         head += '/'
 
-    return '{}{}:{}'.format(head, section, id)
+    return '{}{}:{}'.format(head, section, id)
diff --git a/mkdocs_pdf_export_plugin/preprocessor/links/util.py b/mkdocs_pdf_export_plugin/preprocessor/links/util.py
@@ -59,12 +59,22 @@ def reduce_rel(x):
             return x
 
     rel_dir = os.path.dirname(rel_url)
-    href = str.split(os.path.join(rel_dir, href), '/')
+    href = str.split(os.path.join(rel_dir, href), os.sep)
     href = reduce_rel(href)
     href[-1], _ = os.path.splitext(href[-1])
 
     return os.path.join(*href)
 
 def get_body_id(url: str):
     section, _ = os.path.splitext(url)
-    return '{}:'.format(section)
+    return '{}:'.format(section).strip('/')
+
+# Prepare inter-page xrefs; these are recalculated relative to the output directory
+def get_xref_href(href: str, base_url: str, output_dir: str):
+    full_url = urls.iri_to_uri(urls.urljoin(base_url, href))
+    rel_url = full_url.split(output_dir)[-1]
+    out, _ = os.path.splitext(rel_url)
+
+    return out.strip('/')
+
+
diff --git a/mkdocs_pdf_export_plugin/preprocessor/prep.py b/mkdocs_pdf_export_plugin/preprocessor/prep.py
@@ -5,16 +5,40 @@
 from weasyprint import urls
 from bs4 import BeautifulSoup
 
-def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str):
-    for id in soup.find_all(id=True):
-        id['id'] = transform_id(id['id'], rel_url)
+def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str, output_dir: str):
 
-    for a in soup.find_all('a', href=True):
-        if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
-            continue
+    # the relative URL base is the incoming rel_url with the extension and trailing slash stripped off.
+    if rel_url.count('.') is 1:
+        rel_url_base, _ = rel_url.split('.')
+    else:
+        rel_url_base = rel_url
+    rel_url_base = rel_url_base.strip('/')
+
+    # Only process links and headings that are inside the article tag
+    for article in soup.find_all('article'):
+
+        # If the permalink plugin is active, headings will contain extra useless links. Delete these.
+        for headerlink in article.find_all('a', {'class':'headerlink'}):
+            headerlink.decompose()
+
+        # Process H1 elements. There *should* be only one of these per page, but process any that are found.
+        # Format for H1 IDs is "path/to/page:" with a trailing colon.
+        for title in article.find_all('h1'):
+            title['id'] = '{}:'.format(rel_url_base)
+            print('[pdf export] Processing page: "{}"'.format(title.string))
+
+        # process H2-6 elements. 
+        # Format for H2-6 IDs is "path/to/page:subhead-id".
+        for heading in article.find_all(['h2','h3','h4','h5','h6']):
+            heading['id'] = '{}:{}'.format(rel_url_base, heading.get('id'))
+
+        # process body (article) link hrefs.
+        for a in article.find_all('a', href=True):
+            if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
+                continue
+
+            a['href'] = transform_href(a['href'], rel_url_base, base_url, output_dir)
 
-        a['href'] = transform_href(a['href'], rel_url)
-
     soup.body['id'] = get_body_id(rel_url)
     soup = replace_asset_hrefs(soup, base_url)
     return soup

diff --git a/mkdocs_pdf_export_plugin/renderer.py b/mkdocs_pdf_export_plugin/renderer.py
@@ -10,12 +10,13 @@
 from .preprocessor import get_separate as prep_separate, get_combined as prep_combined
 
 class Renderer(object):
-    def __init__(self, combined: bool, theme: str, theme_handler_path: str=None):
+    def __init__(self, combined: bool, theme: str, theme_handler_path: str=None, output_dir: str='site'):
         self.theme = self._load_theme_handler(theme, theme_handler_path)
         self.combined = combined
         self.page_order = []
         self.pgnum = 0
         self.pages = []
+        self.output_dir = output_dir
 
     def write_pdf(self, content: str, base_url: str, filename: str):
         self.render_doc(content, base_url).write_pdf(filename)
@@ -34,7 +35,7 @@ def render_doc(self, content: str, base_url: str, rel_url: str = None):
 
 
         if self.combined:
-            soup = prep_combined(soup, base_url, rel_url)
+            soup = prep_combined(soup, base_url, rel_url, self.output_dir)
         else:
             soup = prep_separate(soup, base_url)
 
@@ -45,7 +46,7 @@ def add_doc(self, content: str, base_url: str, rel_url: str):
         pos = self.page_order.index(rel_url)
         self.pages[pos] = (content, base_url, rel_url)
 
-    def write_combined_pdf(self, output_path: str):
+    def write_combined_pdf(self, output_path: str, output_dir:str):
         rendered_pages = []
         for p in self.pages:
             if p is None: