diff --git a/Dockerfile b/Dockerfile
index e9787418..02a400a7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,7 +14,9 @@ RUN apt-get update && \
libgmp10 \
libgmpxx4ldbl \
openjdk-8-jdk \
+ pandoc \
python3-minimal \
+ python-nh3 \
python3-pip \
python3-plastex \
python3-yaml \
diff --git a/README.md b/README.md
index 601de517..e490f67e 100644
--- a/README.md
+++ b/README.md
@@ -207,17 +207,17 @@ The dependencies needed to *build/install* problemtools can be installed with:
And the dependencies needed to *run* problemtools can be installed with:
- sudo apt install ghostscript libgmpxx4ldbl python3-minimal python-pkg-resources python3-plastex python3-yaml texlive-fonts-recommended texlive-lang-cyrillic texlive-latex-extra texlive-plain-generic tidy
+ sudo apt install ghostscript libgmpxx4ldbl pandoc python3-minimal python-nh3 python-pkg-resources python3-plastex python3-yaml texlive-fonts-recommended texlive-lang-cyrillic texlive-latex-extra texlive-plain-generic tidy
### Fedora
On Fedora, these dependencies can be installed with:
- sudo dnf install boost-regex gcc gmp-devel gmp-c++ python3 python3-pyyaml texlive-latex texlive-collection-fontsrecommended texlive-fancyhdr texlive-subfigure texlive-wrapfig texlive-import texlive-ulem texlive-xifthen texlive-overpic texlive-pbox tidy ghostscript
+ sudo dnf install boost-regex gcc gmp-devel gmp-c++ pandoc python3 python3-pyyaml texlive-latex texlive-collection-fontsrecommended texlive-fancyhdr texlive-subfigure texlive-wrapfig texlive-import texlive-ulem texlive-xifthen texlive-overpic texlive-pbox tidy ghostscript
Followed by:
- pip3 install --user plastex
+ pip3 install --user plastex nh3
### Arch
Package is available on the AUR [kattis-problemtools-git](https://aur.archlinux.org/packages/kattis-problemtools-git). Use your favorite AUR helper or follow the installation instructions found [here](https://wiki.archlinux.org/title/Arch_User_Repository#Installing_and_upgrading_packages).
diff --git a/admin/docker/Dockerfile.build b/admin/docker/Dockerfile.build
index e2f7a3bf..e7041fb9 100644
--- a/admin/docker/Dockerfile.build
+++ b/admin/docker/Dockerfile.build
@@ -25,6 +25,7 @@ RUN apt update && \
libgmp-dev \
libgmp10 \
libgmpxx4ldbl \
+ pandoc \
python3 \
python3-pytest \
python3-setuptools \
diff --git a/admin/docker/Dockerfile.full b/admin/docker/Dockerfile.full
index 40580dd6..9fb5196a 100644
--- a/admin/docker/Dockerfile.full
+++ b/admin/docker/Dockerfile.full
@@ -23,6 +23,7 @@ RUN apt-get update && \
mono-complete \
nodejs \
ocaml-nox \
+ pandoc \
php-cli \
pypy \
rustc \
diff --git a/admin/docker/Dockerfile.minimal b/admin/docker/Dockerfile.minimal
index 534e661f..886d1a2d 100644
--- a/admin/docker/Dockerfile.minimal
+++ b/admin/docker/Dockerfile.minimal
@@ -20,6 +20,7 @@ RUN apt update && \
apt install -y \
ghostscript \
libgmpxx4ldbl \
+ pandoc \
python-pkg-resources \
python3-minimal \
python3-yaml \
diff --git a/debian/control b/debian/control
index 42797c8b..1d39a4a9 100644
--- a/debian/control
+++ b/debian/control
@@ -2,13 +2,13 @@ Source: kattis-problemtools
Section: devel
Priority: optional
Maintainer: Per Austrin
") + statement_html = statement_html[:pos] + "".join(remaining_samples) + statement_html[pos:] + + with open(destfile, "w", encoding="utf-8", errors="xmlcharrefreplace") as output_file: + output_file.write(statement_html) + + if options.css: + shutil.copyfile(os.path.join(templatepath, "problem.css"), "problem.css") + + return True + + +def sanitize_html(problem: str, statement_html: str): + # Allow footnote ids (the anchor points you jump to) + def is_fn_id(s): + pattern_id_top = r'^fn\d+$' + pattern_id_bottom = r'^fnref\d+$' + return bool(re.fullmatch(pattern_id_top, s)) or bool(re.fullmatch(pattern_id_bottom, s)) + + allowed_classes = ("sample", "problemheader", "problembody", + "sampleinteractionwrite", "sampleinteractionread", + "footnotes") + + def is_image_valid(problem_root: str, img_src: str) -> str | None: + # Check that the image exists and uses an allowed extension + extension = Path(img_src).suffix + # TODO: fix svg sanitization and allow svg + if extension not in statement_util.ALLOWED_IMAGE_EXTENSIONS: + return f"Unsupported image extension {extension} for image {img_src}" + + source_file = Path(problem_root) / "statement" / img_src + if not source_file.exists(): + return f"Resource file {img_src} not found in statement" + return None + + # Annoying: nh3 will ignore exceptions in attribute_filter + image_fail_reason: str | None = None + + def attribute_filter(tag, attribute, value): + if attribute == "class" and value in allowed_classes: + return value + if tag == "a" and attribute == "href": + return value + if tag in ("li", "a") and attribute == "id" and is_fn_id(value): + return value + if tag == "img" and attribute == "src": + fail = is_image_valid(problem, value) + if fail: + nonlocal image_fail_reason + image_fail_reason = fail + return None + copy_image(problem, value) + return value + return None + + statement_html = nh3.clean(statement_html, + link_rel="noopener nofollow noreferrer", + attribute_filter=attribute_filter, + tags=nh3.ALLOWED_TAGS | {"img", "a", "section"}, + attributes={"table": {"class"}, "div": {"class"}, "section": {"class"}, "img": {"src"}, + "a": {"href", "id"}, "li": {"id"}}, + ) + + if image_fail_reason: + assert isinstance(image_fail_reason, str) + if "Unsupported" in image_fail_reason: + raise ValueError(image_fail_reason) + raise FileNotFoundError(image_fail_reason) + + return statement_html + + +def copy_image(problem_root: str, img_src: str) -> None: + """Copy image to output directory + + Args: + problem_root: the root of the problem directory + img_src: the image source as in the Markdown statement + """ + + source_name = os.path.join(problem_root, "statement", img_src) + + if os.path.isfile(img_src): # already copied + return + shutil.copyfile(source_name, img_src) diff --git a/problemtools/problem2html.py b/problemtools/problem2html.py index 86137a59..1807c2ad 100644 --- a/problemtools/problem2html.py +++ b/problemtools/problem2html.py @@ -4,67 +4,21 @@ import os.path import string import argparse -import logging import subprocess -from . import template +from . import tex2html +from . import md2html +from . import statement_util def convert(options: argparse.Namespace) -> None: - # PlasTeX.Logging statically overwrites logging and formatting, so delay loading - import plasTeX.TeX - import plasTeX.Logging - from .ProblemPlasTeX import ProblemRenderer - from .ProblemPlasTeX import ProblemsetMacros - problem = os.path.realpath(options.problem) + if not os.path.isdir(problem): + raise Exception(f"Problem does not exist: {problem}") + problembase = os.path.splitext(os.path.basename(problem))[0] destdir = string.Template(options.destdir).safe_substitute(problem=problembase) destfile = string.Template(options.destfile).safe_substitute(problem=problembase) - imgbasedir = string.Template(options.imgbasedir).safe_substitute(problem=problembase) - - if options.quiet: - plasTeX.Logging.disableLogging() - else: - plasTeX.Logging.getLogger().setLevel(getattr(logging, options.loglevel.upper())) - plasTeX.Logging.getLogger('status').setLevel(getattr(logging, options.loglevel.upper())) - - texfile = problem - # Set up template if necessary - with template.Template(problem, language=options.language) as templ: - texfile = open(templ.get_file_name(), 'r') - - origcwd = os.getcwd() - - # Setup parser and renderer etc - - # plasTeX version 3 changed the name of this argument (and guarding against this - # by checking plasTeX.__version__ fails on plastex v3.0 which failed to update - # __version__) - try: - tex = plasTeX.TeX.TeX(myfile=texfile) - except Exception: - tex = plasTeX.TeX.TeX(file=texfile) - - ProblemsetMacros.init(tex) - - tex.ownerDocument.config['general']['copy-theme-extras'] = options.css - if not options.headers: - tex.ownerDocument.userdata['noheaders'] = True - tex.ownerDocument.config['files']['filename'] = destfile - tex.ownerDocument.config['images']['filenames'] = 'img-$num(4)' - tex.ownerDocument.config['images']['enabled'] = False - tex.ownerDocument.config['images']['imager'] = 'none' - tex.ownerDocument.config['images']['base-url'] = imgbasedir - # tell plasTeX where to search for problemtools' built-in packages - tex.ownerDocument.config['general']['packages-dirs'] = [os.path.join(os.path.dirname(__file__), 'ProblemPlasTeX')] - - renderer = ProblemRenderer() - - if not options.quiet: - print('Parsing TeX source...') - doc = tex.parse() - texfile.close() # Go to destdir if destdir: @@ -75,12 +29,13 @@ def convert(options: argparse.Namespace) -> None: try: if not options.quiet: print('Rendering!') - renderer.render(doc) - # Annoying: I have not figured out any way of stopping the plasTeX - # renderer from generating a .paux file - if os.path.isfile('.paux'): - os.remove('.paux') + origcwd = os.getcwd() + + if statement_util.find_statement_extension(problem, options.language) == "tex": + tex2html.convert(problem, options) + else: + md2html.convert(problem, options) if options.tidy: with open(os.devnull, 'w') as devnull: diff --git a/problemtools/problem2pdf.py b/problemtools/problem2pdf.py index ec2b3ad1..4e687fac 100644 --- a/problemtools/problem2pdf.py +++ b/problemtools/problem2pdf.py @@ -1,20 +1,111 @@ #! /usr/bin/env python3 # -*- coding: utf-8 -*- +import argparse import os.path +import re import shutil import string -import argparse import subprocess +import tempfile +from pathlib import Path + from . import template +from . import statement_util def convert(options: argparse.Namespace) -> bool: - problem = os.path.realpath(options.problem) - problembase = os.path.splitext(os.path.basename(problem))[0] + problem_root = os.path.realpath(options.problem) + + if statement_util.find_statement_extension(problem_root, language=options.language) == "md": + return md2pdf(options) + else: + return latex2pdf(options) + + +def md2pdf(options: argparse.Namespace) -> bool: + """Renders a Markdown document to pdf. Uses pandoc md -> tex, then + reuses the normal tex -> pdf pipeline + """ + problem_root = os.path.realpath(options.problem) + statement_path = statement_util.find_statement(problem_root, extension="md", language=options.language) + + if not statement_path or not os.path.isfile(statement_path): + raise FileNotFoundError(f"Error! {statement_path} does not exist") + + statement_util.assert_images_are_valid_md(statement_path) + + language = options.language + if not language: + language = "en" + temp_tex_file = Path(statement_path).parent / f"problem.{language}.tex" + command = ["pandoc", statement_path, "-o", str(temp_tex_file)] + try: + subprocess.run(command, capture_output=True, + text=True, shell=False, check=True + ) + except subprocess.CalledProcessError as e: + print(f"Error compiling Markdown to pdf: {e.stderr}") + return False + + try: + with open(temp_tex_file, "r", encoding="utf-8") as f: + tex = f.read() + + def format_latex_tables(latex_doc): + # Match table environments produced by pandoc + pattern = r''' + (\\begin\{longtable\}\[\]\{@\{\}) + ([a-z]) + ([a-z]*) + (@\{\}\}) + ''' + + def replacer(match): + prefix = match.group(1)[:-3] + first_col = match.group(2) + other_cols = match.group(3) + suffix = match.group(4)[3:] + + # Combine columns with | separators + cols = [first_col] + list(other_cols) + return f'{prefix}|{"|".join(cols)}|{suffix} \\hline' + + return re.sub(pattern, replacer, latex_doc, flags=re.VERBOSE) + + # Add solid outline to tables + tex = format_latex_tables(tex) + tex = tex.replace(r"\toprule", "") + tex = tex.replace(r"\midrule", "") + tex = tex.replace(r"\endhead", "") + tex = tex.replace(r"\bottomrule", "") + tex = tex.replace(r"\tabularnewline", r"\\ \hline") + + # Fix sample inclusions commands + # Currently does not work, as normal problemtools tex -> pdf does not support it + tex = tex.replace(r"\{\{nextsample\}\}", r"\nextsample") + tex = tex.replace(r"\{\{remainingsamples\}\}", r"\remainingsamples") + + problem_name = statement_util.get_yaml_problem_name(problem_root, options.language) + tex = r'\problemname{' + problem_name + '}\n' + tex + with open(temp_tex_file, "w", encoding="utf-8") as f: + f.write(tex) + + status = latex2pdf(options) + if status != 0: + return False + finally: + temp_tex_file.unlink() + + return status == 0 + + +def latex2pdf(options: argparse.Namespace) -> bool: + problem_root = os.path.realpath(options.problem) + problembase = os.path.splitext(os.path.basename(problem_root))[0] destfile = string.Template(options.destfile).safe_substitute(problem=problembase) # Set up template if necessary - with template.Template(problem, language=options.language) as templ: + with template.Template(problem_root, language=options.language) as templ: texfile = templ.get_file_name() origcwd = os.getcwd() @@ -41,7 +132,25 @@ def convert(options: argparse.Namespace) -> bool: if status == 0 and not options.nopdf: shutil.move(os.path.splitext(texfile)[0] + '.pdf', destfile) - return status == 0 + if status: + return False + + try: + with tempfile.NamedTemporaryFile(suffix='.pdf') as f: + command = ["gs", "-q", "-dBATCH", "-sDEVICE=pdfwrite", "-dNOPAUSE", + "-dCompatibilityLevel=1.7", f"-sOutputFile={f.name}", destfile] + gs_status = subprocess.run(command, capture_output=True, + text=True, shell=False, check=True + ) + if gs_status: + return False + shutil.copy(f.name, destfile) + except subprocess.CalledProcessError as e: + print(f"Error sanitizing PDF: {e} {e.stderr}") + raise + + return True + def get_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) diff --git a/problemtools/statement_util.py b/problemtools/statement_util.py new file mode 100644 index 00000000..3beab3d8 --- /dev/null +++ b/problemtools/statement_util.py @@ -0,0 +1,270 @@ +import os +from typing import Optional, List, Tuple +import html +import json +import re +import subprocess +import tempfile +from pathlib import Path + +import yaml + +from . import formatversion + +SUPPORTED_EXTENSIONS = ("tex", "md") +ALLOWED_IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg") # ".svg" + + +def find_statement(problem_root: str, extension: str, language: Optional[str]) -> Optional[str]: + """Finds the "best" statement for given language and extension""" + statement_dir = Path(problem_root) / formatversion.get_format_data(problem_root).statement_directory + + candidates = [] + if language is None: + candidates = [ + statement_dir / f"problem.en.{extension}", + statement_dir / f"problem.{extension}", + ] + else: + candidates = [statement_dir / f"problem.{language}.{extension}"] + + for candidate in candidates: + if candidate.is_file(): + return str(candidate) + + return None + + +def find_statement_extension(problem_root: str, language: Optional[str]) -> str: + """Given a language, find whether the extension is tex or md + + Args: + problem_root: path to problem root + """ + extensions = [] + for ext in SUPPORTED_EXTENSIONS: + if find_statement(problem_root, ext, language) is not None: + extensions.append(ext) + # At most one extension per language to avoid arbitrary/hidden priorities + if len(extensions) > 1: + raise ValueError(f"""Found more than one type of statement ({' and '.join(extensions)}) + for language {language or 'en'}""") + if len(extensions) == 1: + return extensions[0] + raise FileNotFoundError(f"No statement found for language {language or 'en'}") + + +def get_yaml_problem_name(problem: str, language: Optional[str]) -> str: + """Finds the problem name from the problem.yaml file""" + # TODO: getting this should be done using verifyproblem + # Wait until new config parsing system is in place + config_file = Path(problem) / 'problem.yaml' + + if not config_file.is_file(): + raise FileNotFoundError("No problem.yaml found") + + try: + with open(config_file, "r", encoding="utf-8") as f: + config = yaml.safe_load(f) + if config is None: + config = {} + except Exception as e: + raise ValueError(f"Invalid problem.yaml: {e}") from e + + if 'name' in config and not isinstance(config['name'], dict): + config['name'] = {'': config['name']} + + names = config.get("name") + # If there is only one language, per the spec that is the one we want + if len(names) == 1: + return next(iter(names.values())) + + if language is None: + language = "en" + if language not in names: + raise ValueError(f"No problem name defined for language {language or 'en'}") + return names[language] + + +def json_dfs(data, callback) -> None: + """Traverse all items in a JSON tree, find all images, and call callback for each one""" + if isinstance(data, dict): + for key, value in data.items(): + # Markdown-style images + if key == 't' and value == 'Image': + callback(data['c'][2][0]) + else: + json_dfs(value, callback) + + elif isinstance(data, list): + for item in data: + json_dfs(item, callback) + + +def foreach_image(statement_path, callback): + """ Find all images in the statement and call callback for each one """ + command = ["pandoc", statement_path, "-t", "json"] + # Must create a working directory for pytest to work + with tempfile.TemporaryDirectory() as work_dir: + statement_json = subprocess.run(command, capture_output=True, text=True, + shell=False, check=True, cwd=work_dir).stdout + + json_dfs(json.loads(statement_json), callback) + + +def assert_image_is_valid(problem_root: str, img_src: str) -> None: + """ Check that the image exists and uses an allowed extension """ + extension = Path(img_src).suffix + # TODO: fix svg sanitization and allow svg + if extension not in ALLOWED_IMAGE_EXTENSIONS: + raise ValueError(f"Unsupported image extension {extension} for image {img_src}") + + source_file = Path(problem_root) / img_src + if not source_file.exists(): + raise FileNotFoundError(f"Resource file {img_src} not found in statement") + + +def assert_images_are_valid_md(statement_path: str) -> None: + """ Find all images in the statement and assert that they exist and + use valid image extensions + + """ + problem_root = os.path.dirname(statement_path) + foreach_image(statement_path, + lambda img_name: assert_image_is_valid(problem_root, img_name)) + + +def inject_samples(statement_html: str, samples: List[str]) -> Tuple[str, List[str]]: + """Injects samples at occurences of {{nextsample}} and {{remainingsamples}} + Non-destructive + + Returns: + Statement with samples inject and left-over samples. + """ + + while True: + match = re.search(r'\{\{(nextsample|remainingsamples)\}\}', statement_html) + if not match: + break + matched_text = match.group(1) + if matched_text == "nextsample" and len(samples) == 0: + raise ValueError("Error: called {{nextsample}} without any samples left") + + num_inject = 1 if matched_text == "nextsample" else len(samples) + to_inject = "".join(samples[:num_inject]) + samples = samples[num_inject:] + + # Always inject, even if to_inject is empty + # This will remove all occurences of {{nextsample}} and {{remainingsamples}} + # (And also properly throw an error if {{nextsample}} is called with no samples left) + statement_html = statement_html[:match.start()] + to_inject + statement_html[match.end():] + + return statement_html, samples + + +def format_samples(problem_root: str) -> List[str]: + """Read all samples from the problem directory and convert them to pandoc-valid markdown + + Args: + problem_root: path to root of problem + + Returns: + List[str]: All samples, converted to a format appropriate to be pasted into + a markdown file. Ordered lexicographically by file names + """ + + sample_path = os.path.join(problem_root, "data", "sample") + if not os.path.isdir(sample_path): + return [] + samples = [] + casenum = 1 + for sample in sorted(os.listdir(sample_path)): + if sample.endswith(".interaction"): + samples.append(format_interactive_sample(sample_path, sample, casenum)) + casenum += 1 + continue + + if not sample.endswith(".in"): + continue + sample_name = sample[:-3] + outpath = os.path.join(sample_path, sample_name + ".ans") + if not os.path.isfile(outpath): + continue + + samples.append(format_normal_sample(sample_path, sample, casenum)) + casenum += 1 + + return samples + + +def format_normal_sample(sample_root: str, sample: str, casenum: int) -> str: + """ + + Args: + sample_root: root of the sample folder + sample: file name of the sample + casenum: which sample is this? (1, 2, 3...) + + Returns: + str: the sample, ready to be pasted into a markdown doc and fed to pandoc + """ + + with open(os.path.join(sample_root, sample), "r", encoding="utf-8") as infile: + sample_input = infile.read() + sample_name = sample[:-3] + outpath = os.path.join(sample_root, sample_name + ".ans") + with open(outpath, "r", encoding="utf-8") as outfile: + sample_output = outfile.read() + + return """ +
Sample Input %(case)d | +Sample Output %(case)d | +
---|---|
%(input)s |
+ %(output)s |
+
""" % ({"case": casenum, "input": html.escape(sample_input), + "output": html.escape(sample_output)}) + + +def format_interactive_sample(sample_root: str, sample: str, casenum: int) -> str: + """ + + Args: + sample_root: root of the sample folder + sample: file name of the sample + casenum: which sample is this? (1, 2, 3...) + + Returns: + str: the sample, ready to be pasted into a markdown doc and fed to pandoc + """ + + line = f""" +
Read | +Sample Interaction {casenum} | +Write | +
---|
""" + + with open(os.path.join(sample_root, sample), "r", encoding="utf-8") as infile: + sample_interaction = infile.readlines() + lines = [] + for interaction in sample_interaction: + data = html.escape(interaction[1:]) + line_type = "" + if interaction[0] == '>': + line_type = "sampleinteractionwrite" + elif interaction[0] == '<': + line_type = "sampleinteractionread" + else: + print(f"Warning: Interaction had unknown prefix {interaction[0]}") + lines.append(f"""
{html.escape(data)}
""") + + return line + ''.join(lines) diff --git a/problemtools/template.py b/problemtools/template.py index f0c7bc4b..b1e73466 100644 --- a/problemtools/template.py +++ b/problemtools/template.py @@ -16,18 +16,14 @@ def detect_version(problemdir, problemtex): class Template: - def __init__(self, problemdir, language=None, force_copy_cls=False, version="automatic"): + def __init__(self, problemdir, language=None, force_copy_cls=False): if not os.path.isdir(problemdir): raise Exception('%s is not a directory' % problemdir) if problemdir[-1] == '/': problemdir = problemdir[:-1] - if version == "automatic": - version_data = formatversion.get_format_data(problemdir) - - else: - version_data = formatversion.get_format_data_by_name(version) + version_data = formatversion.get_format_data(problemdir) stmtdir = os.path.join(problemdir, version_data.statement_directory) langs = [] diff --git a/problemtools/templates/latex/problemset.cls b/problemtools/templates/latex/problemset.cls index 1700901e..f747551c 100644 --- a/problemtools/templates/latex/problemset.cls +++ b/problemtools/templates/latex/problemset.cls @@ -50,6 +50,8 @@ \RequirePackage{url} % Urls \RequirePackage[normalem]{ulem} % \sout \RequirePackage[colorlinks=true,implicit=false]{hyperref} +\RequirePackage{longtable} % TODO: needed by Pandoc, but what do they do? +\RequirePackage{booktabs} % -||- \ifplastex\else \RequirePackage{xstring} \RequirePackage{pgffor} @@ -85,6 +87,9 @@ \addtolength{\textheight}{-\headheight} } +% Pandoc outputs these +\newcommand{\tightlist}{% + \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} % Typesetting sections in a problem diff --git a/problemtools/templates/markdown_html/default-layout.html b/problemtools/templates/markdown_html/default-layout.html new file mode 100644 index 00000000..93a84572 --- /dev/null +++ b/problemtools/templates/markdown_html/default-layout.html @@ -0,0 +1,35 @@ + + +
+ +
+ + + + + + +
+