diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a869c05..7d0e306 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,7 +35,7 @@ jobs: test: strategy: matrix: - html_parser: [floki, meeseeks] + html_parser: [Floki, Meeseeks, LazyHTML] version: - otp: 28.0 elixir: 1.19 diff --git a/CHANGELOG.md b/CHANGELOG.md index b23d4a0..4e1217a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Requires Elixir 1.14 or higher. * Fixed compiler warnings in `Premailex.HTMLParser.Meeseeks` * Fixed invalid spec in `Premailex.HTMLInlineStyles.process/3` +* Added support for `LazyHTML` ## v0.3.20 (2025-01-20) diff --git a/README.md b/README.md index 1c56139..e7a6b38 100644 --- a/README.md +++ b/README.md @@ -89,10 +89,32 @@ end ## HTML parser -By default, premailex uses [`Floki`](https://github.com/philss/floki) to parse HTML, but you can exchange it for any HTML parser you prefer. [`Meeseeks`](https://github.com/mischov/meeseeks) is supported with the [`Premailex.HTMLParser.Meeseeks`](/lib/premailex/html_parser/meeseeks.ex) module. To use it, add the following to `config.exs`: +Premailex supports multiple HTML parsers and will automatically select one based on what's available: + +1. [`Floki`](https://github.com/philss/floki) (default if available) +2. [`Meeseeks`](https://github.com/mischov/meeseeks) +3. [`LazyHTML`](https://github.com/wojtekmach/lazy_html) + +**At least one parser dependency must be added to your `mix.exs`:** + +```elixir +def deps do + [ + {:premailex, "~> 0.3.20"}, + # Add at least one of these: + {:floki, "~> 0.19"}, # Recommended default + # {:meeseeks, "~> 0.11"}, # Alternative option + # {:lazy_html, "~> 0.1.8"}, # Alternative option + ] +end +``` + +To explicitly configure which parser to use, add to your `config.exs`: ```elixir config :premailex, html_parser: Premailex.HTMLParser.Meeseeks +# or +config :premailex, html_parser: Premailex.HTMLParser.LazyHTML ``` ## LICENSE diff --git a/lib/premailex/html_parser.ex b/lib/premailex/html_parser.ex index 3391430..04a0fcd 100644 --- a/lib/premailex/html_parser.ex +++ b/lib/premailex/html_parser.ex @@ -1,9 +1,14 @@ defmodule Premailex.HTMLParser do @moduledoc """ Module that provide HTML parsing API using an underlying HTML parser library. - """ - @default_parser Premailex.HTMLParser.Floki + By default, premailex will try to use Floki, then LazyHTML, then Meeseeks + (in that order) based on what's available. + + You can explicitly configure which parser to use in your config: + + config :premailex, html_parser: Premailex.HTMLParser.LazyHTML + """ @type html_tree :: tuple() | list() @type selector :: binary() @@ -23,7 +28,39 @@ defmodule Premailex.HTMLParser do {"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Title"]}]}]} """ @spec parse(binary()) :: html_tree() - def parse(html), do: parser().parse(html) + def parse(html), do: html_parser().parse(html) + + defp html_parser do + case Application.get_env(:premailex, :html_parser) || default_html_parser!() do + mod when is_atom(mod) -> mod + other -> raise "Invalid html_parser, got: #{inspect(other)}" + end + end + + defp default_html_parser! do + cond do + Code.ensure_loaded?(Floki) -> + Premailex.HTMLParser.Floki + + Code.ensure_loaded?(LazyHTML) -> + Premailex.HTMLParser.LazyHTML + + Code.ensure_loaded?(Meeseeks) -> + Premailex.HTMLParser.Meeseeks + + true -> + raise """ + No HTML parser is available. Please add at least one of the following dependencies to your mix.exs: + + - {:floki, "~> 0.19"} + - {:meeseeks, "~> 0.11"} + - {:lazy_html, "~> 0.1.8"} + + Or explicitly configure a parser: + config :premailex, html_parser: Premailex.HTMLParser.Floki + """ + end + end @doc """ Searches an HTML tree for the selector. @@ -34,7 +71,7 @@ defmodule Premailex.HTMLParser do [{"h1", [], ["Title"]}] """ @spec all(html_tree(), selector()) :: [html_tree()] - def all(tree, selector), do: parser().all(tree, selector) + def all(tree, selector), do: html_parser().all(tree, selector) @doc """ Filters elements matching the selector from the HTML tree. @@ -45,7 +82,7 @@ defmodule Premailex.HTMLParser do [{"html", [], [{"head", [], []}, {"body", [], []}]}] """ @spec filter(html_tree(), selector()) :: [html_tree()] - def filter(tree, selector), do: parser().filter(tree, selector) + def filter(tree, selector), do: html_parser().filter(tree, selector) @doc """ Turns an HTML tree into a string. @@ -56,7 +93,7 @@ defmodule Premailex.HTMLParser do "

Title

" """ @spec to_string(html_tree()) :: binary() - def to_string(tree), do: parser().to_string(tree) + def to_string(tree), do: html_parser().to_string(tree) @doc """ Extracts text elements from the HTML tree. @@ -67,9 +104,5 @@ defmodule Premailex.HTMLParser do "Title" """ @spec text(html_tree()) :: binary() - def text(tree), do: parser().text(tree) - - defp parser do - Application.get_env(:premailex, :html_parser, @default_parser) - end + def text(tree), do: html_parser().text(tree) end diff --git a/lib/premailex/html_parser/floki.ex b/lib/premailex/html_parser/floki.ex index 4df1103..848be30 100644 --- a/lib/premailex/html_parser/floki.ex +++ b/lib/premailex/html_parser/floki.ex @@ -1,59 +1,61 @@ -defmodule Premailex.HTMLParser.Floki do - @moduledoc false - alias Premailex.HTMLParser - - @behaviour HTMLParser - - @impl true - @doc false - def parse(html) do - html = retain_inline_whitespace(html) - args = [html] - - "< 0.24.0" - |> floki_version_match?() - |> case do - true -> apply(Floki, :parse, args) - false -> apply(Floki, :parse_document, args) +if Code.ensure_loaded?(Floki) do + defmodule Premailex.HTMLParser.Floki do + @moduledoc false + alias Premailex.HTMLParser + + @behaviour HTMLParser + + @impl true + @doc false + def parse(html) do + html = retain_inline_whitespace(html) + args = [html] + + "< 0.24.0" + |> floki_version_match?() + |> case do + true -> apply(Floki, :parse, args) + false -> apply(Floki, :parse_document, args) + end + |> case do + {:ok, [html]} -> html + {:ok, document} -> document + any -> any + end end - |> case do - {:ok, [html]} -> html - {:ok, document} -> document - any -> any - end - end - defp floki_version_match?(req) do - case :application.get_key(:floki, :vsn) do - {:ok, actual} -> - actual - |> List.to_string() - |> Version.match?(req) + defp floki_version_match?(req) do + case :application.get_key(:floki, :vsn) do + {:ok, actual} -> + actual + |> List.to_string() + |> Version.match?(req) - _any -> - false + _any -> + false + end end - end - @impl true - @doc false - def all(tree, selector), do: Floki.find(tree, selector) + @impl true + @doc false + def all(tree, selector), do: Floki.find(tree, selector) - @impl true - @doc false - def filter(tree, selector), do: Floki.filter_out(tree, selector) + @impl true + @doc false + def filter(tree, selector), do: Floki.filter_out(tree, selector) - @impl true - @doc false - def to_string(tree), do: Floki.raw_html(tree) + @impl true + @doc false + def to_string(tree), do: Floki.raw_html(tree) - @impl true - @doc false - def text(tree), do: Floki.text(tree) + @impl true + @doc false + def text(tree), do: Floki.text(tree) - # """ - # This is a tempory fix until mochweb (or floki) has been updated - # to correctly handle whitespace text nodes: https://github.com/mochi/mochiweb/issues/166 - # """ - defp retain_inline_whitespace(html), do: String.replace(html, ~r/\>[ ]+\ <") + # """ + # This is a tempory fix until mochweb (or floki) has been updated + # to correctly handle whitespace text nodes: https://github.com/mochi/mochiweb/issues/166 + # """ + defp retain_inline_whitespace(html), do: String.replace(html, ~r/\>[ ]+\ <") + end end diff --git a/lib/premailex/html_parser/lazy_html.ex b/lib/premailex/html_parser/lazy_html.ex new file mode 100644 index 0000000..fe8e6c3 --- /dev/null +++ b/lib/premailex/html_parser/lazy_html.ex @@ -0,0 +1,110 @@ +if Code.ensure_loaded?(LazyHTML) do + defmodule Premailex.HTMLParser.LazyHTML do + @moduledoc false + + @behaviour Premailex.HTMLParser + + @impl true + @doc false + def parse(html) do + is_document = Regex.match?(~r/ LazyHTML.from_document() + |> LazyHTML.to_tree(skip_whitespace_nodes: true) + else + html + |> LazyHTML.from_fragment() + |> LazyHTML.to_tree(skip_whitespace_nodes: true) + end + |> Enum.reject(&empty_text_node?/1) + + case result do + [html] -> html + html when is_list(html) -> html + html -> html + end + end + + @impl true + @doc false + def all(tree, selector) do + tree + |> to_lazy_html() + |> LazyHTML.query(selector) + |> LazyHTML.to_tree(skip_whitespace_nodes: true) + end + + @impl true + @doc false + def filter(tree, selector) do + tree_list = normalize_tree(tree) + + filter_tree(tree_list, selector) + end + + @impl true + @doc false + def to_string(tree) do + tree + |> to_lazy_html() + |> LazyHTML.to_html(skip_whitespace_nodes: true) + end + + @impl true + @doc false + def text(tree) do + tree + |> to_lazy_html() + |> LazyHTML.text() + end + + defp to_lazy_html(tree) when is_list(tree) do + LazyHTML.from_tree(tree) + end + + defp to_lazy_html(tree) when is_tuple(tree) do + LazyHTML.from_tree([tree]) + end + + defp normalize_tree(tree) when is_list(tree), do: tree + defp normalize_tree(tree) when is_tuple(tree), do: [tree] + + defp filter_tree(tree_list, selector) when is_list(tree_list) do + tree_list + |> Enum.map(fn node -> filter_node(node, selector) end) + |> Enum.reject(fn node -> is_nil(node) or empty_text_node?(node) end) + end + + defp empty_text_node?(""), do: true + defp empty_text_node?(text) when is_binary(text), do: String.trim(text) == "" + defp empty_text_node?(_), do: false + + defp filter_node(node, selector) when is_tuple(node) do + {tag, attrs, children} = node + + node_without_children = {tag, attrs, []} + lazy_html = LazyHTML.from_tree([node_without_children]) + matches = LazyHTML.query(lazy_html, selector) + + node_matches = LazyHTML.to_tree(matches) != [] + + if node_matches do + nil + else + filtered_children = filter_tree(children, selector) + {tag, attrs, filtered_children} + end + end + + defp filter_node(node, _selector) when is_binary(node) do + node + end + + defp filter_node({:comment, _text} = node, _selector) do + node + end + end +end diff --git a/mix.exs b/mix.exs index 3008e8c..83cbb8e 100644 --- a/mix.exs +++ b/mix.exs @@ -17,6 +17,7 @@ defmodule Premailex.Mixfile do Meeseeks, Meeseeks.Document, Meeseeks.Selector.CSS, + LazyHTML, :ssl_verify_hostname ] ], @@ -39,8 +40,9 @@ defmodule Premailex.Mixfile do defp deps do [ - {:floki, "~> 0.19"}, + {:floki, "~> 0.19", optional: true}, {:meeseeks, "~> 0.11", optional: true}, + {:lazy_html, "~> 0.1.8", optional: true}, {:certifi, ">= 0.0.0", optional: true}, {:ssl_verify_fun, ">= 0.0.0", optional: true}, diff --git a/mix.lock b/mix.lock index 5d7bc58..4e897cf 100644 --- a/mix.lock +++ b/mix.lock @@ -1,15 +1,19 @@ %{ "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, "castore": {:hex, :castore, "1.0.18", "5e43ef0ec7d31195dfa5a65a86e6131db999d074179d2ba5a8de11fe14570f55", [:mix], [], "hexpm", "f393e4fe6317829b158fb74d86eb681f737d2fe326aa61ccf6293c4104957e34"}, + "cc_precompiler": {:hex, :cc_precompiler, "0.1.11", "8c844d0b9fb98a3edea067f94f616b3f6b29b959b6b3bf25fee94ffe34364768", [:mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "3427232caf0835f94680e5bcf082408a70b48ad68a5f5c0b02a3bea9f3a075b9"}, "certifi": {:hex, :certifi, "2.16.0", "a4edfc1d2da3424d478a3271133bf28e0ec5e6fd8c009aab5a4ae980cb165ce9", [:rebar3], [], "hexpm", "8a64f6669d85e9cc0e5086fcf29a5b13de57a13efa23d3582874b9a19303f184"}, "credo": {:hex, :credo, "1.7.17", "f92b6aa5b26301eaa5a35e4d48ebf5aa1e7094ac00ae38f87086c562caf8a22f", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1eb5645c835f0b6c9b5410f94b5a185057bcf6d62a9c2b476da971cde8749645"}, "dialyxir": {:hex, :dialyxir, "1.4.7", "dda948fcee52962e4b6c5b4b16b2d8fa7d50d8645bbae8b8685c3f9ecb7f5f4d", [:mix], [{:erlex, ">= 0.2.8", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "b34527202e6eb8cee198efec110996c25c5898f43a4094df157f8d28f27d9efe"}, "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"}, + "elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"}, "erlex": {:hex, :erlex, "0.2.8", "cd8116f20f3c0afe376d1e8d1f0ae2452337729f68be016ea544a72f767d9c12", [:mix], [], "hexpm", "9d66ff9fedf69e49dc3fd12831e12a8a37b76f8651dd21cd45fcf5561a8a7590"}, "ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"}, "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, + "fine": {:hex, :fine, "0.1.4", "b19a89c1476c7c57afb5f9314aed5960b5bc95d5277de4cb5ee8e1d1616ce379", [:mix], [], "hexpm", "be3324cc454a42d80951cf6023b9954e9ff27c6daa255483b3e8d608670303f5"}, "floki": {:hex, :floki, "0.38.1", "f002ccac94b3bcb21d40d9b34cc2cc9fd88a8311879120330075b5dde657ebee", [:mix], [], "hexpm", "e744bf0db7ee34b2c8b62767f04071107af0516a81144b9a2f73fe0494200e5b"}, "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, + "lazy_html": {:hex, :lazy_html, "0.1.10", "ffe42a0b4e70859cf21a33e12a251e0c76c1dff76391609bd56702a0ef5bc429", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.9.0", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:fine, "~> 0.1.0", [hex: :fine, repo: "hexpm", optional: false]}], "hexpm", "50f67e5faa09d45a99c1ddf3fac004f051997877dc8974c5797bb5ccd8e27058"}, "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, "makeup_erlang": {:hex, :makeup_erlang, "1.0.3", "4252d5d4098da7415c390e847c814bad3764c94a814a0b4245176215615e1035", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "953297c02582a33411ac6208f2c6e55f0e870df7f80da724ed613f10e6706afd"}, diff --git a/test/premailex/html_parser/lazy_html_test.exs b/test/premailex/html_parser/lazy_html_test.exs new file mode 100644 index 0000000..7952fdb --- /dev/null +++ b/test/premailex/html_parser/lazy_html_test.exs @@ -0,0 +1,4 @@ +defmodule Premailex.HTMLParser.LazyHTMLTest do + use ExUnit.Case + doctest Premailex.HTMLParser.LazyHTML +end diff --git a/test/test_helper.exs b/test/test_helper.exs index 91e6ac7..e603e6e 100644 --- a/test/test_helper.exs +++ b/test/test_helper.exs @@ -1,4 +1,8 @@ ExUnit.start() -if System.get_env("HTML_PARSER") == "meeseeks", - do: Application.put_env(:premailex, :html_parser, Premailex.HTMLParser.Meeseeks) +html_parser = System.get_env("HTML_PARSER", "Floki") +html_parser = Module.concat(Premailex.HTMLParser, html_parser) + +Application.put_env(:premailex, :html_parser, html_parser) + +IO.puts("Testing with #{inspect(html_parser)}")