Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
test:
strategy:
matrix:
html_parser: [floki, meeseeks]
html_parser: [Floki, Meeseeks, LazyHTML]
version:
- otp: 28.0
elixir: 1.19
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Requires Elixir 1.14 or higher.

* Fixed compiler warnings in `Premailex.HTMLParser.Meeseeks`
* Fixed invalid spec in `Premailex.HTMLInlineStyles.process/3`
* Added support for `LazyHTML`

## v0.3.20 (2025-01-20)

Expand Down
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,32 @@ end

## HTML parser

By default, premailex uses [`Floki`](https://github.com/philss/floki) to parse HTML, but you can exchange it for any HTML parser you prefer. [`Meeseeks`](https://github.com/mischov/meeseeks) is supported with the [`Premailex.HTMLParser.Meeseeks`](/lib/premailex/html_parser/meeseeks.ex) module. To use it, add the following to `config.exs`:
Premailex supports multiple HTML parsers and will automatically select one based on what's available:

1. [`Floki`](https://github.com/philss/floki) (default if available)
2. [`Meeseeks`](https://github.com/mischov/meeseeks)
3. [`LazyHTML`](https://github.com/wojtekmach/lazy_html)

**At least one parser dependency must be added to your `mix.exs`:**

```elixir
def deps do
[
{:premailex, "~> 0.3.20"},
# Add at least one of these:
{:floki, "~> 0.19"}, # Recommended default
# {:meeseeks, "~> 0.11"}, # Alternative option
# {:lazy_html, "~> 0.1.8"}, # Alternative option
]
end
```

To explicitly configure which parser to use, add to your `config.exs`:

```elixir
config :premailex, html_parser: Premailex.HTMLParser.Meeseeks
# or
config :premailex, html_parser: Premailex.HTMLParser.LazyHTML
```

## LICENSE
Expand Down
55 changes: 44 additions & 11 deletions lib/premailex/html_parser.ex
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
defmodule Premailex.HTMLParser do
@moduledoc """
Module that provide HTML parsing API using an underlying HTML parser library.
"""

@default_parser Premailex.HTMLParser.Floki
By default, premailex will try to use Floki, then LazyHTML, then Meeseeks
(in that order) based on what's available.

You can explicitly configure which parser to use in your config:

config :premailex, html_parser: Premailex.HTMLParser.LazyHTML
"""

@type html_tree :: tuple() | list()
@type selector :: binary()
Expand All @@ -23,7 +28,39 @@ defmodule Premailex.HTMLParser do
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Title"]}]}]}
"""
@spec parse(binary()) :: html_tree()
def parse(html), do: parser().parse(html)
def parse(html), do: html_parser().parse(html)

defp html_parser do
case Application.get_env(:premailex, :html_parser) || default_html_parser!() do
mod when is_atom(mod) -> mod
other -> raise "Invalid html_parser, got: #{inspect(other)}"
end
end

defp default_html_parser! do
cond do
Code.ensure_loaded?(Floki) ->
Premailex.HTMLParser.Floki

Code.ensure_loaded?(LazyHTML) ->
Premailex.HTMLParser.LazyHTML

Code.ensure_loaded?(Meeseeks) ->
Premailex.HTMLParser.Meeseeks

true ->
raise """
No HTML parser is available. Please add at least one of the following dependencies to your mix.exs:

- {:floki, "~> 0.19"}
- {:meeseeks, "~> 0.11"}
- {:lazy_html, "~> 0.1.8"}

Or explicitly configure a parser:
config :premailex, html_parser: Premailex.HTMLParser.Floki
"""
end
end

@doc """
Searches an HTML tree for the selector.
Expand All @@ -34,7 +71,7 @@ defmodule Premailex.HTMLParser do
[{"h1", [], ["Title"]}]
"""
@spec all(html_tree(), selector()) :: [html_tree()]
def all(tree, selector), do: parser().all(tree, selector)
def all(tree, selector), do: html_parser().all(tree, selector)

@doc """
Filters elements matching the selector from the HTML tree.
Expand All @@ -45,7 +82,7 @@ defmodule Premailex.HTMLParser do
[{"html", [], [{"head", [], []}, {"body", [], []}]}]
"""
@spec filter(html_tree(), selector()) :: [html_tree()]
def filter(tree, selector), do: parser().filter(tree, selector)
def filter(tree, selector), do: html_parser().filter(tree, selector)

@doc """
Turns an HTML tree into a string.
Expand All @@ -56,7 +93,7 @@ defmodule Premailex.HTMLParser do
"<html><head></head><body><h1>Title</h1></body></html>"
"""
@spec to_string(html_tree()) :: binary()
def to_string(tree), do: parser().to_string(tree)
def to_string(tree), do: html_parser().to_string(tree)

@doc """
Extracts text elements from the HTML tree.
Expand All @@ -67,9 +104,5 @@ defmodule Premailex.HTMLParser do
"Title"
"""
@spec text(html_tree()) :: binary()
def text(tree), do: parser().text(tree)

defp parser do
Application.get_env(:premailex, :html_parser, @default_parser)
end
def text(tree), do: html_parser().text(tree)
end
100 changes: 51 additions & 49 deletions lib/premailex/html_parser/floki.ex
Original file line number Diff line number Diff line change
@@ -1,59 +1,61 @@
defmodule Premailex.HTMLParser.Floki do
@moduledoc false
alias Premailex.HTMLParser

@behaviour HTMLParser

@impl true
@doc false
def parse(html) do
html = retain_inline_whitespace(html)
args = [html]

"< 0.24.0"
|> floki_version_match?()
|> case do
true -> apply(Floki, :parse, args)
false -> apply(Floki, :parse_document, args)
if Code.ensure_loaded?(Floki) do
defmodule Premailex.HTMLParser.Floki do
@moduledoc false
alias Premailex.HTMLParser

@behaviour HTMLParser

@impl true
@doc false
def parse(html) do
html = retain_inline_whitespace(html)
args = [html]

"< 0.24.0"
|> floki_version_match?()
|> case do
true -> apply(Floki, :parse, args)
false -> apply(Floki, :parse_document, args)
end
|> case do
{:ok, [html]} -> html
{:ok, document} -> document
any -> any
end
end
|> case do
{:ok, [html]} -> html
{:ok, document} -> document
any -> any
end
end

defp floki_version_match?(req) do
case :application.get_key(:floki, :vsn) do
{:ok, actual} ->
actual
|> List.to_string()
|> Version.match?(req)
defp floki_version_match?(req) do
case :application.get_key(:floki, :vsn) do
{:ok, actual} ->
actual
|> List.to_string()
|> Version.match?(req)

_any ->
false
_any ->
false
end
end
end

@impl true
@doc false
def all(tree, selector), do: Floki.find(tree, selector)
@impl true
@doc false
def all(tree, selector), do: Floki.find(tree, selector)

@impl true
@doc false
def filter(tree, selector), do: Floki.filter_out(tree, selector)
@impl true
@doc false
def filter(tree, selector), do: Floki.filter_out(tree, selector)

@impl true
@doc false
def to_string(tree), do: Floki.raw_html(tree)
@impl true
@doc false
def to_string(tree), do: Floki.raw_html(tree)

@impl true
@doc false
def text(tree), do: Floki.text(tree)
@impl true
@doc false
def text(tree), do: Floki.text(tree)

# """
# This is a tempory fix until mochweb (or floki) has been updated
# to correctly handle whitespace text nodes: https://github.com/mochi/mochiweb/issues/166
# """
defp retain_inline_whitespace(html), do: String.replace(html, ~r/\>[ ]+\</, ">&#32;<")
# """
# This is a tempory fix until mochweb (or floki) has been updated
# to correctly handle whitespace text nodes: https://github.com/mochi/mochiweb/issues/166
# """
defp retain_inline_whitespace(html), do: String.replace(html, ~r/\>[ ]+\</, ">&#32;<")
end
end
110 changes: 110 additions & 0 deletions lib/premailex/html_parser/lazy_html.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
if Code.ensure_loaded?(LazyHTML) do
defmodule Premailex.HTMLParser.LazyHTML do
@moduledoc false

@behaviour Premailex.HTMLParser

@impl true
@doc false
def parse(html) do
is_document = Regex.match?(~r/<html|<HTML|<!DOCTYPE/i, html)

result =
if is_document do
html
|> LazyHTML.from_document()
|> LazyHTML.to_tree(skip_whitespace_nodes: true)
else
html
|> LazyHTML.from_fragment()
|> LazyHTML.to_tree(skip_whitespace_nodes: true)
end
|> Enum.reject(&empty_text_node?/1)

case result do
[html] -> html
html when is_list(html) -> html
html -> html
end
end

@impl true
@doc false
def all(tree, selector) do
tree
|> to_lazy_html()
|> LazyHTML.query(selector)
|> LazyHTML.to_tree(skip_whitespace_nodes: true)
end

@impl true
@doc false
def filter(tree, selector) do
tree_list = normalize_tree(tree)

filter_tree(tree_list, selector)
end

@impl true
@doc false
def to_string(tree) do
tree
|> to_lazy_html()
|> LazyHTML.to_html(skip_whitespace_nodes: true)
end

@impl true
@doc false
def text(tree) do
tree
|> to_lazy_html()
|> LazyHTML.text()
end

defp to_lazy_html(tree) when is_list(tree) do
LazyHTML.from_tree(tree)
end

defp to_lazy_html(tree) when is_tuple(tree) do
LazyHTML.from_tree([tree])
end

defp normalize_tree(tree) when is_list(tree), do: tree
defp normalize_tree(tree) when is_tuple(tree), do: [tree]

defp filter_tree(tree_list, selector) when is_list(tree_list) do
tree_list
|> Enum.map(fn node -> filter_node(node, selector) end)
|> Enum.reject(fn node -> is_nil(node) or empty_text_node?(node) end)
end

defp empty_text_node?(""), do: true
defp empty_text_node?(text) when is_binary(text), do: String.trim(text) == ""
defp empty_text_node?(_), do: false

defp filter_node(node, selector) when is_tuple(node) do
{tag, attrs, children} = node

node_without_children = {tag, attrs, []}
lazy_html = LazyHTML.from_tree([node_without_children])
matches = LazyHTML.query(lazy_html, selector)

node_matches = LazyHTML.to_tree(matches) != []

if node_matches do
nil
else
filtered_children = filter_tree(children, selector)
{tag, attrs, filtered_children}
end
end

defp filter_node(node, _selector) when is_binary(node) do
node
end

defp filter_node({:comment, _text} = node, _selector) do
node
end
end
end
4 changes: 3 additions & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ defmodule Premailex.Mixfile do
Meeseeks,
Meeseeks.Document,
Meeseeks.Selector.CSS,
LazyHTML,
:ssl_verify_hostname
]
],
Expand All @@ -39,8 +40,9 @@ defmodule Premailex.Mixfile do

defp deps do
[
{:floki, "~> 0.19"},
{:floki, "~> 0.19", optional: true},
{:meeseeks, "~> 0.11", optional: true},
{:lazy_html, "~> 0.1.8", optional: true},
{:certifi, ">= 0.0.0", optional: true},
{:ssl_verify_fun, ">= 0.0.0", optional: true},

Expand Down
Loading
Loading