Commit bdcdd617 authored by kaniini's avatar kaniini

Merge branch 'release-polishing' into 'master'

Release polishing

See merge request pleroma/fast_html!10
parents ec1e3cf1 f1632ca3
Pipeline #19682 passed with stage
in 47 seconds
......@@ -3,32 +3,23 @@
A C Node wrapping lexborisov's [myhtml](https://github.com/lexborisov/myhtml).
Primarily used with [FastSanitize](https://git.pleroma.social/pleroma/fast_sanitize).
* Available as a hex package: `{:fast_html, "~> 0.1.0"}`
* [Documentation](https://hexdocs.pm/fast_html/FastHTML.html)
* Available as a hex package: `{:fast_html, "~> 0.99"}`
* [Documentation](https://hexdocs.pm/fast_html/fast_html.html)
## Example
## Benchmarks
iex> :fast_html.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
The following table provides median times it takes to decode a string to a tree for html parsers that can be used from Elixir. Benchmarks were conducted on a machine with `Intel Core i7-3520M @ 2.90GHz` CPU and 16GB of RAM. The `mix fast_html.bench` task can be used for running the benchmark by yourself.
Benchmark results (removed Nif calling mode) on various file sizes on a 2,5Ghz Core i7:
Settings:
duration: 1.0 s
## FileSizesBench
[15:28:42] 1/3: github_trending_js.html 341k
[15:28:46] 2/3: w3c_html5.html 131k
[15:28:48] 3/3: wikipedia_hyperlink.html 97k
Finished in 7.52 seconds
## FileSizesBench
benchmark name iterations average time
wikipedia_hyperlink.html 97k 1000 1385.86 µs/op
w3c_html5.html 131k 1000 2179.30 µs/op
github_trending_js.html 341k 500 5686.21 µs/op
| File/Parser | fast_html (C-Node) | mochiweb_html (erlang) | html5ever (Rust NIF) | Myhtmlex (NIF)¹ |
|----------------------|--------------------|------------------------|----------------------|----------------|
| document-large.html | 178.13 ms | 3471.70 ms | 799.20 ms | 402.64 ms |
| document-medium.html | 2.85 ms | 26.58 ms | 9.06 ms | 3.72 ms |
| document-small.html | 1.08 ms | 5.45 ms | 2.10 ms | 1.24 ms |
| fragment-large.html | 1.50 ms | 10.91 ms | 6.03 ms | 1.91 ms |
| fragment-small.html² | 434.64 μs | 83.02 μs | 57.97 μs | 311.39 μs |
1. Myhtmlex has a C-Node mode as well, but it wasn't benchmarked here because it segfaults on `document-large.html`
2. The slowdown on `fragment-small.html` is due to C-Node overhead. Unlike html5ever and Myhtmlex in NIF mode, `fast_html` has the parser process isolated and communicates with it over the network, so even if a fatal crash in the parser happens, it won't bring down the entire VM.
## Contribution / Bug Reports
* Please make sure you do `git submodule update` after a checkout/pull
......
defmodule :fast_html do
@moduledoc """
A module to decode html into a tree structure.
Based on [Alexander Borisov's myhtml](https://github.com/lexborisov/myhtml),
this binding gains the properties of being html-spec compliant and very fast.
## Example
iex> :fast_html.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
Benchmark results (removed Nif calling mode) on various file sizes on a 2,5Ghz Core i7:
Settings:
duration: 1.0 s
## FileSizesBench
[15:28:42] 1/3: github_trending_js.html 341k
[15:28:46] 2/3: w3c_html5.html 131k
[15:28:48] 3/3: wikipedia_hyperlink.html 97k
Finished in 7.52 seconds
## FileSizesBench
benchmark name iterations average time
wikipedia_hyperlink.html 97k 1000 1385.86 µs/op
w3c_html5.html 131k 1000 2179.30 µs/op
github_trending_js.html 341k 500 5686.21 µs/op
"""
@type tag() :: String.t() | atom()
......@@ -44,31 +18,11 @@ defmodule :fast_html do
@doc """
Returns a tree representation from the given html string.
## Examples
iex> :fast_html.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
iex> :fast_html.decode("<span class='hello'>Hi there</span>")
{"html", [],
[{"head", [], []},
{"body", [], [{"span", [{"class", "hello"}], ["Hi there"]}]}]}
`opts` is a keyword list of options, the options available:
* `timeout` - Call timeout
* `format` - Format flags for the tree
iex> :fast_html.decode("<body><!-- a comment --!></body>")
{"html", [], [{"head", [], []}, {"body", [], [comment: " a comment "]}]}
iex> :fast_html.decode("<br>")
{"html", [], [{"head", [], []}, {"body", [], [{"br", [], []}]}]}
"""
@spec decode(String.t()) :: tree()
def decode(bin) do
decode(bin, format: [])
end
@doc """
Returns a tree representation from the given html string.
This variant allows you to pass in one or more of the following format flags:
The following format flags are available:
* `:html_atoms` uses atoms for known html tags (faster), binaries for everything else.
* `:nil_self_closing` uses `nil` to designate self-closing tags and void elements.
......@@ -77,26 +31,44 @@ defmodule :fast_html do
* `:comment_tuple3` uses 3-tuple elements for comments, instead of the default 2-tuple element.
## Examples
iex> :fast_html.decode("<h1>Hello world</h1>")
{:ok, {"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}}
iex> :fast_html.decode("Hello world", timeout: 0)
{:error, :timeout}
iex> :fast_html.decode("<span class='hello'>Hi there</span>")
{:ok, {"html", [],
[{"head", [], []},
{"body", [], [{"span", [{"class", "hello"}], ["Hi there"]}]}]}}
iex> :fast_html.decode("<body><!-- a comment --!></body>")
{:ok, {"html", [], [{"head", [], []}, {"body", [], [comment: " a comment "]}]}}
iex> :fast_html.decode("<br>")
{:ok, {"html", [], [{"head", [], []}, {"body", [], [{"br", [], []}]}]}}
iex> :fast_html.decode("<h1>Hello world</h1>", format: [:html_atoms])
{:html, [], [{:head, [], []}, {:body, [], [{:h1, [], ["Hello world"]}]}]}
{:ok, {:html, [], [{:head, [], []}, {:body, [], [{:h1, [], ["Hello world"]}]}]}}
iex> :fast_html.decode("<br>", format: [:nil_self_closing])
{"html", [], [{"head", [], []}, {"body", [], [{"br", [], nil}]}]}
{:ok, {"html", [], [{"head", [], []}, {"body", [], [{"br", [], nil}]}]}}
iex> :fast_html.decode("<body><!-- a comment --!></body>", format: [:comment_tuple3])
{"html", [], [{"head", [], []}, {"body", [], [{:comment, [], " a comment "}]}]}
{:ok, {"html", [], [{"head", [], []}, {"body", [], [{:comment, [], " a comment "}]}]}}
iex> html = "<body><!-- a comment --!><unknown /></body>"
iex> :fast_html.decode(html, format: [:html_atoms, :nil_self_closing, :comment_tuple3])
{:html, [],
{:ok, {:html, [],
[{:head, [], []},
{:body, [], [{:comment, [], " a comment "}, {"unknown", [], nil}]}]}
{:body, [], [{:comment, [], " a comment "}, {"unknown", [], nil}]}]}}
"""
@spec decode(String.t(), format: [format_flag()]) :: tree()
def decode(bin, format: flags) do
{:ok, res} = FastHtml.Cnode.call({:decode, bin, flags})
res
@spec decode(String.t(), format: [format_flag()]) ::
{:ok, tree()} | {:error, String.t() | atom()}
def decode(bin, opts \\ []) do
flags = Keyword.get(opts, :format, [])
timeout = Keyword.get(opts, :timeout, 10000)
FastHtml.Cnode.call({:decode, bin, flags}, timeout)
end
end
......@@ -11,7 +11,10 @@ defmodule FastHtml.Application do
_ -> :ok
end
Supervisor.start_link([FastHtml.Cnode], strategy: :one_for_one, name: FastHtml.Supervisor)
Supervisor.start_link([{FastHtml.Cnode, Application.get_env(:fast_html, :cnode, [])}],
strategy: :one_for_one,
name: FastHtml.Supervisor
)
end
defp maybe_setup_node() do
......
defmodule FastHtml.Cnode do
@moduledoc false
@moduledoc """
Manages myhtml c-node.
## Configuration
```elixir
config :fast_html, :cnode,
sname: "myhtml_worker", # Defaults to myhtml_<random bytes>
spawn_inactive_timeout: 5000 # Defaults to 10000
```
"""
@spawn_inactive_timeout 10000
......@@ -8,25 +17,20 @@ defmodule FastHtml.Cnode do
use GenServer
require Logger
@doc false
def start_link(args) do
GenServer.start_link(__MODULE__, args, name: __MODULE__)
end
@doc false
def init(args) do
args =
if args == [] do
%{}
else
args
end
exec_path = Path.join(:code.priv_dir(unquote(application)), "myhtml_worker")
sname = Map.get_lazy(args, :sname, &default_sname/0)
hostname = Map.get_lazy(args, :hostname, &master_hostname/0)
sname = Keyword.get_lazy(args, :sname, &default_sname/0)
hostname = master_hostname()
addr = :"#{sname}@#{hostname}"
spawn_inactive_timeout = Map.get(args, :spawn_inactive_timeout, @spawn_inactive_timeout)
spawn_inactive_timeout = Keyword.get(args, :spawn_inactive_timeout, @spawn_inactive_timeout)
state = %{
exec_path: exec_path,
......@@ -106,24 +110,29 @@ defmodule FastHtml.Cnode do
end
end
@doc false
def handle_info({:nodedown, _cnode}, state) do
{:stop, :nodedown, state}
end
@doc false
def handle_info(msg, state) do
Logger.warn("unhandled handle_info: #{inspect(msg)}")
{:noreply, state}
end
@doc false
def handle_call(:addr, _from, %{addr: addr} = state) do
{:reply, addr, state}
end
@doc false
def terminate(_reason, %{pid: pid}) when pid != nil do
System.cmd("kill", ["-9", to_string(pid)])
:normal
end
@doc "Call into myhtml cnode"
def call(msg, timeout \\ 10000) do
node = GenServer.call(__MODULE__, :addr)
send({nil, node}, msg)
......
......@@ -16,7 +16,10 @@ defmodule Mix.Tasks.FastHtml.Bench do
Benchee.run(
%{
"Decoding" => fn input -> :fast_html.decode(input) end
"fast_html" => fn input -> :fast_html.decode(input) end,
"myhtmlex nif" => fn input -> Myhtmlex.Nif.decode(input) end,
"html5ever nif" => fn input -> Html5ever.parse(input) end,
"mochiweb_html" => fn input -> :mochiweb_html.parse(input) end
},
inputs: inputs,
save: [path: "fast_html.bench"],
......
defmodule FastHTML.Mixfile do
defmodule FastHtml.Mixfile do
use Mix.Project
def project do
[
app: :fast_html,
version: "0.9.2",
version: "0.99.0",
elixir: "~> 1.5",
deps: deps(),
package: package(),
compilers: [:fast_html_cnode_make] ++ Mix.compilers(),
build_embedded: Mix.env() == :prod,
start_permanent: Mix.env() == :prod,
name: "FastHTML",
name: "FastHtml",
description: """
A module to decode HTML into a tree,
porting all properties of the underlying
......@@ -26,7 +26,7 @@ defmodule FastHTML.Mixfile do
def package do
[
maintainers: ["Ariadne Conill"],
maintainers: ["Ariadne Conill", "rinpatch"],
licenses: ["GNU LGPL"],
links: %{
"GitLab" => "https://git.pleroma.social/pleroma/fast_html",
......@@ -60,13 +60,17 @@ defmodule FastHTML.Mixfile do
# documentation helpers
{:ex_doc, "~> 0.19", only: :dev},
# benchmarking helpers
{:benchee, "~> 1.0", only: :dev}
{:benchee, "~> 1.0", only: :dev},
{:myhtmlex, "~> 0.2.0", only: :dev, runtime: false},
{:mochiweb, "~> 2.18", only: :dev},
{:html5ever, "~> 0.7.0", only: :dev}
]
end
defp docs do
[
main: "fast_html"
main: "readme",
extras: ["README.md"]
]
end
end
......@@ -127,7 +131,7 @@ defmodule Mix.Tasks.Compile.FastHtmlCnodeMake do
{:error,
[
%Mix.Task.Compiler.Diagnostic{
compiler_name: "FastHTML Cnode",
compiler_name: "FastHtml Cnode",
message: "Make exited with #{exit_code}",
severity: :error,
file: nil,
......
......@@ -3,9 +3,13 @@
"deep_merge": {:hex, :deep_merge, "1.0.0", "b4aa1a0d1acac393bdf38b2291af38cb1d4a52806cf7a4906f718e1feb5ee961", [:mix], [], "hexpm"},
"earmark": {:hex, :earmark, "1.4.2", "3aa0bd23bc4c61cf2f1e5d752d1bb470560a6f8539974f767a38923bb20e1d7f", [:mix], [], "hexpm"},
"ex_doc": {:hex, :ex_doc, "0.21.2", "caca5bc28ed7b3bdc0b662f8afe2bee1eedb5c3cf7b322feeeb7c6ebbde089d6", [:mix], [{:earmark, "~> 1.3.3 or ~> 1.4", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm"},
"html5ever": {:hex, :html5ever, "0.7.0", "9f63ec1c783b2dc9f326840fcc993c01e926dbdef4e51ba1bbe5355993c258b4", [:mix], [{:rustler, "~> 0.18.0", [hex: :rustler, repo: "hexpm", optional: false]}], "hexpm"},
"makeup": {:hex, :makeup, "1.0.0", "671df94cf5a594b739ce03b0d0316aa64312cee2574b6a44becb83cd90fb05dc", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm"},
"makeup_elixir": {:hex, :makeup_elixir, "0.14.0", "cf8b7c66ad1cff4c14679698d532f0b5d45a3968ffbcbfd590339cb57742f1ae", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm"},
"mochiweb": {:hex, :mochiweb, "2.18.0", "eb55f1db3e6e960fac4e6db4e2db9ec3602cc9f30b86cd1481d56545c3145d2e", [:rebar3], [], "hexpm"},
"myhtml": {:git, "https://github.com/lexborisov/myhtml.git", "fe2cf577570666d058a2b7167c26d3384a758e19", [branch: "master"]},
"myhtmlex": {:hex, :myhtmlex, "0.2.1", "d6f3eb1826f7cdaa0225a996569da0930d1a334405510845c905ae59295ab226", [:make, :mix], [{:nodex, "~> 0.1.1", [hex: :nodex, repo: "hexpm", optional: false]}], "hexpm"},
"nimble_parsec": {:hex, :nimble_parsec, "0.5.1", "c90796ecee0289dbb5ad16d3ad06f957b0cd1199769641c961cfe0b97db190e0", [:mix], [], "hexpm"},
"nodex": {:git, "https://git.pleroma.social/pleroma/nodex", "cb6730f943cfc6aad674c92161be23a8411f15d1", [ref: "cb6730f943cfc6aad674c92161be23a8411f15d1"]},
"nodex": {:hex, :nodex, "0.1.1", "ed2f7bbe19ea62a43ad4b7ad332eb3f9ca12c64a35a5802a0eb545b93ebe32af", [:mix], [], "hexpm"},
"rustler": {:hex, :rustler, "0.18.0", "db4bd0c613d83a1badc31be90ddada6f9821de29e4afd15c53a5da61882e4f2d", [:mix], [], "hexpm"},
}
......@@ -3,138 +3,150 @@ defmodule :fast_html_test do
doctest :fast_html
test "doesn't segfault when <!----> is encountered" do
assert {"html", _attrs, _children} = :fast_html.decode("<div> <!----> </div>")
assert {:ok, {"html", _attrs, _children}} = :fast_html.decode("<div> <!----> </div>")
end
test "builds a tree, formatted like mochiweb by default" do
assert {"html", [],
[
{"head", [], []},
{"body", [],
[
{"br", [], []}
]}
]} = :fast_html.decode("<br>")
assert {:ok,
{"html", [],
[
{"head", [], []},
{"body", [],
[
{"br", [], []}
]}
]}} = :fast_html.decode("<br>")
end
test "builds a tree, html tags as atoms" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
{:br, [], []}
]}
]} = :fast_html.decode("<br>", format: [:html_atoms])
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:br, [], []}
]}
]}} = :fast_html.decode("<br>", format: [:html_atoms])
end
test "builds a tree, nil self closing" do
assert {"html", [],
[
{"head", [], []},
{"body", [],
[
{"br", [], nil},
{"esi:include", [], nil}
]}
]} = :fast_html.decode("<br><esi:include />", format: [:nil_self_closing])
assert {:ok,
{"html", [],
[
{"head", [], []},
{"body", [],
[
{"br", [], nil},
{"esi:include", [], nil}
]}
]}} = :fast_html.decode("<br><esi:include />", format: [:nil_self_closing])
end
test "builds a tree, multiple format options" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
{:br, [], nil}
]}
]} = :fast_html.decode("<br>", format: [:html_atoms, :nil_self_closing])
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:br, [], nil}
]}
]}} = :fast_html.decode("<br>", format: [:html_atoms, :nil_self_closing])
end
test "attributes" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
{:span, [{"id", "test"}, {"class", "foo garble"}], []}
]}
]} =
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:span, [{"id", "test"}, {"class", "foo garble"}], []}
]}
]}} =
:fast_html.decode(~s'<span id="test" class="foo garble"></span>',
format: [:html_atoms]
)
end
test "single attributes" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
{:button, [{"disabled", "disabled"}, {"class", "foo garble"}], []}
]}
]} =
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:button, [{"disabled", "disabled"}, {"class", "foo garble"}], []}
]}
]}} =
:fast_html.decode(~s'<button disabled class="foo garble"></span>',
format: [:html_atoms]
)
end
test "text nodes" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
"text node"
]}
]} = :fast_html.decode(~s'<body>text node</body>', format: [:html_atoms])
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
"text node"
]}
]}} = :fast_html.decode(~s'<body>text node</body>', format: [:html_atoms])
end
test "broken input" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
{:a, [{"<", "<"}], [" asdf"]}
]}
]} = :fast_html.decode(~s'<a <> asdf', format: [:html_atoms])
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:a, [{"<", "<"}], [" asdf"]}
]}
]}} = :fast_html.decode(~s'<a <> asdf', format: [:html_atoms])
end
test "namespaced tags" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
{"svg:svg", [],
[
{"svg:path", [], []},
{"svg:a", [], []}
]}
]}
]} = :fast_html.decode(~s'<svg><path></path><a></a></svg>', format: [:html_atoms])
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
{"svg:svg", [],
[
{"svg:path", [], []},
{"svg:a", [], []}
]}
]}
]}} = :fast_html.decode(~s'<svg><path></path><a></a></svg>', format: [:html_atoms])
end
test "custom namespaced tags" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
{"esi:include", [], nil}
]}
]} = :fast_html.decode(~s'<esi:include />', format: [:html_atoms, :nil_self_closing])
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
{"esi:include", [], nil}
]}
]}} =
:fast_html.decode(~s'<esi:include />', format: [:html_atoms, :nil_self_closing])
end
test "html comments" do
assert {:html, [],
[
{:head, [], []},
{:body, [],
[
comment: " a comment "
]}
]} = :fast_html.decode(~s'<body><!-- a comment --></body>', format: [:html_atoms])
assert {:ok,
{:html, [],
[
{:head, [], []},
{:body, [],
[
comment: " a comment "
]}
]}} = :fast_html.decode(~s'<body><!-- a comment --></body>', format: [:html_atoms])
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment