Commit f472d7c1 authored by rinpatch's avatar rinpatch

fast_html 2.0

parent 04d32126
Pipeline #29267 failed with stage
in 1 minute and 42 seconds
image: elixir:1.7
image: elixir:1.7-alpine
variables:
MIX_ENV: test
......@@ -15,6 +15,7 @@ stages:
- publish
before_script:
- apk add build-base cmake
- mix local.hex --force
- mix local.rebar --force
- mix deps.get --only test
......@@ -30,3 +31,8 @@ unit-testing:
coverage: '/(\d+\.\d+\%) \| Total/'
script:
- mix test --trace --preload-modules --cover
dialyzer:
stage: test
script:
- mix dialyzer
[submodule "c_src/myhtml"]
path = c_src/myhtml
url = https://github.com/lexborisov/myhtml.git
[submodule "c_src/lexbor"]
path = c_src/lexbor
url = https://github.com/lexbor/lexbor
......@@ -3,6 +3,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.0] - ???
### Changed
- **Breaking:** CMake is now required at compile-time due to it being lexbor's build system
- **Breaking:** namespaces are no longer automatically appended. i.e "<svg> </svg>" will be `{"svg", [], []}` instead of `{"svg:svg", [], []}`
- **Breaking:** when using `:nil_self_closing` flag, only valid [void elements](https://html.spec.whatwg.org/#void-elements) will have `nil` in children
- Now deprecated myhtml was switched to [lexbor](https://github.com/lexbor/lexbor)
- The worker process now communicates with the node via stdio, instead of TCP, which was known to cause issues
on BSD systems
### Added
- `FastHtml.Pool` for fast_html workers. There is a default pool of `System.schedulers_online/0` workers, but a custom pool can be started if desired, or it can be disabled altogether. See `FastHtml.Pool` module documentation for more info
## [1.0.3] - 2020-02-10
### Fixed
- C-Node not respawning after being killed.
......
MIX = mix
CMAKE = cmake
CNODE_CFLAGS = -g -O2 -std=c99 -pedantic -Wcomment -Wextra -Wno-old-style-declaration -Wall
# ignore unused parameter warnings
......@@ -11,9 +12,9 @@ CNODE_CFLAGS += -I$(ERLANG_PATH)/include
# expecting myhtml as a submodule in c_src/
# that way we can pin a version and package the whole thing in hex
# hex does not allow for non-app related dependencies.
MYHTML_PATH = c_src/myhtml
MYHTML_STATIC = $(MYHTML_PATH)/lib/libmyhtml_static.a
CNODE_CFLAGS += -I$(MYHTML_PATH)/include
LXB_PATH = c_src/lexbor
LXB_STATIC = $(LXB_PATH)/liblexbor_static.a
CNODE_CFLAGS += -I$(LXB_PATH)/source
# avoid undefined reference errors to phtread_mutex_trylock
CNODE_CFLAGS += -lpthread
......@@ -34,17 +35,19 @@ CNODE_LDFLAGS += -lei -pthread
.PHONY: all
all: priv/myhtml_worker
all: priv/fasthtml_worker
$(MYHTML_STATIC): $(MYHTML_PATH)
$(MAKE) -C $(MYHTML_PATH) library MyCORE_BUILD_WITHOUT_THREADS=YES
$(LXB_STATIC): $(LXB_PATH)
# Sadly, build components separately seems to sporadically fail
cd $(LXB_PATH); cmake -DLEXBOR_BUILD_SEPARATELY=OFF -DLEXBOR_BUILD_SHARED=OFF
$(MAKE) -C $(LXB_PATH)
priv/myhtml_worker: c_src/myhtml_worker.c $(MYHTML_STATIC)
$(CC) -o $@ $< $(MYHTML_STATIC) $(CNODE_CFLAGS) $(CNODE_LDFLAGS)
priv/fasthtml_worker: c_src/fasthtml_worker.c $(LXB_STATIC)
$(CC) -o $@ $< $(LXB_STATIC) $(CNODE_CFLAGS) $(CNODE_LDFLAGS)
clean: clean-myhtml
$(RM) -r priv/myhtmlex*
$(RM) priv/myhtml_worker
$(RM) priv/fasthtml_worker
$(RM) myhtmlex-*.tar
$(RM) -r package-test
......
......@@ -3,31 +3,25 @@
A C Node wrapping lexborisov's [myhtml](https://github.com/lexborisov/myhtml).
Primarily used with [FastSanitize](https://git.pleroma.social/pleroma/fast_sanitize).
* Available as a hex package: `{:fast_html, "~> 1.0"}`
* Available as a hex package: `{:fast_html, "~> 2.0"}`
* [Documentation](https://hexdocs.pm/fast_html/fast_html.html)
## Benchmarks
The following table provides median times it takes to decode a string to a tree for html parsers that can be used from Elixir. Benchmarks were conducted on a machine with `Intel Core i7-3520M @ 2.90GHz` CPU and 16GB of RAM. The `mix fast_html.bench` task can be used for running the benchmark by yourself.
The following table provides median times it takes to decode a string to a tree for html parsers that can be used from Elixir. Benchmarks were conducted on a machine with an `AMD Ryzen 9 3950X (32) @ 3.500GHz` CPU and 32GB of RAM. The `mix fast_html.bench` task can be used for running the benchmark by yourself.
| File/Parser | fast_html (C-Node) | mochiweb_html (erlang) | html5ever (Rust NIF) | Myhtmlex (NIF)¹ |
| File/Parser | fast_html (Port) | mochiweb_html (erlang) | html5ever (Rust NIF) | Myhtmlex (NIF)¹ |
|----------------------|--------------------|------------------------|----------------------|----------------|
| document-large.html | 178.13 ms | 3471.70 ms | 799.20 ms | 402.64 ms |
| document-medium.html | 2.85 ms | 26.58 ms | 9.06 ms | 3.72 ms |
| document-small.html | 1.08 ms | 5.45 ms | 2.10 ms | 1.24 ms |
| fragment-large.html | 1.50 ms | 10.91 ms | 6.03 ms | 1.91 ms |
| fragment-small.html² | 434.64 μs | 83.02 μs | 57.97 μs | 311.39 μs |
| document-large.html (6.9M) | 125.12 ms | 1778.34 ms | 395.21 ms | 327.17 ms |
| document-medium.html (85K) | 1.93 ms | 12.10 ms | 4.74 ms | 3.82 ms |
| document-small.html (25K)| 0.50 ms | 2.76 ms | 1.72 ms | 1.19 ms |
| fragment-large.html (33K)| 0.93 ms | 4.78 ms | 2.34 ms | 2.15 ms |
| fragment-small.html² (757B)| 44.60 μs | 42.13 μs | 43.58 μs | 289.71 μs |
1. Myhtmlex has a C-Node mode as well, but it wasn't benchmarked here because it segfaults on `document-large.html`
2. The slowdown on `fragment-small.html` is due to C-Node overhead. Unlike html5ever and Myhtmlex in NIF mode, `fast_html` has the parser process isolated and communicates with it over the network, so even if a fatal crash in the parser happens, it won't bring down the entire VM.
Full benchmark output can be seen in [this snippet](https://git.pleroma.social/pleroma/elixir-libraries/fast_html/snippets/3128)
## Note about running with [Swarm](https://github.com/bitwalker/swarm)
Since the myhtml worker runs as a separate node, Swarm will try to sync with it. Of course it will fail since it's not a real Erlang node. To prevent it from doing that, you can add the following to your configuration:
```elixir
config :swarm, node_blacklist: [~r/myhtml_.*$/]
```
1. Myhtmlex has a C-Node mode, but it wasn't benchmarked here because it segfaults on `document-large.html`
2. The slowdown on `fragment-small.html` is due to Port overhead. Unlike html5ever and Myhtmlex in NIF mode, `fast_html` has the parser process isolated and communicates with it over stdio, so even if a fatal crash in the parser happens, it won't bring down the entire VM.
## Contribution / Bug Reports
......
Subproject commit 7cfc4b48aa8ffba251c249eb343aad94d16f9a59
Subproject commit fe2cf577570666d058a2b7167c26d3384a758e19
......@@ -4,13 +4,13 @@
#define GROW_BY 30
typedef struct {
myhtml_tree_node_t **data;
lxb_dom_node_t **data;
size_t used;
size_t size;
} tstack;
void tstack_init(tstack *stack, size_t initial_size) {
stack->data = (myhtml_tree_node_t **) malloc(initial_size * sizeof(myhtml_tree_node_t *));
stack->data = (lxb_dom_node_t **) malloc(initial_size * sizeof(lxb_dom_node_t *));
stack->used = 0;
stack->size = initial_size;
}
......@@ -20,18 +20,18 @@ void tstack_free(tstack *stack) {
}
void tstack_resize(tstack *stack, size_t new_size) {
stack->data = (myhtml_tree_node_t **) realloc(stack->data, new_size * sizeof(myhtml_tree_node_t *));
stack->data = (lxb_dom_node_t **) realloc(stack->data, new_size * sizeof(lxb_dom_node_t *));
stack->size = new_size;
}
void tstack_push(tstack *stack, myhtml_tree_node_t * element) {
void tstack_push(tstack *stack, lxb_dom_node_t * element) {
if(stack->used == stack->size) {
tstack_resize(stack, stack->size + GROW_BY);
}
stack->data[stack->used++] = element;
}
myhtml_tree_node_t* tstack_pop(tstack *stack) {
lxb_dom_node_t * tstack_pop(tstack *stack) {
return stack->data[--(stack->used)];
}
......
......@@ -19,13 +19,14 @@ defmodule :fast_html do
Returns a tree representation from the given html string.
`opts` is a keyword list of options, the options available:
* `timeout` - Call timeout
* `format` - Format flags for the tree
* `timeout` - Call timeout. If pooling is used and the worker doesn't return
the result in time, the worker will be killed with a warning.
* `format` - Format flags for the tree.
The following format flags are available:
* `:html_atoms` uses atoms for known html tags (faster), binaries for everything else.
* `:nil_self_closing` uses `nil` to designate self-closing tags and void elements.
* `:nil_self_closing` uses `nil` to designate void elements.
For example `<br>` is then being represented like `{"br", [], nil}`.
See http://w3c.github.io/html-reference/syntax.html#void-elements for a full list of void elements.
* `:comment_tuple3` uses 3-tuple elements for comments, instead of the default 2-tuple element.
......@@ -61,7 +62,7 @@ defmodule :fast_html do
iex> :fast_html.decode(html, format: [:html_atoms, :nil_self_closing, :comment_tuple3])
{:ok, [{:html, [],
[{:head, [], []},
{:body, [], [{:comment, [], " a comment "}, {"unknown", [], nil}]}]}]}
{:body, [], [{:comment, [], " a comment "}, {"unknown", [], []}]}]}]}
"""
@spec decode(String.t(), format: [format_flag()]) ::
......@@ -69,7 +70,8 @@ defmodule :fast_html do
def decode(bin, opts \\ []) do
flags = Keyword.get(opts, :format, [])
timeout = Keyword.get(opts, :timeout, 10000)
FastHtml.Cnode.call({:decode, bin, flags}, timeout)
find_and_use_port({:decode, bin, flags}, timeout, opts)
end
@doc """
......@@ -77,22 +79,69 @@ defmodule :fast_html do
`opts` is a keyword list of options, the options available are the same as in `decode/2` with addition of:
* `context` - Name of the context element, defaults to `div`
* `format` - Format flags for the tree
Example:
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl")
{:ok, [{"html", [], ["rin is the ", {"i", [], ["best"]}, " girl"]}]}
{:ok, ["rin is the ", {"i", [], ["best"]}, " girl"]}
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl", context: "title")
{:ok, [{"html", [], ["rin is the <i>best</i> girl"]}]}
{:ok, ["rin is the <i>best</i> girl"]}
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl", context: "objective_truth")
{:error, :unknown_context_tag}
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl", format: [:html_atoms])
{:ok, [{:html, [], ["rin is the ", {:i, [], ["best"]}, " girl"]}]}
{:ok, ["rin is the ", {:i, [], ["best"]}, " girl"]}
"""
def decode_fragment(bin, opts \\ []) do
flags = Keyword.get(opts, :format, [])
timeout = Keyword.get(opts, :timeout, 10000)
context = Keyword.get(opts, :context, "div")
FastHtml.Cnode.call({:decode_fragment, bin, flags, context}, timeout)
find_and_use_port({:decode_fragment, bin, flags, context}, timeout, opts)
end
@default_pool FastHtml.Pool
defp find_and_use_port(term_command, timeout, opts) do
command = :erlang.term_to_binary(term_command)
pool =
cond do
pool = Keyword.get(opts, :pool) -> pool
Application.get_env(:fast_html, :pool, enabled: true)[:enabled] -> @default_pool
true -> nil
end
execute_command_fun = fn port ->
send(port, {self(), {:command, command}})
receive do
{^port, {:data, res}} -> {:ok, res}
after
timeout ->
{:error, :timeout}
end
end
result =
if pool do
FastHtml.Pool.get_port(pool, execute_command_fun)
else
port = open_port()
result = execute_command_fun.(port)
Port.close(port)
result
end
case result do
{:ok, result} -> :erlang.binary_to_term(result)
{:error, _} = e -> e
end
end
def open_port do
Port.open({:spawn_executable, Path.join([:code.priv_dir(:fast_html), "fasthtml_worker"])}, [
:binary,
{:packet, 4},
:use_stdio,
:exit_status
])
end
end
......@@ -3,54 +3,13 @@ defmodule FastHtml.Application do
use Application
def random_sname, do: :crypto.strong_rand_bytes(4) |> Base.encode16(case: :lower)
def start(_type, _args) do
case maybe_setup_node() do
{:error, message} -> raise message
_ -> :ok
end
default_pool_config = Application.get_env(:fast_html, :pool, enabled: true)
children = if default_pool_config[:enabled], do: [FastHtml.Pool], else: []
Supervisor.start_link([{FastHtml.Cnode, Application.get_env(:fast_html, :cnode, [])}],
Supervisor.start_link(children,
strategy: :one_for_one,
name: FastHtml.Supervisor
)
end
defp maybe_setup_node() do
with {_, false} <- {:alive, Node.alive?()},
{:ok, epmd_path} <- find_epmd(),
:ok <- start_epmd(epmd_path),
{:ok, _pid} = pid_tuple <- start_node() do
pid_tuple
else
{:alive, _} ->
:ok
{:error, _} = e ->
e
end
end
defp find_epmd() do
case System.find_executable("epmd") do
nil ->
{:error,
"Could not find epmd executable. Please ensure the location it's in is present in your PATH or start epmd manually beforehand"}
executable ->
{:ok, executable}
end
end
defp start_epmd(path) do
case System.cmd(path, ["-daemon"]) do
{_result, 0} -> :ok
{_result, exit_code} -> {:error, "Could not start epmd, exit code: #{exit_code}"}
end
end
defp start_node() do
Node.start(:"master_#{random_sname()}@127.0.0.1")
end
end
defmodule FastHtml.Cnode do
@moduledoc """
Manages myhtml c-node.
## Configuration
```elixir
config :fast_html, :cnode,
sname: "myhtml_worker", # Defaults to myhtml_<random bytes>
spawn_inactive_timeout: 5000 # Defaults to 10000
```
"""
@spawn_inactive_timeout 10000
application = Mix.Project.config()[:app]
use GenServer
require Logger
@doc false
def start_link(args) do
GenServer.start_link(__MODULE__, args, name: __MODULE__)
end
@doc false
def init(args) do
exec_path = Path.join(:code.priv_dir(unquote(application)), "myhtml_worker")
sname = Keyword.get_lazy(args, :sname, &default_sname/0)
hostname = master_hostname()
addr = :"#{sname}@#{hostname}"
spawn_inactive_timeout = Keyword.get(args, :spawn_inactive_timeout, @spawn_inactive_timeout)
state = %{
exec_path: exec_path,
sname: sname,
addr: addr,
hostname: hostname,
spawn_inactive_timeout: spawn_inactive_timeout
}
connect_or_spawn_cnode(state)
end
defp default_sname, do: "myhtml_#{FastHtml.Application.random_sname()}"
defp master_sname, do: Node.self() |> to_string |> String.split("@") |> List.first()
defp master_hostname, do: Node.self() |> to_string |> String.split("@") |> List.last()
defp connect_or_spawn_cnode(state) do
case connect_cnode(state) do
{:stop, _} -> spawn_cnode(state)
{:ok, state} -> state
end
end
defp connect_cnode(%{addr: addr} = state) do
if Node.connect(addr) do
Logger.debug("connected to #{addr}")
Node.monitor(addr, true)
{:ok, state}
else
Logger.debug("connecting to #{addr} failed")
{:stop, :cnode_connection_fail}
end
end
defp spawn_cnode(%{exec_path: exec_path, sname: sname, hostname: hostname} = state) do
Logger.debug("Spawning #{sname}@#{hostname}")
cookie = :erlang.get_cookie()
port =
Port.open({:spawn_executable, exec_path}, [
:binary,
:exit_status,
:stderr_to_stdout,
line: 4096,
args: [sname, hostname, cookie, master_sname()]
])
pid = Keyword.get(Port.info(port), :os_pid)
state = Map.put(state, :pid, pid)
await_cnode_ready(port, state)
end
defp await_cnode_ready(
port,
%{spawn_inactive_timeout: timeout, addr: addr} = state
) do
ready_line = to_string(addr) <> " ready"
receive do
{^port, {:data, {:eol, ^ready_line}}} ->
connect_cnode(state)
{^port, {:data, {:eol, line}}} ->
Logger.debug("c-node is saying: #{line}")
await_cnode_ready(port, state)
{^port, {:exit_status, exit_status}} ->
Logger.debug("unexpected c-node exit: #{exit_status}")
{:stop, :cnode_unexpected_exit}
message ->
Logger.warn("unhandled message while waiting for cnode to be ready:\n#{inspect(message)}")
await_cnode_ready(port, state)
after
timeout ->
{:stop, :spawn_inactive_timeout}
end
end
@doc false
def handle_info({:nodedown, _cnode}, state) do
{:stop, :nodedown, state}
end
@doc false
def handle_info(msg, state) do
Logger.warn("unhandled handle_info: #{inspect(msg)}")
{:noreply, state}
end
@doc false
def handle_call(:addr, _from, %{addr: addr} = state) do
{:reply, addr, state}
end
@doc false
def terminate(_reason, %{pid: pid}) when pid != nil do
System.cmd("kill", ["-9", to_string(pid)])
:normal
end
@doc "Call into myhtml cnode"
def call(msg, timeout \\ 10000) do
node = GenServer.call(__MODULE__, :addr)
send({nil, node}, msg)
receive do
{:myhtml_worker, res} -> res
after
timeout -> {:error, :timeout}
end
end
end
defmodule FastHtml.Pool do
@behaviour NimblePool
@moduledoc """
"""
require Logger
@doc false
def child_spec(opts) do
%{
id: __MODULE__,
start: {__MODULE__, :start_link, [opts]},
type: :worker,
restart: :permanent
}
end
@doc """
Starts the port pool.
### Options
- `:size` - Number of ports in the pool. Defaults to `System.schedulers_online/0` if not set.
- `:name` - Registered name of the pool. Defaults to `#{__MODULE__}` if not set, set to `false` to not register the process.
"""
@type option :: {:size, pos_integer()} | {:name, atom()}
@spec start_link([option()]) :: term()
def start_link(options) do
{size, options} = Keyword.pop(options, :size, System.schedulers_online())
NimblePool.start_link(worker: {__MODULE__, options}, pool_size: size)
end
@type pool :: atom() | pid()
@type result :: {:ok, term()} | {:error, atom()}
@spec get_port(pool(), (port() -> result())) :: result()
def get_port(pool, fun) do
NimblePool.checkout!(pool, :checkout, fn _from, port ->
result = fun.(port)
client_state =
case result do
{:ok, _} ->
:ok
{:error, reason} ->
reason
end
send(port, {self(), {:connect, GenServer.whereis(pool)}})
client_state =
receive do
{^port, :connected} -> client_state
{:EXIT, ^port, reason} -> {:EXIT, reason}
end
{result, client_state}
end)
end
@impl NimblePool
@doc false
def init_pool(state) do
{name, options} =
case Keyword.pop(state, :name) do
{nil, state} -> {__MODULE__, state}
{name, state} when is_atom(name) -> {name, state}
{_, state} -> {nil, state}
end
if name, do: Process.register(self(), name)
{:ok, options}
end
@impl NimblePool
@doc false
def init_worker(pool_state) do
port = :fast_html.open_port()
{:ok, port, pool_state}
end
@impl NimblePool
@doc false
def terminate_worker({:EXIT, reason}, port, pool_state) do
Logger.warn(fn ->
"[#{__MODULE__}]: Port #{port} unexpectedly exited with reason: #{reason}"
end)
{:ok, pool_state}
end
@impl NimblePool
@doc false
def terminate_worker(_reason, port, pool_state) do
Port.close(port)
{:ok, pool_state}
end
@impl NimblePool
@doc false
def handle_checkout(:checkout, {client_pid, _}, port) do
send(port, {self(), {:connect, client_pid}})
receive do
{^port, :connected} -> {:ok, port, port}
{:EXIT, ^port, reason} -> {:remove, {:EXIT, reason}}
end
end
@impl NimblePool
@doc false
def handle_checkin(:timeout, _, _), do: {:remove, :timeout}
@impl NimblePool
@doc false
def handle_checkin(_, _, port), do: {:ok, port}
@impl NimblePool
@doc false
def handle_info({:EXIT, port, reason}, port), do: {:remove, {:EXIT, reason}}
@impl NimblePool
@doc false
def handle_info({:EXIT, _, _}, port), do: {:ok, port}
# Port sent data to the pool, this happens when the timeout was reached
# and the port got disconnected from the client, but not yet killed by the pool.
# Just discard the message.
@impl NimblePool
@doc false
def handle_info({_sending_port, {:data, _}}, port), do: {:ok, port}
end
......@@ -4,11 +4,12 @@ defmodule FastHtml.Mixfile do
def project do
[
app: :fast_html,
version: "1.0.3",
version: "2.0.0",
elixir: "~> 1.5",
deps: deps(),
package: package(),
compilers: [:fast_html_cnode_make] ++ Mix.compilers(),
compilers: [:elixir_make] ++ Mix.compilers(),
make_env: make_env(),
build_embedded: Mix.env() == :prod,
start_permanent: Mix.env() == :prod,
name: "FastHtml",
......@@ -68,9 +69,13 @@ defmodule FastHtml.Mixfile do
{:ex_doc, "~> 0.19", only: :dev},
# benchmarking helpers
{:benchee, "~> 1.0", only: :bench, optional: true},
{:dialyxir, "~> 1.0", only: [:dev], runtime: false},
{:myhtmlex, "~> 0.2.0", only: :bench, runtime: false, optional: true},
{:mochiweb, "~> 2.18", only: :bench, optional: true},
{:html5ever, "~> 0.7.0", only: :bench, optional: true}
{:html5ever,
git: "https://github.com/rusterlium/html5ever_elixir.git", only: :bench, optional: true},
{:nimble_pool, "~> 0.1"},
{:elixir_make, "~> 0.4", runtime: false}
]
end
......@@ -80,24 +85,6 @@ defmodule FastHtml.Mixfile do
extras: ["README.md"]
]
end
end
defmodule Mix.Tasks.Compile.FastHtmlCnodeMake do
@artifacts [
"priv/myhtml_worker"
]
def find_make do
_make_cmd =
System.get_env("MAKE") ||
case :os.type() do
{:unix, :freebsd} -> "gmake"
{:unix, :openbsd} -> "gmake"
{:unix, :netbsd} -> "gmake"
{:unix, :dragonfly} -> "gmake"
_ -> "make"
end
end
defp otp_version do
:erlang.system_info(:otp_release)
......@@ -109,49 +96,14 @@ defmodule Mix.Tasks.Compile.FastHtmlCnodeMake do
otp_version() >= 22
end
def run(_) do