Commit 806b0fd4 authored by Lukas Rieder's avatar Lukas Rieder

package everything within hex

parent 82260777
[submodule "c_src/myhtml"]
path = c_src/myhtml
url = https://github.com/lexborisov/myhtml.git
This diff is collapsed.
......@@ -18,7 +18,11 @@ ERLANG_PATH = $(shell erl -eval 'io:format("~s", [lists:concat([code:root_dir(),
MYHTMLEX_CFLAGS += -I$(ERLANG_PATH)
# expecting myhtml fetched as a mix dependency
MYHTML_PATH = deps/myhtml
ifeq ($(wildcard c_src/myhtml),)
MYHTML_PATH = deps/myhtml
else
MYHTML_PATH = c_src/myhtml
endif
MYHTML_STATIC = $(MYHTML_PATH)/lib/libmyhtml_static.a
MYHTMLEX_CFLAGS += -I$(MYHTML_PATH)/include
......@@ -43,7 +47,7 @@ myhtmlex: priv/myhtmlex.so
deps/myhtml:
$(MIX) deps.get
$(MYHTML_STATIC): deps/myhtml
$(MYHTML_STATIC): $(MYHTML_PATH)
$(MAKE) -C $(MYHTML_PATH) library
priv/myhtmlex.so: src/myhtmlex.c $(MYHTML_STATIC)
......
Subproject commit fe2cf577570666d058a2b7167c26d3384a758e19
defmodule Myhtmlex do
@moduledoc """
A module to decode html into a tree structure.
Based on [Alexander Borisov's myhtml](https://github.com/lexborisov/myhtml),
this binding gains the properties of being html-spec compliant and very fast.
## Example
iex> Myhtmlex.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
Benchmark results on various file sizes on a 2,5Ghz Core i7:
Settings:
duration: 1.0 s
## FileSizesBench
[15:28:42] 1/3: github_trending_js.html 341k
[15:28:46] 2/3: w3c_html5.html 131k
[15:28:48] 3/3: wikipedia_hyperlink.html 97k
Finished in 7.52 seconds
## FileSizesBench
benchmark name iterations average time
wikipedia_hyperlink.html 97k 1000 1385.86 µs/op
w3c_html5.html 131k 1000 2179.30 µs/op
github_trending_js.html 341k 500 5686.21 µs/op
## Thoughts
I need to a fast html-parsing library in Erlang/Elixir.
So falling back to c, and to myhtml especially, is a natural move.
But Erlang interoperability is a tricky mine-field.
This increase in parsing speed does not come for free.
The current implementation can be considered a proof-of-concept.
The myhtml code is called as a dirty-nif and executed **inside the Erlang-VM**.
Thus completely giving up the safety of the Erlang-VM. I am not saying that myhtml is unsafe, but
the slightest Segfault brings down the whole Erlang-VM.
So, I consider this mode of operation unsafe, and **not recommended for production use**.
The other option, that I have on my roadmap, is to call into a C-Node.
A separate OS-process that receives calls from erlang and returns to the calling process.
Another option is to call into a Port driver.
A separate OS-process that communicates via stdin/stdout.
So to recap, I want a **fast** and **safe** html-parsing library for Erlang/Elixir.
Not quite there, yet.
"""
@type tag() :: String.t | atom()
@type attr() :: {String.t, String.t}
@type attr_list() :: [] | [attr()]
@type comment_node() :: {:comment, String.t}
@type comment_node3() :: {:comment, [], String.t}
@type tree() :: {tag(), attr_list(), tree()}
| {tag(), attr_list(), nil}
| comment_node()
| comment_node3()
@type format_flag() :: :html_atoms | :nil_self_closing | :comment_tuple3
@doc """
Returns a tree representation from the given html string.
## Examples
iex> Myhtmlex.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
iex> Myhtmlex.decode("<span class='hello'>Hi there</span>")
{"html", [],
[{"head", [], []},
{"body", [], [{"span", [{"class", "hello"}], ["Hi there"]}]}]}
iex> Myhtmlex.decode("<body><!-- a comment --!></body>")
{"html", [], [{"head", [], []}, {"body", [], [comment: " a comment "]}]}
iex> Myhtmlex.decode("<br>")
{"html", [], [{"head", [], []}, {"body", [], [{"br", [], []}]}]}
"""
@spec decode(String.t) :: tree()
def decode(bin) do
Myhtmlex.Decoder.decode(bin)
end
@doc """
Returns a tree representation from the given html string.
This variant allows you to pass in one or more of the following format flags:
* `:html_atoms` uses atoms for known html tags (faster), binaries for everything else.
* `:nil_self_closing` uses `nil` to designate self-closing tags and void elements.
For example `<br>` is then being represented like `{"br", [], nil}`.
See http://w3c.github.io/html-reference/syntax.html#void-elements for a full list of void elements.
* `:comment_tuple3` uses 3-tuple elements for comments, instead of the default 2-tuple element.
## Examples
iex> Myhtmlex.decode("<h1>Hello world</h1>", format: [:html_atoms])
{:html, [], [{:head, [], []}, {:body, [], [{:h1, [], ["Hello world"]}]}]}
iex> Myhtmlex.decode("<br>", format: [:nil_self_closing])
{"html", [], [{"head", [], []}, {"body", [], [{"br", [], nil}]}]}
iex> Myhtmlex.decode("<body><!-- a comment --!></body>", format: [:comment_tuple3])
{"html", [], [{"head", [], []}, {"body", [], [{:comment, [], " a comment "}]}]}
iex> html = "<body><!-- a comment --!><unknown /></body>"
iex> Myhtmlex.decode(html, format: [:html_atoms, :nil_self_closing, :comment_tuple3])
{:html, [],
[{:head, [], []},
{:body, [], [{:comment, " a comment "}, {"unknown", [], nil}]}]}
"""
@spec decode(String.t, format: [format_flag()]) :: tree()
def decode(bin, format: flags) do
Myhtmlex.Decoder.decode(bin, flags)
end
@doc """
Returns a reference to an internally parsed myhtml_tree_t.
"""
@spec open(String.t) :: reference()
def open(bin) do
Myhtmlex.Decoder.open(bin)
end
def decode_tree(tree) do
Myhtmlex.Decoder.decode_tree(tree)
@doc """
Returns a tree representation from the given reference. See `decode/1` for example output.
"""
@spec decode_tree(reference()) :: tree()
def decode_tree(ref) do
Myhtmlex.Decoder.decode_tree(ref)
end
def decode_tree(tree, format: flags) do
Myhtmlex.Decoder.decode_tree(tree, flags)
@doc """
Returns a tree representation from the given reference. See `decode/2` for options and example output.
"""
@spec decode_tree(reference(), format: [format_flag()]) :: tree()
def decode_tree(ref, format: flags) do
Myhtmlex.Decoder.decode_tree(ref, flags)
end
end
defmodule Myhtmlex.Decoder do
@moduledoc false
@on_load { :init, 0 }
app = Mix.Project.config[:app]
......
defmodule Myhtmlex.Doc do
defstruct ref: nil, source: nil
end
defimpl Inspect, for: Myhtmlex.Doc do
import Inspect.Algebra
def inspect(%Myhtmlex.Doc{source: source}, opts) do
cut = String.slice(source, 0..60)
cut = if String.length(cut) < String.length(source) do
"#{cut}..."
else
cut
end
concat ["#Myhtmlex.Doc<source: ", to_doc(cut, opts), ">"]
end
end
......@@ -4,7 +4,6 @@ defmodule Mix.Tasks.Compile.Myhtml do
IO.warn "Windows is not yet a target."
exit(1)
else
File.mkdir_p("priv")
{result, _error_code} = System.cmd("make", ["priv/myhtmlex.so"], stderr_to_stdout: true)
IO.binwrite result
end
......@@ -22,10 +21,35 @@ defmodule Myhtmlex.Mixfile do
elixir: "~> 1.5",
compilers: [:myhtml, :elixir, :app],
start_permanent: Mix.env == :prod,
description: "A module to decode HTML into a tree, porting all properties of the underlying library myhtml, being fast and correct in regards to the html spec.",
package: package(),
deps: deps()
]
end
def package do
[
maintainers: ["Lukas Rieder"],
licenses: ["GNU LGPL"],
links: %{
"Github" => "https://github.com/Overbryd/myhtmlex",
"Issues" => "https://github.com/Overbryd/myhtmlex/issues",
"MyHTML" => "https://github.com/lexborisov/myhtml"
},
files: [
"lib",
"src",
"c_src",
"Makefile",
"Makefile.Darwin",
"Makefile.Linux",
"mix.exs",
"README.md",
"LICENSE"
]
]
end
# Run "mix help compile.app" to learn about applications.
def application do
[
......@@ -36,9 +60,8 @@ defmodule Myhtmlex.Mixfile do
# Run "mix help deps" to learn about dependencies.
defp deps do
[
# myhtml c library
{:myhtml, github: "Overbryd/myhtml", branch: "feat/node-is-void-element", app: false},
# {:myhtml, github: "lexborisov/myhtml", tag: "v4.0.2", app: false},
# in dev environment, manage myhtml c library with mix
{:myhtml, github: "lexborisov/myhtml", branch: "master", app: false, only: :dev},
# documentation helpers
{:ex_doc, ">= 0.0.0", only: :dev},
# benchmarking helpers
......
%{"benchfella": {:hex, :benchfella, "0.3.5", "b2122c234117b3f91ed7b43b6e915e19e1ab216971154acd0a80ce0e9b8c05f5", [], [], "hexpm"},
"earmark": {:hex, :earmark, "1.2.3", "206eb2e2ac1a794aa5256f3982de7a76bf4579ff91cb28d0e17ea2c9491e46a4", [], [], "hexpm"},
"ex_doc": {:hex, :ex_doc, "0.16.3", "cd2a4cfe5d26e37502d3ec776702c72efa1adfa24ed9ce723bb565f4c30bd31a", [], [{:earmark, "~> 1.1", [hex: :earmark, repo: "hexpm", optional: false]}], "hexpm"},
"myhtml": {:git, "https://github.com/Overbryd/myhtml.git", "697d2aad1392ef555fd3d632674d7ad7717d1e6a", [branch: "feat/node-is-void-element"]}}
"myhtml": {:git, "https://github.com/lexborisov/myhtml.git", "fe2cf577570666d058a2b7167c26d3384a758e19", [branch: "master"]}}
......@@ -150,6 +150,10 @@ read_parse_flags(ErlNifEnv* env, const ERL_NIF_TERM* options)
{
parse_flags |= FLAG_NIL_SELF_CLOSING;
}
else if (enif_compare(flag, ATOM_COMMENT_TUPLE3) == 0)
{
parse_flags |= FLAG_COMMENT_TUPLE3;
}
}
return parse_flags;
......@@ -375,6 +379,7 @@ load(ErlNifEnv *env, void **priv, ERL_NIF_TERM info)
ATOM_COMMENT = make_atom(env, "comment");
ATOM_HTML_ATOMS = make_atom(env, "html_atoms");
ATOM_NIL_SELF_CLOSING = make_atom(env, "nil_self_closing");
ATOM_COMMENT_TUPLE3 = make_atom(env, "comment_tuple3");
EMPTY_LIST = enif_make_list(env, 0);
// myhtml basic init
......
......@@ -35,6 +35,7 @@ ERL_NIF_TERM ATOM_NIL;
ERL_NIF_TERM ATOM_COMMENT;
ERL_NIF_TERM ATOM_HTML_ATOMS;
ERL_NIF_TERM ATOM_NIL_SELF_CLOSING;
ERL_NIF_TERM ATOM_COMMENT_TUPLE3;
ERL_NIF_TERM EMPTY_LIST;
const unsigned char FLAG_HTML_ATOMS = 1 << 0;
const unsigned char FLAG_NIL_SELF_CLOSING = 1 << 1;
......
......@@ -112,10 +112,20 @@ defmodule MyhtmlexTest do
]} = Myhtmlex.decode(~s'<esi:include />', format: [:html_atoms, :nil_self_closing])
end
test "open this nasty github file (works fine in parse single, parse threaded hangs)" do
test "html comments" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
comment: " a comment "
]}
]} = Myhtmlex.decode(~s'<body><!-- a comment --></body>', format: [:html_atoms])
end
test "parse a larger file (131K)" do
html = File.read!("bench/github_trending_js.html")
ref = Myhtmlex.open(html)
assert is_reference(ref)
assert is_tuple(Myhtmlex.decode_tree(ref))
end
end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment