Commit 67b7193a authored by Lukas Rieder's avatar Lukas Rieder

test both implementations, Nif + Cnode with the same tests

* Set Myhtmlex.Safe as the default variant
* Rename cclient -> myhtml_worker
* Fix C-Node behaviour when passing additional parse flags, now supporting :html_atoms and :comment_tuple3
parent 15c2c034
......@@ -62,12 +62,12 @@ $(MYHTML_STATIC): $(MYHTML_PATH)
priv/myhtmlex.so: c_src/myhtmlex.c $(MYHTML_STATIC)
$(CC) $(MYHTMLEX_CFLAGS) $(MYHTMLEX_LDFLAGS) -o $@ $< $(MYHTML_STATIC)
priv/cclient: c_src/cclient.c $(MYHTML_STATIC)
priv/myhtml_worker: c_src/myhtml_worker.c $(MYHTML_STATIC)
$(CC) -o $@ $< $(MYHTML_STATIC) $(CNODE_CFLAGS)
clean: clean-myhtml
$(RM) -r priv/myhtmlex*
$(RM) priv/cclient
$(RM) priv/myhtml_worker
$(RM) myhtmlex-*.tar
$(RM) -r package-test
......
......@@ -3,7 +3,7 @@ defmodule CnodeFileSizesBench do
setup_all do
Nodex.Distributed.up
{:ok, _pid} = Nodex.Cnode.start_link(%{exec_path: "priv/cclient"}, name: Myhtmlex.Safe)
{:ok, _pid} = Nodex.Cnode.start_link(%{exec_path: "priv/myhtml_worker"}, name: Myhtmlex.Safe)
contents = {
File.read!("bench/github_trending_js.html"),
File.read!("bench/w3c_html5.html"),
......
......@@ -8,6 +8,7 @@
#include <netinet/in.h>
#include <arpa/inet.h>
#include <errno.h>
#include <ctype.h>
#include "erl_interface.h"
#include "ei.h"
......@@ -42,6 +43,10 @@ ETERM*
build_node_attrs(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node);
ETERM*
err_term(const char* error_atom);
unsigned char
read_parse_flags(ETERM* list);
char*
lowercase(char* c);
const unsigned char FLAG_HTML_ATOMS = 1 << 0;
const unsigned char FLAG_NIL_SELF_CLOSING = 1 << 1;
......@@ -219,17 +224,45 @@ decode(state_t* state, ErlMessage* emsg, ETERM* bin, ETERM* args)
return err_term("myhtml_parse_failed");
}
// read parse flags
parse_flags = read_parse_flags(args);
// build tree
myhtml_tree_node_t *root = myhtml_tree_get_document(state->tree);
return build_tree(&prefab, state->tree, myhtml_node_last_child(root), &parse_flags);
}
unsigned char
read_parse_flags(ETERM* list)
{
unsigned char parse_flags = 0;
ETERM *flag;
for (; !ERL_IS_EMPTY_LIST(list); list = ERL_CONS_TAIL(list)) {
flag = ERL_CONS_HEAD(list);
if (erl_match(erl_format("html_atoms"), flag))
{
parse_flags |= FLAG_HTML_ATOMS;
}
else if (erl_match(erl_format("nil_self_closing"), flag))
{
parse_flags |= FLAG_NIL_SELF_CLOSING;
}
else if (erl_match(erl_format("comment_tuple3"), flag))
{
parse_flags |= FLAG_COMMENT_TUPLE3;
}
}
return parse_flags;
}
ETERM*
build_tree(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsigned char* parse_flags)
{
ETERM* result;
myhtml_tag_id_t tag_id = myhtml_node_tag_id(node);
/* myhtml_namespace_t tag_ns = myhtml_node_namespace(node); */
myhtml_namespace_t tag_ns = myhtml_node_namespace(node);
if (tag_id == MyHTML_TAG__TEXT)
{
......@@ -266,14 +299,29 @@ build_tree(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsi
size_t tag_name_len;
const char *tag_name = myhtml_tag_name_by_id(tree, tag_id, &tag_name_len);
// get namespace of tag
/* size_t tag_ns_len; */
/* const char *tag_ns_name_ptr = myhtml_namespace_name_by_id(tag_ns, &tag_ns_len); */
/* char *tag_ns_buffer; */
/* char buffer [tag_ns_len + tag_name_len + 1]; */
/* char *tag_string = buffer; */
/* size_t tag_string_len; */
tag = erl_mk_binary(tag_name, tag_name_len);
size_t tag_ns_len;
const char *tag_ns_name_ptr = myhtml_namespace_name_by_id(tag_ns, &tag_ns_len);
char *tag_ns_buffer;
char buffer [tag_ns_len + tag_name_len + 1];
char *tag_string = buffer;
size_t tag_string_len;
if (tag_ns != MyHTML_NAMESPACE_HTML)
{
// tag_ns_name_ptr is unmodifyable, copy it in our tag_ns_buffer to make it modifyable.
tag_ns_buffer = malloc(tag_ns_len);
strcpy(tag_ns_buffer, tag_ns_name_ptr);
// lowercase tag buffer (can be removed, just a nice to have)
tag_ns_buffer = lowercase(tag_ns_buffer);
// prepend namespace to tag name, e.g. "svg:path"
stpcpy(stpcpy(stpcpy(tag_string, tag_ns_buffer), ":"), tag_name);
tag_string_len = tag_ns_len + tag_name_len + 1; // +1 for colon
}
else
{
stpcpy(tag_string, tag_name);
tag_string_len = tag_name_len;
}
// attributes
attrs = build_node_attrs(prefab, tree, node);
......@@ -281,9 +329,22 @@ build_tree(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsi
// children
children = build_node_children(prefab, tree, node, parse_flags);
/* ETERM* tuple3[] = {tag, attrs, children}; */
/* result = erl_mk_tuple(tuple3, 3); */
result = erl_format("{~w, ~w, ~w}", tag, attrs, children);
if (!(*parse_flags & FLAG_HTML_ATOMS) || (tag_id == MyHTML_TAG__UNDEF || tag_id == MyHTML_TAG_LAST_ENTRY || tag_ns != MyHTML_NAMESPACE_HTML))
{
tag = erl_mk_binary(tag_string, tag_string_len);
/* ETERM* tuple3[] = {tag, attrs, children}; */
/* result = erl_mk_tuple(tuple3, 3); */
result = erl_format("{~w, ~w, ~w}", tag, attrs, children);
}
else
{
// tag = erl_mk_atom(tag_string);
tag = erl_mk_atom(tag_string);
/* ETERM* tuple3[] = {tag, attrs, children}; */
/* result = erl_mk_tuple(tuple3, 3); */
result = erl_format("{~w, ~w, ~w}", tag, attrs, children);
}
}
return result;
......@@ -368,3 +429,16 @@ build_node_attrs(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node
return list;
}
char*
lowercase(char* c)
{
char* p = c;
while(*p)
{
*p = tolower((unsigned char)*p);
p++;
}
return c;
}
......@@ -136,10 +136,8 @@ read_parse_flags(ErlNifEnv* env, const ERL_NIF_TERM* options)
unsigned char parse_flags = 0;
ERL_NIF_TERM flag;
// only look at 2 flags max (more are not implemented yet)
for (int i = 0; i < 2; i++)
while (enif_get_list_cell(env, *options, &flag, (ERL_NIF_TERM*)options))
{
if (!enif_get_list_cell(env, *options, &flag, (ERL_NIF_TERM*)options)) break;
if (!enif_is_atom(env, flag)) return enif_make_badarg(env);
// set parse flags
if (enif_compare(flag, ATOM_HTML_ATOMS) == 0)
......
......@@ -23,7 +23,7 @@ defmodule Myhtmlex do
Finished in 7.52 seconds
## FileSizesBench
benchmark name iterations average time
benchmark name iterations average time
wikipedia_hyperlink.html 97k 1000 1385.86 µs/op
w3c_html5.html 131k 1000 2179.30 µs/op
github_trending_js.html 341k 500 5686.21 µs/op
......@@ -118,7 +118,7 @@ defmodule Myhtmlex do
iex> Myhtmlex.decode(html, format: [:html_atoms, :nil_self_closing, :comment_tuple3])
{:html, [],
[{:head, [], []},
{:body, [], [{:comment, " a comment "}, {"unknown", [], nil}]}]}
{:body, [], [{:comment, [], " a comment "}, {"unknown", [], nil}]}]}
"""
@spec decode(String.t, format: [format_flag()]) :: tree()
......
......@@ -13,20 +13,20 @@ defmodule Myhtmlex.Safe do
unless Node.alive? do
Nodex.Distributed.up
end
cclient = :filename.join(:code.priv_dir(unquote(app)), 'cclient')
myhtml_worker = Path.join(:code.priv_dir(unquote(app)), "myhtml_worker")
children = [
worker(Nodex.Cnode, [%{exec_path: cclient}, [name: __MODULE__]])
worker(Nodex.Cnode, [%{exec_path: myhtml_worker}, [name: __MODULE__]])
]
Supervisor.start_link(children, strategy: :one_for_one, name: Myhtmlex.Safe.Supervisor)
end
@doc false
def decode(bin) do
decode(bin, format: [])
decode(bin, [])
end
@doc false
def decode(bin, format: flags) do
def decode(bin, flags) do
{:ok, res} = Nodex.Cnode.call(__MODULE__, {:decode, bin, flags})
res
end
......
......@@ -50,7 +50,7 @@ defmodule Myhtmlex.Mixfile do
mod: {Myhtmlex.Safe, []},
registered: [Myhtmlex.Safe],
env: [
mode: Myhtmlex.Nif
mode: Myhtmlex.Safe
]
]
end
......@@ -71,7 +71,7 @@ end
defmodule Mix.Tasks.Compile.MyhtmlexMake do
@artifacts [
"priv/myhtmlex.so",
"priv/cclient"
"priv/myhtml_worker"
]
def run(_) do
......
%{"benchfella": {:hex, :benchfella, "0.3.5", "b2122c234117b3f91ed7b43b6e915e19e1ab216971154acd0a80ce0e9b8c05f5", [], [], "hexpm"},
%{
"benchfella": {:hex, :benchfella, "0.3.5", "b2122c234117b3f91ed7b43b6e915e19e1ab216971154acd0a80ce0e9b8c05f5", [], [], "hexpm"},
"cnodex": {:git, "https://github.com/Overbryd/cnodex.git", "c1c4cde21295db07f87bb74006ab5f7222720db9", []},
"earmark": {:hex, :earmark, "1.2.3", "206eb2e2ac1a794aa5256f3982de7a76bf4579ff91cb28d0e17ea2c9491e46a4", [], [], "hexpm"},
"ex_doc": {:hex, :ex_doc, "0.16.3", "cd2a4cfe5d26e37502d3ec776702c72efa1adfa24ed9ce723bb565f4c30bd31a", [], [{:earmark, "~> 1.1", [hex: :earmark, repo: "hexpm", optional: false]}], "hexpm"},
"myhtml": {:git, "https://github.com/lexborisov/myhtml.git", "fe2cf577570666d058a2b7167c26d3384a758e19", [branch: "master"]},
"nodex": {:hex, :nodex, "0.1.1", "ed2f7bbe19ea62a43ad4b7ad332eb3f9ca12c64a35a5802a0eb545b93ebe32af", [], [], "hexpm"}}
"nodex": {:hex, :nodex, "0.1.1", "ed2f7bbe19ea62a43ad4b7ad332eb3f9ca12c64a35a5802a0eb545b93ebe32af", [:mix], [], "hexpm"},
}
defmodule Myhtmlex.NifTest do
use MyhtmlexSharedTests, module: Myhtmlex.Nif
test "parse a larger file (131K)" do
html = File.read!("bench/github_trending_js.html")
ref = Myhtmlex.open(html)
assert is_reference(ref)
assert is_tuple(Myhtmlex.decode_tree(ref))
end
test "open" do
ref = Myhtmlex.open(~s'<dif class="a"></div><div class="b"></div>')
assert is_reference(ref)
end
test "open and decode_tree" do
ref = Myhtmlex.open(~s'text node')
assert is_reference(ref)
assert {:html, [], [
{:head, [], []},
{:body, [], [
"text node"
]}
]} = Myhtmlex.decode_tree(ref, format: [:html_atoms])
end
end
defmodule Myhtmlex.SafeTest do
use MyhtmlexSharedTests, module: Myhtmlex.Safe
end
defmodule MyhtmlexSafeTest do
use ExUnit.Case
test "it works" do
tree = Myhtmlex.Safe.decode("foo")
assert {"html", [], [{"head", [], []}, {"body", [], ["foo"]}]} = tree
end
end
defmodule MyhtmlexSharedTests do
defmacro __using__(opts) do
module = Keyword.fetch!(opts, :module)
quote do
use ExUnit.Case
doctest Myhtmlex
setup_all(_) do
Application.put_env(:myhtmlex, :mode, unquote(module))
:ok
end
test "builds a tree, formatted like mochiweb by default" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
{"br", [], []}
]}
]} = Myhtmlex.decode("<br>")
end
test "builds a tree, html tags as atoms" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:br, [], []}
]}
]} = Myhtmlex.decode("<br>", format: [:html_atoms])
end
test "builds a tree, nil self closing" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
{"br", [], nil},
{"esi:include", [], nil}
]}
]} = Myhtmlex.decode("<br><esi:include />", format: [:nil_self_closing])
end
test "builds a tree, multiple format options" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:br, [], nil}
]}
]} = Myhtmlex.decode("<br>", format: [:html_atoms, :nil_self_closing])
end
test "attributes" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:span, [{"id", "test"}, {"class", "foo garble"}], []}
]}
]} = Myhtmlex.decode(~s'<span id="test" class="foo garble"></span>', format: [:html_atoms])
end
test "single attributes" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:button, [{"disabled", "disabled"}, {"class", "foo garble"}], []}
]}
]} = Myhtmlex.decode(~s'<button disabled class="foo garble"></span>', format: [:html_atoms])
end
test "text nodes" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
"text node"
]}
]} = Myhtmlex.decode(~s'<body>text node</body>', format: [:html_atoms])
end
test "broken input" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:a, [{"<", "<"}], [" asdf"]}
]}
]} = Myhtmlex.decode(~s'<a <> asdf', format: [:html_atoms])
end
test "namespaced tags" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{"svg:svg", [], [
{"svg:path", [], []},
{"svg:a", [], []}
]}
]}
]} = Myhtmlex.decode(~s'<svg><path></path><a></a></svg>', format: [:html_atoms])
end
test "custom namespaced tags" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{"esi:include", [], nil}
]}
]} = Myhtmlex.decode(~s'<esi:include />', format: [:html_atoms, :nil_self_closing])
end
test "html comments" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
comment: " a comment "
]}
]} = Myhtmlex.decode(~s'<body><!-- a comment --></body>', format: [:html_atoms])
end
end # quote
end # defmacro __using__
end
defmodule MyhtmlexTest do
use ExUnit.Case
doctest Myhtmlex
test "builds a tree, formatted like mochiweb by default" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
{"br", [], []}
]}
]} = Myhtmlex.decode("<br>")
end
test "builds a tree, html tags as atoms" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:br, [], []}
]}
]} = Myhtmlex.decode("<br>", format: [:html_atoms])
end
test "builds a tree, nil self closing" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
{"br", [], nil},
{"esi:include", [], nil}
]}
]} = Myhtmlex.decode("<br><esi:include />", format: [:nil_self_closing])
end
test "builds a tree, multiple format options" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:br, [], nil}
]}
]} = Myhtmlex.decode("<br>", format: [:html_atoms, :nil_self_closing])
end
test "attributes" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:span, [{"id", "test"}, {"class", "foo garble"}], []}
]}
]} = Myhtmlex.decode(~s'<span id="test" class="foo garble"></span>', format: [:html_atoms])
end
test "single attributes" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:button, [{"disabled", "disabled"}, {"class", "foo garble"}], []}
]}
]} = Myhtmlex.decode(~s'<button disabled class="foo garble"></span>', format: [:html_atoms])
end
test "text nodes" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
"text node"
]}
]} = Myhtmlex.decode(~s'<body>text node</body>', format: [:html_atoms])
end
test "broken input" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:a, [{"<", "<"}], [" asdf"]}
]}
]} = Myhtmlex.decode(~s'<a <> asdf', format: [:html_atoms])
end
test "open" do
ref = Myhtmlex.open(~s'<dif class="a"></div><div class="b"></div>')
assert is_reference(ref)
end
test "open and decode_tree" do
ref = Myhtmlex.open(~s'text node')
assert is_reference(ref)
assert {:html, [], [
{:head, [], []},
{:body, [], [
"text node"
]}
]} = Myhtmlex.decode_tree(ref, format: [:html_atoms])
end
test "namespaced tags" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{"svg:svg", [], [
{"svg:path", [], []},
{"svg:a", [], []}
]}
]}
]} = Myhtmlex.decode(~s'<svg><path></path><a></a></svg>', format: [:html_atoms])
end
test "custom namespaced tags" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{"esi:include", [], nil}
]}
]} = Myhtmlex.decode(~s'<esi:include />', format: [:html_atoms, :nil_self_closing])
end
test "html comments" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
comment: " a comment "
]}
]} = Myhtmlex.decode(~s'<body><!-- a comment --></body>', format: [:html_atoms])
end
test "parse a larger file (131K)" do
html = File.read!("bench/github_trending_js.html")
ref = Myhtmlex.open(html)
assert is_reference(ref)
assert is_tuple(Myhtmlex.decode_tree(ref))
end
end
Code.require_file("myhtmlex_shared_tests.ex", "test")
ExUnit.start()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment