Commit 106512ce authored by Lukas Rieder's avatar Lukas Rieder

decode namespaced tags and non-html tags into binaries

parent af271e03
......@@ -2,6 +2,8 @@ MIX = mix
CFLAGS = -g -O3 -std=c99 -pedantic -Wcomment -Wall -Wextra
# turn warnings into errors
CFLAGS += -Werror
# ignore unused variables
CFLAGS += -Wno-unused-variable
# ignore unused parameter warnings
CFLAGS += -Wno-unused-parameter
# ignore missing field initializers warning (because of `static ErlNifFunc funcs[]`)
......
......@@ -2,14 +2,23 @@ defmodule BasicHtmlBench do
use Benchfella
@html File.read!("bench/w3c_html5.html")
# @decoded Exoml.decode(@html)
setup_all do
ref = Myhtmlex.open(@html)
{:ok, ref}
end
bench "decode" do
Myhtmlex.decode(@html)
end
# bench "encode" do
# Mixoml.encode(@decoded)
# end
bench "decode with ref" do
ref = bench_context
Myhtmlex.decode_tree(ref)
end
# bench "encode" do
# Mixoml.encode(@decoded)
# end
end
......@@ -2,4 +2,12 @@ defmodule Myhtmlex do
def decode(bin) do
Myhtmlex.Decoder.decode(bin)
end
def open(bin) do
Myhtmlex.Decoder.open(bin)
end
def decode_tree(tree) do
Myhtmlex.Decoder.decode_tree(tree)
end
end
defmodule Myhtmlex.Decoder do
@on_load { :init, 0 }
@on_load { :init, 0 }
app = Mix.Project.config[:app]
......@@ -11,4 +11,10 @@ defmodule Myhtmlex.Decoder do
@spec decode(bin :: String.t) :: {atom(), list(), list()}
def decode(bin)
def decode(_), do: exit(:nif_library_not_loaded)
def open(bin)
def open(_), do: exit(:nif_library_not_loaded)
def decode_tree(tree)
def decode_tree(_), do: exit(:nif_library_not_loaded)
end
#include "erl_nif.h"
#include <myhtml/api.h>
#include "myhtmlex.h"
char*
lowercase(char* c)
{
char* p = c;
while(*p)
{
*p = tolower((unsigned char)*p);
p++;
}
return c;
}
ERL_NIF_TERM
build_node_attrs(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node);
make_atom(ErlNifEnv* env, const char* name)
{
ERL_NIF_TERM ret;
if(enif_make_existing_atom(env, name, &ret, ERL_NIF_LATIN1)) {
return ret;
}
return enif_make_atom(env, name);
}
ERL_NIF_TERM
build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node);
nif_open(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
ERL_NIF_TERM result;
myhtmlex_ref_t* ref;
// fetch nif state
myhtmlex_state_t* state = (myhtmlex_state_t*) enif_priv_data(env);
// placeholder for the html binary we want to read from erlang caller
ErlNifBinary html_bin;
// read binary into &html_bin from argv[0] (first argument)
if (!enif_inspect_iolist_as_binary(env, argv[0], &html_bin))
{
// blame the user if html_bin is not a binary
return enif_make_badarg(env);
}
ref = enif_alloc_resource(state->myhtml_tree_rt, sizeof(myhtmlex_ref_t));
ref->tree = myhtml_tree_create();
myhtml_tree_init(ref->tree, state->myhtml);
mystatus_t status = myhtml_parse(ref->tree, MyENCODING_UTF_8, (char*) html_bin.data, (size_t) html_bin.size);
if (status != MyHTML_STATUS_OK)
{
// TODO: what is the correct reaction for a not ok state?
return enif_make_badarg(env);
}
ref->root = myhtml_tree_get_document(ref->tree);
// garbage collect argument
enif_release_binary(&html_bin);
result = enif_make_resource(env, ref);
return result;
}
ERL_NIF_TERM
build_node_children(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node);
nif_decode_tree(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
ERL_NIF_TERM result;
myhtmlex_ref_t* ref;
// fetch nif state
myhtmlex_state_t* state = (myhtmlex_state_t*) enif_priv_data(env);
// fetch reference
if (!enif_get_resource(env, argv[0], state->myhtml_tree_rt, (void **) &ref))
{
return enif_make_badarg(env);
}
// build erlang tree
result = build_tree(env, ref->tree, myhtml_node_last_child(ref->root));
// return tree to erlang
return result;
}
ERL_NIF_TERM
decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
// TODO: move this to load?
// myhtml basic init
myhtml_t* myhtml = myhtml_create();
myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
nif_decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
ERL_NIF_TERM result;
// init myhtml tree
myhtml_tree_t* tree = myhtml_tree_create();
myhtml_tree_init(tree, myhtml);
// fetch nif state
myhtmlex_state_t* state = (myhtmlex_state_t*) enif_priv_data(env);
// placeholder for the html binary we want to read from erlang caller
ErlNifBinary html_bin;
......@@ -28,30 +94,24 @@ decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
return enif_make_badarg(env);
}
// clear myhtml tree resources before parsing
myhtml_tree_clean(state->tree);
// parse html into tree
mystatus_t status = myhtml_parse(tree, MyENCODING_UTF_8, (char*) html_bin.data, (size_t) html_bin.size);
// TODO: check if it returned MyHTML_STATUS_OK
mystatus_t status = myhtml_parse(state->tree, MyENCODING_UTF_8, (char*) html_bin.data, (size_t) html_bin.size);
if (status != MyHTML_STATUS_OK)
{
printf("NOT OK");
// TODO: what is the correct reaction for a not ok state?
return enif_make_badarg(env);
}
myhtml_tree_node_t *root = myhtml_tree_get_document(tree);
ERL_NIF_TERM result = build_tree(env, tree, myhtml_node_last_child(root));
// build erlang tree
myhtml_tree_node_t *root = myhtml_tree_get_document(state->tree);
result = build_tree(env, state->tree, myhtml_node_last_child(root));
// garbage collect argument
enif_release_binary(&html_bin);
// TODO: move this to unload?
// maybe clean these resource after use (reuse myhtml, tree, etc...)
// myhtml_clean(myhtml);
// mythml_tree_clean(tree);
// release myhtml resources
myhtml_node_free(root);
myhtml_tree_destroy(tree);
myhtml_destroy(myhtml);
// return tree to erlang
return result;
......@@ -131,6 +191,7 @@ build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
{
ERL_NIF_TERM result;
myhtml_tag_id_t tag_id = myhtml_node_tag_id(node);
myhtml_namespace_t tag_ns = myhtml_node_namespace(node);
if (tag_id == MyHTML_TAG__TEXT)
{
......@@ -151,50 +212,120 @@ build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
memcpy(comment.data, node_comment, comment_len);
return result = enif_make_tuple3(env,
enif_make_atom(env, "comment"),
make_atom(env, "comment"),
enif_make_list(env, 0),
enif_make_binary(env, &comment)
);
}
else
{
ErlNifBinary tag;
ERL_NIF_TERM tag_bin;
ERL_NIF_TERM tag;
ERL_NIF_TERM children;
ERL_NIF_TERM attrs;
// get name of tag
size_t tag_len;
const char *tag_name = myhtml_tag_name_by_id(tree, tag_id, &tag_len);
// and put it in a binary
enif_alloc_binary(tag_len, &tag);
memcpy(tag.data, tag_name, tag_len);
tag_bin = enif_make_binary(env, &tag);
size_t tag_name_len;
const char *tag_name = myhtml_tag_name_by_id(tree, tag_id, &tag_name_len);
size_t tag_string_len;
// get namespace of tag
size_t tag_ns_len;
const char *tag_ns_name_ptr = myhtml_namespace_name_by_id(tag_ns, &tag_ns_len);
char *tag_ns_buffer;
char buffer [tag_ns_len + 2];
char *tag_string = buffer;
if (tag_ns != MyHTML_NAMESPACE_HTML)
{
// tag_ns_name_ptr is unmodifyable
tag_ns_buffer = malloc(tag_ns_len);
strcpy(tag_ns_buffer, tag_ns_name_ptr);
tag_ns_buffer = lowercase(tag_ns_buffer);
tag_string_len = tag_ns_len + tag_name_len + 1; // +1 for colon
stpcpy(stpcpy(stpcpy(tag_string, tag_ns_buffer), ":"), tag_name);
}
else
{
stpcpy(tag_string, tag_name);
tag_string_len = tag_name_len;
}
// put non-html tags it in a binary
if (tag_id == MyHTML_TAG__UNDEF || tag_ns != MyHTML_NAMESPACE_HTML)
{
ErlNifBinary tag_b;
enif_alloc_binary(tag_string_len, &tag_b);
memcpy(tag_b.data, tag_string, tag_string_len);
tag = enif_make_binary(env, &tag_b);
}
else
{
tag = make_atom(env, tag_string);
}
// attributes
attrs = build_node_attrs(env, tree, node);
// add children
children = build_node_children(env, tree, node);
// add children or nil as a self-closing flag
if (myhtml_node_is_close_self(node))
{
children = ATOM_NIL;
}
else
{
children = build_node_children(env, tree, node);
}
// free allocated resources
if (tag_ns != MyHTML_NAMESPACE_HTML)
{
free(tag_ns_buffer);
}
return result = enif_make_tuple3(env,
tag_bin,
tag,
attrs,
children
);
}
}
// Erlang NIF
static ErlNifFunc funcs[] =
void
nif_cleanup_myhtmlex_ref(ErlNifEnv* env, void* obj)
{
{"decode", 1, decode}
};
myhtmlex_ref_t* ref = (myhtmlex_ref_t*) obj;
// release myhtml resources
myhtml_tree_destroy(ref->tree);
myhtml_node_free(ref->root);
}
// Erlang NIF
static int
load(ErlNifEnv *env, void **priv, ERL_NIF_TERM info)
{
myhtmlex_state_t* state = enif_alloc(sizeof(myhtmlex_state_t));
if (state == NULL)
{
return 1;
}
state->myhtml_tree_rt = enif_open_resource_type(
env,
NULL,
"myhtmlex_ref_t",
&nif_cleanup_myhtmlex_ref,
ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER,
NULL
);
ATOM_NIL = make_atom(env, "nil");
// myhtml basic init
state->myhtml = myhtml_create();
myhtml_init(state->myhtml, MyHTML_OPTIONS_DEFAULT, 4, 0);
state->tree = myhtml_tree_create();
myhtml_tree_init(state->tree, state->myhtml);
*priv = (void*) state;
return 0;
}
......@@ -213,8 +344,20 @@ upgrade(ErlNifEnv *env, void **priv, void **old_priv, ERL_NIF_TERM info)
static void
unload(ErlNifEnv *env, void *priv)
{
myhtmlex_state_t* state = (myhtmlex_state_t*) priv;
myhtml_tree_destroy(state->tree);
myhtml_destroy(state->myhtml);
enif_free(priv);
return;
}
static ErlNifFunc funcs[] =
{
{"decode", 1, nif_decode},
{"open", 1, nif_open},
{"decode_tree", 1, nif_decode_tree}
};
ERL_NIF_INIT(Elixir.Myhtmlex.Decoder, funcs, &load, &reload, &upgrade, &unload)
#ifndef MYHTMLEX_H
#define MYHTMLEX_H
#include <stdlib.h>
#include <ctype.h>
#include "erl_nif.h"
#include <myhtml/myhtml.h>
#include <myhtml/mynamespace.h>
char*
lowercase(char* c);
// myhtmlex.c
ERL_NIF_TERM
make_atom(ErlNifEnv* env, const char* name);
ERL_NIF_TERM
nif_decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
ERL_NIF_TERM
nif_decode_tree(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
ERL_NIF_TERM
nif_open(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
ERL_NIF_TERM
build_node_attrs(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node);
ERL_NIF_TERM
build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node);
ERL_NIF_TERM
build_node_children(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node);
void
nif_cleanup_myhtml_tree(ErlNifEnv* env, void* obj);
// consts
ERL_NIF_TERM ATOM_NIL;
typedef struct {
myhtml_t* myhtml;
myhtml_tree_t* tree;
ErlNifResourceType* myhtml_tree_rt;
} myhtmlex_state_t;
typedef struct {
myhtml_tree_t* tree;
myhtml_tree_node_t *root;
} myhtmlex_ref_t;
#endif // included myhtmlex.h
......@@ -3,50 +3,83 @@ defmodule MyhtmlexTest do
doctest Myhtmlex
test "builds a tree" do
assert {"html", [], [
{"head", [], []},
{"body", [], []}
assert {:html, [], [
{:head, [], []},
{:body, [], []}
]} = Myhtmlex.decode("<html></html>")
end
test "attributes" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
{"span", [{"id", "test"}, {"class", "foo garble"}], []}
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:span, [{"id", "test"}, {"class", "foo garble"}], []}
]}
]} = Myhtmlex.decode(~s'<span id="test" class="foo garble"></span>')
end
test "single attributes" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
{"button", [{"disabled", "disabled"}, {"class", "foo garble"}], []}
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:button, [{"disabled", "disabled"}, {"class", "foo garble"}], []}
]}
]} = Myhtmlex.decode(~s'<button disabled class="foo garble"></span>')
end
test "text nodes" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
assert {:html, [], [
{:head, [], []},
{:body, [], [
"text node"
]}
]} = Myhtmlex.decode(~s'<body>text node</body>')
end
test "broken input" do
assert {"html", [], [
{"head", [], []},
{"body", [], [
{"a", [{"<", "<"}], [" asdf"]}
assert {:html, [], [
{:head, [], []},
{:body, [], [
{:a, [{"<", "<"}], [" asdf"]}
]}
]} = Myhtmlex.decode(~s'<a <> asdf')
end
# test "open" do
# assert %Myhtmlex.Doc{} = doc = Myhtmlex.open(~s'<dif class="a"></div><div class="b"></div>')
# end
test "open" do
ref = Myhtmlex.open(~s'<dif class="a"></div><div class="b"></div>')
assert is_reference(ref)
end
test "open and decode_tree" do
ref = Myhtmlex.open(~s'text node')
assert {:html, [], [
{:head, [], []},
{:body, [], [
"text node"
]}
]} = Myhtmlex.decode_tree(ref)
end
test "namespaced tags" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{"svg:svg", [], [
{"svg:path", [], []},
{"svg:a", [], []}
]}
]}
]} = Myhtmlex.decode(~s'<svg><path></path><a></a></svg>')
end
test "custom namespaced tags" do
assert {:html, [], [
{:head, [], []},
{:body, [], [
{"esi:include", [], nil}
]}
]} = Myhtmlex.decode(~s'<esi:include />')
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment