Commit 15404e28 authored by rinpatch's avatar rinpatch

Fragment: rework to use iodata

Also add benchmarks
parent ab487fb7
defmodule FastSanitize.Fragment do
import Plug.HTML, only: [html_escape: 1]
import Plug.HTML, only: [html_escape: 1, html_escape_to_iodata: 1]
def to_tree(bin) do
with {:html, _, [{:head, _, _}, {:body, _, fragment}]} <-
......@@ -11,30 +11,32 @@ defmodule FastSanitize.Fragment do
end
end
defp build_attr_chunks([]) do
""
end
defp build_attr_chunks([]), do: ""
defp build_attr_chunks(attrs) do
" " <>
(Enum.map(attrs, fn {k, v} ->
"#{html_escape(k)}=\"#{html_escape(v)}\""
end)
|> Enum.join(" "))
List.foldr(attrs, [], fn {k, v}, iodata ->
[[" ", html_escape_to_iodata(k), "=\"", html_escape_to_iodata(v), "\""] | iodata]
end)
end
defp build_start_tag(tag, attrs, nil), do: "<#{tag}#{build_attr_chunks(attrs)}/>"
defp build_start_tag(tag, attrs, _children) when length(attrs) == 0, do: "<#{tag}>"
defp build_start_tag(tag, attrs, _children), do: "<#{tag}#{build_attr_chunks(attrs)}>"
defp build_start_tag(tag, attrs, nil), do: ["<", to_string(tag), build_attr_chunks(attrs), "/>"]
defp build_start_tag(tag, attrs, _children) when length(attrs) == 0,
do: ["<", to_string(tag), ">"]
defp build_start_tag(tag, attrs, _children),
do: ["<", to_string(tag), build_attr_chunks(attrs), ">"]
# empty tuple - fragment was clobbered, return nothing
defp fragment_to_html(nil), do: ""
defp fragment_to_html({}), do: ""
# text node
defp fragment_to_html(text) when is_binary(text), do: html_escape(text)
defp fragment_to_html(text) when is_binary(text), do: html_escape_to_iodata(text)
# comment node
defp fragment_to_html({:comment, _, text}), do: "<!-- #{text} -->"
defp fragment_to_html({:comment, _, text}), do: ["<!-- ", text, " -->"]
# bare subtree
defp fragment_to_html(subtree) when is_list(subtree) do
......@@ -48,23 +50,22 @@ defmodule FastSanitize.Fragment do
# every other case, assume a subtree
defp fragment_to_html({tag, attrs, subtree}) do
with start_tag <- build_start_tag(tag, attrs, subtree),
end_tag <- "</#{tag}>",
{:ok, subtree} <- subtree_to_html(subtree) do
end_tag <- ["</", to_string(tag), ">"],
subtree <- subtree_to_iodata(subtree) do
[start_tag, subtree, end_tag]
|> Enum.join("")
end
end
defp subtree_to_html([]), do: {:ok, ""}
defp subtree_to_html(tree) do
rendered =
Enum.reject(tree, &is_nil/1)
|> Enum.map(&fragment_to_html/1)
|> Enum.join("")
iodata = subtree_to_iodata(tree)
rendered = :erlang.iolist_to_binary(iodata)
{:ok, rendered}
end
defp subtree_to_iodata(tree),
do: List.foldr(tree, [], fn node, iodata -> [fragment_to_html(node) | iodata] end)
def to_html(tree), do: subtree_to_html(tree)
end
defmodule Mix.Tasks.FastSanitize.Bench do
use Mix.Task
@input_dir "lib/mix/tasks/fast_sanitize/html"
def run(_) do
inputs =
Enum.reduce(File.ls!(@input_dir), %{}, fn input_name, acc ->
IO.inspect(input_name)
input = File.read!(Path.join(@input_dir, input_name))
Map.put(acc, input_name, input)
end)
|> IO.inspect()
Benchee.run(
%{
"FastSanitize strip tags" => fn input -> FastSanitize.strip_tags(input) end,
"HtmlSanitizeex strip tags" => fn input -> HtmlSanitizeEx.strip_tags(input) end,
"FastSanitize basic html" => fn input -> FastSanitize.basic_html(input) end,
"HtmlSanitizeex basic html" => fn input -> HtmlSanitizeEx.basic_html(input) end
},
inputs: inputs
)
end
end
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
<p>Sanitize is a whitelist-based HTML sanitizer. Given a list of acceptable
elements and attributes, Sanitize will remove all unacceptable HTML from a
string.</p>
<p>Using a simple configuration syntax, you can tell Sanitize to allow certain
elements, certain attributes within those elements, and even certain URL
protocols within attributes that contain URLs. Any HTML elements or attributes
that you don't explicitly allow will be removed.</p>
<p>Sanitize is based on <a href="https://github.com/google/gumbo-parser">Google's Gumbo HTML5 parser</a>, which parses HTML
exactly the same way modern browsers do. As long as your whitelist config only
allows safe markup, even the most malformed or malicious input will be
transformed into safe output.</p>
......@@ -27,6 +27,8 @@ defmodule FastSanitize.MixProject do
ref: "d973dfb1b252b1c6e6eddddc18c0895aa977091c",
submodules: true},
{:credo, "~> 1.0.0", only: [:dev, :test], runtime: false},
{:benchee, "~> 1.0", only: :dev},
{:html_sanitize_ex, "~> 1.3.0-rc3", only: :dev},
{:ex_doc, "~> 0.19", only: :dev, runtime: false},
{:dialyxir, "~> 1.0.0-rc.5", only: [:dev], runtime: false}
]
......
%{
"benchee": {:hex, :benchee, "1.0.1", "66b211f9bfd84bd97e6d1beaddf8fc2312aaabe192f776e8931cb0c16f53a521", [:mix], [{:deep_merge, "~> 1.0", [hex: :deep_merge, repo: "hexpm", optional: false]}], "hexpm"},
"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm"},
"credo": {:hex, :credo, "1.0.5", "fdea745579f8845315fe6a3b43e2f9f8866839cfbc8562bb72778e9fdaa94214", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm"},
"deep_merge": {:hex, :deep_merge, "1.0.0", "b4aa1a0d1acac393bdf38b2291af38cb1d4a52806cf7a4906f718e1feb5ee961", [:mix], [], "hexpm"},
"dialyxir": {:hex, :dialyxir, "1.0.0-rc.6", "78e97d9c0ff1b5521dd68041193891aebebce52fc3b93463c0a6806874557d7d", [:mix], [{:erlex, "~> 0.2.1", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm"},
"earmark": {:hex, :earmark, "1.3.2", "b840562ea3d67795ffbb5bd88940b1bed0ed9fa32834915125ea7d02e35888a5", [:mix], [], "hexpm"},
"erlex": {:hex, :erlex, "0.2.1", "cee02918660807cbba9a7229cae9b42d1c6143b768c781fa6cee1eaf03ad860b", [:mix], [], "hexpm"},
"ex_doc": {:hex, :ex_doc, "0.20.2", "1bd0dfb0304bade58beb77f20f21ee3558cc3c753743ae0ddbb0fd7ba2912331", [:mix], [{:earmark, "~> 1.3", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.10", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm"},
"html_sanitize_ex": {:hex, :html_sanitize_ex, "1.3.0", "f005ad692b717691203f940c686208aa3d8ffd9dd4bb3699240096a51fa9564e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"},
"jason": {:hex, :jason, "1.1.2", "b03dedea67a99223a2eaf9f1264ce37154564de899fd3d8b9a21b1a6fd64afe7", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm"},
"makeup": {:hex, :makeup, "0.8.0", "9cf32aea71c7fe0a4b2e9246c2c4978f9070257e5c9ce6d4a28ec450a839b55f", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm"},
"makeup_elixir": {:hex, :makeup_elixir, "0.13.0", "be7a477997dcac2e48a9d695ec730b2d22418292675c75aa2d34ba0909dcdeda", [:mix], [{:makeup, "~> 0.8", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm"},
"mime": {:hex, :mime, "1.3.1", "30ce04ab3175b6ad0bdce0035cba77bba68b813d523d1aac73d9781b4d193cf8", [:mix], [], "hexpm"},
"myhtmlex": {:git, "https://github.com/rinpatch/myhtmlex.git", "d973dfb1b252b1c6e6eddddc18c0895aa977091c", [ref: "d973dfb1b252b1c6e6eddddc18c0895aa977091c"]},
"mochiweb": {:hex, :mochiweb, "2.18.0", "eb55f1db3e6e960fac4e6db4e2db9ec3602cc9f30b86cd1481d56545c3145d2e", [:rebar3], [], "hexpm"},
"myhtmlex": {:git, "https://github.com/rinpatch/myhtmlex.git", "d973dfb1b252b1c6e6eddddc18c0895aa977091c", [ref: "d973dfb1b252b1c6e6eddddc18c0895aa977091c", submodules: true]},
"nimble_parsec": {:hex, :nimble_parsec, "0.5.0", "90e2eca3d0266e5c53f8fbe0079694740b9c91b6747f2b7e3c5d21966bba8300", [:mix], [], "hexpm"},
"nodex": {:git, "https://github.com/rinpatch/nodex", "12ca7a2c5b5791f1e847d73ed646cf006d4c8ca8", [ref: "12ca7a2c5b5791f1e847d73ed646cf006d4c8ca8"]},
"plug": {:hex, :plug, "1.8.0", "9d2685cb007fe5e28ed9ac27af2815bc262b7817a00929ac10f56f169f43b977", [:mix], [{:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: true]}], "hexpm"},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment