Commit f9f22965 authored by rinpatch's avatar rinpatch

Sanitize and convert to html in one pass

parent 15404e28
Pipeline #19239 failed with stage
in 1 minute and 24 seconds
defmodule FastSanitize.Fragment do
import Plug.HTML, only: [html_escape: 1, html_escape_to_iodata: 1]
import Plug.HTML, only: [html_escape_to_iodata: 1]
def to_tree(bin) do
with {:html, _, [{:head, _, _}, {:body, _, fragment}]} <-
......@@ -28,44 +28,47 @@ defmodule FastSanitize.Fragment do
do: ["<", to_string(tag), build_attr_chunks(attrs), ">"]
# empty tuple - fragment was clobbered, return nothing
defp fragment_to_html(nil), do: ""
defp fragment_to_html(nil, _), do: ""
defp fragment_to_html({}), do: ""
defp fragment_to_html({}, _), do: ""
# text node
defp fragment_to_html(text) when is_binary(text), do: html_escape_to_iodata(text)
defp fragment_to_html(text, _) when is_binary(text), do: html_escape_to_iodata(text)
# comment node
defp fragment_to_html({:comment, _, text}), do: ["<!-- ", text, " -->"]
defp fragment_to_html({:comment, _, text}, _), do: ["<!-- ", text, " -->"]
# bare subtree
defp fragment_to_html(subtree) when is_list(subtree) do
{:ok, result} = subtree_to_html(subtree)
result
defp fragment_to_html(subtree, scrubber) when is_list(subtree) do
subtree_to_iodata(subtree, scrubber)
end
# a node which can never accept children will have nil instead of a subtree
defp fragment_to_html({tag, attrs, nil}), do: build_start_tag(tag, attrs, nil)
defp fragment_to_html({tag, attrs, nil}, _), do: build_start_tag(tag, attrs, nil)
# every other case, assume a subtree
defp fragment_to_html({tag, attrs, subtree}) do
defp fragment_to_html({tag, attrs, subtree}, scrubber) do
with start_tag <- build_start_tag(tag, attrs, subtree),
end_tag <- ["</", to_string(tag), ">"],
subtree <- subtree_to_iodata(subtree) do
subtree <- subtree_to_iodata(subtree, scrubber) do
[start_tag, subtree, end_tag]
end
end
defp subtree_to_html([]), do: {:ok, ""}
defp subtree_to_html([], _), do: {:ok, ""}
defp subtree_to_html(tree) do
iodata = subtree_to_iodata(tree)
defp subtree_to_html(tree, scrubber) do
iodata = subtree_to_iodata(tree, scrubber)
rendered = :erlang.iolist_to_binary(iodata)
{:ok, rendered}
end
defp subtree_to_iodata(tree),
do: List.foldr(tree, [], fn node, iodata -> [fragment_to_html(node) | iodata] end)
defp subtree_to_iodata(tree, scrubber) do
List.foldr(tree, [], fn node, iodata ->
[fragment_to_html(scrubber.scrub(node), scrubber) | iodata]
end)
end
def to_html(tree), do: subtree_to_html(tree)
def to_html(tree, scrubber \\ FastSanitize.Sanitizer.Dummy),
do: subtree_to_html(tree, scrubber)
end
......@@ -29,35 +29,10 @@ defmodule FastSanitize.Sanitizer do
def scrub(doc, scrubber) when is_binary(doc) do
with wrapped_doc <- "<body>" <> doc <> "</body>",
{:ok, subtree} <- Fragment.to_tree(wrapped_doc) do
scrub(subtree, scrubber)
|> Fragment.to_html()
Fragment.to_html(subtree, scrubber)
else
e ->
{:error, e}
end
end
def scrub(subtree, scrubber) when is_list(subtree) do
Logger.debug("Pre-process: #{inspect(subtree)}")
Enum.map(subtree, fn fragment ->
case scrubber.scrub(fragment) do
{_tag, _attrs, nil} = fragment ->
Logger.debug("Post-process closure: #{inspect(fragment)}")
fragment
{tag, attrs, children} ->
Logger.debug("Post-process tag: #{inspect({tag, attrs, children})}")
{tag, attrs, scrub(children, scrubber)}
subtree when is_list(subtree) ->
Logger.debug("Post-process subtree: #{inspect(subtree)}")
scrub(subtree, scrubber)
other ->
Logger.debug("Post-process other: #{inspect(other)}")
other
end
end)
end
end
defmodule FastSanitize.Sanitizer.Dummy do
def scrub(x), do: x
end
......@@ -178,35 +178,9 @@ defmodule FastSanitize.Sanitizer.Meta do
nil
end
@protocol_separator ":|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A"
@protocol_separator_regex Regex.compile!(@protocol_separator, "mi")
@http_like_scheme "(?<scheme>.+?)(#{@protocol_separator})//"
@other_schemes "(?<other_schemes>mailto)(#{@protocol_separator})"
@scheme_capture Regex.compile!(
"(#{@http_like_scheme})|(#{@other_schemes})",
"mi"
)
def scrub_attribute(unquote(tag_name), {unquote(attr_name), uri}) do
valid_schema =
if uri =~ @protocol_separator_regex do
case Regex.named_captures(@scheme_capture, uri) do
%{"scheme" => scheme, "other_schemes" => ""} ->
scheme in unquote(valid_schemes)
%{"other_schemes" => scheme, "scheme" => ""} ->
scheme in unquote(valid_schemes)
_ ->
false
end
else
true
end
if valid_schema, do: {unquote(attr_name), uri}
def scrub_attribute(unquote(tag_name), {unquote(attr_name), uri} = attr) do
uri = URI.parse(uri)
if uri.scheme == nil or uri.scheme in unquote(valid_schemes), do: attr
end
end
end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment