From 5710ff391f516e81cc47539634aae49c8e256147 Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@FreeBSD.org> Date: Fri, 31 Jul 2020 17:59:29 -0500 Subject: [PATCH 1/2] Update usage of Floki.find/2 ensuring we always pass data from Floki.parse_document/1 first --- lib/pleroma/web/rich_media/parser.ex | 3 --- .../rich_media/parsers/meta_tags_parser.ex | 4 ++-- .../web/rich_media/parsers/oembed_parser.ex | 2 +- .../rich_media/parsers/twitter_card_test.exs | 19 +++++-------------- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index c8a7679353..ef2f78e3fe 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -92,7 +92,6 @@ defp parse_url(url) do Pleroma.HTTP.get(url, [{"user-agent", rich_media_agent}], adapter: opts) html - |> parse_html() |> maybe_parse() |> Map.put("url", url) |> clean_parsed_data() @@ -103,8 +102,6 @@ defp parse_url(url) do end end - defp parse_html(html), do: Floki.parse_document!(html) - defp maybe_parse(html) do Enum.reduce_while(parsers(), %{}, fn parser, acc -> case parser.parse(html, acc) do diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex index 3d577e2540..589d81f01f 100644 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -15,7 +15,7 @@ def parse(data, html, prefix, key_name, value_name \\ "content") do end defp get_elements(html, key_name, prefix) do - html |> Floki.find("meta[#{key_name}^='#{prefix}:']") + Floki.parse_document!(html) |> Floki.find("meta[#{key_name}^='#{prefix}:']") end defp normalize_attributes(html_node, prefix, key_name, value_name) do @@ -41,6 +41,6 @@ defp maybe_put_title(meta, html) when meta != %{} do defp maybe_put_title(meta, _), do: meta defp get_page_title(html) do - Floki.find(html, "html head title") |> List.first() |> Floki.text() + Floki.parse_document!(html) |> Floki.find("html head title") |> List.first() |> Floki.text() end end diff --git a/lib/pleroma/web/rich_media/parsers/oembed_parser.ex b/lib/pleroma/web/rich_media/parsers/oembed_parser.ex index 6bdeac89c2..2f1428529b 100644 --- a/lib/pleroma/web/rich_media/parsers/oembed_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/oembed_parser.ex @@ -14,7 +14,7 @@ def parse(html, _data) do end defp get_discovery_data(html) do - html |> Floki.find("link[type='application/json+oembed']") + Floki.parse_document!(html) |> Floki.find("link[type='application/json+oembed']") end defp get_oembed_url([{"link", attributes, _children} | _]) do diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs index 219f005a2b..edce5078ea 100644 --- a/test/web/rich_media/parsers/twitter_card_test.exs +++ b/test/web/rich_media/parsers/twitter_card_test.exs @@ -11,9 +11,7 @@ test "returns error when html not contains twitter card" do end test "parses twitter card with only name attributes" do - html = - File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") - |> Floki.parse_document!() + html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") assert TwitterCard.parse(html, %{}) == %{ @@ -34,9 +32,7 @@ test "parses twitter card with only name attributes" do end test "parses twitter card with only property attributes" do - html = - File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") - |> Floki.parse_document!() + html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") assert TwitterCard.parse(html, %{}) == %{ @@ -55,9 +51,7 @@ test "parses twitter card with only property attributes" do end test "parses twitter card with name & property attributes" do - html = - File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") - |> Floki.parse_document!() + html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") assert TwitterCard.parse(html, %{}) == %{ @@ -85,8 +79,7 @@ test "respect only first title tag on the page" do "YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <> "yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg" - html = - File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!() + html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html") assert TwitterCard.parse(html, %{}) == %{ @@ -103,9 +96,7 @@ test "respect only first title tag on the page" do end test "takes first founded title in html head if there is html markup error" do - html = - File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html") - |> Floki.parse_document!() + html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html") assert TwitterCard.parse(html, %{}) == %{ -- GitLab From bb0201124e788a9fe5812726862428c1bc04b9c2 Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@FreeBSD.org> Date: Fri, 31 Jul 2020 18:04:54 -0500 Subject: [PATCH 2/2] Remove test that is irrelevant and never seemed to work anyway --- test/web/rich_media/parsers/twitter_card_test.exs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs index edce5078ea..1307f597e6 100644 --- a/test/web/rich_media/parsers/twitter_card_test.exs +++ b/test/web/rich_media/parsers/twitter_card_test.exs @@ -6,10 +6,6 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do use ExUnit.Case, async: true alias Pleroma.Web.RichMedia.Parsers.TwitterCard - test "returns error when html not contains twitter card" do - assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == %{} - end - test "parses twitter card with only name attributes" do html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") -- GitLab