From 5710ff391f516e81cc47539634aae49c8e256147 Mon Sep 17 00:00:00 2001
From: Mark Felder <feld@FreeBSD.org>
Date: Fri, 31 Jul 2020 17:59:29 -0500
Subject: [PATCH 1/2] Update usage of Floki.find/2 ensuring we always pass data
 from Floki.parse_document/1 first

---
 lib/pleroma/web/rich_media/parser.ex          |  3 ---
 .../rich_media/parsers/meta_tags_parser.ex    |  4 ++--
 .../web/rich_media/parsers/oembed_parser.ex   |  2 +-
 .../rich_media/parsers/twitter_card_test.exs  | 19 +++++--------------
 4 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex
index c8a7679353..ef2f78e3fe 100644
--- a/lib/pleroma/web/rich_media/parser.ex
+++ b/lib/pleroma/web/rich_media/parser.ex
@@ -92,7 +92,6 @@ defp parse_url(url) do
         Pleroma.HTTP.get(url, [{"user-agent", rich_media_agent}], adapter: opts)
 
       html
-      |> parse_html()
       |> maybe_parse()
       |> Map.put("url", url)
       |> clean_parsed_data()
@@ -103,8 +102,6 @@ defp parse_url(url) do
     end
   end
 
-  defp parse_html(html), do: Floki.parse_document!(html)
-
   defp maybe_parse(html) do
     Enum.reduce_while(parsers(), %{}, fn parser, acc ->
       case parser.parse(html, acc) do
diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex
index 3d577e2540..589d81f01f 100644
--- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex
+++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex
@@ -15,7 +15,7 @@ def parse(data, html, prefix, key_name, value_name \\ "content") do
   end
 
   defp get_elements(html, key_name, prefix) do
-    html |> Floki.find("meta[#{key_name}^='#{prefix}:']")
+    Floki.parse_document!(html) |> Floki.find("meta[#{key_name}^='#{prefix}:']")
   end
 
   defp normalize_attributes(html_node, prefix, key_name, value_name) do
@@ -41,6 +41,6 @@ defp maybe_put_title(meta, html) when meta != %{} do
   defp maybe_put_title(meta, _), do: meta
 
   defp get_page_title(html) do
-    Floki.find(html, "html head title") |> List.first() |> Floki.text()
+    Floki.parse_document!(html) |> Floki.find("html head title") |> List.first() |> Floki.text()
   end
 end
diff --git a/lib/pleroma/web/rich_media/parsers/oembed_parser.ex b/lib/pleroma/web/rich_media/parsers/oembed_parser.ex
index 6bdeac89c2..2f1428529b 100644
--- a/lib/pleroma/web/rich_media/parsers/oembed_parser.ex
+++ b/lib/pleroma/web/rich_media/parsers/oembed_parser.ex
@@ -14,7 +14,7 @@ def parse(html, _data) do
   end
 
   defp get_discovery_data(html) do
-    html |> Floki.find("link[type='application/json+oembed']")
+    Floki.parse_document!(html) |> Floki.find("link[type='application/json+oembed']")
   end
 
   defp get_oembed_url([{"link", attributes, _children} | _]) do
diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs
index 219f005a2b..edce5078ea 100644
--- a/test/web/rich_media/parsers/twitter_card_test.exs
+++ b/test/web/rich_media/parsers/twitter_card_test.exs
@@ -11,9 +11,7 @@ test "returns error when html not contains twitter card" do
   end
 
   test "parses twitter card with only name attributes" do
-    html =
-      File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
-      |> Floki.parse_document!()
+    html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
 
     assert TwitterCard.parse(html, %{}) ==
              %{
@@ -34,9 +32,7 @@ test "parses twitter card with only name attributes" do
   end
 
   test "parses twitter card with only property attributes" do
-    html =
-      File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
-      |> Floki.parse_document!()
+    html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
 
     assert TwitterCard.parse(html, %{}) ==
              %{
@@ -55,9 +51,7 @@ test "parses twitter card with only property attributes" do
   end
 
   test "parses twitter card with name & property attributes" do
-    html =
-      File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
-      |> Floki.parse_document!()
+    html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
 
     assert TwitterCard.parse(html, %{}) ==
              %{
@@ -85,8 +79,7 @@ test "respect only first title tag on the page" do
         "YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <>
         "yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg"
 
-    html =
-      File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
+    html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html")
 
     assert TwitterCard.parse(html, %{}) ==
              %{
@@ -103,9 +96,7 @@ test "respect only first title tag on the page" do
   end
 
   test "takes first founded title in html head if there is html markup error" do
-    html =
-      File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
-      |> Floki.parse_document!()
+    html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
 
     assert TwitterCard.parse(html, %{}) ==
              %{
-- 
GitLab


From bb0201124e788a9fe5812726862428c1bc04b9c2 Mon Sep 17 00:00:00 2001
From: Mark Felder <feld@FreeBSD.org>
Date: Fri, 31 Jul 2020 18:04:54 -0500
Subject: [PATCH 2/2] Remove test that is irrelevant and never seemed to work
 anyway

---
 test/web/rich_media/parsers/twitter_card_test.exs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs
index edce5078ea..1307f597e6 100644
--- a/test/web/rich_media/parsers/twitter_card_test.exs
+++ b/test/web/rich_media/parsers/twitter_card_test.exs
@@ -6,10 +6,6 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
   use ExUnit.Case, async: true
   alias Pleroma.Web.RichMedia.Parsers.TwitterCard
 
-  test "returns error when html not contains twitter card" do
-    assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == %{}
-  end
-
   test "parses twitter card with only name attributes" do
     html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
 
-- 
GitLab