From 627c944fecebe3d48a6bc35ed62273fc058814ff Mon Sep 17 00:00:00 2001
From: Mark Felder <feld@feld.me>
Date: Wed, 26 Jun 2024 09:20:46 -0400
Subject: [PATCH 1/5] Search Indexing: filter indexable activities before
 inserting Oban jobs

---
 lib/pleroma/search.ex | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/lib/pleroma/search.ex b/lib/pleroma/search.ex
index fd0218cb8e..b60ca3d05c 100644
--- a/lib/pleroma/search.ex
+++ b/lib/pleroma/search.ex
@@ -1,12 +1,26 @@
 defmodule Pleroma.Search do
+  alias Pleroma.Activity
+  alias Pleroma.Object
   alias Pleroma.Workers.SearchIndexingWorker
 
-  def add_to_index(%Pleroma.Activity{id: activity_id}) do
-    SearchIndexingWorker.enqueue("add_to_index", %{"activity" => activity_id})
+  @spec add_to_index(Activity.t()) :: :ok | :error
+  def add_to_index(%Pleroma.Activity{id: activity_id} = activity) do
+    with true <- indexable?(activity),
+         {:ok, %Oban.Job{}} <-
+           SearchIndexingWorker.enqueue("add_to_index", %{"activity" => activity_id}) do
+      :ok
+    else
+      false -> :ok
+      _ -> :error
+    end
   end
 
+  @spec remove_from_index(Object.t()) :: :ok | :error
   def remove_from_index(%Pleroma.Object{id: object_id}) do
-    SearchIndexingWorker.enqueue("remove_from_index", %{"object" => object_id})
+    case SearchIndexingWorker.enqueue("remove_from_index", %{"object" => object_id}) do
+      {:ok, %Oban.Job{}} -> :ok
+      _ -> :error
+    end
   end
 
   def search(query, options) do
@@ -18,4 +32,7 @@ def healthcheck_endpoints do
     search_module = Pleroma.Config.get([Pleroma.Search, :module])
     search_module.healthcheck_endpoints
   end
+
+  defp indexable?(%Activity{object: %Object{}, data: %{"type" => "Create"}}), do: true
+  defp indexable?(_), do: false
 end
-- 
GitLab


From a5c88eb39b8d01b7d831dd1ba95fc1c00f0eba52 Mon Sep 17 00:00:00 2001
From: Mark Felder <feld@feld.me>
Date: Wed, 26 Jun 2024 09:24:48 -0400
Subject: [PATCH 2/5] Remove redundant checks from backends' add_to_index/1

---
 lib/pleroma/search/meilisearch.ex   | 38 +++++++++++++----------------
 lib/pleroma/search/qdrant_search.ex | 28 +++++++++------------
 2 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/lib/pleroma/search/meilisearch.ex b/lib/pleroma/search/meilisearch.ex
index 9bba5b30f8..fb8fdea1b6 100644
--- a/lib/pleroma/search/meilisearch.ex
+++ b/lib/pleroma/search/meilisearch.ex
@@ -4,6 +4,7 @@ defmodule Pleroma.Search.Meilisearch do
 
   alias Pleroma.Activity
   alias Pleroma.Config.Getting, as: Config
+  alias Pleroma.Object
 
   import Pleroma.Search.DatabaseSearch
   import Ecto.Query
@@ -155,28 +156,23 @@ def object_to_search_data(object) do
   end
 
   @impl true
-  def add_to_index(activity) do
-    maybe_search_data = object_to_search_data(activity.object)
-
-    if activity.data["type"] == "Create" and maybe_search_data do
-      result =
-        meili_put(
-          "/indexes/objects/documents",
-          [maybe_search_data]
-        )
-
-      with {:ok, %{"status" => "enqueued"}} <- result do
-        # Added successfully
-        :ok
-      else
-        _ ->
-          # There was an error, report it
-          Logger.error("Failed to add activity #{activity.id} to index: #{inspect(result)}")
-          {:error, result}
-      end
-    else
-      # The post isn't something we can search, that's ok
+  def add_to_index(%Activity{object: %Object{} = object} = activity) do
+    search_data = object_to_search_data(object)
+
+    result =
+      meili_put(
+        "/indexes/objects/documents",
+        [search_data]
+      )
+
+    with {:ok, %{"status" => "enqueued"}} <- result do
+      # Added successfully
       :ok
+    else
+      _ ->
+        # There was an error, report it
+        Logger.error("Failed to add activity #{activity.id} to index: #{inspect(result)}")
+        {:error, result}
     end
   end
 
diff --git a/lib/pleroma/search/qdrant_search.ex b/lib/pleroma/search/qdrant_search.ex
index b659bb682c..f004757998 100644
--- a/lib/pleroma/search/qdrant_search.ex
+++ b/lib/pleroma/search/qdrant_search.ex
@@ -4,6 +4,7 @@ defmodule Pleroma.Search.QdrantSearch do
 
   alias Pleroma.Activity
   alias Pleroma.Config.Getting, as: Config
+  alias Pleroma.Object
 
   alias __MODULE__.OpenAIClient
   alias __MODULE__.QdrantClient
@@ -82,23 +83,18 @@ defp build_search_payload(embedding, options) do
   end
 
   @impl true
-  def add_to_index(activity) do
-    # This will only index public or unlisted notes
-    maybe_search_data = object_to_search_data(activity.object)
-
-    if activity.data["type"] == "Create" and maybe_search_data do
-      with {:ok, embedding} <- get_embedding(maybe_search_data.content),
-           {:ok, %{status: 200}} <-
-             QdrantClient.put(
-               "/collections/posts/points",
-               build_index_payload(activity, embedding)
-             ) do
-        :ok
-      else
-        e -> {:error, e}
-      end
-    else
+  def add_to_index(%Activity{object: %Object{} = object} = activity) do
+    search_data = object_to_search_data(object)
+
+    with {:ok, embedding} <- get_embedding(search_data.content),
+         {:ok, %{status: 200}} <-
+           QdrantClient.put(
+             "/collections/posts/points",
+             build_index_payload(activity, embedding)
+           ) do
       :ok
+    else
+      e -> {:error, e}
     end
   end
 
-- 
GitLab


From 77436451adf736285f3aa3435cd79d0429eb797d Mon Sep 17 00:00:00 2001
From: Mark Felder <feld@feld.me>
Date: Wed, 26 Jun 2024 09:26:02 -0400
Subject: [PATCH 3/5] Indexable changelog

---
 changelog.d/search-indexing.change | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/search-indexing.change

diff --git a/changelog.d/search-indexing.change b/changelog.d/search-indexing.change
new file mode 100644
index 0000000000..766934f3ff
--- /dev/null
+++ b/changelog.d/search-indexing.change
@@ -0,0 +1 @@
+Filter indexable activities before inserting indexing jobs into the queue.
-- 
GitLab


From 7c09150cdbdf8c270021bb7b1ccfcc1632354e23 Mon Sep 17 00:00:00 2001
From: Mark Felder <feld@feld.me>
Date: Wed, 26 Jun 2024 10:02:48 -0400
Subject: [PATCH 4/5] Validate the activity is public before indexing

Additional tests added
---
 lib/pleroma/search.ex                    |  9 ++--
 test/pleroma/search/meilisearch_test.exs | 23 --------
 test/pleroma/search_test.exs             | 69 ++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 26 deletions(-)
 create mode 100644 test/pleroma/search_test.exs

diff --git a/lib/pleroma/search.ex b/lib/pleroma/search.ex
index b60ca3d05c..2c7a60c480 100644
--- a/lib/pleroma/search.ex
+++ b/lib/pleroma/search.ex
@@ -1,16 +1,19 @@
 defmodule Pleroma.Search do
   alias Pleroma.Activity
   alias Pleroma.Object
+  alias Pleroma.Web.ActivityPub.Visibility
   alias Pleroma.Workers.SearchIndexingWorker
 
   @spec add_to_index(Activity.t()) :: :ok | :error
-  def add_to_index(%Pleroma.Activity{id: activity_id} = activity) do
-    with true <- indexable?(activity),
+  def add_to_index(%Pleroma.Activity{id: activity_id, object: %Object{} = object} = activity) do
+    with {_, true} <- {:indexable, indexable?(activity)},
+         {_, "public"} <- {:visibility, Visibility.get_visibility(object)},
          {:ok, %Oban.Job{}} <-
            SearchIndexingWorker.enqueue("add_to_index", %{"activity" => activity_id}) do
       :ok
     else
-      false -> :ok
+      {:indexable, false} -> :ok
+      {:visibility, _} -> :ok
       _ -> :error
     end
   end
diff --git a/test/pleroma/search/meilisearch_test.exs b/test/pleroma/search/meilisearch_test.exs
index eea4543232..ff32491c58 100644
--- a/test/pleroma/search/meilisearch_test.exs
+++ b/test/pleroma/search/meilisearch_test.exs
@@ -74,29 +74,6 @@ test "indexes a local post on creation" do
       assert_received("posted_to_meilisearch")
     end
 
-    test "doesn't index posts that are not public" do
-      user = insert(:user)
-
-      Enum.each(["private", "direct"], fn visibility ->
-        {:ok, activity} =
-          CommonAPI.post(user, %{
-            status: "guys i just don't wanna leave the swamp",
-            visibility: visibility
-          })
-
-        args = %{"op" => "add_to_index", "activity" => activity.id}
-
-        Config
-        |> expect(:get, fn
-          [Pleroma.Search, :module], nil ->
-            Meilisearch
-        end)
-
-        assert_enqueued(worker: SearchIndexingWorker, args: args)
-        assert :ok = perform_job(SearchIndexingWorker, args)
-      end)
-    end
-
     test "deletes posts from index when deleted locally" do
       user = insert(:user)
 
diff --git a/test/pleroma/search_test.exs b/test/pleroma/search_test.exs
new file mode 100644
index 0000000000..cdd2a72bde
--- /dev/null
+++ b/test/pleroma/search_test.exs
@@ -0,0 +1,69 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2021 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.SearchTest do
+  use Pleroma.DataCase, async: true
+  use Oban.Testing, repo: Pleroma.Repo
+
+  import Pleroma.Factory
+
+  alias Pleroma.Web.CommonAPI
+  alias Pleroma.Workers.SearchIndexingWorker
+
+  test "indexes posts that are public" do
+    user = insert(:user)
+
+    {:ok, activity} =
+      CommonAPI.post(user, %{
+        status: "Well this is a story all about how my life got flipped turned upside down",
+        visibility: "public"
+      })
+
+    args = %{"op" => "add_to_index", "activity" => activity.id}
+
+    assert_enqueued(worker: SearchIndexingWorker, args: args)
+  end
+
+  test "doesn't index posts that are not public" do
+    user = insert(:user)
+
+    Enum.each(["private", "direct"], fn visibility ->
+      {:ok, activity} =
+        CommonAPI.post(user, %{
+          status: "guys i just don't wanna leave the swamp",
+          visibility: visibility
+        })
+
+      args = %{"op" => "add_to_index", "activity" => activity.id}
+
+      refute_enqueued(worker: SearchIndexingWorker, args: args)
+    end)
+  end
+
+  test "Indexes appropriate activity types" do
+    user = insert(:user)
+
+    {:ok, activity} =
+      CommonAPI.post(user, %{
+        status: "I'm my own hype man",
+        visibility: "public"
+      })
+
+    args = %{"op" => "add_to_index", "activity" => activity.id}
+
+    assert_enqueued(worker: SearchIndexingWorker, args: args)
+
+    {:ok, fav_activity} = CommonAPI.favorite(user, activity.id)
+
+    args = %{"op" => "add_to_index", "activity" => fav_activity.id}
+
+    refute_enqueued(worker: SearchIndexingWorker, args: args)
+
+    {:ok, repeat_activity} = CommonAPI.repeat(activity.id, user)
+
+    args = %{"op" => "add_to_index", "activity" => repeat_activity.id}
+
+    refute_enqueued(worker: SearchIndexingWorker, args: args)
+  end
+end
-- 
GitLab


From 592955a895fff4003e62155b08209490625d156d Mon Sep 17 00:00:00 2001
From: Mark Felder <feld@feld.me>
Date: Wed, 26 Jun 2024 11:04:52 -0400
Subject: [PATCH 5/5] Improve add_to_index/1 when there is no Object

---
 lib/pleroma/search.ex | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/lib/pleroma/search.ex b/lib/pleroma/search.ex
index 2c7a60c480..7eb88672b1 100644
--- a/lib/pleroma/search.ex
+++ b/lib/pleroma/search.ex
@@ -5,7 +5,7 @@ defmodule Pleroma.Search do
   alias Pleroma.Workers.SearchIndexingWorker
 
   @spec add_to_index(Activity.t()) :: :ok | :error
-  def add_to_index(%Pleroma.Activity{id: activity_id, object: %Object{} = object} = activity) do
+  def add_to_index(%Activity{id: activity_id, object: %Object{} = object} = activity) do
     with {_, true} <- {:indexable, indexable?(activity)},
          {_, "public"} <- {:visibility, Visibility.get_visibility(object)},
          {:ok, %Oban.Job{}} <-
@@ -18,6 +18,13 @@ def add_to_index(%Pleroma.Activity{id: activity_id, object: %Object{} = object}
     end
   end
 
+  def add_to_index(%Activity{id: activity_id}) do
+    case Activity.get_by_id_with_object(activity_id) do
+      %Activity{} = preloaded -> add_to_index(preloaded)
+      _ -> :ok
+    end
+  end
+
   @spec remove_from_index(Object.t()) :: :ok | :error
   def remove_from_index(%Pleroma.Object{id: object_id}) do
     case SearchIndexingWorker.enqueue("remove_from_index", %{"object" => object_id}) do
@@ -36,6 +43,6 @@ def healthcheck_endpoints do
     search_module.healthcheck_endpoints
   end
 
-  defp indexable?(%Activity{object: %Object{}, data: %{"type" => "Create"}}), do: true
+  defp indexable?(%Activity{data: %{"type" => "Create"}}), do: true
   defp indexable?(_), do: false
 end
-- 
GitLab