From 627c944fecebe3d48a6bc35ed62273fc058814ff Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@feld.me> Date: Wed, 26 Jun 2024 09:20:46 -0400 Subject: [PATCH 1/5] Search Indexing: filter indexable activities before inserting Oban jobs --- lib/pleroma/search.ex | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/lib/pleroma/search.ex b/lib/pleroma/search.ex index fd0218cb8e..b60ca3d05c 100644 --- a/lib/pleroma/search.ex +++ b/lib/pleroma/search.ex @@ -1,12 +1,26 @@ defmodule Pleroma.Search do + alias Pleroma.Activity + alias Pleroma.Object alias Pleroma.Workers.SearchIndexingWorker - def add_to_index(%Pleroma.Activity{id: activity_id}) do - SearchIndexingWorker.enqueue("add_to_index", %{"activity" => activity_id}) + @spec add_to_index(Activity.t()) :: :ok | :error + def add_to_index(%Pleroma.Activity{id: activity_id} = activity) do + with true <- indexable?(activity), + {:ok, %Oban.Job{}} <- + SearchIndexingWorker.enqueue("add_to_index", %{"activity" => activity_id}) do + :ok + else + false -> :ok + _ -> :error + end end + @spec remove_from_index(Object.t()) :: :ok | :error def remove_from_index(%Pleroma.Object{id: object_id}) do - SearchIndexingWorker.enqueue("remove_from_index", %{"object" => object_id}) + case SearchIndexingWorker.enqueue("remove_from_index", %{"object" => object_id}) do + {:ok, %Oban.Job{}} -> :ok + _ -> :error + end end def search(query, options) do @@ -18,4 +32,7 @@ def healthcheck_endpoints do search_module = Pleroma.Config.get([Pleroma.Search, :module]) search_module.healthcheck_endpoints end + + defp indexable?(%Activity{object: %Object{}, data: %{"type" => "Create"}}), do: true + defp indexable?(_), do: false end -- GitLab From a5c88eb39b8d01b7d831dd1ba95fc1c00f0eba52 Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@feld.me> Date: Wed, 26 Jun 2024 09:24:48 -0400 Subject: [PATCH 2/5] Remove redundant checks from backends' add_to_index/1 --- lib/pleroma/search/meilisearch.ex | 38 +++++++++++++---------------- lib/pleroma/search/qdrant_search.ex | 28 +++++++++------------ 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/lib/pleroma/search/meilisearch.ex b/lib/pleroma/search/meilisearch.ex index 9bba5b30f8..fb8fdea1b6 100644 --- a/lib/pleroma/search/meilisearch.ex +++ b/lib/pleroma/search/meilisearch.ex @@ -4,6 +4,7 @@ defmodule Pleroma.Search.Meilisearch do alias Pleroma.Activity alias Pleroma.Config.Getting, as: Config + alias Pleroma.Object import Pleroma.Search.DatabaseSearch import Ecto.Query @@ -155,28 +156,23 @@ def object_to_search_data(object) do end @impl true - def add_to_index(activity) do - maybe_search_data = object_to_search_data(activity.object) - - if activity.data["type"] == "Create" and maybe_search_data do - result = - meili_put( - "/indexes/objects/documents", - [maybe_search_data] - ) - - with {:ok, %{"status" => "enqueued"}} <- result do - # Added successfully - :ok - else - _ -> - # There was an error, report it - Logger.error("Failed to add activity #{activity.id} to index: #{inspect(result)}") - {:error, result} - end - else - # The post isn't something we can search, that's ok + def add_to_index(%Activity{object: %Object{} = object} = activity) do + search_data = object_to_search_data(object) + + result = + meili_put( + "/indexes/objects/documents", + [search_data] + ) + + with {:ok, %{"status" => "enqueued"}} <- result do + # Added successfully :ok + else + _ -> + # There was an error, report it + Logger.error("Failed to add activity #{activity.id} to index: #{inspect(result)}") + {:error, result} end end diff --git a/lib/pleroma/search/qdrant_search.ex b/lib/pleroma/search/qdrant_search.ex index b659bb682c..f004757998 100644 --- a/lib/pleroma/search/qdrant_search.ex +++ b/lib/pleroma/search/qdrant_search.ex @@ -4,6 +4,7 @@ defmodule Pleroma.Search.QdrantSearch do alias Pleroma.Activity alias Pleroma.Config.Getting, as: Config + alias Pleroma.Object alias __MODULE__.OpenAIClient alias __MODULE__.QdrantClient @@ -82,23 +83,18 @@ defp build_search_payload(embedding, options) do end @impl true - def add_to_index(activity) do - # This will only index public or unlisted notes - maybe_search_data = object_to_search_data(activity.object) - - if activity.data["type"] == "Create" and maybe_search_data do - with {:ok, embedding} <- get_embedding(maybe_search_data.content), - {:ok, %{status: 200}} <- - QdrantClient.put( - "/collections/posts/points", - build_index_payload(activity, embedding) - ) do - :ok - else - e -> {:error, e} - end - else + def add_to_index(%Activity{object: %Object{} = object} = activity) do + search_data = object_to_search_data(object) + + with {:ok, embedding} <- get_embedding(search_data.content), + {:ok, %{status: 200}} <- + QdrantClient.put( + "/collections/posts/points", + build_index_payload(activity, embedding) + ) do :ok + else + e -> {:error, e} end end -- GitLab From 77436451adf736285f3aa3435cd79d0429eb797d Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@feld.me> Date: Wed, 26 Jun 2024 09:26:02 -0400 Subject: [PATCH 3/5] Indexable changelog --- changelog.d/search-indexing.change | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/search-indexing.change diff --git a/changelog.d/search-indexing.change b/changelog.d/search-indexing.change new file mode 100644 index 0000000000..766934f3ff --- /dev/null +++ b/changelog.d/search-indexing.change @@ -0,0 +1 @@ +Filter indexable activities before inserting indexing jobs into the queue. -- GitLab From 7c09150cdbdf8c270021bb7b1ccfcc1632354e23 Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@feld.me> Date: Wed, 26 Jun 2024 10:02:48 -0400 Subject: [PATCH 4/5] Validate the activity is public before indexing Additional tests added --- lib/pleroma/search.ex | 9 ++-- test/pleroma/search/meilisearch_test.exs | 23 -------- test/pleroma/search_test.exs | 69 ++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 26 deletions(-) create mode 100644 test/pleroma/search_test.exs diff --git a/lib/pleroma/search.ex b/lib/pleroma/search.ex index b60ca3d05c..2c7a60c480 100644 --- a/lib/pleroma/search.ex +++ b/lib/pleroma/search.ex @@ -1,16 +1,19 @@ defmodule Pleroma.Search do alias Pleroma.Activity alias Pleroma.Object + alias Pleroma.Web.ActivityPub.Visibility alias Pleroma.Workers.SearchIndexingWorker @spec add_to_index(Activity.t()) :: :ok | :error - def add_to_index(%Pleroma.Activity{id: activity_id} = activity) do - with true <- indexable?(activity), + def add_to_index(%Pleroma.Activity{id: activity_id, object: %Object{} = object} = activity) do + with {_, true} <- {:indexable, indexable?(activity)}, + {_, "public"} <- {:visibility, Visibility.get_visibility(object)}, {:ok, %Oban.Job{}} <- SearchIndexingWorker.enqueue("add_to_index", %{"activity" => activity_id}) do :ok else - false -> :ok + {:indexable, false} -> :ok + {:visibility, _} -> :ok _ -> :error end end diff --git a/test/pleroma/search/meilisearch_test.exs b/test/pleroma/search/meilisearch_test.exs index eea4543232..ff32491c58 100644 --- a/test/pleroma/search/meilisearch_test.exs +++ b/test/pleroma/search/meilisearch_test.exs @@ -74,29 +74,6 @@ test "indexes a local post on creation" do assert_received("posted_to_meilisearch") end - test "doesn't index posts that are not public" do - user = insert(:user) - - Enum.each(["private", "direct"], fn visibility -> - {:ok, activity} = - CommonAPI.post(user, %{ - status: "guys i just don't wanna leave the swamp", - visibility: visibility - }) - - args = %{"op" => "add_to_index", "activity" => activity.id} - - Config - |> expect(:get, fn - [Pleroma.Search, :module], nil -> - Meilisearch - end) - - assert_enqueued(worker: SearchIndexingWorker, args: args) - assert :ok = perform_job(SearchIndexingWorker, args) - end) - end - test "deletes posts from index when deleted locally" do user = insert(:user) diff --git a/test/pleroma/search_test.exs b/test/pleroma/search_test.exs new file mode 100644 index 0000000000..cdd2a72bde --- /dev/null +++ b/test/pleroma/search_test.exs @@ -0,0 +1,69 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.SearchTest do + use Pleroma.DataCase, async: true + use Oban.Testing, repo: Pleroma.Repo + + import Pleroma.Factory + + alias Pleroma.Web.CommonAPI + alias Pleroma.Workers.SearchIndexingWorker + + test "indexes posts that are public" do + user = insert(:user) + + {:ok, activity} = + CommonAPI.post(user, %{ + status: "Well this is a story all about how my life got flipped turned upside down", + visibility: "public" + }) + + args = %{"op" => "add_to_index", "activity" => activity.id} + + assert_enqueued(worker: SearchIndexingWorker, args: args) + end + + test "doesn't index posts that are not public" do + user = insert(:user) + + Enum.each(["private", "direct"], fn visibility -> + {:ok, activity} = + CommonAPI.post(user, %{ + status: "guys i just don't wanna leave the swamp", + visibility: visibility + }) + + args = %{"op" => "add_to_index", "activity" => activity.id} + + refute_enqueued(worker: SearchIndexingWorker, args: args) + end) + end + + test "Indexes appropriate activity types" do + user = insert(:user) + + {:ok, activity} = + CommonAPI.post(user, %{ + status: "I'm my own hype man", + visibility: "public" + }) + + args = %{"op" => "add_to_index", "activity" => activity.id} + + assert_enqueued(worker: SearchIndexingWorker, args: args) + + {:ok, fav_activity} = CommonAPI.favorite(user, activity.id) + + args = %{"op" => "add_to_index", "activity" => fav_activity.id} + + refute_enqueued(worker: SearchIndexingWorker, args: args) + + {:ok, repeat_activity} = CommonAPI.repeat(activity.id, user) + + args = %{"op" => "add_to_index", "activity" => repeat_activity.id} + + refute_enqueued(worker: SearchIndexingWorker, args: args) + end +end -- GitLab From 592955a895fff4003e62155b08209490625d156d Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@feld.me> Date: Wed, 26 Jun 2024 11:04:52 -0400 Subject: [PATCH 5/5] Improve add_to_index/1 when there is no Object --- lib/pleroma/search.ex | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/pleroma/search.ex b/lib/pleroma/search.ex index 2c7a60c480..7eb88672b1 100644 --- a/lib/pleroma/search.ex +++ b/lib/pleroma/search.ex @@ -5,7 +5,7 @@ defmodule Pleroma.Search do alias Pleroma.Workers.SearchIndexingWorker @spec add_to_index(Activity.t()) :: :ok | :error - def add_to_index(%Pleroma.Activity{id: activity_id, object: %Object{} = object} = activity) do + def add_to_index(%Activity{id: activity_id, object: %Object{} = object} = activity) do with {_, true} <- {:indexable, indexable?(activity)}, {_, "public"} <- {:visibility, Visibility.get_visibility(object)}, {:ok, %Oban.Job{}} <- @@ -18,6 +18,13 @@ def add_to_index(%Pleroma.Activity{id: activity_id, object: %Object{} = object} end end + def add_to_index(%Activity{id: activity_id}) do + case Activity.get_by_id_with_object(activity_id) do + %Activity{} = preloaded -> add_to_index(preloaded) + _ -> :ok + end + end + @spec remove_from_index(Object.t()) :: :ok | :error def remove_from_index(%Pleroma.Object{id: object_id}) do case SearchIndexingWorker.enqueue("remove_from_index", %{"object" => object_id}) do @@ -36,6 +43,6 @@ def healthcheck_endpoints do search_module.healthcheck_endpoints end - defp indexable?(%Activity{object: %Object{}, data: %{"type" => "Create"}}), do: true + defp indexable?(%Activity{data: %{"type" => "Create"}}), do: true defp indexable?(_), do: false end -- GitLab