diff --git a/config/config.exs b/config/config.exs index 75e60acf0..6a234db03 100644 --- a/config/config.exs +++ b/config/config.exs @@ -911,6 +911,15 @@ config :pleroma, Pleroma.Uploaders.Uploader, timeout: 30_000 +config :pleroma, Pleroma.Search.QdrantSearch, + qdrant_url: "http://127.0.0.1:6333/", + qdrant_api_key: nil, + ollama_url: "http://127.0.0.1:11434", + ollama_model: "snowflake-arctic-embed:xs", + qdrant_index_configuration: %{ + vectors: %{size: 384, distance: "Cosine"} + } + # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. import_config "#{Mix.env()}.exs" diff --git a/docs/configuration/search.md b/docs/configuration/search.md index 0316c9bf4..682d1e52a 100644 --- a/docs/configuration/search.md +++ b/docs/configuration/search.md @@ -10,6 +10,12 @@ To use built-in search that has no external dependencies, set the search module While it has no external dependencies, it has problems with performance and relevancy. +## QdrantSearch + +This uses the vector search engine [Qdrant](https://qdrant.tech) to search the posts in a vector space. This needs a way to generate embeddings, for now only the [Ollama](Ollama) api is supported. + +The default settings will support a setup where both Ollama and Qdrant run on the same system as pleroma. The embedding model used by Ollama will need to be pulled first (e.g. `ollama pull snowflake-arctic-embed:xs`) for the embedding to work. + ## Meilisearch Note that it's quite a bit more memory hungry than PostgreSQL (around 4-5G for ~1.2 million diff --git a/lib/mix/tasks/pleroma/search/indexer.ex b/lib/mix/tasks/pleroma/search/indexer.ex new file mode 100644 index 000000000..326646b69 --- /dev/null +++ b/lib/mix/tasks/pleroma/search/indexer.ex @@ -0,0 +1,66 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Mix.Tasks.Pleroma.Search.Indexer do + import Mix.Pleroma + import Ecto.Query + + alias Pleroma.Workers.SearchIndexingWorker + + def run(["create_index"]) do + Application.ensure_all_started(:pleroma) + + Pleroma.Config.get([Pleroma.Search, :module]).create_index() + end + + def run(["index" | options]) do + {options, [], []} = + OptionParser.parse( + options, + strict: [ + limit: :integer + ] + ) + + start_pleroma() + + limit = Keyword.get(options, :limit, 100_000) + + per_step = 1000 + chunks = max(div(limit, per_step), 1) + + 1..chunks + |> Enum.each(fn step -> + q = + from(a in Pleroma.Activity, + limit: ^per_step, + offset: ^per_step * (^step - 1), + select: [:id], + order_by: [desc: :id] + ) + + {:ok, ids} = + Pleroma.Repo.transaction(fn -> + Pleroma.Repo.stream(q, timeout: :infinity) + |> Enum.map(fn a -> + a.id + end) + end) + + IO.puts("Got #{length(ids)} activities, adding to indexer") + + ids + |> Enum.chunk_every(100) + |> Enum.each(fn chunk -> + IO.puts("Adding #{length(chunk)} activities to indexing queue") + + chunk + |> Enum.map(fn id -> + SearchIndexingWorker.new(%{"op" => "add_to_index", "activity" => id}) + end) + |> Oban.insert_all() + end) + end) + end +end diff --git a/lib/pleroma/search/qdrant_search.ex b/lib/pleroma/search/qdrant_search.ex new file mode 100644 index 000000000..315262cb3 --- /dev/null +++ b/lib/pleroma/search/qdrant_search.ex @@ -0,0 +1,118 @@ +defmodule Pleroma.Search.QdrantSearch do + @behaviour Pleroma.Search.SearchBackend + import Ecto.Query + alias Pleroma.Activity + + alias __MODULE__.QdrantClient + alias __MODULE__.OllamaClient + + import Pleroma.Search.Meilisearch, only: [object_to_search_data: 1] + + @impl true + def create_index() do + payload = Pleroma.Config.get([Pleroma.Search.QdrantSearch, :qdrant_index_configuration]) + QdrantClient.put("/collections/posts", payload) + end + + def drop_index() do + QdrantClient.delete("/collections/posts") + end + + def get_embedding(text) do + with {:ok, %{body: %{"embedding" => embedding}}} <- + OllamaClient.post("/api/embeddings", %{ + prompt: text, + model: Pleroma.Config.get([Pleroma.Search.QdrantSearch, :ollama_model]) + }) do + {:ok, embedding} + else + _ -> + {:error, "Failed to get embedding"} + end + end + + defp build_index_payload(activity, embedding) do + %{ + points: [ + %{ + id: activity.id |> FlakeId.from_string() |> Ecto.UUID.cast!(), + vector: embedding + } + ] + } + end + + defp build_search_payload(embedding) do + %{ + vector: embedding, + limit: 20 + } + end + + @impl true + def add_to_index(activity) do + # This will only index public or unlisted notes + maybe_search_data = object_to_search_data(activity.object) + + if activity.data["type"] == "Create" and maybe_search_data do + with {:ok, embedding} <- get_embedding(maybe_search_data.content), + {:ok, %{status: 200}} <- + QdrantClient.put( + "/collections/posts/points", + build_index_payload(activity, embedding) + ) do + :ok + else + e -> {:error, e} + end + else + :ok + end + end + + @impl true + def search(_user, query, _options) do + query = "Represent this sentence for searching relevant passages: #{query}" + + with {:ok, embedding} <- get_embedding(query), + {:ok, %{body: %{"result" => result}}} <- + QdrantClient.post("/collections/posts/points/search", build_search_payload(embedding)) do + ids = + Enum.map(result, fn %{"id" => id} -> + Ecto.UUID.dump!(id) + end) + + from(a in Activity, where: a.id in ^ids) + |> Activity.with_preloaded_object() + |> Activity.restrict_deactivated_users() + |> Ecto.Query.order_by([a], fragment("array_position(?, ?)", ^ids, a.id)) + |> Pleroma.Repo.all() + else + _ -> + [] + end + end + + @impl true + def remove_from_index(_object) do + :ok + end +end + +defmodule Pleroma.Search.QdrantSearch.OllamaClient do + use Tesla + + plug(Tesla.Middleware.BaseUrl, Pleroma.Config.get([Pleroma.Search.QdrantSearch, :ollama_url])) + plug(Tesla.Middleware.JSON) +end + +defmodule Pleroma.Search.QdrantSearch.QdrantClient do + use Tesla + + plug(Tesla.Middleware.BaseUrl, Pleroma.Config.get([Pleroma.Search.QdrantSearch, :qdrant_url])) + plug(Tesla.Middleware.JSON) + + plug(Tesla.Middleware.Headers, [ + {"api-key", Pleroma.Config.get([Pleroma.Search.QdrantSearch, :qdrant_api_key])} + ]) +end diff --git a/lib/pleroma/search/search_backend.ex b/lib/pleroma/search/search_backend.ex index 68bc48cec..5be0169d0 100644 --- a/lib/pleroma/search/search_backend.ex +++ b/lib/pleroma/search/search_backend.ex @@ -21,4 +21,9 @@ defmodule Pleroma.Search.SearchBackend do from index. """ @callback remove_from_index(object :: Pleroma.Object.t()) :: :ok | {:error, any()} + + @doc """ + Create the index + """ + @callback create_index() :: :ok | {:error, any()} end