Fix Rich Media Previews for updated activities

The Rich Media Previews were not regenerated when a post was updated due to a cache invalidation issue. They are now cached by the activity id so they can be evicted with the other activity cache objects in the :scrubber_cache.
2024-02-04 19:24:52 -05:00 · 2024-02-04 19:24:52 -05:00 · 04fc4eddaa
parent 0b9990a7e5
commit 04fc4eddaa
8 changed files with 96 additions and 23 deletions
--- a/changelog.d/rich_media.fix
+++ b/changelog.d/rich_media.fix
@ -0,0 +1 @@
 Rich Media Preview cache eviction when the activity is updated.
--- a/lib/pleroma/activity/html.ex
+++ b/lib/pleroma/activity/html.ex
@ -28,7 +28,7 @@ defp get_cache_keys_for(activity_id) do
    end
  end
-  defp add_cache_key_for(activity_id, additional_key) do
+  def add_cache_key_for(activity_id, additional_key) do
    current = get_cache_keys_for(activity_id)
    unless additional_key in current do
--- a/lib/pleroma/html.ex
+++ b/lib/pleroma/html.ex
@ -6,8 +6,6 @@ defmodule Pleroma.HTML do
  # Scrubbers are compiled on boot so they can be configured in OTP releases
  #  @on_load :compile_scrubbers
  @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
  def compile_scrubbers do
    dir = Path.join(:code.priv_dir(:pleroma), "scrubbers")
@ -67,27 +65,20 @@ def ensure_scrubbed_html(
    end
  end
-  def extract_first_external_url_from_object(%{data: %{"content" => content}} = object)
+  @spec extract_first_external_url_from_object(Pleroma.Object.t()) ::
          {:ok, String.t()} | {:error, :no_content}
  def extract_first_external_url_from_object(%{data: %{"content" => content}})
      when is_binary(content) do
-    unless object.data["fake"] do
+    url =
-      key = "URL|#{object.id}"
+      content
      |> Floki.parse_fragment!()
      |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
      |> Enum.take(1)
      |> Floki.attribute("href")
      |> Enum.at(0)
-      @cachex.fetch!(:scrubber_cache, key, fn _key ->
+    {:ok, url}
        {:commit, {:ok, extract_first_external_url(content)}}
      end)
    else
      {:ok, extract_first_external_url(content)}
    end
  end
  def extract_first_external_url_from_object(_), do: {:error, :no_content}
  def extract_first_external_url(content) do
    content
    |> Floki.parse_fragment!()
    |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
    |> Enum.take(1)
    |> Floki.attribute("href")
    |> Enum.at(0)
  end
 end
--- a/lib/pleroma/web/rich_media/helpers.ex
+++ b/lib/pleroma/web/rich_media/helpers.ex
@ -8,6 +8,8 @@ defmodule Pleroma.Web.RichMedia.Helpers do
  alias Pleroma.Object
  alias Pleroma.Web.RichMedia.Parser
  @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
  @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
  @options [
@ -71,7 +73,24 @@ def fetch_data_for_object(object) do
  def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do
    with true <- @config_impl.get([:rich_media, :enabled]),
         %Object{} = object <- Object.normalize(activity, fetch: false) do
-      fetch_data_for_object(object)
+      if object.data["fake"] do
        fetch_data_for_object(object)
      else
        key = "URL|#{activity.id}"
        @cachex.fetch!(:scrubber_cache, key, fn _ ->
          result = fetch_data_for_object(object)
          cond do
            match?(%{page_url: _, rich_media: _}, result) ->
              Activity.HTML.add_cache_key_for(activity.id, key)
              {:commit, result}
            true ->
              {:ignore, %{}}
          end
        end)
      end
    else
      _ -> %{}
    end
--- a/test/fixtures/rich_media/google.html
+++ b/test/fixtures/rich_media/google.html
@ -0,0 +1,12 @@
 <meta property="og:url" content="https://google.com">
 <meta property="og:type" content="website">
 <meta property="og:title" content="Google">
 <meta property="og:description" content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for.">
 <meta property="og:image" content="">
 <meta name="twitter:card" content="summary_large_image">
 <meta property="twitter:domain" content="google.com">
 <meta property="twitter:url" content="https://google.com">
 <meta name="twitter:title" content="Google">
 <meta name="twitter:description" content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for.">
 <meta name="twitter:image" content="">
--- a/test/fixtures/rich_media/yahoo.html
+++ b/test/fixtures/rich_media/yahoo.html
@ -0,0 +1,12 @@
 <meta property="og:url" content="https://yahoo.com">
 <meta property="og:type" content="website">
 <meta property="og:title" content="Yahoo | Mail, Weather, Search, Politics, News, Finance, Sports & Videos">
 <meta property="og:description" content="Latest news coverage, email, free stock quotes, live scores and video are just the beginning. Discover more every day at Yahoo!">
 <meta property="og:image" content="https://s.yimg.com/cv/apiv2/social/images/yahoo_default_logo.png">
 <meta name="twitter:card" content="summary_large_image">
 <meta property="twitter:domain" content="yahoo.com">
 <meta property="twitter:url" content="https://yahoo.com">
 <meta name="twitter:title" content="Yahoo | Mail, Weather, Search, Politics, News, Finance, Sports & Videos">
 <meta name="twitter:description" content="Latest news coverage, email, free stock quotes, live scores and video are just the beginning. Discover more every day at Yahoo!">
 <meta name="twitter:image" content="https://s.yimg.com/cv/apiv2/social/images/yahoo_default_logo.png">
--- a/test/pleroma/web/rich_media/helpers_test.exs
+++ b/test/pleroma/web/rich_media/helpers_test.exs
@ -83,6 +83,34 @@ test "crawls valid, complete URLs" do
             Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
  end
  test "recrawls URLs on updates" do
    original_url = "https://google.com/"
    updated_url = "https://yahoo.com/"
    Pleroma.StaticStubbedConfigMock
    |> stub(:get, fn
      [:rich_media, :enabled] -> true
      path -> Pleroma.Test.StaticConfig.get(path)
    end)
    user = insert(:user)
    {:ok, activity} = CommonAPI.post(user, %{status: "I like this site #{original_url}"})
    assert match?(
             %{page_url: ^original_url, rich_media: _},
             Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
           )
    {:ok, _} = CommonAPI.update(user, activity, %{status: "I like this site #{updated_url}"})
    activity = Pleroma.Activity.get_by_id(activity.id)
    assert match?(
             %{page_url: ^updated_url, rich_media: _},
             Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
           )
  end
  # This does not seem to work. The urls are being fetched.
  @tag skip: true
  test "refuses to crawl URLs of private network from posts" do
--- a/test/support/http_request_mock.ex
+++ b/test/support/http_request_mock.ex
@ -1464,6 +1464,14 @@ def get("https://misskey.io/notes/8vs6wxufd0", _, _, _) do
     }}
  end
  def get("https://google.com/", _, _, _) do
    {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/google.html")}}
  end
  def get("https://yahoo.com/", _, _, _) do
    {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/yahoo.html")}}
  end
  def get(url, query, body, headers) do
    {:error,
     "Mock response not implemented for GET #{inspect(url)}, #{query}, #{inspect(body)}, #{inspect(headers)}"}
@ -1539,7 +1547,9 @@ def post(url, query, body, headers) do
  @rich_media_mocks [
    "https://example.com/ogp",
    "https://example.com/ogp-missing-data",
-    "https://example.com/twitter-card"
+    "https://example.com/twitter-card",
    "https://google.com/",
    "https://yahoo.com/"
  ]
  def head(url, _query, _body, _headers) when url in @rich_media_mocks do
    {:ok, %Tesla.Env{status: 404, body: ""}}
		`@ -0,0 +1 @@`
							`Rich Media Preview cache eviction when the activity is updated.`