Fix Rich Media Previews for updated activities

The Rich Media Previews were not regenerated when a post was updated due to a cache invalidation issue. They are now cached by the activity id so they can be evicted with the other activity cache objects in the :scrubber_cache.
This commit is contained in:
Mark Felder 2024-02-04 19:24:52 -05:00
parent 0b9990a7e5
commit 04fc4eddaa
8 changed files with 96 additions and 23 deletions

View File

@ -0,0 +1 @@
Rich Media Preview cache eviction when the activity is updated.

View File

@ -28,7 +28,7 @@ defp get_cache_keys_for(activity_id) do
end end
end end
defp add_cache_key_for(activity_id, additional_key) do def add_cache_key_for(activity_id, additional_key) do
current = get_cache_keys_for(activity_id) current = get_cache_keys_for(activity_id)
unless additional_key in current do unless additional_key in current do

View File

@ -6,8 +6,6 @@ defmodule Pleroma.HTML do
# Scrubbers are compiled on boot so they can be configured in OTP releases # Scrubbers are compiled on boot so they can be configured in OTP releases
# @on_load :compile_scrubbers # @on_load :compile_scrubbers
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
def compile_scrubbers do def compile_scrubbers do
dir = Path.join(:code.priv_dir(:pleroma), "scrubbers") dir = Path.join(:code.priv_dir(:pleroma), "scrubbers")
@ -67,27 +65,20 @@ def ensure_scrubbed_html(
end end
end end
def extract_first_external_url_from_object(%{data: %{"content" => content}} = object) @spec extract_first_external_url_from_object(Pleroma.Object.t()) ::
{:ok, String.t()} | {:error, :no_content}
def extract_first_external_url_from_object(%{data: %{"content" => content}})
when is_binary(content) do when is_binary(content) do
unless object.data["fake"] do url =
key = "URL|#{object.id}"
@cachex.fetch!(:scrubber_cache, key, fn _key ->
{:commit, {:ok, extract_first_external_url(content)}}
end)
else
{:ok, extract_first_external_url(content)}
end
end
def extract_first_external_url_from_object(_), do: {:error, :no_content}
def extract_first_external_url(content) do
content content
|> Floki.parse_fragment!() |> Floki.parse_fragment!()
|> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])") |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
|> Enum.take(1) |> Enum.take(1)
|> Floki.attribute("href") |> Floki.attribute("href")
|> Enum.at(0) |> Enum.at(0)
{:ok, url}
end end
def extract_first_external_url_from_object(_), do: {:error, :no_content}
end end

View File

@ -8,6 +8,8 @@ defmodule Pleroma.Web.RichMedia.Helpers do
alias Pleroma.Object alias Pleroma.Object
alias Pleroma.Web.RichMedia.Parser alias Pleroma.Web.RichMedia.Parser
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
@options [ @options [
@ -71,7 +73,24 @@ def fetch_data_for_object(object) do
def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do
with true <- @config_impl.get([:rich_media, :enabled]), with true <- @config_impl.get([:rich_media, :enabled]),
%Object{} = object <- Object.normalize(activity, fetch: false) do %Object{} = object <- Object.normalize(activity, fetch: false) do
if object.data["fake"] do
fetch_data_for_object(object) fetch_data_for_object(object)
else
key = "URL|#{activity.id}"
@cachex.fetch!(:scrubber_cache, key, fn _ ->
result = fetch_data_for_object(object)
cond do
match?(%{page_url: _, rich_media: _}, result) ->
Activity.HTML.add_cache_key_for(activity.id, key)
{:commit, result}
true ->
{:ignore, %{}}
end
end)
end
else else
_ -> %{} _ -> %{}
end end

12
test/fixtures/rich_media/google.html vendored Normal file
View File

@ -0,0 +1,12 @@
<meta property="og:url" content="https://google.com">
<meta property="og:type" content="website">
<meta property="og:title" content="Google">
<meta property="og:description" content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for.">
<meta property="og:image" content="">
<meta name="twitter:card" content="summary_large_image">
<meta property="twitter:domain" content="google.com">
<meta property="twitter:url" content="https://google.com">
<meta name="twitter:title" content="Google">
<meta name="twitter:description" content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for.">
<meta name="twitter:image" content="">

12
test/fixtures/rich_media/yahoo.html vendored Normal file
View File

@ -0,0 +1,12 @@
<meta property="og:url" content="https://yahoo.com">
<meta property="og:type" content="website">
<meta property="og:title" content="Yahoo | Mail, Weather, Search, Politics, News, Finance, Sports & Videos">
<meta property="og:description" content="Latest news coverage, email, free stock quotes, live scores and video are just the beginning. Discover more every day at Yahoo!">
<meta property="og:image" content="https://s.yimg.com/cv/apiv2/social/images/yahoo_default_logo.png">
<meta name="twitter:card" content="summary_large_image">
<meta property="twitter:domain" content="yahoo.com">
<meta property="twitter:url" content="https://yahoo.com">
<meta name="twitter:title" content="Yahoo | Mail, Weather, Search, Politics, News, Finance, Sports & Videos">
<meta name="twitter:description" content="Latest news coverage, email, free stock quotes, live scores and video are just the beginning. Discover more every day at Yahoo!">
<meta name="twitter:image" content="https://s.yimg.com/cv/apiv2/social/images/yahoo_default_logo.png">

View File

@ -83,6 +83,34 @@ test "crawls valid, complete URLs" do
Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
end end
test "recrawls URLs on updates" do
original_url = "https://google.com/"
updated_url = "https://yahoo.com/"
Pleroma.StaticStubbedConfigMock
|> stub(:get, fn
[:rich_media, :enabled] -> true
path -> Pleroma.Test.StaticConfig.get(path)
end)
user = insert(:user)
{:ok, activity} = CommonAPI.post(user, %{status: "I like this site #{original_url}"})
assert match?(
%{page_url: ^original_url, rich_media: _},
Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
)
{:ok, _} = CommonAPI.update(user, activity, %{status: "I like this site #{updated_url}"})
activity = Pleroma.Activity.get_by_id(activity.id)
assert match?(
%{page_url: ^updated_url, rich_media: _},
Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
)
end
# This does not seem to work. The urls are being fetched. # This does not seem to work. The urls are being fetched.
@tag skip: true @tag skip: true
test "refuses to crawl URLs of private network from posts" do test "refuses to crawl URLs of private network from posts" do

View File

@ -1464,6 +1464,14 @@ def get("https://misskey.io/notes/8vs6wxufd0", _, _, _) do
}} }}
end end
def get("https://google.com/", _, _, _) do
{:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/google.html")}}
end
def get("https://yahoo.com/", _, _, _) do
{:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/yahoo.html")}}
end
def get(url, query, body, headers) do def get(url, query, body, headers) do
{:error, {:error,
"Mock response not implemented for GET #{inspect(url)}, #{query}, #{inspect(body)}, #{inspect(headers)}"} "Mock response not implemented for GET #{inspect(url)}, #{query}, #{inspect(body)}, #{inspect(headers)}"}
@ -1539,7 +1547,9 @@ def post(url, query, body, headers) do
@rich_media_mocks [ @rich_media_mocks [
"https://example.com/ogp", "https://example.com/ogp",
"https://example.com/ogp-missing-data", "https://example.com/ogp-missing-data",
"https://example.com/twitter-card" "https://example.com/twitter-card",
"https://google.com/",
"https://yahoo.com/"
] ]
def head(url, _query, _body, _headers) when url in @rich_media_mocks do def head(url, _query, _body, _headers) when url in @rich_media_mocks do
{:ok, %Tesla.Env{status: 404, body: ""}} {:ok, %Tesla.Env{status: 404, body: ""}}