From 04fc4eddaa534185d9784351e70f59f30bc1476f Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Sun, 4 Feb 2024 19:24:52 -0500 Subject: [PATCH 1/3] Fix Rich Media Previews for updated activities The Rich Media Previews were not regenerated when a post was updated due to a cache invalidation issue. They are now cached by the activity id so they can be evicted with the other activity cache objects in the :scrubber_cache. --- changelog.d/rich_media.fix | 1 + lib/pleroma/activity/html.ex | 2 +- lib/pleroma/html.ex | 31 +++++++------------- lib/pleroma/web/rich_media/helpers.ex | 21 ++++++++++++- test/fixtures/rich_media/google.html | 12 ++++++++ test/fixtures/rich_media/yahoo.html | 12 ++++++++ test/pleroma/web/rich_media/helpers_test.exs | 28 ++++++++++++++++++ test/support/http_request_mock.ex | 12 +++++++- 8 files changed, 96 insertions(+), 23 deletions(-) create mode 100644 changelog.d/rich_media.fix create mode 100644 test/fixtures/rich_media/google.html create mode 100644 test/fixtures/rich_media/yahoo.html diff --git a/changelog.d/rich_media.fix b/changelog.d/rich_media.fix new file mode 100644 index 000000000..08f119550 --- /dev/null +++ b/changelog.d/rich_media.fix @@ -0,0 +1 @@ +Rich Media Preview cache eviction when the activity is updated. diff --git a/lib/pleroma/activity/html.ex b/lib/pleroma/activity/html.ex index 706b2d36c..ba284b4d5 100644 --- a/lib/pleroma/activity/html.ex +++ b/lib/pleroma/activity/html.ex @@ -28,7 +28,7 @@ defp get_cache_keys_for(activity_id) do end end - defp add_cache_key_for(activity_id, additional_key) do + def add_cache_key_for(activity_id, additional_key) do current = get_cache_keys_for(activity_id) unless additional_key in current do diff --git a/lib/pleroma/html.ex b/lib/pleroma/html.ex index 5bf735c4f..84ff2f129 100644 --- a/lib/pleroma/html.ex +++ b/lib/pleroma/html.ex @@ -6,8 +6,6 @@ defmodule Pleroma.HTML do # Scrubbers are compiled on boot so they can be configured in OTP releases # @on_load :compile_scrubbers - @cachex Pleroma.Config.get([:cachex, :provider], Cachex) - def compile_scrubbers do dir = Path.join(:code.priv_dir(:pleroma), "scrubbers") @@ -67,27 +65,20 @@ def ensure_scrubbed_html( end end - def extract_first_external_url_from_object(%{data: %{"content" => content}} = object) + @spec extract_first_external_url_from_object(Pleroma.Object.t()) :: + {:ok, String.t()} | {:error, :no_content} + def extract_first_external_url_from_object(%{data: %{"content" => content}}) when is_binary(content) do - unless object.data["fake"] do - key = "URL|#{object.id}" + url = + content + |> Floki.parse_fragment!() + |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])") + |> Enum.take(1) + |> Floki.attribute("href") + |> Enum.at(0) - @cachex.fetch!(:scrubber_cache, key, fn _key -> - {:commit, {:ok, extract_first_external_url(content)}} - end) - else - {:ok, extract_first_external_url(content)} - end + {:ok, url} end def extract_first_external_url_from_object(_), do: {:error, :no_content} - - def extract_first_external_url(content) do - content - |> Floki.parse_fragment!() - |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])") - |> Enum.take(1) - |> Floki.attribute("href") - |> Enum.at(0) - end end diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index 61000bb9b..ee12a9dfb 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -8,6 +8,8 @@ defmodule Pleroma.Web.RichMedia.Helpers do alias Pleroma.Object alias Pleroma.Web.RichMedia.Parser + @cachex Pleroma.Config.get([:cachex, :provider], Cachex) + @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) @options [ @@ -71,7 +73,24 @@ def fetch_data_for_object(object) do def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do with true <- @config_impl.get([:rich_media, :enabled]), %Object{} = object <- Object.normalize(activity, fetch: false) do - fetch_data_for_object(object) + if object.data["fake"] do + fetch_data_for_object(object) + else + key = "URL|#{activity.id}" + + @cachex.fetch!(:scrubber_cache, key, fn _ -> + result = fetch_data_for_object(object) + + cond do + match?(%{page_url: _, rich_media: _}, result) -> + Activity.HTML.add_cache_key_for(activity.id, key) + {:commit, result} + + true -> + {:ignore, %{}} + end + end) + end else _ -> %{} end diff --git a/test/fixtures/rich_media/google.html b/test/fixtures/rich_media/google.html new file mode 100644 index 000000000..c068397a5 --- /dev/null +++ b/test/fixtures/rich_media/google.html @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/test/fixtures/rich_media/yahoo.html b/test/fixtures/rich_media/yahoo.html new file mode 100644 index 000000000..41d8c5cd9 --- /dev/null +++ b/test/fixtures/rich_media/yahoo.html @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/test/pleroma/web/rich_media/helpers_test.exs b/test/pleroma/web/rich_media/helpers_test.exs index 3ef5705ce..8f6713ef8 100644 --- a/test/pleroma/web/rich_media/helpers_test.exs +++ b/test/pleroma/web/rich_media/helpers_test.exs @@ -83,6 +83,34 @@ test "crawls valid, complete URLs" do Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end + test "recrawls URLs on updates" do + original_url = "https://google.com/" + updated_url = "https://yahoo.com/" + + Pleroma.StaticStubbedConfigMock + |> stub(:get, fn + [:rich_media, :enabled] -> true + path -> Pleroma.Test.StaticConfig.get(path) + end) + + user = insert(:user) + {:ok, activity} = CommonAPI.post(user, %{status: "I like this site #{original_url}"}) + + assert match?( + %{page_url: ^original_url, rich_media: _}, + Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) + ) + + {:ok, _} = CommonAPI.update(user, activity, %{status: "I like this site #{updated_url}"}) + + activity = Pleroma.Activity.get_by_id(activity.id) + + assert match?( + %{page_url: ^updated_url, rich_media: _}, + Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) + ) + end + # This does not seem to work. The urls are being fetched. @tag skip: true test "refuses to crawl URLs of private network from posts" do diff --git a/test/support/http_request_mock.ex b/test/support/http_request_mock.ex index f76128312..b220fd051 100644 --- a/test/support/http_request_mock.ex +++ b/test/support/http_request_mock.ex @@ -1464,6 +1464,14 @@ def get("https://misskey.io/notes/8vs6wxufd0", _, _, _) do }} end + def get("https://google.com/", _, _, _) do + {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/google.html")}} + end + + def get("https://yahoo.com/", _, _, _) do + {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/yahoo.html")}} + end + def get(url, query, body, headers) do {:error, "Mock response not implemented for GET #{inspect(url)}, #{query}, #{inspect(body)}, #{inspect(headers)}"} @@ -1539,7 +1547,9 @@ def post(url, query, body, headers) do @rich_media_mocks [ "https://example.com/ogp", "https://example.com/ogp-missing-data", - "https://example.com/twitter-card" + "https://example.com/twitter-card", + "https://google.com/", + "https://yahoo.com/" ] def head(url, _query, _body, _headers) when url in @rich_media_mocks do {:ok, %Tesla.Env{status: 404, body: ""}} From 579561e97ba83183022d4bd2658522be6b6ae202 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Sun, 4 Feb 2024 23:40:11 -0500 Subject: [PATCH 2/3] URI.authority is deprecated --- lib/pleroma/web/rich_media/helpers.ex | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index ee12a9dfb..9d6b8a38b 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -27,8 +27,7 @@ defp validate_page_url(page_url) when is_binary(page_url) do |> parse_uri(page_url) end - defp validate_page_url(%URI{host: host, scheme: "https", authority: authority}) - when is_binary(authority) do + defp validate_page_url(%URI{host: host, scheme: "https"}) do cond do host in @config_impl.get([:rich_media, :ignore_hosts], []) -> :error From 0cc038b67c231090827c1b4e71a32f65ee7c3d88 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Mon, 5 Feb 2024 00:09:37 -0500 Subject: [PATCH 3/3] Ensure URLs with IP addresses for the host do not generate previews --- lib/pleroma/web/rich_media/helpers.ex | 3 +++ test/pleroma/web/rich_media/helpers_test.exs | 12 +++++------- test/support/http_request_mock.ex | 3 ++- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index 9d6b8a38b..1501776d9 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -29,6 +29,9 @@ defp validate_page_url(page_url) when is_binary(page_url) do defp validate_page_url(%URI{host: host, scheme: "https"}) do cond do + Linkify.Parser.ip?(host) -> + :error + host in @config_impl.get([:rich_media, :ignore_hosts], []) -> :error diff --git a/test/pleroma/web/rich_media/helpers_test.exs b/test/pleroma/web/rich_media/helpers_test.exs index 8f6713ef8..bf7372476 100644 --- a/test/pleroma/web/rich_media/helpers_test.exs +++ b/test/pleroma/web/rich_media/helpers_test.exs @@ -111,8 +111,6 @@ test "recrawls URLs on updates" do ) end - # This does not seem to work. The urls are being fetched. - @tag skip: true test "refuses to crawl URLs of private network from posts" do user = insert(:user) @@ -130,10 +128,10 @@ test "refuses to crawl URLs of private network from posts" do path -> Pleroma.Test.StaticConfig.get(path) end) - assert %{} = Helpers.fetch_data_for_activity(activity) - assert %{} = Helpers.fetch_data_for_activity(activity2) - assert %{} = Helpers.fetch_data_for_activity(activity3) - assert %{} = Helpers.fetch_data_for_activity(activity4) - assert %{} = Helpers.fetch_data_for_activity(activity5) + assert %{} == Helpers.fetch_data_for_activity(activity) + assert %{} == Helpers.fetch_data_for_activity(activity2) + assert %{} == Helpers.fetch_data_for_activity(activity3) + assert %{} == Helpers.fetch_data_for_activity(activity4) + assert %{} == Helpers.fetch_data_for_activity(activity5) end end diff --git a/test/support/http_request_mock.ex b/test/support/http_request_mock.ex index b220fd051..df3371a75 100644 --- a/test/support/http_request_mock.ex +++ b/test/support/http_request_mock.ex @@ -1549,7 +1549,8 @@ def post(url, query, body, headers) do "https://example.com/ogp-missing-data", "https://example.com/twitter-card", "https://google.com/", - "https://yahoo.com/" + "https://yahoo.com/", + "https://pleroma.local/notice/9kCP7V" ] def head(url, _query, _body, _headers) when url in @rich_media_mocks do {:ok, %Tesla.Env{status: 404, body: ""}}