Merge branch 'fix/2047-rich-media-parser' into 'develop'

RichMedia parser fix

Closes #2047

See merge request pleroma/pleroma!2941
This commit is contained in:
rinpatch 2020-09-02 09:38:43 +00:00
parent 13e606941c
commit 8c3241df44
4 changed files with 49 additions and 38 deletions

View File

@ -3,6 +3,8 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parser do defmodule Pleroma.Web.RichMedia.Parser do
require Logger
defp parsers do defp parsers do
Pleroma.Config.get([:rich_media, :parsers]) Pleroma.Config.get([:rich_media, :parsers])
end end
@ -10,18 +12,19 @@ defp parsers do
def parse(nil), do: {:error, "No URL provided"} def parse(nil), do: {:error, "No URL provided"}
if Pleroma.Config.get(:env) == :test do if Pleroma.Config.get(:env) == :test do
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url), do: parse_url(url) def parse(url), do: parse_url(url)
else else
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url) do def parse(url) do
try do Cachex.fetch!(:rich_media_cache, url, fn _ ->
Cachex.fetch!(:rich_media_cache, url, fn _ -> with {:ok, data} <- parse_url(url) do
{:commit, parse_url(url)} {:commit, {:ok, data}}
end) else
|> set_ttl_based_on_image(url) error -> {:ignore, error}
rescue end
e -> end)
{:error, "Cachex error: #{inspect(e)}"} |> set_ttl_based_on_image(url)
end
end end
end end
@ -47,9 +50,11 @@ def ttl(data, url) do
config :pleroma, :rich_media, config :pleroma, :rich_media,
ttl_setters: [MyModule] ttl_setters: [MyModule]
""" """
@spec set_ttl_based_on_image({:ok, map()} | {:error, any()}, String.t()) ::
{:ok, map()} | {:error, any()}
def set_ttl_based_on_image({:ok, data}, url) do def set_ttl_based_on_image({:ok, data}, url) do
with {:ok, nil} <- Cachex.ttl(:rich_media_cache, url), with {:ok, nil} <- Cachex.ttl(:rich_media_cache, url),
ttl when is_number(ttl) <- get_ttl_from_image(data, url) do {:ok, ttl} when is_number(ttl) <- get_ttl_from_image(data, url) do
Cachex.expire_at(:rich_media_cache, url, ttl * 1000) Cachex.expire_at(:rich_media_cache, url, ttl * 1000)
{:ok, data} {:ok, data}
else else
@ -58,8 +63,14 @@ def set_ttl_based_on_image({:ok, data}, url) do
end end
end end
def set_ttl_based_on_image({:error, _} = error, _) do
Logger.error("parsing error: #{inspect(error)}")
error
end
defp get_ttl_from_image(data, url) do defp get_ttl_from_image(data, url) do
Pleroma.Config.get([:rich_media, :ttl_setters]) [:rich_media, :ttl_setters]
|> Pleroma.Config.get()
|> Enum.reduce({:ok, nil}, fn |> Enum.reduce({:ok, nil}, fn
module, {:ok, _ttl} -> module, {:ok, _ttl} ->
module.ttl(data, url) module.ttl(data, url)
@ -70,23 +81,16 @@ defp get_ttl_from_image(data, url) do
end end
defp parse_url(url) do defp parse_url(url) do
try do with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
{:ok, %Tesla.Env{body: html}} = Pleroma.Web.RichMedia.Helpers.rich_media_get(url) {:ok, html} <- Floki.parse_document(html) do
html html
|> parse_html()
|> maybe_parse() |> maybe_parse()
|> Map.put("url", url) |> Map.put("url", url)
|> clean_parsed_data() |> clean_parsed_data()
|> check_parsed_data() |> check_parsed_data()
rescue
e ->
{:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
end end
end end
defp parse_html(html), do: Floki.parse_document!(html)
defp maybe_parse(html) do defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc -> Enum.reduce_while(parsers(), %{}, fn parser, acc ->
case parser.parse(html, acc) do case parser.parse(html, acc) do

View File

@ -10,20 +10,15 @@ def ttl(data, _url) do
|> parse_query_params() |> parse_query_params()
|> format_query_params() |> format_query_params()
|> get_expiration_timestamp() |> get_expiration_timestamp()
else
{:error, "Not aws signed url #{inspect(image)}"}
end end
end end
defp is_aws_signed_url(""), do: nil defp is_aws_signed_url(image) when is_binary(image) and image != "" do
defp is_aws_signed_url(nil), do: nil
defp is_aws_signed_url(image) when is_binary(image) do
%URI{host: host, query: query} = URI.parse(image) %URI{host: host, query: query} = URI.parse(image)
if String.contains?(host, "amazonaws.com") and String.contains?(query, "X-Amz-Expires") do String.contains?(host, "amazonaws.com") and String.contains?(query, "X-Amz-Expires")
image
else
nil
end
end end
defp is_aws_signed_url(_), do: nil defp is_aws_signed_url(_), do: nil
@ -46,6 +41,6 @@ defp get_expiration_timestamp(params) when is_map(params) do
|> Map.get("X-Amz-Date") |> Map.get("X-Amz-Date")
|> Timex.parse("{ISO:Basic:Z}") |> Timex.parse("{ISO:Basic:Z}")
Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires")) {:ok, Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires"))}
end end
end end

View File

@ -21,7 +21,7 @@ test "s3 signed url is parsed correct for expiration time" do
expire_time = expire_time =
Timex.parse!(timestamp, "{ISO:Basic:Z}") |> Timex.to_unix() |> Kernel.+(valid_till) Timex.parse!(timestamp, "{ISO:Basic:Z}") |> Timex.to_unix() |> Kernel.+(valid_till)
assert expire_time == Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.ttl(metadata, url) assert {:ok, expire_time} == Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.ttl(metadata, url)
end end
test "s3 signed url is parsed and correct ttl is set for rich media" do test "s3 signed url is parsed and correct ttl is set for rich media" do

View File

@ -5,6 +5,8 @@
defmodule Pleroma.Web.RichMedia.ParserTest do defmodule Pleroma.Web.RichMedia.ParserTest do
use ExUnit.Case, async: true use ExUnit.Case, async: true
alias Pleroma.Web.RichMedia.Parser
setup do setup do
Tesla.Mock.mock(fn Tesla.Mock.mock(fn
%{ %{
@ -48,23 +50,29 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
%{method: :get, url: "http://example.com/empty"} -> %{method: :get, url: "http://example.com/empty"} ->
%Tesla.Env{status: 200, body: "hello"} %Tesla.Env{status: 200, body: "hello"}
%{method: :get, url: "http://example.com/malformed"} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/malformed-data.html")}
%{method: :get, url: "http://example.com/error"} ->
{:error, :overload}
end) end)
:ok :ok
end end
test "returns error when no metadata present" do test "returns error when no metadata present" do
assert {:error, _} = Pleroma.Web.RichMedia.Parser.parse("http://example.com/empty") assert {:error, _} = Parser.parse("http://example.com/empty")
end end
test "doesn't just add a title" do test "doesn't just add a title" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/non-ogp") == assert Parser.parse("http://example.com/non-ogp") ==
{:error, {:error,
"Found metadata was invalid or incomplete: %{\"url\" => \"http://example.com/non-ogp\"}"} "Found metadata was invalid or incomplete: %{\"url\" => \"http://example.com/non-ogp\"}"}
end end
test "parses ogp" do test "parses ogp" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp") == assert Parser.parse("http://example.com/ogp") ==
{:ok, {:ok,
%{ %{
"image" => "http://ia.media-imdb.com/images/rock.jpg", "image" => "http://ia.media-imdb.com/images/rock.jpg",
@ -77,7 +85,7 @@ test "parses ogp" do
end end
test "falls back to <title> when ogp:title is missing" do test "falls back to <title> when ogp:title is missing" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp-missing-title") == assert Parser.parse("http://example.com/ogp-missing-title") ==
{:ok, {:ok,
%{ %{
"image" => "http://ia.media-imdb.com/images/rock.jpg", "image" => "http://ia.media-imdb.com/images/rock.jpg",
@ -90,7 +98,7 @@ test "falls back to <title> when ogp:title is missing" do
end end
test "parses twitter card" do test "parses twitter card" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/twitter-card") == assert Parser.parse("http://example.com/twitter-card") ==
{:ok, {:ok,
%{ %{
"card" => "summary", "card" => "summary",
@ -103,7 +111,7 @@ test "parses twitter card" do
end end
test "parses OEmbed" do test "parses OEmbed" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/oembed") == assert Parser.parse("http://example.com/oembed") ==
{:ok, {:ok,
%{ %{
"author_name" => "bees", "author_name" => "bees",
@ -132,6 +140,10 @@ test "parses OEmbed" do
end end
test "rejects invalid OGP data" do test "rejects invalid OGP data" do
assert {:error, _} = Pleroma.Web.RichMedia.Parser.parse("http://example.com/malformed") assert {:error, _} = Parser.parse("http://example.com/malformed")
end
test "returns error if getting page was not successful" do
assert {:error, :overload} = Parser.parse("http://example.com/error")
end end
end end