Switch from HtmlSanitizeEx to FastSanitize

This commit is contained in:
rinpatch 2019-10-29 01:18:08 +03:00
parent 2453928b57
commit 08f6837065
5 changed files with 95 additions and 94 deletions

View File

@ -3,7 +3,6 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.HTML do defmodule Pleroma.HTML do
alias HtmlSanitizeEx.Scrubber
defp get_scrubbers(scrubber) when is_atom(scrubber), do: [scrubber] defp get_scrubbers(scrubber) when is_atom(scrubber), do: [scrubber]
defp get_scrubbers(scrubbers) when is_list(scrubbers), do: scrubbers defp get_scrubbers(scrubbers) when is_list(scrubbers), do: scrubbers
@ -24,9 +23,13 @@ def filter_tags(html, scrubbers) when is_list(scrubbers) do
end) end)
end end
def filter_tags(html, scrubber), do: Scrubber.scrub(html, scrubber) def filter_tags(html, scrubber) do
{:ok, content} = FastSanitize.Sanitizer.scrub(html, scrubber)
content
end
def filter_tags(html), do: filter_tags(html, nil) def filter_tags(html), do: filter_tags(html, nil)
def strip_tags(html), do: Scrubber.scrub(html, Scrubber.StripTags) def strip_tags(html), do: filter_tags(html, FastSanitize.Sanitizer.StripTags)
def get_cached_scrubbed_html_for_activity( def get_cached_scrubbed_html_for_activity(
content, content,
@ -36,7 +39,6 @@ def get_cached_scrubbed_html_for_activity(
callback \\ fn x -> x end callback \\ fn x -> x end
) do ) do
key = "#{key}#{generate_scrubber_signature(scrubbers)}|#{activity.id}" key = "#{key}#{generate_scrubber_signature(scrubbers)}|#{activity.id}"
Cachex.fetch!(:scrubber_cache, key, fn _key -> Cachex.fetch!(:scrubber_cache, key, fn _key ->
object = Pleroma.Object.normalize(activity) object = Pleroma.Object.normalize(activity)
ensure_scrubbed_html(content, scrubbers, object.data["fake"] || false, callback) ensure_scrubbed_html(content, scrubbers, object.data["fake"] || false, callback)
@ -46,7 +48,7 @@ def get_cached_scrubbed_html_for_activity(
def get_cached_stripped_html_for_activity(content, activity, key) do def get_cached_stripped_html_for_activity(content, activity, key) do
get_cached_scrubbed_html_for_activity( get_cached_scrubbed_html_for_activity(
content, content,
HtmlSanitizeEx.Scrubber.StripTags, FastSanitize.Sanitizer.StripTags,
activity, activity,
key, key,
&HtmlEntities.decode/1 &HtmlEntities.decode/1
@ -109,13 +111,12 @@ defmodule Pleroma.HTML.Scrubber.TwitterText do
require HtmlSanitizeEx.Scrubber.Meta require HtmlSanitizeEx.Scrubber.Meta
alias HtmlSanitizeEx.Scrubber.Meta alias HtmlSanitizeEx.Scrubber.Meta
Meta.remove_cdata_sections_before_scrub()
Meta.strip_comments() Meta.strip_comments()
# links # links
Meta.allow_tag_with_uri_attributes("a", ["href", "data-user", "data-tag"], @valid_schemes) Meta.allow_tag_with_uri_attributes(:a, ["href", "data-user", "data-tag"], @valid_schemes)
Meta.allow_tag_with_this_attribute_values("a", "class", [ Meta.allow_tag_with_this_attribute_values(:a, "class", [
"hashtag", "hashtag",
"u-url", "u-url",
"mention", "mention",
@ -123,29 +124,29 @@ defmodule Pleroma.HTML.Scrubber.TwitterText do
"mention u-url" "mention u-url"
]) ])
Meta.allow_tag_with_this_attribute_values("a", "rel", [ Meta.allow_tag_with_this_attribute_values(:a, "rel", [
"tag", "tag",
"nofollow", "nofollow",
"noopener", "noopener",
"noreferrer" "noreferrer"
]) ])
Meta.allow_tag_with_these_attributes("a", ["name", "title"]) Meta.allow_tag_with_these_attributes(:a, ["name", "title"])
# paragraphs and linebreaks # paragraphs and linebreaks
Meta.allow_tag_with_these_attributes("br", []) Meta.allow_tag_with_these_attributes(:br, [])
Meta.allow_tag_with_these_attributes("p", []) Meta.allow_tag_with_these_attributes(:p, [])
# microformats # microformats
Meta.allow_tag_with_this_attribute_values("span", "class", ["h-card"]) Meta.allow_tag_with_this_attribute_values(:span, "class", ["h-card"])
Meta.allow_tag_with_these_attributes("span", []) Meta.allow_tag_with_these_attributes(:span, [])
# allow inline images for custom emoji # allow inline images for custom emoji
if Pleroma.Config.get([:markup, :allow_inline_images]) do if Pleroma.Config.get([:markup, :allow_inline_images]) do
# restrict img tags to http/https only, because of MediaProxy. # restrict img tags to http/https only, because of MediaProxy.
Meta.allow_tag_with_uri_attributes("img", ["src"], ["http", "https"]) Meta.allow_tag_with_uri_attributes(:img, ["src"], ["http", "https"])
Meta.allow_tag_with_these_attributes("img", [ Meta.allow_tag_with_these_attributes(:img, [
"width", "width",
"height", "height",
"class", "class",
@ -160,19 +161,19 @@ defmodule Pleroma.HTML.Scrubber.TwitterText do
defmodule Pleroma.HTML.Scrubber.Default do defmodule Pleroma.HTML.Scrubber.Default do
@doc "The default HTML scrubbing policy: no " @doc "The default HTML scrubbing policy: no "
require HtmlSanitizeEx.Scrubber.Meta require FastSanitize.Sanitizer.Meta
alias HtmlSanitizeEx.Scrubber.Meta alias FastSanitize.Sanitizer.Meta
# credo:disable-for-previous-line # credo:disable-for-previous-line
# No idea how to fix this one… # No idea how to fix this one…
@valid_schemes Pleroma.Config.get([:uri_schemes, :valid_schemes], []) @valid_schemes Pleroma.Config.get([:uri_schemes, :valid_schemes], [])
Meta.remove_cdata_sections_before_scrub() # Meta.remove_cdata_sections_before_scrub()
Meta.strip_comments() Meta.strip_comments()
Meta.allow_tag_with_uri_attributes("a", ["href", "data-user", "data-tag"], @valid_schemes) Meta.allow_tag_with_uri_attributes(:a, ["href", "data-user", "data-tag"], @valid_schemes)
Meta.allow_tag_with_this_attribute_values("a", "class", [ Meta.allow_tag_with_this_attribute_values(:a, "class", [
"hashtag", "hashtag",
"u-url", "u-url",
"mention", "mention",
@ -180,7 +181,7 @@ defmodule Pleroma.HTML.Scrubber.Default do
"mention u-url" "mention u-url"
]) ])
Meta.allow_tag_with_this_attribute_values("a", "rel", [ Meta.allow_tag_with_this_attribute_values(:a, "rel", [
"tag", "tag",
"nofollow", "nofollow",
"noopener", "noopener",
@ -188,37 +189,37 @@ defmodule Pleroma.HTML.Scrubber.Default do
"ugc" "ugc"
]) ])
Meta.allow_tag_with_these_attributes("a", ["name", "title"]) Meta.allow_tag_with_these_attributes(:a, ["name", "title"])
Meta.allow_tag_with_these_attributes("abbr", ["title"]) Meta.allow_tag_with_these_attributes(:abbr, ["title"])
Meta.allow_tag_with_these_attributes("b", []) Meta.allow_tag_with_these_attributes(:b, [])
Meta.allow_tag_with_these_attributes("blockquote", []) Meta.allow_tag_with_these_attributes(:blockquote, [])
Meta.allow_tag_with_these_attributes("br", []) Meta.allow_tag_with_these_attributes(:br, [])
Meta.allow_tag_with_these_attributes("code", []) Meta.allow_tag_with_these_attributes(:code, [])
Meta.allow_tag_with_these_attributes("del", []) Meta.allow_tag_with_these_attributes(:del, [])
Meta.allow_tag_with_these_attributes("em", []) Meta.allow_tag_with_these_attributes(:em, [])
Meta.allow_tag_with_these_attributes("i", []) Meta.allow_tag_with_these_attributes(:i, [])
Meta.allow_tag_with_these_attributes("li", []) Meta.allow_tag_with_these_attributes(:li, [])
Meta.allow_tag_with_these_attributes("ol", []) Meta.allow_tag_with_these_attributes(:ol, [])
Meta.allow_tag_with_these_attributes("p", []) Meta.allow_tag_with_these_attributes(:p, [])
Meta.allow_tag_with_these_attributes("pre", []) Meta.allow_tag_with_these_attributes(:pre, [])
Meta.allow_tag_with_these_attributes("strong", []) Meta.allow_tag_with_these_attributes(:strong, [])
Meta.allow_tag_with_these_attributes("sub", []) Meta.allow_tag_with_these_attributes(:sub, [])
Meta.allow_tag_with_these_attributes("sup", []) Meta.allow_tag_with_these_attributes(:sup, [])
Meta.allow_tag_with_these_attributes("u", []) Meta.allow_tag_with_these_attributes(:u, [])
Meta.allow_tag_with_these_attributes("ul", []) Meta.allow_tag_with_these_attributes(:ul, [])
Meta.allow_tag_with_this_attribute_values("span", "class", ["h-card"]) Meta.allow_tag_with_this_attribute_values(:span, "class", ["h-card"])
Meta.allow_tag_with_these_attributes("span", []) Meta.allow_tag_with_these_attributes(:span, [])
@allow_inline_images Pleroma.Config.get([:markup, :allow_inline_images]) @allow_inline_images Pleroma.Config.get([:markup, :allow_inline_images])
if @allow_inline_images do if @allow_inline_images do
# restrict img tags to http/https only, because of MediaProxy. # restrict img tags to http/https only, because of MediaProxy.
Meta.allow_tag_with_uri_attributes("img", ["src"], ["http", "https"]) Meta.allow_tag_with_uri_attributes(:img, ["src"], ["http", "https"])
Meta.allow_tag_with_these_attributes("img", [ Meta.allow_tag_with_these_attributes(:img, [
"width", "width",
"height", "height",
"class", "class",
@ -228,24 +229,24 @@ defmodule Pleroma.HTML.Scrubber.Default do
end end
if Pleroma.Config.get([:markup, :allow_tables]) do if Pleroma.Config.get([:markup, :allow_tables]) do
Meta.allow_tag_with_these_attributes("table", []) Meta.allow_tag_with_these_attributes(:table, [])
Meta.allow_tag_with_these_attributes("tbody", []) Meta.allow_tag_with_these_attributes(:tbody, [])
Meta.allow_tag_with_these_attributes("td", []) Meta.allow_tag_with_these_attributes(:td, [])
Meta.allow_tag_with_these_attributes("th", []) Meta.allow_tag_with_these_attributes(:th, [])
Meta.allow_tag_with_these_attributes("thead", []) Meta.allow_tag_with_these_attributes(:thead, [])
Meta.allow_tag_with_these_attributes("tr", []) Meta.allow_tag_with_these_attributes(:tr, [])
end end
if Pleroma.Config.get([:markup, :allow_headings]) do if Pleroma.Config.get([:markup, :allow_headings]) do
Meta.allow_tag_with_these_attributes("h1", []) Meta.allow_tag_with_these_attributes(:h1, [])
Meta.allow_tag_with_these_attributes("h2", []) Meta.allow_tag_with_these_attributes(:h2, [])
Meta.allow_tag_with_these_attributes("h3", []) Meta.allow_tag_with_these_attributes(:h3, [])
Meta.allow_tag_with_these_attributes("h4", []) Meta.allow_tag_with_these_attributes(:h4, [])
Meta.allow_tag_with_these_attributes("h5", []) Meta.allow_tag_with_these_attributes(:h5, [])
end end
if Pleroma.Config.get([:markup, :allow_fonts]) do if Pleroma.Config.get([:markup, :allow_fonts]) do
Meta.allow_tag_with_these_attributes("font", ["face"]) Meta.allow_tag_with_these_attributes(:font, ["face"])
end end
Meta.strip_everything_not_covered() Meta.strip_everything_not_covered()
@ -258,7 +259,7 @@ defmodule Pleroma.HTML.Transform.MediaProxy do
def before_scrub(html), do: html def before_scrub(html), do: html
def scrub_attribute("img", {"src", "http" <> target}) do def scrub_attribute(:img, {"src", "http" <> target}) do
media_url = media_url =
("http" <> target) ("http" <> target)
|> MediaProxy.url() |> MediaProxy.url()
@ -268,16 +269,16 @@ def scrub_attribute("img", {"src", "http" <> target}) do
def scrub_attribute(_tag, attribute), do: attribute def scrub_attribute(_tag, attribute), do: attribute
def scrub({"img", attributes, children}) do def scrub({:img, attributes, children}) do
attributes = attributes =
attributes attributes
|> Enum.map(fn attr -> scrub_attribute("img", attr) end) |> Enum.map(fn attr -> scrub_attribute(:img, attr) end)
|> Enum.reject(&is_nil(&1)) |> Enum.reject(&is_nil(&1))
{"img", attributes, children} {:img, attributes, children}
end end
def scrub({:comment, _children}), do: "" def scrub({:comment, _text, _children}), do: ""
def scrub({tag, attributes, children}), do: {tag, attributes, children} def scrub({tag, attributes, children}), do: {tag, attributes, children}
def scrub({_tag, children}), do: children def scrub({_tag, children}), do: children
@ -298,9 +299,9 @@ defmodule Pleroma.HTML.Scrubber.LinksOnly do
Meta.strip_comments() Meta.strip_comments()
# links # links
Meta.allow_tag_with_uri_attributes("a", ["href"], @valid_schemes) Meta.allow_tag_with_uri_attributes(:a, ["href"], @valid_schemes)
Meta.allow_tag_with_this_attribute_values("a", "rel", [ Meta.allow_tag_with_this_attribute_values(:a, "rel", [
"tag", "tag",
"nofollow", "nofollow",
"noopener", "noopener",
@ -309,6 +310,6 @@ defmodule Pleroma.HTML.Scrubber.LinksOnly do
"ugc" "ugc"
]) ])
Meta.allow_tag_with_these_attributes("a", ["name", "title"]) Meta.allow_tag_with_these_attributes(:a, ["name", "title"])
Meta.strip_everything_not_covered() Meta.strip_everything_not_covered()
end end

View File

@ -42,10 +42,10 @@ test "works as expected" do
this is in bold this is in bold
this is a paragraph this is a paragraph
this is a linebreak this is a linebreak
this is a link with allowed "rel" attribute: example.com this is a link with allowed &quot;rel&quot; attribute: example.com
this is a link with not allowed "rel" attribute: example.com this is a link with not allowed &quot;rel&quot; attribute: example.com
this is an image: this is an image:
alert('hacked') alert(&#39;hacked&#39;)
""" """
assert expected == HTML.strip_tags(@html_sample) assert expected == HTML.strip_tags(@html_sample)
@ -64,10 +64,10 @@ test "normalizes HTML as expected" do
this is in bold this is in bold
<p>this is a paragraph</p> <p>this is a paragraph</p>
this is a linebreak<br/> this is a linebreak<br/>
this is a link with allowed "rel" attribute: <a href="http://example.com/" rel="tag">example.com</a> this is a link with allowed &quot;rel&quot; attribute: <a href="http://example.com/" rel="tag">example.com</a>
this is a link with not allowed "rel" attribute: <a href="http://example.com/">example.com</a> this is a link with not allowed &quot;rel&quot; attribute: <a href="http://example.com/">example.com</a>
this is an image: <img src="http://example.com/image.jpg"/><br/> this is an image: <img src="http://example.com/image.jpg"/><br/>
alert('hacked') alert(&#39;hacked&#39;)
""" """
assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.TwitterText) assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.TwitterText)
@ -118,10 +118,10 @@ test "normalizes HTML as expected" do
<b>this is in bold</b> <b>this is in bold</b>
<p>this is a paragraph</p> <p>this is a paragraph</p>
this is a linebreak<br/> this is a linebreak<br/>
this is a link with allowed "rel" attribute: <a href="http://example.com/" rel="tag">example.com</a> this is a link with allowed &quot;rel&quot; attribute: <a href="http://example.com/" rel="tag">example.com</a>
this is a link with not allowed "rel" attribute: <a href="http://example.com/">example.com</a> this is a link with not allowed &quot;rel&quot; attribute: <a href="http://example.com/">example.com</a>
this is an image: <img src="http://example.com/image.jpg"/><br/> this is an image: <img src="http://example.com/image.jpg"/><br/>
alert('hacked') alert(&#39;hacked&#39;)
""" """
assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.Default) assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.Default)

View File

@ -21,10 +21,10 @@ test "it filter html tags" do
<b>this is in bold</b> <b>this is in bold</b>
<p>this is a paragraph</p> <p>this is a paragraph</p>
this is a linebreak<br/> this is a linebreak<br/>
this is a link with allowed "rel" attribute: <a href="http://example.com/" rel="tag">example.com</a> this is a link with allowed &quot;rel&quot; attribute: <a href="http://example.com/" rel="tag">example.com</a>
this is a link with not allowed "rel" attribute: <a href="http://example.com/">example.com</a> this is a link with not allowed &quot;rel&quot; attribute: <a href="http://example.com/">example.com</a>
this is an image: <img src="http://example.com/image.jpg"/><br/> this is an image: <img src="http://example.com/image.jpg"/><br/>
alert('hacked') alert(&#39;hacked&#39;)
""" """
message = %{"type" => "Create", "object" => %{"content" => @html_sample}} message = %{"type" => "Create", "object" => %{"content" => @html_sample}}

View File

@ -140,7 +140,7 @@ test "it filters out obviously bad tags when accepting a post as HTML" do
object = Object.normalize(activity) object = Object.normalize(activity)
assert object.data["content"] == "<p><b>2hu</b></p>alert('xss')" assert object.data["content"] == "<p><b>2hu</b></p>alert(&#39;xss&#39;)"
end end
test "it filters out obviously bad tags when accepting a post as Markdown" do test "it filters out obviously bad tags when accepting a post as Markdown" do
@ -156,7 +156,7 @@ test "it filters out obviously bad tags when accepting a post as Markdown" do
object = Object.normalize(activity) object = Object.normalize(activity)
assert object.data["content"] == "<p><b>2hu</b></p>alert('xss')" assert object.data["content"] == "<p><b>2hu</b></p>alert(&#39;xss&#39;)"
end end
test "it does not allow replies to direct messages that are not direct messages themselves" do test "it does not allow replies to direct messages that are not direct messages themselves" do