summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrinpatch <rinpatch@sdf.org>2020-02-11 00:29:25 +0300
committerrinpatch <rinpatch@sdf.org>2020-02-11 16:17:21 +0300
commitea1631d7e67e22eb49d608e066ef4a3555bf25f7 (patch)
tree53e061abc3dfcc6bfa0f3024b443ac372eb7781c
parent58299fcfb4c9f4ed63c2f791b6ebf9920ebb7659 (diff)
Make Floki use fast_html
-rw-r--r--config/config.exs2
-rw-r--r--lib/pleroma/html.ex1
-rw-r--r--lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex1
-rw-r--r--lib/pleroma/web/metadata/rel_me.ex6
-rw-r--r--lib/pleroma/web/rel_me.ex5
-rw-r--r--lib/pleroma/web/rich_media/parser.ex6
-rw-r--r--test/web/rich_media/parsers/twitter_card_test.exs22
7 files changed, 30 insertions, 13 deletions
diff --git a/config/config.exs b/config/config.exs
index 41c1ff637..364aaf776 100644
--- a/config/config.exs
+++ b/config/config.exs
@@ -612,6 +612,8 @@ config :pleroma, :modules, runtime_dir: "instance/modules"
config :pleroma, configurable_from_database: false
+config :floki, :html_parser, Floki.HTMLParser.FastHtml
+
# Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above.
import_config "#{Mix.env()}.exs"
diff --git a/lib/pleroma/html.ex b/lib/pleroma/html.ex
index 11513106e..05946aa96 100644
--- a/lib/pleroma/html.ex
+++ b/lib/pleroma/html.ex
@@ -108,6 +108,7 @@ defmodule Pleroma.HTML do
Cachex.fetch!(:scrubber_cache, key, fn _key ->
result =
content
+ |> Floki.parse_fragment!()
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")
|> Floki.attribute("a", "href")
|> Enum.at(0)
diff --git a/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex b/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex
index 8abe18e29..802d10edc 100644
--- a/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex
+++ b/lib/pleroma/web/activity_pub/mrf/anti_link_spam_policy.ex
@@ -17,6 +17,7 @@ defmodule Pleroma.Web.ActivityPub.MRF.AntiLinkSpamPolicy do
# does the post contain links?
defp contains_links?(%{"content" => content} = _object) do
content
+ |> Floki.parse_fragment!()
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl")
|> Floki.attribute("a", "href")
|> length() > 0
diff --git a/lib/pleroma/web/metadata/rel_me.ex b/lib/pleroma/web/metadata/rel_me.ex
index f87fc1973..86dcc1a3b 100644
--- a/lib/pleroma/web/metadata/rel_me.ex
+++ b/lib/pleroma/web/metadata/rel_me.ex
@@ -8,8 +8,10 @@ defmodule Pleroma.Web.Metadata.Providers.RelMe do
@impl Provider
def build_tags(%{user: user}) do
- (Floki.attribute(user.bio, "link[rel~=me]", "href") ++
- Floki.attribute(user.bio, "a[rel~=me]", "href"))
+ bio_tree = Floki.parse_fragment!(user.bio)
+
+ (Floki.attribute(bio_tree, "link[rel~=me]", "href") ++
+ Floki.attribute(bio_tree, "a[rel~=me]", "href"))
|> Enum.map(fn link ->
{:link, [rel: "me", href: link], []}
end)
diff --git a/lib/pleroma/web/rel_me.ex b/lib/pleroma/web/rel_me.ex
index 16b1a53d2..540fa65df 100644
--- a/lib/pleroma/web/rel_me.ex
+++ b/lib/pleroma/web/rel_me.ex
@@ -27,9 +27,10 @@ defmodule Pleroma.Web.RelMe do
defp parse_url(url) do
with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <-
Pleroma.HTTP.get(url, [], adapter: @hackney_options),
+ {:ok, html_tree} <- Floki.parse_document(html),
data <-
- Floki.attribute(html, "link[rel~=me]", "href") ++
- Floki.attribute(html, "a[rel~=me]", "href") do
+ Floki.attribute(html_tree, "link[rel~=me]", "href") ++
+ Floki.attribute(html_tree, "a[rel~=me]", "href") do
{:ok, data}
end
rescue
diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex
index c06b0a0f2..9702e90f1 100644
--- a/lib/pleroma/web/rich_media/parser.ex
+++ b/lib/pleroma/web/rich_media/parser.ex
@@ -81,18 +81,18 @@ defmodule Pleroma.Web.RichMedia.Parser do
{:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options)
html
- |> parse_html
+ |> parse_html()
|> maybe_parse()
|> Map.put(:url, url)
|> clean_parsed_data()
|> check_parsed_data()
rescue
e ->
- {:error, "Parsing error: #{inspect(e)}"}
+ {:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
end
end
- defp parse_html(html), do: Floki.parse(html)
+ defp parse_html(html), do: Floki.parse_document!(html)
defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc ->
diff --git a/test/web/rich_media/parsers/twitter_card_test.exs b/test/web/rich_media/parsers/twitter_card_test.exs
index 751ca614c..f2ebbde7e 100644
--- a/test/web/rich_media/parsers/twitter_card_test.exs
+++ b/test/web/rich_media/parsers/twitter_card_test.exs
@@ -7,11 +7,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
alias Pleroma.Web.RichMedia.Parsers.TwitterCard
test "returns error when html not contains twitter card" do
- assert TwitterCard.parse("", %{}) == {:error, "No twitter card metadata found"}
+ assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) ==
+ {:error, "No twitter card metadata found"}
end
test "parses twitter card with only name attributes" do
- html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
+ html =
+ File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
+ |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) ==
{:ok,
@@ -26,7 +29,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
end
test "parses twitter card with only property attributes" do
- html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
+ html =
+ File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
+ |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) ==
{:ok,
@@ -45,7 +50,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
end
test "parses twitter card with name & property attributes" do
- html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
+ html =
+ File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
+ |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) ==
{:ok,
@@ -73,7 +80,8 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
"YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <>
"yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg"
- html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html")
+ html =
+ File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) ==
{:ok,
@@ -87,7 +95,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
end
test "takes first founded title in html head if there is html markup error" do
- html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
+ html =
+ File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
+ |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) ==
{:ok,