RichMedia: Do a HEAD request to check content type/length

This shouldn't be too expensive, since the connections are pooled, but it should save us some bandwidth since we won't fetch non-html files and files that are too large for us to process (especially since you can't cancel a request without closing the connection with HTTP1).
author: rinpatch <rinpatch@sdf.org> 2020-09-14 14:45:58 +0300
committer: rinpatch <rinpatch@sdf.org> 2020-09-14 14:45:58 +0300
commit: f70335002df9b2b3f47f0ccaed6aaeebfb14435f (patch)
tree: 16c15bdcdf5c5260e58fae9947075aca7d80687a
parent: f66a15c4a51e1c8f614b4c1609b2385a29762931 (diff)
3 files changed, 91 insertions, 1 deletions
diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex
index bd7f03cbe..d7a19df4a 100644
--- a/lib/pleroma/web/rich_media/helpers.ex
+++ b/lib/pleroma/web/rich_media/helpers.ex
@@ -87,6 +87,50 @@ defmodule Pleroma.Web.RichMedia.Helpers do
   def rich_media_get(url) do
     headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
 
-    Pleroma.HTTP.get(url, headers, @options)
+    head_check =
+      case Pleroma.HTTP.head(url, headers, @options) do
+        # If the HEAD request didn't reach the server for whatever reason,
+        # we assume the GET that comes right after won't either
+        {:error, _} = e ->
+          e
+
+        {:ok, %Tesla.Env{status: 200, headers: headers}} ->
+          with :ok <- check_content_type(headers),
+               :ok <- check_content_length(headers),
+               do: :ok
+
+        _ ->
+          :ok
+      end
+
+    with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, @options)
+  end
+
+  defp check_content_type(headers) do
+    case List.keyfind(headers, "content-type", 0) do
+      {_, content_type} ->
+        case Plug.Conn.Utils.media_type(content_type) do
+          {:ok, "text", "html", _} -> :ok
+          _ -> {:error, {:content_type, content_type}}
+        end
+
+      _ ->
+        :ok
+    end
+  end
+
+  @max_body @options[:max_body]
+  defp check_content_length(headers) do
+    case List.keyfind(headers, "content-length", 0) do
+      {_, maybe_content_length} ->
+        case Integer.parse(maybe_content_length) do
+          {content_length, ""} when content_length <= @max_body -> :ok
+          {_, ""} -> {:error, :body_too_large}
+          _ -> :ok
+        end
+
+      _ ->
+        :ok
+    end
   end
 end
diff --git a/test/support/http_request_mock.ex b/test/support/http_request_mock.ex
index 344e27f13..cb022333f 100644
--- a/test/support/http_request_mock.ex
+++ b/test/support/http_request_mock.ex
@@ -1262,4 +1262,21 @@ defmodule HttpRequestMock do
        inspect(headers)
      }"}
   end
+
+  # Most of the rich media mocks are missing HEAD requests, so we just return 404.
+  @rich_media_mocks [
+    "https://example.com/ogp",
+    "https://example.com/ogp-missing-data",
+    "https://example.com/twitter-card"
+  ]
+  def head(url, _query, _body, _headers) when url in @rich_media_mocks do
+    {:ok, %Tesla.Env{status: 404, body: ""}}
+  end
+
+  def head(url, query, body, headers) do
+    {:error,
+     "Mock response not implemented for HEAD #{inspect(url)}, #{query}, #{inspect(body)}, #{
+       inspect(headers)
+     }"}
+  end
 end
diff --git a/test/web/rich_media/parser_test.exs b/test/web/rich_media/parser_test.exs
index 21ae35f8b..d65a63121 100644
--- a/test/web/rich_media/parser_test.exs
+++ b/test/web/rich_media/parser_test.exs
@@ -56,6 +56,27 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
 
       %{method: :get, url: "http://example.com/error"} ->
         {:error, :overload}
+
+      %{
+        method: :head,
+        url: "http://example.com/huge-page"
+      } ->
+        %Tesla.Env{
+          status: 200,
+          headers: [{"content-length", "2000001"}, {"content-type", "text/html"}]
+        }
+
+      %{
+        method: :head,
+        url: "http://example.com/pdf-file"
+      } ->
+        %Tesla.Env{
+          status: 200,
+          headers: [{"content-length", "1000000"}, {"content-type", "application/pdf"}]
+        }
+
+      %{method: :head} ->
+        %Tesla.Env{status: 404, body: "", headers: []}
     end)
 
     :ok
@@ -144,4 +165,12 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
   test "returns error if getting page was not successful" do
     assert {:error, :overload} = Parser.parse("http://example.com/error")
   end
+
+  test "does a HEAD request to check if the body is too large" do
+    assert {:error, body_too_large} = Parser.parse("http://example.com/huge-page")
+  end
+
+  test "does a HEAD request to check if the body is html" do
+    assert {:error, {:content_type, _}} = Parser.parse("http://example.com/pdf-file")
+  end
 end
author	rinpatch <rinpatch@sdf.org>	2020-09-14 14:45:58 +0300
committer	rinpatch <rinpatch@sdf.org>	2020-09-14 14:45:58 +0300
commit	f70335002df9b2b3f47f0ccaed6aaeebfb14435f (patch)
tree	16c15bdcdf5c5260e58fae9947075aca7d80687a
parent	f66a15c4a51e1c8f614b4c1609b2385a29762931 (diff)