summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIvan Tashkinov <ivantashkinov@gmail.com>2020-06-10 21:02:44 +0300
committerIvan Tashkinov <ivantashkinov@gmail.com>2020-06-10 21:02:44 +0300
commit659e15896e361331a5f7b69aa62bf834d59a0ce4 (patch)
tree657b17182f85bab1b580491bafb4f8ab772f3fac
parent7aa6c82937090ca6f2298dee0ef894954ca2f129 (diff)
[#1570] Experimental feature to improve user search using Levenshtein distance calculation.1570-levenshtein-distance-user-search
Improves short queries, especially containing typos.
-rw-r--r--lib/pleroma/user.ex1
-rw-r--r--lib/pleroma/user/search.ex101
-rw-r--r--priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs23
-rw-r--r--test/tasks/user_test.exs5
-rw-r--r--test/user_search_test.exs28
5 files changed, 145 insertions, 13 deletions
diff --git a/lib/pleroma/user.ex b/lib/pleroma/user.ex
index c5c74d132..f0193207c 100644
--- a/lib/pleroma/user.ex
+++ b/lib/pleroma/user.ex
@@ -92,6 +92,7 @@ defmodule Pleroma.User do
field(:local, :boolean, default: true)
field(:follower_address, :string)
field(:following_address, :string)
+ field(:levenshtein_distance, :integer, virtual: true)
field(:search_rank, :float, virtual: true)
field(:search_type, :integer, virtual: true)
field(:tags, {:array, :string}, default: [])
diff --git a/lib/pleroma/user/search.ex b/lib/pleroma/user/search.ex
index cec59c372..3878e81bf 100644
--- a/lib/pleroma/user/search.ex
+++ b/lib/pleroma/user/search.ex
@@ -8,6 +8,8 @@ defmodule Pleroma.User.Search do
import Ecto.Query
@limit 20
+ @levenshtein_max_query_length 5
+ @search_rank_threshold 0
def search(query_string, opts \\ []) do
resolve = Keyword.get(opts, :resolve, false)
@@ -31,7 +33,10 @@ defmodule Pleroma.User.Search do
defp format_query(query_string) do
# Strip the beginning @ off if there is a query
- query_string = String.trim_leading(query_string, "@")
+ query_string =
+ query_string
+ |> String.trim()
+ |> String.trim_leading("@")
with [name, domain] <- String.split(query_string, "@") do
encoded_domain =
@@ -47,15 +52,33 @@ defmodule Pleroma.User.Search do
end
end
+ defp levenshtein_applicable?(query_string) do
+ String.length(query_string) <= @levenshtein_max_query_length
+ end
+
defp search_query(query_string, for_user, following) do
- for_user
- |> base_query(following)
- |> filter_blocked_user(for_user)
- |> filter_invisible_users()
- |> filter_blocked_domains(for_user)
- |> fts_search(query_string)
- |> trigram_rank(query_string)
+ query =
+ for_user
+ |> base_query(following)
+ |> filter_blocked_user(for_user)
+ |> filter_invisible_users()
+ |> filter_blocked_domains(for_user)
+
+ query =
+ if levenshtein_applicable?(query_string) do
+ query
+ |> levenshtein_distance(query_string)
+ |> fts_levenshtein_search(query_string)
+ |> trigram_levenshtein_rank(query_string)
+ else
+ query
+ |> fts_search(query_string)
+ |> trigram_rank(query_string)
+ end
+
+ query
|> boost_search_rank(for_user)
+ |> filter_by_search_rank()
|> subquery()
|> order_by(desc: :search_rank)
|> maybe_restrict_local(for_user)
@@ -78,6 +101,25 @@ defmodule Pleroma.User.Search do
)
end
+ defp fts_levenshtein_search(query, query_string) do
+ tsquery = to_tsquery(query_string)
+
+ from(
+ u in subquery(query),
+ where:
+ fragment(
+ """
+ ? <= 2 OR
+ (to_tsvector('simple', ?) || to_tsvector('simple', ?)) @@ to_tsquery('simple', ?)
+ """,
+ u.levenshtein_distance,
+ u.name,
+ u.nickname,
+ ^tsquery
+ )
+ )
+ end
+
defp to_tsquery(query_string) do
String.trim_trailing(query_string, "@" <> local_domain())
|> String.replace(~r/[!-\/|@|[-`|{-~|:-?]+/, " ")
@@ -87,6 +129,45 @@ defmodule Pleroma.User.Search do
|> Enum.join(" | ")
end
+ # Trigram-based rank with bonus for close Levenshtein distance b/w query and nickname
+ defp trigram_levenshtein_rank(query, query_string) do
+ from(
+ u in subquery(query),
+ select_merge: %{
+ search_rank:
+ fragment(
+ "similarity(?, trim(? || ' ' || coalesce(?, ''))) + \
+ (CASE WHEN ? = 0 THEN 1.0 \
+ WHEN ? = 1 AND length(?) > 1 THEN 0.5
+ WHEN ? = 2 AND length(?) > 3 THEN 0.2
+ ELSE 0 END)",
+ ^query_string,
+ u.nickname,
+ u.name,
+ u.levenshtein_distance,
+ u.levenshtein_distance,
+ ^query_string,
+ u.levenshtein_distance,
+ ^query_string
+ )
+ }
+ )
+ end
+
+ defp levenshtein_distance(query, query_string) do
+ from(
+ u in query,
+ select_merge: %{
+ levenshtein_distance:
+ fragment(
+ "levenshtein(?, regexp_replace(?, '@.+', ''))",
+ ^query_string,
+ u.nickname
+ )
+ }
+ )
+ end
+
defp trigram_rank(query, query_string) do
from(
u in query,
@@ -185,4 +266,8 @@ defmodule Pleroma.User.Search do
end
defp boost_search_rank(query, _for_user), do: query
+
+ defp filter_by_search_rank(query) do
+ from(u in subquery(query), where: u.search_rank > @search_rank_threshold)
+ end
end
diff --git a/priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs b/priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs
new file mode 100644
index 000000000..7f6f60ebb
--- /dev/null
+++ b/priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs
@@ -0,0 +1,23 @@
+defmodule Pleroma.Repo.Migrations.AddFuzzystrmatchPostgresExtension do
+ use Ecto.Migration
+
+ require Logger
+
+ def up do
+ Logger.warn("ATTENTION ATTENTION ATTENTION\n")
+
+ Logger.warn(
+ "This will try to create the pg_trgm extension on your database. If your database user does NOT have the necessary rights, you will have to do it manually and re-run the migrations.\nYou can probably do this by running the following:\n"
+ )
+
+ Logger.warn(
+ "sudo -u postgres psql pleroma_dev -c \"create extension if not exists fuzzystrmatch\"\n"
+ )
+
+ execute("create extension if not exists fuzzystrmatch")
+ end
+
+ def down do
+ execute("drop extension if exists fuzzystrmatch")
+ end
+end
diff --git a/test/tasks/user_test.exs b/test/tasks/user_test.exs
index b55aa1cdb..d4c4b5a14 100644
--- a/test/tasks/user_test.exs
+++ b/test/tasks/user_test.exs
@@ -436,11 +436,14 @@ defmodule Mix.Tasks.Pleroma.UserTest do
{:ok, user} = User.follow(user, kawen)
- assert [moon.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
+ # One "typo" in nickname makes `moot` score better than `kawen` despite of name match
+ assert [moon.id, moot.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
+
res = User.search("moo") |> Enum.map(& &1.id)
assert moon.id in res
assert moot.id in res
assert kawen.id in res
+
assert [moon.id, kawen.id] == User.Search.search("moon fediverse") |> Enum.map(& &1.id)
assert [kawen.id, moon.id] ==
diff --git a/test/user_search_test.exs b/test/user_search_test.exs
index 17c63322a..9ff00d7f9 100644
--- a/test/user_search_test.exs
+++ b/test/user_search_test.exs
@@ -37,6 +37,10 @@ defmodule Pleroma.UserSearchTest do
assert length(User.search("john", limit: 3, offset: 3)) == 2
end
+ defp clear_virtual_fields(user) do
+ Map.merge(user, %{search_rank: nil, search_type: nil, levenshtein_distance: nil})
+ end
+
test "finds a user by full or partial nickname" do
user = insert(:user, %{nickname: "john"})
@@ -44,8 +48,7 @@ defmodule Pleroma.UserSearchTest do
assert user ==
User.search(query)
|> List.first()
- |> Map.put(:search_rank, nil)
- |> Map.put(:search_type, nil)
+ |> clear_virtual_fields()
end)
end
@@ -56,8 +59,8 @@ defmodule Pleroma.UserSearchTest do
assert user ==
User.search(query)
|> List.first()
- |> Map.put(:search_rank, nil)
- |> Map.put(:search_type, nil)
+ |> Map.merge(%{search_rank: nil, search_type: nil, levenshtein_distance: nil})
+ |> clear_virtual_fields()
end)
end
@@ -68,6 +71,23 @@ defmodule Pleroma.UserSearchTest do
assert [u2.id, u1.id] == Enum.map(User.search("bar word"), & &1.id)
end
+ test "considers Levenshtein distance between query and nickname for short queries" do
+ clear_config([:instance, :limit_to_local_content], false)
+
+ user = insert(:user, %{nickname: "hj@shigusegubu.club"})
+ insert(:user, %{nickname: "xyz@sample.com"})
+ insert(:user, %{nickname: "zyx@hj.com"})
+
+ # Note: "h.j." and "hhhj" are matched since 4+ char queries allow for 2 typos
+ for query <- ["hj", "hhj", "h j", "lj", "hi", "jj", "h.j.", "hhhj"] do
+ assert [user.id] == Enum.map(User.search(query), & &1.id)
+ end
+
+ for query <- ["ajay", "gg"] do
+ assert [] == User.search(query)
+ end
+ end
+
test "finds users, boosting ranks of friends and followers" do
u1 = insert(:user)
u2 = insert(:user, %{name: "Doe"})