From 659e15896e361331a5f7b69aa62bf834d59a0ce4 Mon Sep 17 00:00:00 2001 From: Ivan Tashkinov Date: Wed, 10 Jun 2020 21:02:44 +0300 Subject: [#1570] Experimental feature to improve user search using Levenshtein distance calculation. Improves short queries, especially containing typos. --- lib/pleroma/user.ex | 1 + lib/pleroma/user/search.ex | 101 +++++++++++++++++++-- ...150903_add_fuzzystrmatch_postgres_extension.exs | 23 +++++ test/tasks/user_test.exs | 5 +- test/user_search_test.exs | 28 +++++- 5 files changed, 145 insertions(+), 13 deletions(-) create mode 100644 priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs diff --git a/lib/pleroma/user.ex b/lib/pleroma/user.ex index c5c74d132..f0193207c 100644 --- a/lib/pleroma/user.ex +++ b/lib/pleroma/user.ex @@ -92,6 +92,7 @@ defmodule Pleroma.User do field(:local, :boolean, default: true) field(:follower_address, :string) field(:following_address, :string) + field(:levenshtein_distance, :integer, virtual: true) field(:search_rank, :float, virtual: true) field(:search_type, :integer, virtual: true) field(:tags, {:array, :string}, default: []) diff --git a/lib/pleroma/user/search.ex b/lib/pleroma/user/search.ex index cec59c372..3878e81bf 100644 --- a/lib/pleroma/user/search.ex +++ b/lib/pleroma/user/search.ex @@ -8,6 +8,8 @@ defmodule Pleroma.User.Search do import Ecto.Query @limit 20 + @levenshtein_max_query_length 5 + @search_rank_threshold 0 def search(query_string, opts \\ []) do resolve = Keyword.get(opts, :resolve, false) @@ -31,7 +33,10 @@ def search(query_string, opts \\ []) do defp format_query(query_string) do # Strip the beginning @ off if there is a query - query_string = String.trim_leading(query_string, "@") + query_string = + query_string + |> String.trim() + |> String.trim_leading("@") with [name, domain] <- String.split(query_string, "@") do encoded_domain = @@ -47,15 +52,33 @@ defp format_query(query_string) do end end + defp levenshtein_applicable?(query_string) do + String.length(query_string) <= @levenshtein_max_query_length + end + defp search_query(query_string, for_user, following) do - for_user - |> base_query(following) - |> filter_blocked_user(for_user) - |> filter_invisible_users() - |> filter_blocked_domains(for_user) - |> fts_search(query_string) - |> trigram_rank(query_string) + query = + for_user + |> base_query(following) + |> filter_blocked_user(for_user) + |> filter_invisible_users() + |> filter_blocked_domains(for_user) + + query = + if levenshtein_applicable?(query_string) do + query + |> levenshtein_distance(query_string) + |> fts_levenshtein_search(query_string) + |> trigram_levenshtein_rank(query_string) + else + query + |> fts_search(query_string) + |> trigram_rank(query_string) + end + + query |> boost_search_rank(for_user) + |> filter_by_search_rank() |> subquery() |> order_by(desc: :search_rank) |> maybe_restrict_local(for_user) @@ -78,6 +101,25 @@ defp fts_search(query, query_string) do ) end + defp fts_levenshtein_search(query, query_string) do + tsquery = to_tsquery(query_string) + + from( + u in subquery(query), + where: + fragment( + """ + ? <= 2 OR + (to_tsvector('simple', ?) || to_tsvector('simple', ?)) @@ to_tsquery('simple', ?) + """, + u.levenshtein_distance, + u.name, + u.nickname, + ^tsquery + ) + ) + end + defp to_tsquery(query_string) do String.trim_trailing(query_string, "@" <> local_domain()) |> String.replace(~r/[!-\/|@|[-`|{-~|:-?]+/, " ") @@ -87,6 +129,45 @@ defp to_tsquery(query_string) do |> Enum.join(" | ") end + # Trigram-based rank with bonus for close Levenshtein distance b/w query and nickname + defp trigram_levenshtein_rank(query, query_string) do + from( + u in subquery(query), + select_merge: %{ + search_rank: + fragment( + "similarity(?, trim(? || ' ' || coalesce(?, ''))) + \ + (CASE WHEN ? = 0 THEN 1.0 \ + WHEN ? = 1 AND length(?) > 1 THEN 0.5 + WHEN ? = 2 AND length(?) > 3 THEN 0.2 + ELSE 0 END)", + ^query_string, + u.nickname, + u.name, + u.levenshtein_distance, + u.levenshtein_distance, + ^query_string, + u.levenshtein_distance, + ^query_string + ) + } + ) + end + + defp levenshtein_distance(query, query_string) do + from( + u in query, + select_merge: %{ + levenshtein_distance: + fragment( + "levenshtein(?, regexp_replace(?, '@.+', ''))", + ^query_string, + u.nickname + ) + } + ) + end + defp trigram_rank(query, query_string) do from( u in query, @@ -185,4 +266,8 @@ defp boost_search_rank(query, %User{} = for_user) do end defp boost_search_rank(query, _for_user), do: query + + defp filter_by_search_rank(query) do + from(u in subquery(query), where: u.search_rank > @search_rank_threshold) + end end diff --git a/priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs b/priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs new file mode 100644 index 000000000..7f6f60ebb --- /dev/null +++ b/priv/repo/migrations/20200604150903_add_fuzzystrmatch_postgres_extension.exs @@ -0,0 +1,23 @@ +defmodule Pleroma.Repo.Migrations.AddFuzzystrmatchPostgresExtension do + use Ecto.Migration + + require Logger + + def up do + Logger.warn("ATTENTION ATTENTION ATTENTION\n") + + Logger.warn( + "This will try to create the pg_trgm extension on your database. If your database user does NOT have the necessary rights, you will have to do it manually and re-run the migrations.\nYou can probably do this by running the following:\n" + ) + + Logger.warn( + "sudo -u postgres psql pleroma_dev -c \"create extension if not exists fuzzystrmatch\"\n" + ) + + execute("create extension if not exists fuzzystrmatch") + end + + def down do + execute("drop extension if exists fuzzystrmatch") + end +end diff --git a/test/tasks/user_test.exs b/test/tasks/user_test.exs index b55aa1cdb..d4c4b5a14 100644 --- a/test/tasks/user_test.exs +++ b/test/tasks/user_test.exs @@ -436,11 +436,14 @@ test "it returns users matching" do {:ok, user} = User.follow(user, kawen) - assert [moon.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id) + # One "typo" in nickname makes `moot` score better than `kawen` despite of name match + assert [moon.id, moot.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id) + res = User.search("moo") |> Enum.map(& &1.id) assert moon.id in res assert moot.id in res assert kawen.id in res + assert [moon.id, kawen.id] == User.Search.search("moon fediverse") |> Enum.map(& &1.id) assert [kawen.id, moon.id] == diff --git a/test/user_search_test.exs b/test/user_search_test.exs index 17c63322a..9ff00d7f9 100644 --- a/test/user_search_test.exs +++ b/test/user_search_test.exs @@ -37,6 +37,10 @@ test "accepts offset parameter" do assert length(User.search("john", limit: 3, offset: 3)) == 2 end + defp clear_virtual_fields(user) do + Map.merge(user, %{search_rank: nil, search_type: nil, levenshtein_distance: nil}) + end + test "finds a user by full or partial nickname" do user = insert(:user, %{nickname: "john"}) @@ -44,8 +48,7 @@ test "finds a user by full or partial nickname" do assert user == User.search(query) |> List.first() - |> Map.put(:search_rank, nil) - |> Map.put(:search_type, nil) + |> clear_virtual_fields() end) end @@ -56,8 +59,8 @@ test "finds a user by full or partial name" do assert user == User.search(query) |> List.first() - |> Map.put(:search_rank, nil) - |> Map.put(:search_type, nil) + |> Map.merge(%{search_rank: nil, search_type: nil, levenshtein_distance: nil}) + |> clear_virtual_fields() end) end @@ -68,6 +71,23 @@ test "finds users, considering density of matched tokens" do assert [u2.id, u1.id] == Enum.map(User.search("bar word"), & &1.id) end + test "considers Levenshtein distance between query and nickname for short queries" do + clear_config([:instance, :limit_to_local_content], false) + + user = insert(:user, %{nickname: "hj@shigusegubu.club"}) + insert(:user, %{nickname: "xyz@sample.com"}) + insert(:user, %{nickname: "zyx@hj.com"}) + + # Note: "h.j." and "hhhj" are matched since 4+ char queries allow for 2 typos + for query <- ["hj", "hhj", "h j", "lj", "hi", "jj", "h.j.", "hhhj"] do + assert [user.id] == Enum.map(User.search(query), & &1.id) + end + + for query <- ["ajay", "gg"] do + assert [] == User.search(query) + end + end + test "finds users, boosting ranks of friends and followers" do u1 = insert(:user) u2 = insert(:user, %{name: "Doe"}) -- cgit v1.2.3