autocomplete: tune autocorrect algorithm.
Tune autocorrect to produce fewer false positives. Before we used trigram similarity. Now we use Levenshtein edit distance with a dynamic typo threshold. Trigram similarity was able to correct large transpositions (e.g. `miku_hatsune` -> `hatsune_miku`), but it was bad at correcting small typos. Levenshtein is good at small typos, but can't correct large transpositions.
This commit is contained in:
@@ -100,7 +100,7 @@ class AutocompleteService
|
||||
end
|
||||
|
||||
def tag_autocorrect_matches(string)
|
||||
tags = Tag.nonempty.fuzzy_name_matches(string).order_similarity(string).limit(limit)
|
||||
tags = Tag.nonempty.autocorrect_matches(string).limit(limit)
|
||||
|
||||
tags.map do |tag|
|
||||
{ type: "tag", label: tag.pretty_name, value: tag.name, category: tag.category, post_count: tag.post_count }
|
||||
|
||||
@@ -234,16 +234,19 @@ class Tag < ApplicationRecord
|
||||
end
|
||||
|
||||
module SearchMethods
|
||||
def autocorrect_matches(name)
|
||||
tags = fuzzy_name_matches(name).order_similarity(name)
|
||||
end
|
||||
|
||||
# ref: https://www.postgresql.org/docs/current/static/pgtrgm.html#idm46428634524336
|
||||
def order_similarity(name)
|
||||
# trunc(3 * sim) reduces the similarity score from a range of 0.0 -> 1.0 to just 0, 1, or 2.
|
||||
# This groups tags first by approximate similarity, then by largest tags within groups of similar tags.
|
||||
order(Arel.sql("trunc(3 * similarity(name, #{connection.quote(name)})) DESC"), "post_count DESC", "name DESC")
|
||||
order(Arel.sql("levenshtein(left(name, 255), #{connection.quote(name)}), tags.post_count DESC, tags.name ASC"))
|
||||
end
|
||||
|
||||
# ref: https://www.postgresql.org/docs/current/static/pgtrgm.html#idm46428634524336
|
||||
def fuzzy_name_matches(name)
|
||||
where("tags.name % ?", name)
|
||||
max_distance = [name.size / 4, 3].max.floor.to_i
|
||||
where("tags.name % ?", name).where("levenshtein(left(name, 255), ?) < ?", name, max_distance)
|
||||
end
|
||||
|
||||
def name_matches(name)
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
class AddExtensionFuzzyStrMatch < ActiveRecord::Migration[6.0]
|
||||
def change
|
||||
enable_extension "fuzzystrmatch"
|
||||
end
|
||||
end
|
||||
@@ -9,6 +9,21 @@ SET xmloption = content;
|
||||
SET client_min_messages = warning;
|
||||
SET row_security = off;
|
||||
|
||||
|
||||
--
|
||||
-- Name: fuzzystrmatch; Type: EXTENSION; Schema: -; Owner: -
|
||||
--
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch WITH SCHEMA public;
|
||||
|
||||
|
||||
--
|
||||
-- Name: EXTENSION fuzzystrmatch; Type: COMMENT; Schema: -; Owner: -
|
||||
--
|
||||
|
||||
COMMENT ON EXTENSION fuzzystrmatch IS 'determine similarities and distance between strings';
|
||||
|
||||
|
||||
--
|
||||
-- Name: pg_trgm; Type: EXTENSION; Schema: -; Owner: -
|
||||
--
|
||||
@@ -7420,6 +7435,7 @@ INSERT INTO "schema_migrations" (version) VALUES
|
||||
('20200520060951'),
|
||||
('20200803022359'),
|
||||
('20200816175151'),
|
||||
('20201201211748');
|
||||
('20201201211748'),
|
||||
('20201213052805');
|
||||
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ class TagsControllerTest < ActionDispatch::IntegrationTest
|
||||
should respond_to_search(name_matches: "hatsune_miku").with { @miku }
|
||||
should respond_to_search(name_normalize: "HATSUNE_MIKU ").with { @miku }
|
||||
should respond_to_search(name_or_alias_matches: "miku").with { @miku }
|
||||
should respond_to_search(fuzzy_name_matches: "miku_hatsune", order: "similarity").with { @miku }
|
||||
should respond_to_search(fuzzy_name_matches: "hatsune_mika", order: "similarity").with { @miku }
|
||||
should respond_to_search(name: "empty", hide_empty: "true").with { [] }
|
||||
should respond_to_search(name: "empty", hide_empty: "false").with { [@empty] }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user