autocomplete: switch to word-based tag matching.

Switch autocomplete to match individual words in the tag, instead of
only matching the start of the tag.

For example, "hair" matches any tag containing the word "hair", not just tags
starting with "hair". "long_hair" matches all tags containing the words "long"
and "hair", which includes "very_long_hair" and "absurdly_long_hair".

Words can be in any order and words can be left out. So "closed_eye" matches
"one_eye_closed". "asuka_langley_souryuu" matches "souryuu_asuka_langley".

This has several advantages:

* You can search characters by first name. For example, "miku" matches "hatsune_miku".
  "zelda" matches both "princess_zelda" and "the_legend_of_zelda".
* You can find the right tag even if you get the word order wrong, or forget a word.
  For example, "eyes_closed" matches "closed_eyes". "hair_over_eye" matches "hair_over_one_eye".
* You can find more related tags. For example, searching "skirt" shows all tags
  containing the word "skirt", not just tags starting with "skirt".

The downside is this may break muscle memory by changing the autocomplete order of
some tags. This is an acceptable trade-off.

You can get the old behavior by writing a "*" at the end of the tag. For
example, searching "skirt*" gives the same results as before.
This commit is contained in:
evazion
2022-09-01 17:41:54 -05:00
parent ec382357b8
commit f8e4e5724f
3 changed files with 122 additions and 15 deletions

View File

@@ -78,8 +78,7 @@ class AutocompleteService
#
# @return [Array<Hash>] the autocomplete results
def autocomplete_tag_query
if parsed_query.metatags.one?
metatag = parsed_query.metatags.first
if metatag.present?
autocomplete_metatag(metatag.name, metatag.value)
else
tag = Tag.normalize_name(query)
@@ -105,25 +104,67 @@ class AutocompleteService
results = tag_other_name_matches(string)
elsif string.starts_with?("/")
results = tag_abbreviation_matches(string)
results = results.sort_by do |r|
[r[:antecedent].to_s.size, -r[:post_count]]
end
results = results.uniq { |r| r[:value] }.take(limit)
elsif string.include?("*")
results = tag_matches(string)
else
results = tag_matches(string + "*")
results = tag_wildcard_matches(string)
elsif Tag.parsable_into_words?(string) # do a word match if the search contains at least 2 contiguous letters or numbers
results = tag_word_matches(string)
results = tag_autocorrect_matches(string) if results.blank?
else
results = tag_prefix_matches(string)
end
results
end
# Find tags or tag aliases matching a wildcard search.
# Find tags or tag aliases containing all the words in the search string, in any order.
# Example: "haruhi_suzumiya" => "suzumiya_haruhi_no_yuuutsu"
#
# Rank results with exact matches first (unless it's a small tag), then substring matches
# next (e.g. tags where the words are in the same order and next to each other), then word
# matches last (e.g. tag where the words are in a different order, or not next to each other).
#
# @param string [String] the string to complete
# @return [Array<Hash>] the autocomplete results
def tag_matches(string)
def tag_word_matches(string)
query = Tag.parse_query(string)
name_matches = Tag.nonempty.where_all_in_array_like(:words, query)
alias_matches = Tag.nonempty.where(name: TagAlias.active.joins(:antecedent_tag).where_all_in_array_like("tags.words", query).select(:consequent_name))
union = "((#{name_matches.to_sql}) UNION (#{alias_matches.to_sql})) AS tags"
tags = Tag.from(union).includes(:consequent_aliases).order(post_count: :desc, name: :asc).limit(100)
results = tags.map do |tag|
antecedent = tag.tag_alias_for_word_pattern(string)&.antecedent_name
{ type: "tag-word", label: tag.pretty_name, value: tag.name, category: tag.category, post_count: tag.post_count, antecedent: antecedent }
end
results = results.sort_by do |result|
name = result[:antecedent] || result[:value]
post_count = result[:post_count]
large = post_count > 100 ? 1 : 0
exact = name == string ? 1 : 0
substr = name.include?(string) ? 1 : 0
[-large, -exact, -substr, -post_count, result[:value]]
end
results.take(limit)
end
# Find tags or tag aliases starting with the given search string.
#
# @param string [String] the string to complete
# @return [Array<Hash>] the autocomplete results
def tag_prefix_matches(string)
tag_wildcard_matches(string + "*")
end
# Find tags or tag aliases matching a wildcard search.
#
# @param string [String] the string to complete
# @return [Array<Hash>] the autocomplete results
def tag_wildcard_matches(string)
name_matches = Tag.nonempty.name_matches(string).order(post_count: :desc).limit(limit)
alias_matches = Tag.nonempty.alias_matches(string).order(post_count: :desc).limit(limit)
union = "((#{name_matches.to_sql}) UNION (#{alias_matches.to_sql})) AS tags"
@@ -148,9 +189,13 @@ class AutocompleteService
string += "*" unless string.include?("*")
tags = Tag.nonempty.abbreviation_matches(string).order(post_count: :desc).limit(limit)
tags.map do |tag|
results = tags.map do |tag|
{ type: "tag-abbreviation", label: tag.pretty_name, value: tag.name, category: tag.category, post_count: tag.post_count, antecedent: "/" + tag.abbreviation }
end.sort_by do |r|
[r[:antecedent].to_s.size, -r[:post_count]]
end
results.uniq { |r| r[:value] }.take(limit)
end
# Find tags matching a mispelled tag.
@@ -346,5 +391,9 @@ class AutocompleteService
PostQuery.new(query)
end
memoize :autocomplete_results, :parsed_query
def metatag
parsed_query.metatags.first if type == :tag_query && parsed_query.metatags.one?
end
memoize :autocomplete_results, :parsed_query, :metatag
end

View File

@@ -125,9 +125,24 @@ module Searchable
where("? ~<< ANY(#{qualified_column_for(attr)})", "(?#{flags})#{regex}")
end
# Perform a Postgres full-text search on an array of strings. Assumes the query is already escaped.
# The column should have a `array_to_tsvector(column) using gin` index for best performance.
#
# @see https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY
def where_array_to_tsvector_matches(attr, query)
where("array_to_tsvector(#{qualified_column_for(attr)}) @@ ?::tsquery", query)
end
def where_any_in_array_starts_with(attr, value)
where("array_to_tsvector(#{qualified_column_for(attr)}) @@ ?", value.to_escaped_for_tsquery + ":*")
where_array_to_tsvector_matches(attr, value.to_escaped_for_tsquery + ":*")
end
def where_all_in_array_like(attr, patterns)
where_array_to_tsvector_matches(attr, escape_patterns_for_tsquery(patterns).join(" & "))
end
def where_any_in_array_like(attr, patterns)
where_array_to_tsvector_matches(attr, escape_patterns_for_tsquery(patterns).join(" | "))
end
def where_text_includes_lower(attr, values)
@@ -614,9 +629,21 @@ module Searchable
private
def qualified_column_for(attr)
return attr if attr.to_s.include?(".")
"#{table_name}.#{column_for_attribute(attr).name}"
end
# @param patterns [Array<String>] An array of wildcard patterns to escape for a tsquery search.
def escape_patterns_for_tsquery(patterns)
patterns.map do |pattern|
if pattern.ends_with?("*")
pattern.delete_suffix("*").to_escaped_for_tsquery + ":*"
else
pattern.to_escaped_for_tsquery
end
end
end
# Convert a column name or a raw SQL fragment to an Arel node.
#
# @param field [String, Arel::Nodes::Node] an Arel node, the name of a table

View File

@@ -279,6 +279,24 @@ class Tag < ApplicationRecord
def parsable_into_words?(name)
name.match?(/[a-zA-Z0-9]{2}/)
end
# True if the `string` contains all the words in the `query`.
#
# Tag.includes_all_words?("holding_hands", ["hand*", "hold*"]) => true
def includes_all_words?(string, query)
words = parse_words(string)
query.all? { |pattern| words.any? { |word| word.ilike?(pattern) }}
end
# Parse a string into a query for performing a word-based search.
#
# Tag.parse_query("holding_hand") => ["holding", "hand*"]
# Tag.parse_query("looking_at_") => ["looking", "at"]
def parse_query(string)
query = parse_words(string)
query[-1] += "*" unless string.match?(/[#{WORD_DELIMITERS}]\z/)
query
end
end
end
@@ -452,6 +470,19 @@ class Tag < ApplicationRecord
end
end
# If this tag has aliases, find the shortest alias matching the given pattern.
def tag_alias_for_word_pattern(query)
query = Tag.parse_query(query)
aliases = consequent_aliases.sort_by { |ca| [ca.antecedent_name.size, ca.antecedent_name] }
aliases.find do |tag_alias|
name_matches = Tag.includes_all_words?(name, query)
antecedent_matches = Tag.includes_all_words?(tag_alias.antecedent_name, query)
antecedent_matches && !name_matches
end
end
def is_aliased?
aliased_tag.present?
end