From f8e4e5724f5990422c898a8ba62191a295972848 Mon Sep 17 00:00:00 2001 From: evazion Date: Thu, 1 Sep 2022 17:41:54 -0500 Subject: [PATCH] autocomplete: switch to word-based tag matching. Switch autocomplete to match individual words in the tag, instead of only matching the start of the tag. For example, "hair" matches any tag containing the word "hair", not just tags starting with "hair". "long_hair" matches all tags containing the words "long" and "hair", which includes "very_long_hair" and "absurdly_long_hair". Words can be in any order and words can be left out. So "closed_eye" matches "one_eye_closed". "asuka_langley_souryuu" matches "souryuu_asuka_langley". This has several advantages: * You can search characters by first name. For example, "miku" matches "hatsune_miku". "zelda" matches both "princess_zelda" and "the_legend_of_zelda". * You can find the right tag even if you get the word order wrong, or forget a word. For example, "eyes_closed" matches "closed_eyes". "hair_over_eye" matches "hair_over_one_eye". * You can find more related tags. For example, searching "skirt" shows all tags containing the word "skirt", not just tags starting with "skirt". The downside is this may break muscle memory by changing the autocomplete order of some tags. This is an acceptable trade-off. You can get the old behavior by writing a "*" at the end of the tag. For example, searching "skirt*" gives the same results as before. --- app/logical/autocomplete_service.rb | 77 +++++++++++++++++++++++------ app/logical/concerns/searchable.rb | 29 ++++++++++- app/models/tag.rb | 31 ++++++++++++ 3 files changed, 122 insertions(+), 15 deletions(-) diff --git a/app/logical/autocomplete_service.rb b/app/logical/autocomplete_service.rb index 424a00dd3..693b19684 100644 --- a/app/logical/autocomplete_service.rb +++ b/app/logical/autocomplete_service.rb @@ -78,8 +78,7 @@ class AutocompleteService # # @return [Array] the autocomplete results def autocomplete_tag_query - if parsed_query.metatags.one? - metatag = parsed_query.metatags.first + if metatag.present? autocomplete_metatag(metatag.name, metatag.value) else tag = Tag.normalize_name(query) @@ -105,25 +104,67 @@ class AutocompleteService results = tag_other_name_matches(string) elsif string.starts_with?("/") results = tag_abbreviation_matches(string) - results = results.sort_by do |r| - [r[:antecedent].to_s.size, -r[:post_count]] - end - - results = results.uniq { |r| r[:value] }.take(limit) elsif string.include?("*") - results = tag_matches(string) - else - results = tag_matches(string + "*") + results = tag_wildcard_matches(string) + elsif Tag.parsable_into_words?(string) # do a word match if the search contains at least 2 contiguous letters or numbers + results = tag_word_matches(string) results = tag_autocorrect_matches(string) if results.blank? + else + results = tag_prefix_matches(string) end results end - # Find tags or tag aliases matching a wildcard search. + # Find tags or tag aliases containing all the words in the search string, in any order. + # Example: "haruhi_suzumiya" => "suzumiya_haruhi_no_yuuutsu" + # + # Rank results with exact matches first (unless it's a small tag), then substring matches + # next (e.g. tags where the words are in the same order and next to each other), then word + # matches last (e.g. tag where the words are in a different order, or not next to each other). + # # @param string [String] the string to complete # @return [Array] the autocomplete results - def tag_matches(string) + def tag_word_matches(string) + query = Tag.parse_query(string) + + name_matches = Tag.nonempty.where_all_in_array_like(:words, query) + alias_matches = Tag.nonempty.where(name: TagAlias.active.joins(:antecedent_tag).where_all_in_array_like("tags.words", query).select(:consequent_name)) + union = "((#{name_matches.to_sql}) UNION (#{alias_matches.to_sql})) AS tags" + tags = Tag.from(union).includes(:consequent_aliases).order(post_count: :desc, name: :asc).limit(100) + + results = tags.map do |tag| + antecedent = tag.tag_alias_for_word_pattern(string)&.antecedent_name + { type: "tag-word", label: tag.pretty_name, value: tag.name, category: tag.category, post_count: tag.post_count, antecedent: antecedent } + end + + results = results.sort_by do |result| + name = result[:antecedent] || result[:value] + post_count = result[:post_count] + + large = post_count > 100 ? 1 : 0 + exact = name == string ? 1 : 0 + substr = name.include?(string) ? 1 : 0 + + [-large, -exact, -substr, -post_count, result[:value]] + end + + results.take(limit) + end + + # Find tags or tag aliases starting with the given search string. + # + # @param string [String] the string to complete + # @return [Array] the autocomplete results + def tag_prefix_matches(string) + tag_wildcard_matches(string + "*") + end + + # Find tags or tag aliases matching a wildcard search. + # + # @param string [String] the string to complete + # @return [Array] the autocomplete results + def tag_wildcard_matches(string) name_matches = Tag.nonempty.name_matches(string).order(post_count: :desc).limit(limit) alias_matches = Tag.nonempty.alias_matches(string).order(post_count: :desc).limit(limit) union = "((#{name_matches.to_sql}) UNION (#{alias_matches.to_sql})) AS tags" @@ -148,9 +189,13 @@ class AutocompleteService string += "*" unless string.include?("*") tags = Tag.nonempty.abbreviation_matches(string).order(post_count: :desc).limit(limit) - tags.map do |tag| + results = tags.map do |tag| { type: "tag-abbreviation", label: tag.pretty_name, value: tag.name, category: tag.category, post_count: tag.post_count, antecedent: "/" + tag.abbreviation } + end.sort_by do |r| + [r[:antecedent].to_s.size, -r[:post_count]] end + + results.uniq { |r| r[:value] }.take(limit) end # Find tags matching a mispelled tag. @@ -346,5 +391,9 @@ class AutocompleteService PostQuery.new(query) end - memoize :autocomplete_results, :parsed_query + def metatag + parsed_query.metatags.first if type == :tag_query && parsed_query.metatags.one? + end + + memoize :autocomplete_results, :parsed_query, :metatag end diff --git a/app/logical/concerns/searchable.rb b/app/logical/concerns/searchable.rb index 232ffbb9f..85a722695 100644 --- a/app/logical/concerns/searchable.rb +++ b/app/logical/concerns/searchable.rb @@ -125,9 +125,24 @@ module Searchable where("? ~<< ANY(#{qualified_column_for(attr)})", "(?#{flags})#{regex}") end + # Perform a Postgres full-text search on an array of strings. Assumes the query is already escaped. # The column should have a `array_to_tsvector(column) using gin` index for best performance. + # + # @see https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY + def where_array_to_tsvector_matches(attr, query) + where("array_to_tsvector(#{qualified_column_for(attr)}) @@ ?::tsquery", query) + end + def where_any_in_array_starts_with(attr, value) - where("array_to_tsvector(#{qualified_column_for(attr)}) @@ ?", value.to_escaped_for_tsquery + ":*") + where_array_to_tsvector_matches(attr, value.to_escaped_for_tsquery + ":*") + end + + def where_all_in_array_like(attr, patterns) + where_array_to_tsvector_matches(attr, escape_patterns_for_tsquery(patterns).join(" & ")) + end + + def where_any_in_array_like(attr, patterns) + where_array_to_tsvector_matches(attr, escape_patterns_for_tsquery(patterns).join(" | ")) end def where_text_includes_lower(attr, values) @@ -614,9 +629,21 @@ module Searchable private def qualified_column_for(attr) + return attr if attr.to_s.include?(".") "#{table_name}.#{column_for_attribute(attr).name}" end + # @param patterns [Array] An array of wildcard patterns to escape for a tsquery search. + def escape_patterns_for_tsquery(patterns) + patterns.map do |pattern| + if pattern.ends_with?("*") + pattern.delete_suffix("*").to_escaped_for_tsquery + ":*" + else + pattern.to_escaped_for_tsquery + end + end + end + # Convert a column name or a raw SQL fragment to an Arel node. # # @param field [String, Arel::Nodes::Node] an Arel node, the name of a table diff --git a/app/models/tag.rb b/app/models/tag.rb index d968cd3ad..aad38d8ca 100644 --- a/app/models/tag.rb +++ b/app/models/tag.rb @@ -279,6 +279,24 @@ class Tag < ApplicationRecord def parsable_into_words?(name) name.match?(/[a-zA-Z0-9]{2}/) end + + # True if the `string` contains all the words in the `query`. + # + # Tag.includes_all_words?("holding_hands", ["hand*", "hold*"]) => true + def includes_all_words?(string, query) + words = parse_words(string) + query.all? { |pattern| words.any? { |word| word.ilike?(pattern) }} + end + + # Parse a string into a query for performing a word-based search. + # + # Tag.parse_query("holding_hand") => ["holding", "hand*"] + # Tag.parse_query("looking_at_") => ["looking", "at"] + def parse_query(string) + query = parse_words(string) + query[-1] += "*" unless string.match?(/[#{WORD_DELIMITERS}]\z/) + query + end end end @@ -452,6 +470,19 @@ class Tag < ApplicationRecord end end + # If this tag has aliases, find the shortest alias matching the given pattern. + def tag_alias_for_word_pattern(query) + query = Tag.parse_query(query) + aliases = consequent_aliases.sort_by { |ca| [ca.antecedent_name.size, ca.antecedent_name] } + + aliases.find do |tag_alias| + name_matches = Tag.includes_all_words?(name, query) + antecedent_matches = Tag.includes_all_words?(tag_alias.antecedent_name, query) + + antecedent_matches && !name_matches + end + end + def is_aliased? aliased_tag.present? end