Rewrite related tags implementation.

Rewrite the implementation of related tags to be simpler, faster, and more accurate: * The related tags are now calculated by taking a random sample of 1000 posts, finding the top 250 most frequent tags among those posts, then ordering those tags by cosine similarity. * Related tags can generally be calculated in 50-300ms at these sample sizes. Very high sample sizes (25000+ posts) are still relatively fast (1-3 seconds), but generally they don't improve accuracy much. * Related tags are now cached in redis rather than in the tags table. The related_tags column in the tags table is no longer used. * Only the related tags in the search taglist are cached. The related tags returned by the 'Related tags' button are not cached. * The cache lifetime is a fixed 4 hours. * The 'Related tags' button now works with metatags. * The /related_tag page now works with metatags and multitag searches. Fixes #4134, #4146.
2019-08-30 19:08:56 -05:00
parent 7b8584e3b0
commit 6dd331745a
11 changed files with 99 additions and 256 deletions
--- a/app/logical/cache.rb
+++ b/app/logical/cache.rb
@@ -14,8 +14,8 @@ class Cache
    keys_to_values_hash
  end

-  def self.get(key, expiry_in_seconds = nil, &block)
-    Rails.cache.fetch(key, expires_in: expiry_in_seconds, &block)
+  def self.get(key, expiry_in_seconds = nil, **options, &block)
+    Rails.cache.fetch(key, expires_in: expiry_in_seconds, **options, &block)
  end

  def self.put(key, value, expiry_in_seconds = nil)
--- a/app/logical/related_tag_calculator.rb
+++ b/app/logical/related_tag_calculator.rb
@@ -1,78 +1,32 @@
-class RelatedTagCalculator
-  MAX_RESULTS = 25
+module RelatedTagCalculator
+  def self.similar_tags_for_search(tag_query, search_sample_size: 1000, tag_sample_size: 250, category: nil)
+    search_count = Post.fast_count(tag_query)
+    search_sample_size = [search_count, search_sample_size].min
+    return [] if search_sample_size <= 0

-  def self.calculate_from_sample_to_array(tags, category_constraint = nil)
-    convert_hash_to_array(calculate_from_sample(tags, Danbooru.config.post_sample_size, category_constraint))
-  end
-
-  def self.calculate_from_posts_to_array(posts)
-    convert_hash_to_array(calculate_from_posts(posts))
-  end
-
-  def self.calculate_from_posts(posts)
-    counts = Hash.new {|h, k| h[k] = 0}
-
-    posts.flat_map(&:tag_array).each do |tag|
-      counts[tag] += 1
+    tags = frequent_tags_for_search(tag_query, search_sample_size: search_sample_size, category: category).limit(tag_sample_size)
+    tags = tags.sort_by do |tag|
+      # cosine distance(tag1, tag2) = 1 - {{tag1 tag2}} / sqrt({{tag1}} * {{tag2}})
+      1 - tag.overlap_count / (Math.sqrt(tag.post_count * search_count.to_f))
    end

-    counts
+    tags
  end

-  def self.calculate_similar_from_sample(tag)
-    # this uses cosine similarity to produce more useful
-    # related tags, but is more db intensive
-    counts = Hash.new {|h, k| h[k] = 0}
+  def self.frequent_tags_for_search(tag_query, search_sample_size: 1000, category: nil)
+    sample_posts = Post.tag_match(tag_query).reorder(:md5).limit(search_sample_size)
+    tag_counts = Post.from(sample_posts).with_unflattened_tags.group("tag").select("tag, COUNT(*) AS overlap_count")

-    CurrentUser.without_safe_mode do
-      Post.with_timeout(5_000, [], {:tags => tag}) do
-        Post.tag_match(tag).limit(400).reorder("posts.md5").pluck(:tag_string).each do |tag_string|
-          tag_string.split.each do |tag|
-            counts[tag] += 1
-          end
-        end
-      end
-    end
-
-    tag_record = Tag.find_by_name(tag)
-    candidates = convert_hash_to_array(counts, 100)
-    similar_counts = Hash.new {|h, k| h[k] = 0}
-    CurrentUser.without_safe_mode do
-      PostReadOnly.with_timeout(5_000, nil, {:tags => tag}) do
-        candidates.each do |ctag, _|
-          acount = PostReadOnly.tag_match("#{tag} #{ctag}").count
-          ctag_record = Tag.find_by_name(ctag)
-          div = Math.sqrt(tag_record.post_count * ctag_record.post_count)
-          if div != 0
-            c = acount / div
-            similar_counts[ctag] = c
-          end
-        end
-      end
-    end
-
-    convert_hash_to_array(similar_counts)
+    tags = Tag.from(tag_counts).joins("JOIN tags ON tags.name = tag")
+    tags = tags.select("tags.*, overlap_count")
+    tags = tags.where("tags.post_count > 0")
+    tags = tags.where(category: category) if category.present?
+    tags = tags.order("overlap_count DESC, tags.post_count DESC, tags.name")
+    tags
  end

-  def self.calculate_from_sample(tags, sample_size, category_constraint = nil, max_results = MAX_RESULTS)
-    Post.with_timeout(5_000, [], {:tags => tags}) do
-      sample = Post.sample(tags, sample_size)
-      posts_with_tags = Post.from(sample).with_unflattened_tags
-
-      if category_constraint
-        posts_with_tags = posts_with_tags.joins("JOIN tags ON tags.name = tag").where("tags.category" => category_constraint)
-      end
-
-      counts = posts_with_tags.order("count_all DESC").limit(max_results).group("tag").count(:all)
-      counts
-    end
-  end
-
-  def self.convert_hash_to_array(hash, limit = MAX_RESULTS)
-    hash.to_a.sort_by {|x| [-x[1], x[0]] }.slice(0, limit)
-  end
-
-  def self.convert_hash_to_string(hash)
-    convert_hash_to_array(hash).flatten.join(" ")
+  def self.frequent_tags_for_posts(posts)
+    tags_with_counts = posts.flat_map(&:tag_array).group_by(&:itself).transform_values(&:size)
+    tags_with_counts.sort_by { |tag_name, count| [-count, tag_name] }.map(&:first)
  end
 end
--- a/app/logical/related_tag_query.rb
+++ b/app/logical/related_tag_query.rb
@@ -18,9 +18,9 @@ class RelatedTagQuery
    if query =~ /\*/
      pattern_matching_tags
    elsif category.present?
-      related_tags_by_category
+      RelatedTagCalculator.frequent_tags_for_search(query, category: Tag.categories.value_for(category)).take(25).pluck(:name)
    elsif query.present?
-      related_tags
+      RelatedTagCalculator.similar_tags_for_search(query).take(25).map(&:name)
    else
      []
    end
@@ -82,20 +82,6 @@ protected
    Tag.name_matches(query).where("post_count > 0").order("post_count desc").limit(50).sort_by {|x| x.name}.map(&:name)
  end

-  def related_tags
-    tag = Tag.find_by_name(query.strip)
-
-    if tag
-      tag.related_tag_array.map(&:first)
-    else
-      []
-    end
-  end
-
-  def related_tags_by_category
-    RelatedTagCalculator.calculate_from_sample_to_array(query, Tag.categories.value_for(category)).map(&:first)
-  end
-
  def wiki_page
    WikiPage.titled(query).first
  end