Rewrite related tags implementation.

Rewrite the implementation of related tags to be simpler, faster, and
more accurate:

* The related tags are now calculated by taking a random sample of 1000
  posts, finding the top 250 most frequent tags among those posts, then
  ordering those tags by cosine similarity.

* Related tags can generally be calculated in 50-300ms at these sample
  sizes. Very high sample sizes (25000+ posts) are still relatively fast
  (1-3 seconds), but generally they don't improve accuracy much.

* Related tags are now cached in redis rather than in the tags table.
  The related_tags column in the tags table is no longer used.

* Only the related tags in the search taglist are cached. The related
  tags returned by the 'Related tags' button are not cached.

* The cache lifetime is a fixed 4 hours.

* The 'Related tags' button now works with metatags.

* The /related_tag page now works with metatags and multitag searches.

Fixes #4134, #4146.
This commit is contained in:
evazion
2019-08-30 19:08:56 -05:00
parent 7b8584e3b0
commit 6dd331745a
11 changed files with 99 additions and 256 deletions

View File

@@ -1,27 +1,8 @@
class RelatedTagsController < ApplicationController
respond_to :json, :xml, :js, :html, except: [:update]
before_action :require_reportbooru_key, only: [:update]
skip_forgery_protection only: [:update]
respond_to :json, :xml, :js, :html
def show
@query = RelatedTagQuery.new(query: params[:query], category: params[:category], user: CurrentUser.user)
respond_with(@query)
end
def update
@tag = Tag.find_by_name(params[:name])
@tag.related_tags = params[:related_tags]
@tag.related_tags_updated_at = Time.now
@tag.post_count = params[:post_count] if params[:post_count].present?
@tag.save
head :ok
end
protected
def require_reportbooru_key
unless Danbooru.config.reportbooru_key.present? && params[:key] == Danbooru.config.reportbooru_key
raise User::PrivilegeError
end
end
end

View File

@@ -1,7 +0,0 @@
class UpdateRelatedTagsJob < ApplicationJob
queue_as :default
def perform(tag)
tag.update_related
end
end

View File

@@ -14,8 +14,8 @@ class Cache
keys_to_values_hash
end
def self.get(key, expiry_in_seconds = nil, &block)
Rails.cache.fetch(key, expires_in: expiry_in_seconds, &block)
def self.get(key, expiry_in_seconds = nil, **options, &block)
Rails.cache.fetch(key, expires_in: expiry_in_seconds, **options, &block)
end
def self.put(key, value, expiry_in_seconds = nil)

View File

@@ -1,78 +1,32 @@
class RelatedTagCalculator
MAX_RESULTS = 25
module RelatedTagCalculator
def self.similar_tags_for_search(tag_query, search_sample_size: 1000, tag_sample_size: 250, category: nil)
search_count = Post.fast_count(tag_query)
search_sample_size = [search_count, search_sample_size].min
return [] if search_sample_size <= 0
def self.calculate_from_sample_to_array(tags, category_constraint = nil)
convert_hash_to_array(calculate_from_sample(tags, Danbooru.config.post_sample_size, category_constraint))
end
def self.calculate_from_posts_to_array(posts)
convert_hash_to_array(calculate_from_posts(posts))
end
def self.calculate_from_posts(posts)
counts = Hash.new {|h, k| h[k] = 0}
posts.flat_map(&:tag_array).each do |tag|
counts[tag] += 1
tags = frequent_tags_for_search(tag_query, search_sample_size: search_sample_size, category: category).limit(tag_sample_size)
tags = tags.sort_by do |tag|
# cosine distance(tag1, tag2) = 1 - {{tag1 tag2}} / sqrt({{tag1}} * {{tag2}})
1 - tag.overlap_count / (Math.sqrt(tag.post_count * search_count.to_f))
end
counts
tags
end
def self.calculate_similar_from_sample(tag)
# this uses cosine similarity to produce more useful
# related tags, but is more db intensive
counts = Hash.new {|h, k| h[k] = 0}
def self.frequent_tags_for_search(tag_query, search_sample_size: 1000, category: nil)
sample_posts = Post.tag_match(tag_query).reorder(:md5).limit(search_sample_size)
tag_counts = Post.from(sample_posts).with_unflattened_tags.group("tag").select("tag, COUNT(*) AS overlap_count")
CurrentUser.without_safe_mode do
Post.with_timeout(5_000, [], {:tags => tag}) do
Post.tag_match(tag).limit(400).reorder("posts.md5").pluck(:tag_string).each do |tag_string|
tag_string.split.each do |tag|
counts[tag] += 1
end
end
end
end
tag_record = Tag.find_by_name(tag)
candidates = convert_hash_to_array(counts, 100)
similar_counts = Hash.new {|h, k| h[k] = 0}
CurrentUser.without_safe_mode do
PostReadOnly.with_timeout(5_000, nil, {:tags => tag}) do
candidates.each do |ctag, _|
acount = PostReadOnly.tag_match("#{tag} #{ctag}").count
ctag_record = Tag.find_by_name(ctag)
div = Math.sqrt(tag_record.post_count * ctag_record.post_count)
if div != 0
c = acount / div
similar_counts[ctag] = c
end
end
end
end
convert_hash_to_array(similar_counts)
tags = Tag.from(tag_counts).joins("JOIN tags ON tags.name = tag")
tags = tags.select("tags.*, overlap_count")
tags = tags.where("tags.post_count > 0")
tags = tags.where(category: category) if category.present?
tags = tags.order("overlap_count DESC, tags.post_count DESC, tags.name")
tags
end
def self.calculate_from_sample(tags, sample_size, category_constraint = nil, max_results = MAX_RESULTS)
Post.with_timeout(5_000, [], {:tags => tags}) do
sample = Post.sample(tags, sample_size)
posts_with_tags = Post.from(sample).with_unflattened_tags
if category_constraint
posts_with_tags = posts_with_tags.joins("JOIN tags ON tags.name = tag").where("tags.category" => category_constraint)
end
counts = posts_with_tags.order("count_all DESC").limit(max_results).group("tag").count(:all)
counts
end
end
def self.convert_hash_to_array(hash, limit = MAX_RESULTS)
hash.to_a.sort_by {|x| [-x[1], x[0]] }.slice(0, limit)
end
def self.convert_hash_to_string(hash)
convert_hash_to_array(hash).flatten.join(" ")
def self.frequent_tags_for_posts(posts)
tags_with_counts = posts.flat_map(&:tag_array).group_by(&:itself).transform_values(&:size)
tags_with_counts.sort_by { |tag_name, count| [-count, tag_name] }.map(&:first)
end
end

View File

@@ -18,9 +18,9 @@ class RelatedTagQuery
if query =~ /\*/
pattern_matching_tags
elsif category.present?
related_tags_by_category
RelatedTagCalculator.frequent_tags_for_search(query, category: Tag.categories.value_for(category)).take(25).pluck(:name)
elsif query.present?
related_tags
RelatedTagCalculator.similar_tags_for_search(query).take(25).map(&:name)
else
[]
end
@@ -82,20 +82,6 @@ protected
Tag.name_matches(query).where("post_count > 0").order("post_count desc").limit(50).sort_by {|x| x.name}.map(&:name)
end
def related_tags
tag = Tag.find_by_name(query.strip)
if tag
tag.related_tag_array.map(&:first)
else
[]
end
end
def related_tags_by_category
RelatedTagCalculator.calculate_from_sample_to_array(query, Tag.categories.value_for(category)).map(&:first)
end
def wiki_page
WikiPage.titled(query).first
end

View File

@@ -1,5 +1,4 @@
class Tag < ApplicationRecord
COSINE_SIMILARITY_RELATED_TAG_THRESHOLD = 300
COUNT_METATAGS = %w[
comment_count deleted_comment_count active_comment_count
note_count deleted_note_count active_note_count
@@ -852,57 +851,6 @@ class Tag < ApplicationRecord
end
end
module RelationMethods
def update_related
return unless should_update_related?
CurrentUser.scoped(User.first, "127.0.0.1") do
self.related_tags = RelatedTagCalculator.calculate_from_sample_to_array(name).join(" ")
end
self.related_tags_updated_at = Time.now
fix_post_count if post_count > 20 && rand(post_count) <= 1
save
rescue ActiveRecord::StatementInvalid
end
def update_related_if_outdated
key = Cache.hash(name)
if Cache.get("urt:#{key}").nil? && should_update_related?
if post_count < COSINE_SIMILARITY_RELATED_TAG_THRESHOLD
UpdateRelatedTagsJob.perform_later(self)
else
sqs = SqsService.new(Danbooru.config.aws_sqs_reltagcalc_url)
sqs.send_message("calculate #{name}")
self.related_tags_updated_at = Time.now
save
end
Cache.put("urt:#{key}", true, 600) # mutex to prevent redundant updates
end
end
def related_cache_expiry
base = Math.sqrt([post_count, 0].max)
if base > 24 * 30
24 * 30
elsif base < 24
24
else
base
end
end
def should_update_related?
related_tags.blank? || related_tags_updated_at.blank? || related_tags_updated_at < related_cache_expiry.hours.ago
end
def related_tag_array
update_related_if_outdated
related_tags.to_s.split(/ /).in_groups_of(2)
end
end
module SearchMethods
def empty
where("tags.post_count <= 0")
@@ -1023,6 +971,5 @@ class Tag < ApplicationRecord
extend StatisticsMethods
extend NameMethods
extend ParseMethods
include RelationMethods
extend SearchMethods
end

View File

@@ -1,5 +1,7 @@
module PostSetPresenters
class Post < Base
MAX_TAGS = 25
attr_accessor :post_set
delegate :posts, :to => :post_set
@@ -8,7 +10,7 @@ module PostSetPresenters
end
def tag_set_presenter
@tag_set_presenter ||= TagSetPresenter.new(related_tags)
@tag_set_presenter ||= TagSetPresenter.new(related_tags.take(MAX_TAGS))
end
def post_previews_html(template, options = {})
@@ -19,55 +21,42 @@ module PostSetPresenters
if post_set.is_pattern_search?
pattern_tags
elsif post_set.is_saved_search?
["search:all"] + SavedSearch.labels_for(CurrentUser.user.id).map {|x| "search:#{x}"}
saved_search_tags
elsif post_set.is_empty_tag? || post_set.tag_string == "order:rank"
popular_tags
elsif post_set.is_single_tag?
related_tags_for_single(post_set.tag_string)
elsif post_set.unordered_tag_array.size == 1
related_tags_for_single(post_set.unordered_tag_array.first)
elsif Tag.has_metatag?(post_set.tag_array, *Tag::SUBQUERY_METATAGS)
calculate_related_tags_from_post_set
similar_tags
else
calculate_related_tags_from_post_set
frequent_tags
end
end
def popular_tags
if PopularSearchService.enabled?
PopularSearchService.new(Date.today).tags.slice(0, 25)
PopularSearchService.new(Date.today).tags
else
Tag.trending
end
end
def similar_tags
Cache.get("similar_tags:#{post_set.tag_string}", 4.hours, race_condition_ttl: 60.seconds) do
ApplicationRecord.with_timeout(1_000, []) do
RelatedTagCalculator.similar_tags_for_search(post_set.tag_string).take(MAX_TAGS).pluck(:name)
end
end
end
def frequent_tags
RelatedTagCalculator.frequent_tags_for_posts(post_set.posts).take(MAX_TAGS)
end
def pattern_tags
Tag.name_matches(post_set.tag_string).select("name").limit(Danbooru.config.tag_query_limit).order("post_count DESC").map(&:name)
Tag.name_matches(post_set.tag_string).order(post_count: :desc).limit(MAX_TAGS).pluck(:name)
end
def related_tags_for_group
normalized_tags = Tag.normalize_query(post_set.tag_string, normalize_aliases: false)
Cache.get("PostSetPresenters::Post#related_tags_for_group(#{normalized_tags})", 5.minutes) do
RelatedTagCalculator.calculate_from_sample_to_array(normalized_tags).map(&:first)
end
end
def related_tags_for_single(tag_string)
tag = Tag.find_by_name(tag_string.downcase)
if tag
tag.related_tag_array.map(&:first)
else
calculate_related_tags_from_post_set
end
end
def calculate_related_tags_from_post_set
RelatedTagCalculator.calculate_from_posts_to_array(post_set.posts).map(&:first)
end
def saved_search_labels
SavedSearch.labels_for(CurrentUser.user.id).map {|x| "search:#{x}"}
def saved_search_tags
["search:all"] + SavedSearch.labels_for(CurrentUser.user.id).map {|x| "search:#{x}"}
end
def tag_list_html(**options)

View File

@@ -2,11 +2,20 @@ require 'test_helper'
class RelatedTagsControllerTest < ActionDispatch::IntegrationTest
context "The related tags controller" do
setup do
create(:post, tag_string: "touhou")
end
context "show action" do
should "work" do
get related_tag_path, params: { query: "touhou" }
assert_response :success
end
should "work for .json responses" do
get related_tag_path(format: :json), params: { query: "touhou" }
assert_response :success
end
end
end
end

View File

@@ -12,57 +12,55 @@ class RelatedTagCalculatorTest < ActiveSupport::TestCase
CurrentUser.ip_addr = nil
end
context "A related tag calculator" do
context "for a post set" do
setup do
FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd")
FactoryBot.create(:post, :tag_string => "aaa bbb ccc")
FactoryBot.create(:post, :tag_string => "aaa bbb")
@posts = Post.tag_match("aaa")
end
context "RelatedTagCalculator" do
context "#frequent_tags_for_posts" do
should "calculate the most frequent tags for a set of posts" do
create(:post, tag_string: "aaa bbb ccc ddd")
create(:post, tag_string: "aaa bbb ccc")
create(:post, tag_string: "aaa bbb")
posts = Post.tag_match("aaa")
should "calculate the related tags" do
assert_equal({"aaa"=>3, "bbb"=>3, "ccc"=>2, "ddd"=>1}, RelatedTagCalculator.calculate_from_posts(@posts))
assert_equal(%w[aaa bbb ccc ddd], RelatedTagCalculator.frequent_tags_for_posts(posts))
end
end
should "calculate related tags for a tag" do
posts = []
posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd")
posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc")
posts << FactoryBot.create(:post, :tag_string => "aaa bbb")
context "#frequent_tags_for_search" do
should "calculate the most frequent tags for a single tag search" do
create(:post, tag_string: "aaa bbb ccc ddd")
create(:post, tag_string: "aaa bbb ccc")
create(:post, tag_string: "aaa bbb")
assert_equal({"aaa" => 3, "bbb" => 3, "ccc" => 2, "ddd" => 1}, RelatedTagCalculator.calculate_from_sample("aaa", 10))
assert_equal(%w[aaa bbb ccc ddd], RelatedTagCalculator.frequent_tags_for_search("aaa").pluck(:name))
end
should "calculate the most frequent tags for a multiple tag search" do
create(:post, tag_string: "aaa bbb ccc")
create(:post, tag_string: "aaa bbb ccc ddd")
create(:post, tag_string: "aaa eee fff")
assert_equal(%w[aaa bbb ccc ddd], RelatedTagCalculator.frequent_tags_for_search("aaa bbb").pluck(:name))
end
should "calculate the most frequent tags with a category constraint" do
create(:post, tag_string: "aaa bbb art:ccc copy:ddd")
create(:post, tag_string: "aaa bbb art:ccc")
create(:post, tag_string: "aaa bbb")
assert_equal(%w[aaa bbb], RelatedTagCalculator.frequent_tags_for_search("aaa", category: Tag.categories.general).pluck(:name))
assert_equal(%w[ccc], RelatedTagCalculator.frequent_tags_for_search("aaa", category: Tag.categories.artist).pluck(:name))
end
end
should "calculate related tags for multiple tag" do
posts = []
posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc")
posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd")
posts << FactoryBot.create(:post, :tag_string => "aaa eee fff")
context "#similar_tags_for_search" do
should "calculate the most similar tags for a search" do
create(:post, tag_string: "1girl solo", rating: "s")
create(:post, tag_string: "1girl solo", rating: "q")
create(:post, tag_string: "1girl 1boy", rating: "q")
assert_equal({"aaa"=>2, "bbb"=>2, "ddd"=>1, "ccc"=>2}, RelatedTagCalculator.calculate_from_sample("aaa bbb", 10))
end
should "calculate typed related tags for a tag" do
posts = []
posts << FactoryBot.create(:post, :tag_string => "aaa bbb art:ccc copy:ddd")
posts << FactoryBot.create(:post, :tag_string => "aaa bbb art:ccc")
posts << FactoryBot.create(:post, :tag_string => "aaa bbb")
assert_equal({"ccc" => 2}, RelatedTagCalculator.calculate_from_sample("aaa", 10, Tag.categories.artist))
assert_equal({"ddd" => 1}, RelatedTagCalculator.calculate_from_sample("aaa", 10, Tag.categories.copyright))
end
should "convert a hash into string format" do
posts = []
posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd")
posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc")
posts << FactoryBot.create(:post, :tag_string => "aaa bbb")
tag = Tag.find_by_name("aaa")
counts = RelatedTagCalculator.calculate_from_sample("aaa", 10)
assert_equal("aaa 3 bbb 3 ccc 2 ddd 1", RelatedTagCalculator.convert_hash_to_string(counts))
assert_equal(%w[1girl solo 1boy], RelatedTagCalculator.similar_tags_for_search("1girl").pluck(:name))
assert_equal(%w[1girl 1boy solo], RelatedTagCalculator.similar_tags_for_search("rating:q").pluck(:name))
assert_equal(%w[solo 1girl], RelatedTagCalculator.similar_tags_for_search("solo").pluck(:name))
end
end
end
end

View File

@@ -31,7 +31,6 @@ class RelatedTagQueryTest < ActiveSupport::TestCase
context "for a tag that already exists" do
setup do
Tag.find_by_name("aaa").update_related
@query = RelatedTagQuery.new(query: "aaa")
end
@@ -59,8 +58,6 @@ class RelatedTagQueryTest < ActiveSupport::TestCase
@ta = FactoryBot.create(:tag_alias, antecedent_name: "xyz", consequent_name: "aaa")
@wp = FactoryBot.create(:wiki_page, title: "aaa", body: "blah [[foo|blah]] [[FOO]] [[bar]] blah")
@query = RelatedTagQuery.new(query: "xyz")
Tag.find_by_name("aaa").update_related
end
should "take wiki tags from the consequent's wiki" do

View File

@@ -283,15 +283,4 @@ class TagTest < ActiveSupport::TestCase
assert_equal(1, tag.reload.post_count)
end
end
context "The #related_tag_array method" do
should "update the related tags" do
create(:post, tag_string: "bkub")
tag = Tag.find_by_name("bkub")
assert_nil(tag.related_tags)
perform_enqueued_jobs { tag.related_tag_array }
assert_equal([["bkub", "1"]], tag.reload.related_tag_array)
end
end
end