Rewrite related tags implementation.

Rewrite the implementation of related tags to be simpler, faster, and
more accurate:

* The related tags are now calculated by taking a random sample of 1000
  posts, finding the top 250 most frequent tags among those posts, then
  ordering those tags by cosine similarity.

* Related tags can generally be calculated in 50-300ms at these sample
  sizes. Very high sample sizes (25000+ posts) are still relatively fast
  (1-3 seconds), but generally they don't improve accuracy much.

* Related tags are now cached in redis rather than in the tags table.
  The related_tags column in the tags table is no longer used.

* Only the related tags in the search taglist are cached. The related
  tags returned by the 'Related tags' button are not cached.

* The cache lifetime is a fixed 4 hours.

* The 'Related tags' button now works with metatags.

* The /related_tag page now works with metatags and multitag searches.

Fixes #4134, #4146.
This commit is contained in:
evazion
2019-08-30 19:08:56 -05:00
parent 7b8584e3b0
commit 6dd331745a
11 changed files with 99 additions and 256 deletions

View File

@@ -14,8 +14,8 @@ class Cache
keys_to_values_hash
end
def self.get(key, expiry_in_seconds = nil, &block)
Rails.cache.fetch(key, expires_in: expiry_in_seconds, &block)
def self.get(key, expiry_in_seconds = nil, **options, &block)
Rails.cache.fetch(key, expires_in: expiry_in_seconds, **options, &block)
end
def self.put(key, value, expiry_in_seconds = nil)

View File

@@ -1,78 +1,32 @@
class RelatedTagCalculator
MAX_RESULTS = 25
module RelatedTagCalculator
def self.similar_tags_for_search(tag_query, search_sample_size: 1000, tag_sample_size: 250, category: nil)
search_count = Post.fast_count(tag_query)
search_sample_size = [search_count, search_sample_size].min
return [] if search_sample_size <= 0
def self.calculate_from_sample_to_array(tags, category_constraint = nil)
convert_hash_to_array(calculate_from_sample(tags, Danbooru.config.post_sample_size, category_constraint))
end
def self.calculate_from_posts_to_array(posts)
convert_hash_to_array(calculate_from_posts(posts))
end
def self.calculate_from_posts(posts)
counts = Hash.new {|h, k| h[k] = 0}
posts.flat_map(&:tag_array).each do |tag|
counts[tag] += 1
tags = frequent_tags_for_search(tag_query, search_sample_size: search_sample_size, category: category).limit(tag_sample_size)
tags = tags.sort_by do |tag|
# cosine distance(tag1, tag2) = 1 - {{tag1 tag2}} / sqrt({{tag1}} * {{tag2}})
1 - tag.overlap_count / (Math.sqrt(tag.post_count * search_count.to_f))
end
counts
tags
end
def self.calculate_similar_from_sample(tag)
# this uses cosine similarity to produce more useful
# related tags, but is more db intensive
counts = Hash.new {|h, k| h[k] = 0}
def self.frequent_tags_for_search(tag_query, search_sample_size: 1000, category: nil)
sample_posts = Post.tag_match(tag_query).reorder(:md5).limit(search_sample_size)
tag_counts = Post.from(sample_posts).with_unflattened_tags.group("tag").select("tag, COUNT(*) AS overlap_count")
CurrentUser.without_safe_mode do
Post.with_timeout(5_000, [], {:tags => tag}) do
Post.tag_match(tag).limit(400).reorder("posts.md5").pluck(:tag_string).each do |tag_string|
tag_string.split.each do |tag|
counts[tag] += 1
end
end
end
end
tag_record = Tag.find_by_name(tag)
candidates = convert_hash_to_array(counts, 100)
similar_counts = Hash.new {|h, k| h[k] = 0}
CurrentUser.without_safe_mode do
PostReadOnly.with_timeout(5_000, nil, {:tags => tag}) do
candidates.each do |ctag, _|
acount = PostReadOnly.tag_match("#{tag} #{ctag}").count
ctag_record = Tag.find_by_name(ctag)
div = Math.sqrt(tag_record.post_count * ctag_record.post_count)
if div != 0
c = acount / div
similar_counts[ctag] = c
end
end
end
end
convert_hash_to_array(similar_counts)
tags = Tag.from(tag_counts).joins("JOIN tags ON tags.name = tag")
tags = tags.select("tags.*, overlap_count")
tags = tags.where("tags.post_count > 0")
tags = tags.where(category: category) if category.present?
tags = tags.order("overlap_count DESC, tags.post_count DESC, tags.name")
tags
end
def self.calculate_from_sample(tags, sample_size, category_constraint = nil, max_results = MAX_RESULTS)
Post.with_timeout(5_000, [], {:tags => tags}) do
sample = Post.sample(tags, sample_size)
posts_with_tags = Post.from(sample).with_unflattened_tags
if category_constraint
posts_with_tags = posts_with_tags.joins("JOIN tags ON tags.name = tag").where("tags.category" => category_constraint)
end
counts = posts_with_tags.order("count_all DESC").limit(max_results).group("tag").count(:all)
counts
end
end
def self.convert_hash_to_array(hash, limit = MAX_RESULTS)
hash.to_a.sort_by {|x| [-x[1], x[0]] }.slice(0, limit)
end
def self.convert_hash_to_string(hash)
convert_hash_to_array(hash).flatten.join(" ")
def self.frequent_tags_for_posts(posts)
tags_with_counts = posts.flat_map(&:tag_array).group_by(&:itself).transform_values(&:size)
tags_with_counts.sort_by { |tag_name, count| [-count, tag_name] }.map(&:first)
end
end

View File

@@ -18,9 +18,9 @@ class RelatedTagQuery
if query =~ /\*/
pattern_matching_tags
elsif category.present?
related_tags_by_category
RelatedTagCalculator.frequent_tags_for_search(query, category: Tag.categories.value_for(category)).take(25).pluck(:name)
elsif query.present?
related_tags
RelatedTagCalculator.similar_tags_for_search(query).take(25).map(&:name)
else
[]
end
@@ -82,20 +82,6 @@ protected
Tag.name_matches(query).where("post_count > 0").order("post_count desc").limit(50).sort_by {|x| x.name}.map(&:name)
end
def related_tags
tag = Tag.find_by_name(query.strip)
if tag
tag.related_tag_array.map(&:first)
else
[]
end
end
def related_tags_by_category
RelatedTagCalculator.calculate_from_sample_to_array(query, Tag.categories.value_for(category)).map(&:first)
end
def wiki_page
WikiPage.titled(query).first
end