Rewrite related tags implementation.
Rewrite the implementation of related tags to be simpler, faster, and more accurate: * The related tags are now calculated by taking a random sample of 1000 posts, finding the top 250 most frequent tags among those posts, then ordering those tags by cosine similarity. * Related tags can generally be calculated in 50-300ms at these sample sizes. Very high sample sizes (25000+ posts) are still relatively fast (1-3 seconds), but generally they don't improve accuracy much. * Related tags are now cached in redis rather than in the tags table. The related_tags column in the tags table is no longer used. * Only the related tags in the search taglist are cached. The related tags returned by the 'Related tags' button are not cached. * The cache lifetime is a fixed 4 hours. * The 'Related tags' button now works with metatags. * The /related_tag page now works with metatags and multitag searches. Fixes #4134, #4146.
This commit is contained in:
@@ -14,8 +14,8 @@ class Cache
|
||||
keys_to_values_hash
|
||||
end
|
||||
|
||||
def self.get(key, expiry_in_seconds = nil, &block)
|
||||
Rails.cache.fetch(key, expires_in: expiry_in_seconds, &block)
|
||||
def self.get(key, expiry_in_seconds = nil, **options, &block)
|
||||
Rails.cache.fetch(key, expires_in: expiry_in_seconds, **options, &block)
|
||||
end
|
||||
|
||||
def self.put(key, value, expiry_in_seconds = nil)
|
||||
|
||||
@@ -1,78 +1,32 @@
|
||||
class RelatedTagCalculator
|
||||
MAX_RESULTS = 25
|
||||
module RelatedTagCalculator
|
||||
def self.similar_tags_for_search(tag_query, search_sample_size: 1000, tag_sample_size: 250, category: nil)
|
||||
search_count = Post.fast_count(tag_query)
|
||||
search_sample_size = [search_count, search_sample_size].min
|
||||
return [] if search_sample_size <= 0
|
||||
|
||||
def self.calculate_from_sample_to_array(tags, category_constraint = nil)
|
||||
convert_hash_to_array(calculate_from_sample(tags, Danbooru.config.post_sample_size, category_constraint))
|
||||
end
|
||||
|
||||
def self.calculate_from_posts_to_array(posts)
|
||||
convert_hash_to_array(calculate_from_posts(posts))
|
||||
end
|
||||
|
||||
def self.calculate_from_posts(posts)
|
||||
counts = Hash.new {|h, k| h[k] = 0}
|
||||
|
||||
posts.flat_map(&:tag_array).each do |tag|
|
||||
counts[tag] += 1
|
||||
tags = frequent_tags_for_search(tag_query, search_sample_size: search_sample_size, category: category).limit(tag_sample_size)
|
||||
tags = tags.sort_by do |tag|
|
||||
# cosine distance(tag1, tag2) = 1 - {{tag1 tag2}} / sqrt({{tag1}} * {{tag2}})
|
||||
1 - tag.overlap_count / (Math.sqrt(tag.post_count * search_count.to_f))
|
||||
end
|
||||
|
||||
counts
|
||||
tags
|
||||
end
|
||||
|
||||
def self.calculate_similar_from_sample(tag)
|
||||
# this uses cosine similarity to produce more useful
|
||||
# related tags, but is more db intensive
|
||||
counts = Hash.new {|h, k| h[k] = 0}
|
||||
def self.frequent_tags_for_search(tag_query, search_sample_size: 1000, category: nil)
|
||||
sample_posts = Post.tag_match(tag_query).reorder(:md5).limit(search_sample_size)
|
||||
tag_counts = Post.from(sample_posts).with_unflattened_tags.group("tag").select("tag, COUNT(*) AS overlap_count")
|
||||
|
||||
CurrentUser.without_safe_mode do
|
||||
Post.with_timeout(5_000, [], {:tags => tag}) do
|
||||
Post.tag_match(tag).limit(400).reorder("posts.md5").pluck(:tag_string).each do |tag_string|
|
||||
tag_string.split.each do |tag|
|
||||
counts[tag] += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
tag_record = Tag.find_by_name(tag)
|
||||
candidates = convert_hash_to_array(counts, 100)
|
||||
similar_counts = Hash.new {|h, k| h[k] = 0}
|
||||
CurrentUser.without_safe_mode do
|
||||
PostReadOnly.with_timeout(5_000, nil, {:tags => tag}) do
|
||||
candidates.each do |ctag, _|
|
||||
acount = PostReadOnly.tag_match("#{tag} #{ctag}").count
|
||||
ctag_record = Tag.find_by_name(ctag)
|
||||
div = Math.sqrt(tag_record.post_count * ctag_record.post_count)
|
||||
if div != 0
|
||||
c = acount / div
|
||||
similar_counts[ctag] = c
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
convert_hash_to_array(similar_counts)
|
||||
tags = Tag.from(tag_counts).joins("JOIN tags ON tags.name = tag")
|
||||
tags = tags.select("tags.*, overlap_count")
|
||||
tags = tags.where("tags.post_count > 0")
|
||||
tags = tags.where(category: category) if category.present?
|
||||
tags = tags.order("overlap_count DESC, tags.post_count DESC, tags.name")
|
||||
tags
|
||||
end
|
||||
|
||||
def self.calculate_from_sample(tags, sample_size, category_constraint = nil, max_results = MAX_RESULTS)
|
||||
Post.with_timeout(5_000, [], {:tags => tags}) do
|
||||
sample = Post.sample(tags, sample_size)
|
||||
posts_with_tags = Post.from(sample).with_unflattened_tags
|
||||
|
||||
if category_constraint
|
||||
posts_with_tags = posts_with_tags.joins("JOIN tags ON tags.name = tag").where("tags.category" => category_constraint)
|
||||
end
|
||||
|
||||
counts = posts_with_tags.order("count_all DESC").limit(max_results).group("tag").count(:all)
|
||||
counts
|
||||
end
|
||||
end
|
||||
|
||||
def self.convert_hash_to_array(hash, limit = MAX_RESULTS)
|
||||
hash.to_a.sort_by {|x| [-x[1], x[0]] }.slice(0, limit)
|
||||
end
|
||||
|
||||
def self.convert_hash_to_string(hash)
|
||||
convert_hash_to_array(hash).flatten.join(" ")
|
||||
def self.frequent_tags_for_posts(posts)
|
||||
tags_with_counts = posts.flat_map(&:tag_array).group_by(&:itself).transform_values(&:size)
|
||||
tags_with_counts.sort_by { |tag_name, count| [-count, tag_name] }.map(&:first)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -18,9 +18,9 @@ class RelatedTagQuery
|
||||
if query =~ /\*/
|
||||
pattern_matching_tags
|
||||
elsif category.present?
|
||||
related_tags_by_category
|
||||
RelatedTagCalculator.frequent_tags_for_search(query, category: Tag.categories.value_for(category)).take(25).pluck(:name)
|
||||
elsif query.present?
|
||||
related_tags
|
||||
RelatedTagCalculator.similar_tags_for_search(query).take(25).map(&:name)
|
||||
else
|
||||
[]
|
||||
end
|
||||
@@ -82,20 +82,6 @@ protected
|
||||
Tag.name_matches(query).where("post_count > 0").order("post_count desc").limit(50).sort_by {|x| x.name}.map(&:name)
|
||||
end
|
||||
|
||||
def related_tags
|
||||
tag = Tag.find_by_name(query.strip)
|
||||
|
||||
if tag
|
||||
tag.related_tag_array.map(&:first)
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
def related_tags_by_category
|
||||
RelatedTagCalculator.calculate_from_sample_to_array(query, Tag.categories.value_for(category)).map(&:first)
|
||||
end
|
||||
|
||||
def wiki_page
|
||||
WikiPage.titled(query).first
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user