Rewrite related tags implementation.

Rewrite the implementation of related tags to be simpler, faster, and
more accurate:

* The related tags are now calculated by taking a random sample of 1000
  posts, finding the top 250 most frequent tags among those posts, then
  ordering those tags by cosine similarity.

* Related tags can generally be calculated in 50-300ms at these sample
  sizes. Very high sample sizes (25000+ posts) are still relatively fast
  (1-3 seconds), but generally they don't improve accuracy much.

* Related tags are now cached in redis rather than in the tags table.
  The related_tags column in the tags table is no longer used.

* Only the related tags in the search taglist are cached. The related
  tags returned by the 'Related tags' button are not cached.

* The cache lifetime is a fixed 4 hours.

* The 'Related tags' button now works with metatags.

* The /related_tag page now works with metatags and multitag searches.

Fixes #4134, #4146.
This commit is contained in:
evazion
2019-08-30 19:08:56 -05:00
parent 7b8584e3b0
commit 6dd331745a
11 changed files with 99 additions and 256 deletions

View File

@@ -1,5 +1,4 @@
class Tag < ApplicationRecord
COSINE_SIMILARITY_RELATED_TAG_THRESHOLD = 300
COUNT_METATAGS = %w[
comment_count deleted_comment_count active_comment_count
note_count deleted_note_count active_note_count
@@ -852,57 +851,6 @@ class Tag < ApplicationRecord
end
end
module RelationMethods
def update_related
return unless should_update_related?
CurrentUser.scoped(User.first, "127.0.0.1") do
self.related_tags = RelatedTagCalculator.calculate_from_sample_to_array(name).join(" ")
end
self.related_tags_updated_at = Time.now
fix_post_count if post_count > 20 && rand(post_count) <= 1
save
rescue ActiveRecord::StatementInvalid
end
def update_related_if_outdated
key = Cache.hash(name)
if Cache.get("urt:#{key}").nil? && should_update_related?
if post_count < COSINE_SIMILARITY_RELATED_TAG_THRESHOLD
UpdateRelatedTagsJob.perform_later(self)
else
sqs = SqsService.new(Danbooru.config.aws_sqs_reltagcalc_url)
sqs.send_message("calculate #{name}")
self.related_tags_updated_at = Time.now
save
end
Cache.put("urt:#{key}", true, 600) # mutex to prevent redundant updates
end
end
def related_cache_expiry
base = Math.sqrt([post_count, 0].max)
if base > 24 * 30
24 * 30
elsif base < 24
24
else
base
end
end
def should_update_related?
related_tags.blank? || related_tags_updated_at.blank? || related_tags_updated_at < related_cache_expiry.hours.ago
end
def related_tag_array
update_related_if_outdated
related_tags.to_s.split(/ /).in_groups_of(2)
end
end
module SearchMethods
def empty
where("tags.post_count <= 0")
@@ -1023,6 +971,5 @@ class Tag < ApplicationRecord
extend StatisticsMethods
extend NameMethods
extend ParseMethods
include RelationMethods
extend SearchMethods
end