diff --git a/app/controllers/related_tags_controller.rb b/app/controllers/related_tags_controller.rb index bbd170940..ec86073aa 100644 --- a/app/controllers/related_tags_controller.rb +++ b/app/controllers/related_tags_controller.rb @@ -1,27 +1,8 @@ class RelatedTagsController < ApplicationController - respond_to :json, :xml, :js, :html, except: [:update] - before_action :require_reportbooru_key, only: [:update] - skip_forgery_protection only: [:update] + respond_to :json, :xml, :js, :html def show @query = RelatedTagQuery.new(query: params[:query], category: params[:category], user: CurrentUser.user) respond_with(@query) end - - def update - @tag = Tag.find_by_name(params[:name]) - @tag.related_tags = params[:related_tags] - @tag.related_tags_updated_at = Time.now - @tag.post_count = params[:post_count] if params[:post_count].present? - @tag.save - head :ok - end - - protected - - def require_reportbooru_key - unless Danbooru.config.reportbooru_key.present? && params[:key] == Danbooru.config.reportbooru_key - raise User::PrivilegeError - end - end end diff --git a/app/jobs/update_related_tags_job.rb b/app/jobs/update_related_tags_job.rb deleted file mode 100644 index d538d5945..000000000 --- a/app/jobs/update_related_tags_job.rb +++ /dev/null @@ -1,7 +0,0 @@ -class UpdateRelatedTagsJob < ApplicationJob - queue_as :default - - def perform(tag) - tag.update_related - end -end diff --git a/app/logical/cache.rb b/app/logical/cache.rb index 329824bdf..8641c6c8a 100644 --- a/app/logical/cache.rb +++ b/app/logical/cache.rb @@ -14,8 +14,8 @@ class Cache keys_to_values_hash end - def self.get(key, expiry_in_seconds = nil, &block) - Rails.cache.fetch(key, expires_in: expiry_in_seconds, &block) + def self.get(key, expiry_in_seconds = nil, **options, &block) + Rails.cache.fetch(key, expires_in: expiry_in_seconds, **options, &block) end def self.put(key, value, expiry_in_seconds = nil) diff --git a/app/logical/related_tag_calculator.rb b/app/logical/related_tag_calculator.rb index db5a33d2c..9a999f5a3 100644 --- a/app/logical/related_tag_calculator.rb +++ b/app/logical/related_tag_calculator.rb @@ -1,78 +1,32 @@ -class RelatedTagCalculator - MAX_RESULTS = 25 +module RelatedTagCalculator + def self.similar_tags_for_search(tag_query, search_sample_size: 1000, tag_sample_size: 250, category: nil) + search_count = Post.fast_count(tag_query) + search_sample_size = [search_count, search_sample_size].min + return [] if search_sample_size <= 0 - def self.calculate_from_sample_to_array(tags, category_constraint = nil) - convert_hash_to_array(calculate_from_sample(tags, Danbooru.config.post_sample_size, category_constraint)) - end - - def self.calculate_from_posts_to_array(posts) - convert_hash_to_array(calculate_from_posts(posts)) - end - - def self.calculate_from_posts(posts) - counts = Hash.new {|h, k| h[k] = 0} - - posts.flat_map(&:tag_array).each do |tag| - counts[tag] += 1 + tags = frequent_tags_for_search(tag_query, search_sample_size: search_sample_size, category: category).limit(tag_sample_size) + tags = tags.sort_by do |tag| + # cosine distance(tag1, tag2) = 1 - {{tag1 tag2}} / sqrt({{tag1}} * {{tag2}}) + 1 - tag.overlap_count / (Math.sqrt(tag.post_count * search_count.to_f)) end - counts + tags end - def self.calculate_similar_from_sample(tag) - # this uses cosine similarity to produce more useful - # related tags, but is more db intensive - counts = Hash.new {|h, k| h[k] = 0} + def self.frequent_tags_for_search(tag_query, search_sample_size: 1000, category: nil) + sample_posts = Post.tag_match(tag_query).reorder(:md5).limit(search_sample_size) + tag_counts = Post.from(sample_posts).with_unflattened_tags.group("tag").select("tag, COUNT(*) AS overlap_count") - CurrentUser.without_safe_mode do - Post.with_timeout(5_000, [], {:tags => tag}) do - Post.tag_match(tag).limit(400).reorder("posts.md5").pluck(:tag_string).each do |tag_string| - tag_string.split.each do |tag| - counts[tag] += 1 - end - end - end - end - - tag_record = Tag.find_by_name(tag) - candidates = convert_hash_to_array(counts, 100) - similar_counts = Hash.new {|h, k| h[k] = 0} - CurrentUser.without_safe_mode do - PostReadOnly.with_timeout(5_000, nil, {:tags => tag}) do - candidates.each do |ctag, _| - acount = PostReadOnly.tag_match("#{tag} #{ctag}").count - ctag_record = Tag.find_by_name(ctag) - div = Math.sqrt(tag_record.post_count * ctag_record.post_count) - if div != 0 - c = acount / div - similar_counts[ctag] = c - end - end - end - end - - convert_hash_to_array(similar_counts) + tags = Tag.from(tag_counts).joins("JOIN tags ON tags.name = tag") + tags = tags.select("tags.*, overlap_count") + tags = tags.where("tags.post_count > 0") + tags = tags.where(category: category) if category.present? + tags = tags.order("overlap_count DESC, tags.post_count DESC, tags.name") + tags end - def self.calculate_from_sample(tags, sample_size, category_constraint = nil, max_results = MAX_RESULTS) - Post.with_timeout(5_000, [], {:tags => tags}) do - sample = Post.sample(tags, sample_size) - posts_with_tags = Post.from(sample).with_unflattened_tags - - if category_constraint - posts_with_tags = posts_with_tags.joins("JOIN tags ON tags.name = tag").where("tags.category" => category_constraint) - end - - counts = posts_with_tags.order("count_all DESC").limit(max_results).group("tag").count(:all) - counts - end - end - - def self.convert_hash_to_array(hash, limit = MAX_RESULTS) - hash.to_a.sort_by {|x| [-x[1], x[0]] }.slice(0, limit) - end - - def self.convert_hash_to_string(hash) - convert_hash_to_array(hash).flatten.join(" ") + def self.frequent_tags_for_posts(posts) + tags_with_counts = posts.flat_map(&:tag_array).group_by(&:itself).transform_values(&:size) + tags_with_counts.sort_by { |tag_name, count| [-count, tag_name] }.map(&:first) end end diff --git a/app/logical/related_tag_query.rb b/app/logical/related_tag_query.rb index 2c01f3468..77310a105 100644 --- a/app/logical/related_tag_query.rb +++ b/app/logical/related_tag_query.rb @@ -18,9 +18,9 @@ class RelatedTagQuery if query =~ /\*/ pattern_matching_tags elsif category.present? - related_tags_by_category + RelatedTagCalculator.frequent_tags_for_search(query, category: Tag.categories.value_for(category)).take(25).pluck(:name) elsif query.present? - related_tags + RelatedTagCalculator.similar_tags_for_search(query).take(25).map(&:name) else [] end @@ -82,20 +82,6 @@ protected Tag.name_matches(query).where("post_count > 0").order("post_count desc").limit(50).sort_by {|x| x.name}.map(&:name) end - def related_tags - tag = Tag.find_by_name(query.strip) - - if tag - tag.related_tag_array.map(&:first) - else - [] - end - end - - def related_tags_by_category - RelatedTagCalculator.calculate_from_sample_to_array(query, Tag.categories.value_for(category)).map(&:first) - end - def wiki_page WikiPage.titled(query).first end diff --git a/app/models/tag.rb b/app/models/tag.rb index 450eea994..6885106ee 100644 --- a/app/models/tag.rb +++ b/app/models/tag.rb @@ -1,5 +1,4 @@ class Tag < ApplicationRecord - COSINE_SIMILARITY_RELATED_TAG_THRESHOLD = 300 COUNT_METATAGS = %w[ comment_count deleted_comment_count active_comment_count note_count deleted_note_count active_note_count @@ -852,57 +851,6 @@ class Tag < ApplicationRecord end end - module RelationMethods - def update_related - return unless should_update_related? - - CurrentUser.scoped(User.first, "127.0.0.1") do - self.related_tags = RelatedTagCalculator.calculate_from_sample_to_array(name).join(" ") - end - self.related_tags_updated_at = Time.now - fix_post_count if post_count > 20 && rand(post_count) <= 1 - save - rescue ActiveRecord::StatementInvalid - end - - def update_related_if_outdated - key = Cache.hash(name) - - if Cache.get("urt:#{key}").nil? && should_update_related? - if post_count < COSINE_SIMILARITY_RELATED_TAG_THRESHOLD - UpdateRelatedTagsJob.perform_later(self) - else - sqs = SqsService.new(Danbooru.config.aws_sqs_reltagcalc_url) - sqs.send_message("calculate #{name}") - self.related_tags_updated_at = Time.now - save - end - - Cache.put("urt:#{key}", true, 600) # mutex to prevent redundant updates - end - end - - def related_cache_expiry - base = Math.sqrt([post_count, 0].max) - if base > 24 * 30 - 24 * 30 - elsif base < 24 - 24 - else - base - end - end - - def should_update_related? - related_tags.blank? || related_tags_updated_at.blank? || related_tags_updated_at < related_cache_expiry.hours.ago - end - - def related_tag_array - update_related_if_outdated - related_tags.to_s.split(/ /).in_groups_of(2) - end - end - module SearchMethods def empty where("tags.post_count <= 0") @@ -1023,6 +971,5 @@ class Tag < ApplicationRecord extend StatisticsMethods extend NameMethods extend ParseMethods - include RelationMethods extend SearchMethods end diff --git a/app/presenters/post_set_presenters/post.rb b/app/presenters/post_set_presenters/post.rb index 1aad52b26..a57b60bc0 100644 --- a/app/presenters/post_set_presenters/post.rb +++ b/app/presenters/post_set_presenters/post.rb @@ -1,5 +1,7 @@ module PostSetPresenters class Post < Base + MAX_TAGS = 25 + attr_accessor :post_set delegate :posts, :to => :post_set @@ -8,7 +10,7 @@ module PostSetPresenters end def tag_set_presenter - @tag_set_presenter ||= TagSetPresenter.new(related_tags) + @tag_set_presenter ||= TagSetPresenter.new(related_tags.take(MAX_TAGS)) end def post_previews_html(template, options = {}) @@ -19,55 +21,42 @@ module PostSetPresenters if post_set.is_pattern_search? pattern_tags elsif post_set.is_saved_search? - ["search:all"] + SavedSearch.labels_for(CurrentUser.user.id).map {|x| "search:#{x}"} + saved_search_tags elsif post_set.is_empty_tag? || post_set.tag_string == "order:rank" popular_tags elsif post_set.is_single_tag? - related_tags_for_single(post_set.tag_string) - elsif post_set.unordered_tag_array.size == 1 - related_tags_for_single(post_set.unordered_tag_array.first) - elsif Tag.has_metatag?(post_set.tag_array, *Tag::SUBQUERY_METATAGS) - calculate_related_tags_from_post_set + similar_tags else - calculate_related_tags_from_post_set + frequent_tags end end def popular_tags if PopularSearchService.enabled? - PopularSearchService.new(Date.today).tags.slice(0, 25) + PopularSearchService.new(Date.today).tags else Tag.trending end end + def similar_tags + Cache.get("similar_tags:#{post_set.tag_string}", 4.hours, race_condition_ttl: 60.seconds) do + ApplicationRecord.with_timeout(1_000, []) do + RelatedTagCalculator.similar_tags_for_search(post_set.tag_string).take(MAX_TAGS).pluck(:name) + end + end + end + + def frequent_tags + RelatedTagCalculator.frequent_tags_for_posts(post_set.posts).take(MAX_TAGS) + end + def pattern_tags - Tag.name_matches(post_set.tag_string).select("name").limit(Danbooru.config.tag_query_limit).order("post_count DESC").map(&:name) + Tag.name_matches(post_set.tag_string).order(post_count: :desc).limit(MAX_TAGS).pluck(:name) end - def related_tags_for_group - normalized_tags = Tag.normalize_query(post_set.tag_string, normalize_aliases: false) - Cache.get("PostSetPresenters::Post#related_tags_for_group(#{normalized_tags})", 5.minutes) do - RelatedTagCalculator.calculate_from_sample_to_array(normalized_tags).map(&:first) - end - end - - def related_tags_for_single(tag_string) - tag = Tag.find_by_name(tag_string.downcase) - - if tag - tag.related_tag_array.map(&:first) - else - calculate_related_tags_from_post_set - end - end - - def calculate_related_tags_from_post_set - RelatedTagCalculator.calculate_from_posts_to_array(post_set.posts).map(&:first) - end - - def saved_search_labels - SavedSearch.labels_for(CurrentUser.user.id).map {|x| "search:#{x}"} + def saved_search_tags + ["search:all"] + SavedSearch.labels_for(CurrentUser.user.id).map {|x| "search:#{x}"} end def tag_list_html(**options) diff --git a/test/functional/related_tags_controller_test.rb b/test/functional/related_tags_controller_test.rb index b7fa741ac..4b0c2c81a 100644 --- a/test/functional/related_tags_controller_test.rb +++ b/test/functional/related_tags_controller_test.rb @@ -2,11 +2,20 @@ require 'test_helper' class RelatedTagsControllerTest < ActionDispatch::IntegrationTest context "The related tags controller" do + setup do + create(:post, tag_string: "touhou") + end + context "show action" do should "work" do get related_tag_path, params: { query: "touhou" } assert_response :success end + + should "work for .json responses" do + get related_tag_path(format: :json), params: { query: "touhou" } + assert_response :success + end end end end diff --git a/test/unit/related_tag_calculator_test.rb b/test/unit/related_tag_calculator_test.rb index 250cb4d47..58f3071dc 100644 --- a/test/unit/related_tag_calculator_test.rb +++ b/test/unit/related_tag_calculator_test.rb @@ -12,57 +12,55 @@ class RelatedTagCalculatorTest < ActiveSupport::TestCase CurrentUser.ip_addr = nil end - context "A related tag calculator" do - context "for a post set" do - setup do - FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd") - FactoryBot.create(:post, :tag_string => "aaa bbb ccc") - FactoryBot.create(:post, :tag_string => "aaa bbb") - @posts = Post.tag_match("aaa") - end + context "RelatedTagCalculator" do + context "#frequent_tags_for_posts" do + should "calculate the most frequent tags for a set of posts" do + create(:post, tag_string: "aaa bbb ccc ddd") + create(:post, tag_string: "aaa bbb ccc") + create(:post, tag_string: "aaa bbb") + posts = Post.tag_match("aaa") - should "calculate the related tags" do - assert_equal({"aaa"=>3, "bbb"=>3, "ccc"=>2, "ddd"=>1}, RelatedTagCalculator.calculate_from_posts(@posts)) + assert_equal(%w[aaa bbb ccc ddd], RelatedTagCalculator.frequent_tags_for_posts(posts)) end end - should "calculate related tags for a tag" do - posts = [] - posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd") - posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc") - posts << FactoryBot.create(:post, :tag_string => "aaa bbb") + context "#frequent_tags_for_search" do + should "calculate the most frequent tags for a single tag search" do + create(:post, tag_string: "aaa bbb ccc ddd") + create(:post, tag_string: "aaa bbb ccc") + create(:post, tag_string: "aaa bbb") - assert_equal({"aaa" => 3, "bbb" => 3, "ccc" => 2, "ddd" => 1}, RelatedTagCalculator.calculate_from_sample("aaa", 10)) + assert_equal(%w[aaa bbb ccc ddd], RelatedTagCalculator.frequent_tags_for_search("aaa").pluck(:name)) + end + + should "calculate the most frequent tags for a multiple tag search" do + create(:post, tag_string: "aaa bbb ccc") + create(:post, tag_string: "aaa bbb ccc ddd") + create(:post, tag_string: "aaa eee fff") + + assert_equal(%w[aaa bbb ccc ddd], RelatedTagCalculator.frequent_tags_for_search("aaa bbb").pluck(:name)) + end + + should "calculate the most frequent tags with a category constraint" do + create(:post, tag_string: "aaa bbb art:ccc copy:ddd") + create(:post, tag_string: "aaa bbb art:ccc") + create(:post, tag_string: "aaa bbb") + + assert_equal(%w[aaa bbb], RelatedTagCalculator.frequent_tags_for_search("aaa", category: Tag.categories.general).pluck(:name)) + assert_equal(%w[ccc], RelatedTagCalculator.frequent_tags_for_search("aaa", category: Tag.categories.artist).pluck(:name)) + end end - should "calculate related tags for multiple tag" do - posts = [] - posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc") - posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd") - posts << FactoryBot.create(:post, :tag_string => "aaa eee fff") + context "#similar_tags_for_search" do + should "calculate the most similar tags for a search" do + create(:post, tag_string: "1girl solo", rating: "s") + create(:post, tag_string: "1girl solo", rating: "q") + create(:post, tag_string: "1girl 1boy", rating: "q") - assert_equal({"aaa"=>2, "bbb"=>2, "ddd"=>1, "ccc"=>2}, RelatedTagCalculator.calculate_from_sample("aaa bbb", 10)) - end - - should "calculate typed related tags for a tag" do - posts = [] - posts << FactoryBot.create(:post, :tag_string => "aaa bbb art:ccc copy:ddd") - posts << FactoryBot.create(:post, :tag_string => "aaa bbb art:ccc") - posts << FactoryBot.create(:post, :tag_string => "aaa bbb") - - assert_equal({"ccc" => 2}, RelatedTagCalculator.calculate_from_sample("aaa", 10, Tag.categories.artist)) - assert_equal({"ddd" => 1}, RelatedTagCalculator.calculate_from_sample("aaa", 10, Tag.categories.copyright)) - end - - should "convert a hash into string format" do - posts = [] - posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc ddd") - posts << FactoryBot.create(:post, :tag_string => "aaa bbb ccc") - posts << FactoryBot.create(:post, :tag_string => "aaa bbb") - - tag = Tag.find_by_name("aaa") - counts = RelatedTagCalculator.calculate_from_sample("aaa", 10) - assert_equal("aaa 3 bbb 3 ccc 2 ddd 1", RelatedTagCalculator.convert_hash_to_string(counts)) + assert_equal(%w[1girl solo 1boy], RelatedTagCalculator.similar_tags_for_search("1girl").pluck(:name)) + assert_equal(%w[1girl 1boy solo], RelatedTagCalculator.similar_tags_for_search("rating:q").pluck(:name)) + assert_equal(%w[solo 1girl], RelatedTagCalculator.similar_tags_for_search("solo").pluck(:name)) + end end end end diff --git a/test/unit/related_tag_query_test.rb b/test/unit/related_tag_query_test.rb index 8a2c48855..47748d5f7 100644 --- a/test/unit/related_tag_query_test.rb +++ b/test/unit/related_tag_query_test.rb @@ -31,7 +31,6 @@ class RelatedTagQueryTest < ActiveSupport::TestCase context "for a tag that already exists" do setup do - Tag.find_by_name("aaa").update_related @query = RelatedTagQuery.new(query: "aaa") end @@ -59,8 +58,6 @@ class RelatedTagQueryTest < ActiveSupport::TestCase @ta = FactoryBot.create(:tag_alias, antecedent_name: "xyz", consequent_name: "aaa") @wp = FactoryBot.create(:wiki_page, title: "aaa", body: "blah [[foo|blah]] [[FOO]] [[bar]] blah") @query = RelatedTagQuery.new(query: "xyz") - - Tag.find_by_name("aaa").update_related end should "take wiki tags from the consequent's wiki" do diff --git a/test/unit/tag_test.rb b/test/unit/tag_test.rb index 5ccfd3608..cc68a799d 100644 --- a/test/unit/tag_test.rb +++ b/test/unit/tag_test.rb @@ -283,15 +283,4 @@ class TagTest < ActiveSupport::TestCase assert_equal(1, tag.reload.post_count) end end - - context "The #related_tag_array method" do - should "update the related tags" do - create(:post, tag_string: "bkub") - tag = Tag.find_by_name("bkub") - - assert_nil(tag.related_tags) - perform_enqueued_jobs { tag.related_tag_array } - assert_equal([["bkub", "1"]], tag.reload.related_tag_array) - end - end end