From c280760b44b16ef43f1c4fa35ce8a0f2291fa455 Mon Sep 17 00:00:00 2001 From: r888888888 Date: Sat, 2 Jan 2016 09:38:24 -0800 Subject: [PATCH] add cosine similarity algo for related tag calculator --- Gemfile.lock | 3 -- app/logical/related_tag_calculator.rb | 41 +++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index f61479045..9f360122b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -384,6 +384,3 @@ DEPENDENCIES vcr webmock whenever - -BUNDLED WITH - 1.10.0 diff --git a/app/logical/related_tag_calculator.rb b/app/logical/related_tag_calculator.rb index 2215a6976..e290f3216 100644 --- a/app/logical/related_tag_calculator.rb +++ b/app/logical/related_tag_calculator.rb @@ -2,7 +2,7 @@ class RelatedTagCalculator def self.find_tags(tag, limit) CurrentUser.without_safe_mode do Post.with_timeout(5_000, []) do - Post.tag_match(tag).limit(limit).select("posts.tag_string").reorder("posts.md5").map(&:tag_string) + Post.tag_match(tag).limit(limit).select("posts.tag_string").reorder("posts.md5").pluck(:tag_string) end end end @@ -34,6 +34,41 @@ class RelatedTagCalculator counts end + def self.calculate_similar_from_sample(tag) + # this uses cosine similarity to produce more useful + # related tags, but is more db intensive + counts = Hash.new {|h, k| h[k] = 0} + + CurrentUser.without_safe_mode do + Post.with_timeout(5_000, []) do + Post.tag_match(tag).limit(400).select("posts.tag_string").reorder("posts.md5").pluck(:tag_string).each do |tag_string| + tag_string.scan(/\S+/).each do |tag| + counts[tag] += 1 + end + end + end + end + + tag_record = Tag.find_by_name(tag) + candidates = convert_hash_to_array(counts, 100) + similar_counts = Hash.new {|h, k| h[k] = 0} + CurrentUser.without_safe_mode do + Post.with_timeout(5_000) do + candidates.each do |ctag, _| + acount = Post.tag_match("#{tag} #{ctag}").count + ctag_record = Tag.find_by_name(ctag) + div = Math.sqrt(tag_record.post_count * ctag_record.post_count) + if div != 0 + c = acount / div + similar_counts[ctag] = c + end + end + end + end + + convert_hash_to_array(similar_counts) + end + def self.calculate_from_sample(tags, limit, category_constraint = nil) counts = Hash.new {|h, k| h[k] = 0} @@ -56,8 +91,8 @@ class RelatedTagCalculator counts end - def self.convert_hash_to_array(hash) - hash.to_a.sort_by {|x| -x[1]}.slice(0, 25) + def self.convert_hash_to_array(hash, limit = 25) + hash.to_a.sort_by {|x| -x[1]}.slice(0, limit) end def self.convert_hash_to_string(hash)