add cosine similarity algo for related tag calculator

This commit is contained in:
r888888888
2016-01-02 09:38:24 -08:00
parent 18c84e4b0f
commit c280760b44
2 changed files with 38 additions and 6 deletions

View File

@@ -384,6 +384,3 @@ DEPENDENCIES
vcr
webmock
whenever
BUNDLED WITH
1.10.0

View File

@@ -2,7 +2,7 @@ class RelatedTagCalculator
def self.find_tags(tag, limit)
CurrentUser.without_safe_mode do
Post.with_timeout(5_000, []) do
Post.tag_match(tag).limit(limit).select("posts.tag_string").reorder("posts.md5").map(&:tag_string)
Post.tag_match(tag).limit(limit).select("posts.tag_string").reorder("posts.md5").pluck(:tag_string)
end
end
end
@@ -34,6 +34,41 @@ class RelatedTagCalculator
counts
end
def self.calculate_similar_from_sample(tag)
# this uses cosine similarity to produce more useful
# related tags, but is more db intensive
counts = Hash.new {|h, k| h[k] = 0}
CurrentUser.without_safe_mode do
Post.with_timeout(5_000, []) do
Post.tag_match(tag).limit(400).select("posts.tag_string").reorder("posts.md5").pluck(:tag_string).each do |tag_string|
tag_string.scan(/\S+/).each do |tag|
counts[tag] += 1
end
end
end
end
tag_record = Tag.find_by_name(tag)
candidates = convert_hash_to_array(counts, 100)
similar_counts = Hash.new {|h, k| h[k] = 0}
CurrentUser.without_safe_mode do
Post.with_timeout(5_000) do
candidates.each do |ctag, _|
acount = Post.tag_match("#{tag} #{ctag}").count
ctag_record = Tag.find_by_name(ctag)
div = Math.sqrt(tag_record.post_count * ctag_record.post_count)
if div != 0
c = acount / div
similar_counts[ctag] = c
end
end
end
end
convert_hash_to_array(similar_counts)
end
def self.calculate_from_sample(tags, limit, category_constraint = nil)
counts = Hash.new {|h, k| h[k] = 0}
@@ -56,8 +91,8 @@ class RelatedTagCalculator
counts
end
def self.convert_hash_to_array(hash)
hash.to_a.sort_by {|x| -x[1]}.slice(0, 25)
def self.convert_hash_to_array(hash, limit = 25)
hash.to_a.sort_by {|x| -x[1]}.slice(0, limit)
end
def self.convert_hash_to_string(hash)