tags: populate words column.

Add code for parsing tags into words and for populating the `words` column
in the tags table.
This commit is contained in:
evazion
2022-09-01 23:51:07 -05:00
parent e058cfba4d
commit ec382357b8
3 changed files with 88 additions and 0 deletions

View File

@@ -171,6 +171,11 @@ class Tag < ApplicationRecord
end
concerning :NameMethods do
def name=(name)
super(name)
self.words = Tag.parse_words(name)
end
def pretty_name
name.tr("_", " ")
end
@@ -236,6 +241,47 @@ class Tag < ApplicationRecord
end
end
concerning :WordMethods do
# Characters that delimit words in tags.
WORD_DELIMITERS = " _+:;!.\/()-"
WORD_DELIMITER_REGEX = /([#{WORD_DELIMITERS}]+)/
class_methods do
# Split the tag at word boundaries.
#
# Tag.split_words("jeanne_d'arc_alter_(fate)") => ["jeanne", "_", "d'arc", "_", "alter", "_(", "fate", ")"]
# Tag.split_words("k-on!") => ["k", "-", "on!"]
# Tag.split_words("<o>_<o>") => ["<o>_<o>"]
def split_words(name)
return [name] if !parsable_into_words?(name)
name.split(WORD_DELIMITER_REGEX).compact_blank
end
# Parse the tag into plain words, removing punctuation and delimiters.
#
# Tag.parse_words("jeanne_d'arc_alter_(fate)") => ["jeanne", "d'arc", "alter", "fate"]
# Tag.parse_words("k-on!") => ["k", "on"]
# Tag.parse_words("<o>_<o>") => ["<o>_<o>"]
def parse_words(name)
return [name] if !parsable_into_words?(name)
split_words(name).map do |word|
word.remove(/\A[^a-zA-Z0-9]+|[^a-zA-Z0-9]+\z/)
end.compact_blank
end
# True if the tag can be parsed into words (it contains at least 2 contiguous letters or numbers).
#
# Tag.parsable_into_words?("k-on!") => true
# Tag.parsable_into_words?("<o>_<o>") => false
# Tag.parsable_into_words?("m.u.g.e.n") => false
def parsable_into_words?(name)
name.match?(/[a-zA-Z0-9]{2}/)
end
end
end
module SearchMethods
def autocorrect_matches(name)
fuzzy_name_matches(name).order_similarity(name)

View File

@@ -0,0 +1,8 @@
#!/usr/bin/env ruby
require_relative "base"
Tag.find_each do |tag|
tag.update_columns(words: Tag.parse_words(tag.name))
p tag
end

View File

@@ -140,6 +140,40 @@ class TagTest < ActiveSupport::TestCase
end
end
should "parse tag names into words" do
assert_equal(%w[very long hair], Tag.new(name: "very_long_hair").words)
assert_equal(%w[k on], Tag.new(name: "k-on!").words)
assert_equal(%w[hack], Tag.new(name: ".hack//").words)
assert_equal(%w[re zero], Tag.new(name: "re:zero").words)
assert_equal(%w[compass], Tag.new(name: "#compass").words)
assert_equal(%w[me me me], Tag.new(name: "me!me!me!").words)
assert_equal(%w[d gray man], Tag.new(name: "d.gray-man").words)
assert_equal(%w[steins gate], Tag.new(name: "steins;gate").words)
assert_equal(%w[ssss gridman], Tag.new(name: "ssss.gridman").words)
assert_equal(%w[yu gi oh 5d's], Tag.new(name: "yu-gi-oh!_5d's").words)
assert_equal(%w[jack o lantern], Tag.new(name: "jack-o'-lantern").words)
assert_equal(%w[d va overwatch], Tag.new(name: "d.va_(overwatch)").words)
assert_equal(%w[rosario vampire], Tag.new(name: "rosario+vampire").words)
assert_equal(%w[girls frontline], Tag.new(name: "girls'_frontline").words)
assert_equal(%w[fate grand order], Tag.new(name: "fate/grand_order").words)
assert_equal(%w[yorha no 2 type b], Tag.new(name: "yorha_no._2_type_b").words)
assert_equal(%w[love live sunshine], Tag.new(name: "love_live!_sunshine!!").words)
assert_equal(%w[jeanne d'arc alter ver shinjuku 1999 fate], Tag.new(name: "jeanne_d'arc_alter_(ver._shinjuku_1999)_(fate)").words)
assert_equal(%w[:o], Tag.new(name: ":o").words)
assert_equal(%w[o_o], Tag.new(name: "o_o").words)
assert_equal(%w[^_^], Tag.new(name: "^_^").words)
assert_equal(%w[^^^], Tag.new(name: "^^^").words)
assert_equal(%w[c.c.], Tag.new(name: "c.c.").words)
assert_equal(%w[\||/], Tag.new(name: '\||/').words)
assert_equal(%w[\(^o^)/], Tag.new(name: '\(^o^)/').words)
assert_equal(%w[<o>_<o>], Tag.new(name: "<o>_<o>").words)
assert_equal(%w[<|>_<|>], Tag.new(name: "<|>_<|>").words)
assert_equal(%w[k-----s], Tag.new(name: "k-----s").words)
assert_equal(%w[m.u.g.e.n], Tag.new(name: "m.u.g.e.n").words)
end
context "during name validation" do
# tags with spaces or uppercase are allowed because they are normalized
# to lowercase with underscores.