From ec382357b87e2887d8db7229f5cf53acfde06b45 Mon Sep 17 00:00:00 2001 From: evazion Date: Thu, 1 Sep 2022 23:51:07 -0500 Subject: [PATCH] tags: populate `words` column. Add code for parsing tags into words and for populating the `words` column in the tags table. --- app/models/tag.rb | 46 +++++++++++++++++++++++++++++++ script/fixes/113_add_tag_words.rb | 8 ++++++ test/unit/tag_test.rb | 34 +++++++++++++++++++++++ 3 files changed, 88 insertions(+) create mode 100755 script/fixes/113_add_tag_words.rb diff --git a/app/models/tag.rb b/app/models/tag.rb index 0a6c0597c..d968cd3ad 100644 --- a/app/models/tag.rb +++ b/app/models/tag.rb @@ -171,6 +171,11 @@ class Tag < ApplicationRecord end concerning :NameMethods do + def name=(name) + super(name) + self.words = Tag.parse_words(name) + end + def pretty_name name.tr("_", " ") end @@ -236,6 +241,47 @@ class Tag < ApplicationRecord end end + concerning :WordMethods do + # Characters that delimit words in tags. + WORD_DELIMITERS = " _+:;!.\/()-" + WORD_DELIMITER_REGEX = /([#{WORD_DELIMITERS}]+)/ + + class_methods do + # Split the tag at word boundaries. + # + # Tag.split_words("jeanne_d'arc_alter_(fate)") => ["jeanne", "_", "d'arc", "_", "alter", "_(", "fate", ")"] + # Tag.split_words("k-on!") => ["k", "-", "on!"] + # Tag.split_words("_") => ["_"] + def split_words(name) + return [name] if !parsable_into_words?(name) + + name.split(WORD_DELIMITER_REGEX).compact_blank + end + + # Parse the tag into plain words, removing punctuation and delimiters. + # + # Tag.parse_words("jeanne_d'arc_alter_(fate)") => ["jeanne", "d'arc", "alter", "fate"] + # Tag.parse_words("k-on!") => ["k", "on"] + # Tag.parse_words("_") => ["_"] + def parse_words(name) + return [name] if !parsable_into_words?(name) + + split_words(name).map do |word| + word.remove(/\A[^a-zA-Z0-9]+|[^a-zA-Z0-9]+\z/) + end.compact_blank + end + + # True if the tag can be parsed into words (it contains at least 2 contiguous letters or numbers). + # + # Tag.parsable_into_words?("k-on!") => true + # Tag.parsable_into_words?("_") => false + # Tag.parsable_into_words?("m.u.g.e.n") => false + def parsable_into_words?(name) + name.match?(/[a-zA-Z0-9]{2}/) + end + end + end + module SearchMethods def autocorrect_matches(name) fuzzy_name_matches(name).order_similarity(name) diff --git a/script/fixes/113_add_tag_words.rb b/script/fixes/113_add_tag_words.rb new file mode 100755 index 000000000..177027580 --- /dev/null +++ b/script/fixes/113_add_tag_words.rb @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby + +require_relative "base" + +Tag.find_each do |tag| + tag.update_columns(words: Tag.parse_words(tag.name)) + p tag +end diff --git a/test/unit/tag_test.rb b/test/unit/tag_test.rb index 9b22b43f0..2e6c3b38c 100644 --- a/test/unit/tag_test.rb +++ b/test/unit/tag_test.rb @@ -140,6 +140,40 @@ class TagTest < ActiveSupport::TestCase end end + should "parse tag names into words" do + assert_equal(%w[very long hair], Tag.new(name: "very_long_hair").words) + + assert_equal(%w[k on], Tag.new(name: "k-on!").words) + assert_equal(%w[hack], Tag.new(name: ".hack//").words) + assert_equal(%w[re zero], Tag.new(name: "re:zero").words) + assert_equal(%w[compass], Tag.new(name: "#compass").words) + assert_equal(%w[me me me], Tag.new(name: "me!me!me!").words) + assert_equal(%w[d gray man], Tag.new(name: "d.gray-man").words) + assert_equal(%w[steins gate], Tag.new(name: "steins;gate").words) + assert_equal(%w[ssss gridman], Tag.new(name: "ssss.gridman").words) + assert_equal(%w[yu gi oh 5d's], Tag.new(name: "yu-gi-oh!_5d's").words) + assert_equal(%w[jack o lantern], Tag.new(name: "jack-o'-lantern").words) + assert_equal(%w[d va overwatch], Tag.new(name: "d.va_(overwatch)").words) + assert_equal(%w[rosario vampire], Tag.new(name: "rosario+vampire").words) + assert_equal(%w[girls frontline], Tag.new(name: "girls'_frontline").words) + assert_equal(%w[fate grand order], Tag.new(name: "fate/grand_order").words) + assert_equal(%w[yorha no 2 type b], Tag.new(name: "yorha_no._2_type_b").words) + assert_equal(%w[love live sunshine], Tag.new(name: "love_live!_sunshine!!").words) + assert_equal(%w[jeanne d'arc alter ver shinjuku 1999 fate], Tag.new(name: "jeanne_d'arc_alter_(ver._shinjuku_1999)_(fate)").words) + + assert_equal(%w[:o], Tag.new(name: ":o").words) + assert_equal(%w[o_o], Tag.new(name: "o_o").words) + assert_equal(%w[^_^], Tag.new(name: "^_^").words) + assert_equal(%w[^^^], Tag.new(name: "^^^").words) + assert_equal(%w[c.c.], Tag.new(name: "c.c.").words) + assert_equal(%w[\||/], Tag.new(name: '\||/').words) + assert_equal(%w[\(^o^)/], Tag.new(name: '\(^o^)/').words) + assert_equal(%w[_], Tag.new(name: "_").words) + assert_equal(%w[<|>_<|>], Tag.new(name: "<|>_<|>").words) + assert_equal(%w[k-----s], Tag.new(name: "k-----s").words) + assert_equal(%w[m.u.g.e.n], Tag.new(name: "m.u.g.e.n").words) + end + context "during name validation" do # tags with spaces or uppercase are allowed because they are normalized # to lowercase with underscores.