From 5962152ee353c75cfa74b64d7f6dbb1d06956e78 Mon Sep 17 00:00:00 2001 From: evazion Date: Sat, 9 Jan 2021 23:38:38 -0600 Subject: [PATCH] wiki pages, artists: fix normalization of other names. Fix wiki pages and artists to normalize other names more consistently and correctly. For wiki pages, we strip leading / trailing / repeated underscores to fix user typos, and we normalize to NFKC form to make search more consistent. For artists, we allow leading / trailing / repeated underscores because some artist names have these, and we normalize to NFC form because some artists have weird names that would be lost by NFKC form. This does make search less consistent. --- app/models/artist.rb | 34 +++++++++---------- app/models/wiki_page.rb | 8 ++--- ...1_renormalize_other_names_in_wiki_pages.rb | 17 ++++++++++ .../072_renormalize_other_names_in_artists.rb | 17 ++++++++++ test/unit/artist_test.rb | 28 ++++++++++++--- test/unit/wiki_page_test.rb | 30 +++++++++++----- 6 files changed, 98 insertions(+), 36 deletions(-) create mode 100755 script/fixes/071_renormalize_other_names_in_wiki_pages.rb create mode 100755 script/fixes/072_renormalize_other_names_in_artists.rb diff --git a/app/models/artist.rb b/app/models/artist.rb index 56f1baee8..a9e1e3c4e 100644 --- a/app/models/artist.rb +++ b/app/models/artist.rb @@ -3,11 +3,13 @@ class Artist < ApplicationRecord class RevertError < StandardError; end attr_accessor :url_string_changed - array_attribute :other_names + deletable - before_validation :normalize_name - before_validation :normalize_other_names + normalize :name, :normalize_name + normalize :other_names, :normalize_other_names + array_attribute :other_names # XXX must come after `normalize :other_names` + validate :validate_tag_category validates :name, tag_name: true, uniqueness: true before_save :update_tag_category @@ -55,27 +57,26 @@ class Artist < ApplicationRecord end end - module NameMethods - extend ActiveSupport::Concern - - module ClassMethods + concerning :NameMethods do + class_methods do def normalize_name(name) name.to_s.mb_chars.downcase.strip.gsub(/ /, '_').to_s end - end - def normalize_name - self.name = Artist.normalize_name(name) + def normalize_other_names(other_names) + other_names.map { |name| normalize_other_name(name) }.uniq.reject(&:blank?) + end + + # XXX Differences from wiki page other names: allow uppercase, use NFC + # instead of NFKC, and allow repeated, leading, and trailing underscores. + def normalize_other_name(other_name) + other_name.to_s.unicode_normalize(:nfc).normalize_whitespace.squish.tr(" ", "_") + end end def pretty_name name.tr("_", " ") end - - def normalize_other_names - self.other_names = other_names.map { |x| Artist.normalize_name(x) }.uniq - self.other_names -= [name] - end end module VersionMethods @@ -145,8 +146,6 @@ class Artist < ApplicationRecord artist = Artist.new(params) end - artist.normalize_name - artist.normalize_other_names artist end end @@ -284,7 +283,6 @@ class Artist < ApplicationRecord end include UrlMethods - include NameMethods include VersionMethods extend FactoryMethods include TagMethods diff --git a/app/models/wiki_page.rb b/app/models/wiki_page.rb index 2f0fceb7d..d9da10bd4 100644 --- a/app/models/wiki_page.rb +++ b/app/models/wiki_page.rb @@ -3,12 +3,12 @@ class WikiPage < ApplicationRecord META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"] - before_validation :normalize_other_names before_save :update_dtext_links, if: :dtext_links_changed? after_save :create_version normalize :title, :normalize_title normalize :body, :normalize_text + normalize :other_names, :normalize_other_names validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed? validates :body, presence: true, unless: -> { is_deleted? || other_names.present? } @@ -153,12 +153,12 @@ class WikiPage < ApplicationRecord title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").squeeze("_").gsub(/\A_|_\z/, "") end - def normalize_other_names - self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq + def self.normalize_other_names(other_names) + other_names.map { |name| normalize_other_name(name) }.uniq.reject(&:blank?) end def self.normalize_other_name(name) - name.unicode_normalize(:nfkc).gsub(/[[:space:]]+/, " ").strip.tr(" ", "_") + name.to_s.unicode_normalize(:nfkc).normalize_whitespace.gsub(/[[:space:]]+/, "_").squeeze("_").gsub(/\A_|_\z/, "") end def category_name diff --git a/script/fixes/071_renormalize_other_names_in_wiki_pages.rb b/script/fixes/071_renormalize_other_names_in_wiki_pages.rb new file mode 100755 index 000000000..2fa7611c5 --- /dev/null +++ b/script/fixes/071_renormalize_other_names_in_wiki_pages.rb @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby + +require_relative "../../config/environment" + +CurrentUser.scoped(User.system) do + WikiPage.transaction do + WikiPage.find_each do |wp| + old_other_names = wp.other_names + wp.other_names = old_other_names + + if wp.changed? + p [wp.id, wp.changes] + wp.save! + end + end + end +end diff --git a/script/fixes/072_renormalize_other_names_in_artists.rb b/script/fixes/072_renormalize_other_names_in_artists.rb new file mode 100755 index 000000000..6c9e1fc88 --- /dev/null +++ b/script/fixes/072_renormalize_other_names_in_artists.rb @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby + +require_relative "../../config/environment" + +CurrentUser.scoped(User.system) do + Artist.transaction do + Artist.find_each do |artist| + old_other_names = artist.other_names + artist.other_names = old_other_names + + if artist.changed? + p [artist.id, artist.changes] + artist.save! + end + end + end +end diff --git a/test/unit/artist_test.rb b/test/unit/artist_test.rb index a7fefeb7e..7fdaa3471 100644 --- a/test/unit/artist_test.rb +++ b/test/unit/artist_test.rb @@ -384,9 +384,27 @@ class ArtistTest < ActiveSupport::TestCase end end - should "normalize its other names" do - artist = FactoryBot.create(:artist, name: "a1", other_names: "a1 aaa aaa AAA bbb ccc_ddd") - assert_equal("aaa bbb ccc_ddd", artist.other_names_string) + context "the #normalize_other_names method" do + subject { build(:artist) } + + should normalize_attribute(:other_names).from([" foo"]).to(["foo"]) + should normalize_attribute(:other_names).from(["foo "]).to(["foo"]) + should normalize_attribute(:other_names).from(["___foo"]).to(["___foo"]) + should normalize_attribute(:other_names).from(["foo___"]).to(["foo___"]) + should normalize_attribute(:other_names).from(["foo\n"]).to(["foo"]) + should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"]) + should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"]) + should normalize_attribute(:other_names).from(["foo___bar"]).to(["foo___bar"]) + should normalize_attribute(:other_names).from([" _Foo Bar_ "]).to(["_Foo_Bar_"]) + should normalize_attribute(:other_names).from(["foo 1", "bar 2"]).to(["foo_1", "bar_2"]) + should normalize_attribute(:other_names).from(["foo", nil, "", " ", "bar"]).to(["foo", "bar"]) + should normalize_attribute(:other_names).from([nil, "", " "]).to([]) + should normalize_attribute(:other_names).from(["pokémon".unicode_normalize(:nfd)]).to(["pokémon".unicode_normalize(:nfkc)]) + should normalize_attribute(:other_names).from(["foo", "foo"]).to(["foo"]) + + should normalize_attribute(:other_names).from("foo foo").to(["foo"]) + should normalize_attribute(:other_names).from("foo bar").to(["foo", "bar"]) + should normalize_attribute(:other_names).from("_foo_ Bar").to(["_foo_", "Bar"]) end should "search on its name should return results" do @@ -550,7 +568,7 @@ class ArtistTest < ActiveSupport::TestCase artist = Artist.new_with_defaults(source: source) assert_equal("niceandcool", artist.name) - assert_equal("nice_and_cool", artist.other_names_string) + assert_equal("Nice_and_Cool niceandcool", artist.other_names_string) assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442") assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool") end @@ -561,7 +579,7 @@ class ArtistTest < ActiveSupport::TestCase artist = Artist.new_with_defaults(name: "test_artist") assert_equal("test_artist", artist.name) - assert_equal("nice_and_cool niceandcool", artist.other_names_string) + assert_equal("Nice_and_Cool niceandcool", artist.other_names_string) assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442") assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool") end diff --git a/test/unit/wiki_page_test.rb b/test/unit/wiki_page_test.rb index 6f0f557bf..9b37af9f3 100644 --- a/test/unit/wiki_page_test.rb +++ b/test/unit/wiki_page_test.rb @@ -18,15 +18,6 @@ class WikiPageTest < ActiveSupport::TestCase @wiki_page = FactoryBot.create(:wiki_page, :title => "HOT POTATO", :other_names => "foo*bar baz") end - should "normalize its title" do - assert_equal("hot_potato", @wiki_page.title) - end - - should "normalize its other names" do - @wiki_page.update(:other_names => "foo*bar baz baz 加賀(艦これ)") - assert_equal(%w[foo*bar baz 加賀(艦これ)], @wiki_page.other_names) - end - should "search by title" do matches = WikiPage.titled("hot potato") assert_equal(1, matches.count) @@ -91,6 +82,27 @@ class WikiPageTest < ActiveSupport::TestCase end end + context "the #normalize_other_names method" do + subject { build(:wiki_page) } + + should normalize_attribute(:other_names).from([" foo"]).to(["foo"]) + should normalize_attribute(:other_names).from(["foo "]).to(["foo"]) + should normalize_attribute(:other_names).from(["___foo"]).to(["foo"]) + should normalize_attribute(:other_names).from(["foo___"]).to(["foo"]) + should normalize_attribute(:other_names).from(["foo\n"]).to(["foo"]) + should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"]) + should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"]) + should normalize_attribute(:other_names).from(["foo___bar"]).to(["foo_bar"]) + should normalize_attribute(:other_names).from([" _Foo Bar_ "]).to(["Foo_Bar"]) + should normalize_attribute(:other_names).from(["foo 1", "bar 2"]).to(["foo_1", "bar_2"]) + should normalize_attribute(:other_names).from(["foo", nil, "", " ", "bar"]).to(["foo", "bar"]) + should normalize_attribute(:other_names).from([nil, "", " "]).to([]) + should normalize_attribute(:other_names).from(["pokémon".unicode_normalize(:nfd)]).to(["pokémon".unicode_normalize(:nfkc)]) + should normalize_attribute(:other_names).from(["ABC"]).to(["ABC"]) + should normalize_attribute(:other_names).from(["foo", "foo"]).to(["foo"]) + should normalize_attribute(:other_names).from(%w[foo*bar baz baz 加賀(艦これ)]).to(%w[foo*bar baz 加賀(艦これ)]) + end + context "during title validation" do should normalize_attribute(:title).from(" foo ").to("foo") should normalize_attribute(:title).from("~foo").to("foo")