wiki pages, artists: fix normalization of other names.

Fix wiki pages and artists to normalize other names more consistently
and correctly.

For wiki pages, we strip leading / trailing / repeated underscores to
fix user typos, and we normalize to NFKC form to make search more consistent.

For artists, we allow leading / trailing / repeated underscores because
some artist names have these, and we normalize to NFC form because some
artists have weird names that would be lost by NFKC form. This does make
search less consistent.
This commit is contained in:
evazion
2021-01-09 23:38:38 -06:00
parent e788d8d0b6
commit 5962152ee3
6 changed files with 98 additions and 36 deletions

View File

@@ -3,11 +3,13 @@ class Artist < ApplicationRecord
class RevertError < StandardError; end class RevertError < StandardError; end
attr_accessor :url_string_changed attr_accessor :url_string_changed
array_attribute :other_names
deletable deletable
before_validation :normalize_name normalize :name, :normalize_name
before_validation :normalize_other_names normalize :other_names, :normalize_other_names
array_attribute :other_names # XXX must come after `normalize :other_names`
validate :validate_tag_category validate :validate_tag_category
validates :name, tag_name: true, uniqueness: true validates :name, tag_name: true, uniqueness: true
before_save :update_tag_category before_save :update_tag_category
@@ -55,27 +57,26 @@ class Artist < ApplicationRecord
end end
end end
module NameMethods concerning :NameMethods do
extend ActiveSupport::Concern class_methods do
module ClassMethods
def normalize_name(name) def normalize_name(name)
name.to_s.mb_chars.downcase.strip.gsub(/ /, '_').to_s name.to_s.mb_chars.downcase.strip.gsub(/ /, '_').to_s
end end
end
def normalize_name def normalize_other_names(other_names)
self.name = Artist.normalize_name(name) other_names.map { |name| normalize_other_name(name) }.uniq.reject(&:blank?)
end
# XXX Differences from wiki page other names: allow uppercase, use NFC
# instead of NFKC, and allow repeated, leading, and trailing underscores.
def normalize_other_name(other_name)
other_name.to_s.unicode_normalize(:nfc).normalize_whitespace.squish.tr(" ", "_")
end
end end
def pretty_name def pretty_name
name.tr("_", " ") name.tr("_", " ")
end end
def normalize_other_names
self.other_names = other_names.map { |x| Artist.normalize_name(x) }.uniq
self.other_names -= [name]
end
end end
module VersionMethods module VersionMethods
@@ -145,8 +146,6 @@ class Artist < ApplicationRecord
artist = Artist.new(params) artist = Artist.new(params)
end end
artist.normalize_name
artist.normalize_other_names
artist artist
end end
end end
@@ -284,7 +283,6 @@ class Artist < ApplicationRecord
end end
include UrlMethods include UrlMethods
include NameMethods
include VersionMethods include VersionMethods
extend FactoryMethods extend FactoryMethods
include TagMethods include TagMethods

View File

@@ -3,12 +3,12 @@ class WikiPage < ApplicationRecord
META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"] META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"]
before_validation :normalize_other_names
before_save :update_dtext_links, if: :dtext_links_changed? before_save :update_dtext_links, if: :dtext_links_changed?
after_save :create_version after_save :create_version
normalize :title, :normalize_title normalize :title, :normalize_title
normalize :body, :normalize_text normalize :body, :normalize_text
normalize :other_names, :normalize_other_names
validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed? validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed?
validates :body, presence: true, unless: -> { is_deleted? || other_names.present? } validates :body, presence: true, unless: -> { is_deleted? || other_names.present? }
@@ -153,12 +153,12 @@ class WikiPage < ApplicationRecord
title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").squeeze("_").gsub(/\A_|_\z/, "") title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").squeeze("_").gsub(/\A_|_\z/, "")
end end
def normalize_other_names def self.normalize_other_names(other_names)
self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq other_names.map { |name| normalize_other_name(name) }.uniq.reject(&:blank?)
end end
def self.normalize_other_name(name) def self.normalize_other_name(name)
name.unicode_normalize(:nfkc).gsub(/[[:space:]]+/, " ").strip.tr(" ", "_") name.to_s.unicode_normalize(:nfkc).normalize_whitespace.gsub(/[[:space:]]+/, "_").squeeze("_").gsub(/\A_|_\z/, "")
end end
def category_name def category_name

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env ruby
require_relative "../../config/environment"
CurrentUser.scoped(User.system) do
WikiPage.transaction do
WikiPage.find_each do |wp|
old_other_names = wp.other_names
wp.other_names = old_other_names
if wp.changed?
p [wp.id, wp.changes]
wp.save!
end
end
end
end

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env ruby
require_relative "../../config/environment"
CurrentUser.scoped(User.system) do
Artist.transaction do
Artist.find_each do |artist|
old_other_names = artist.other_names
artist.other_names = old_other_names
if artist.changed?
p [artist.id, artist.changes]
artist.save!
end
end
end
end

View File

@@ -384,9 +384,27 @@ class ArtistTest < ActiveSupport::TestCase
end end
end end
should "normalize its other names" do context "the #normalize_other_names method" do
artist = FactoryBot.create(:artist, name: "a1", other_names: "a1 aaa aaa AAA bbb ccc_ddd") subject { build(:artist) }
assert_equal("aaa bbb ccc_ddd", artist.other_names_string)
should normalize_attribute(:other_names).from([" foo"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo "]).to(["foo"])
should normalize_attribute(:other_names).from(["___foo"]).to(["___foo"])
should normalize_attribute(:other_names).from(["foo___"]).to(["foo___"])
should normalize_attribute(:other_names).from(["foo\n"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo___bar"]).to(["foo___bar"])
should normalize_attribute(:other_names).from([" _Foo Bar_ "]).to(["_Foo_Bar_"])
should normalize_attribute(:other_names).from(["foo 1", "bar 2"]).to(["foo_1", "bar_2"])
should normalize_attribute(:other_names).from(["foo", nil, "", " ", "bar"]).to(["foo", "bar"])
should normalize_attribute(:other_names).from([nil, "", " "]).to([])
should normalize_attribute(:other_names).from(["pokémon".unicode_normalize(:nfd)]).to(["pokémon".unicode_normalize(:nfkc)])
should normalize_attribute(:other_names).from(["foo", "foo"]).to(["foo"])
should normalize_attribute(:other_names).from("foo foo").to(["foo"])
should normalize_attribute(:other_names).from("foo bar").to(["foo", "bar"])
should normalize_attribute(:other_names).from("_foo_ Bar").to(["_foo_", "Bar"])
end end
should "search on its name should return results" do should "search on its name should return results" do
@@ -550,7 +568,7 @@ class ArtistTest < ActiveSupport::TestCase
artist = Artist.new_with_defaults(source: source) artist = Artist.new_with_defaults(source: source)
assert_equal("niceandcool", artist.name) assert_equal("niceandcool", artist.name)
assert_equal("nice_and_cool", artist.other_names_string) assert_equal("Nice_and_Cool niceandcool", artist.other_names_string)
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442") assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442")
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool") assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool")
end end
@@ -561,7 +579,7 @@ class ArtistTest < ActiveSupport::TestCase
artist = Artist.new_with_defaults(name: "test_artist") artist = Artist.new_with_defaults(name: "test_artist")
assert_equal("test_artist", artist.name) assert_equal("test_artist", artist.name)
assert_equal("nice_and_cool niceandcool", artist.other_names_string) assert_equal("Nice_and_Cool niceandcool", artist.other_names_string)
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442") assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442")
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool") assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool")
end end

View File

@@ -18,15 +18,6 @@ class WikiPageTest < ActiveSupport::TestCase
@wiki_page = FactoryBot.create(:wiki_page, :title => "HOT POTATO", :other_names => "foo*bar baz") @wiki_page = FactoryBot.create(:wiki_page, :title => "HOT POTATO", :other_names => "foo*bar baz")
end end
should "normalize its title" do
assert_equal("hot_potato", @wiki_page.title)
end
should "normalize its other names" do
@wiki_page.update(:other_names => "foo*bar baz baz 加賀(艦これ)")
assert_equal(%w[foo*bar baz 加賀(艦これ)], @wiki_page.other_names)
end
should "search by title" do should "search by title" do
matches = WikiPage.titled("hot potato") matches = WikiPage.titled("hot potato")
assert_equal(1, matches.count) assert_equal(1, matches.count)
@@ -91,6 +82,27 @@ class WikiPageTest < ActiveSupport::TestCase
end end
end end
context "the #normalize_other_names method" do
subject { build(:wiki_page) }
should normalize_attribute(:other_names).from([" foo"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo "]).to(["foo"])
should normalize_attribute(:other_names).from(["___foo"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo___"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo\n"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo___bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from([" _Foo Bar_ "]).to(["Foo_Bar"])
should normalize_attribute(:other_names).from(["foo 1", "bar 2"]).to(["foo_1", "bar_2"])
should normalize_attribute(:other_names).from(["foo", nil, "", " ", "bar"]).to(["foo", "bar"])
should normalize_attribute(:other_names).from([nil, "", " "]).to([])
should normalize_attribute(:other_names).from(["pokémon".unicode_normalize(:nfd)]).to(["pokémon".unicode_normalize(:nfkc)])
should normalize_attribute(:other_names).from([""]).to(["ABC"])
should normalize_attribute(:other_names).from(["foo", "foo"]).to(["foo"])
should normalize_attribute(:other_names).from(%w[foo*bar baz baz 加賀(艦これ)]).to(%w[foo*bar baz 加賀(艦これ)])
end
context "during title validation" do context "during title validation" do
should normalize_attribute(:title).from(" foo ").to("foo") should normalize_attribute(:title).from(" foo ").to("foo")
should normalize_attribute(:title).from("~foo").to("foo") should normalize_attribute(:title).from("~foo").to("foo")