wiki pages, artists: fix normalization of other names.

Fix wiki pages and artists to normalize other names more consistently
and correctly.

For wiki pages, we strip leading / trailing / repeated underscores to
fix user typos, and we normalize to NFKC form to make search more consistent.

For artists, we allow leading / trailing / repeated underscores because
some artist names have these, and we normalize to NFC form because some
artists have weird names that would be lost by NFKC form. This does make
search less consistent.
This commit is contained in:
evazion
2021-01-09 23:38:38 -06:00
parent e788d8d0b6
commit 5962152ee3
6 changed files with 98 additions and 36 deletions

View File

@@ -3,11 +3,13 @@ class Artist < ApplicationRecord
class RevertError < StandardError; end
attr_accessor :url_string_changed
array_attribute :other_names
deletable
before_validation :normalize_name
before_validation :normalize_other_names
normalize :name, :normalize_name
normalize :other_names, :normalize_other_names
array_attribute :other_names # XXX must come after `normalize :other_names`
validate :validate_tag_category
validates :name, tag_name: true, uniqueness: true
before_save :update_tag_category
@@ -55,27 +57,26 @@ class Artist < ApplicationRecord
end
end
module NameMethods
extend ActiveSupport::Concern
module ClassMethods
concerning :NameMethods do
class_methods do
def normalize_name(name)
name.to_s.mb_chars.downcase.strip.gsub(/ /, '_').to_s
end
end
def normalize_name
self.name = Artist.normalize_name(name)
def normalize_other_names(other_names)
other_names.map { |name| normalize_other_name(name) }.uniq.reject(&:blank?)
end
# XXX Differences from wiki page other names: allow uppercase, use NFC
# instead of NFKC, and allow repeated, leading, and trailing underscores.
def normalize_other_name(other_name)
other_name.to_s.unicode_normalize(:nfc).normalize_whitespace.squish.tr(" ", "_")
end
end
def pretty_name
name.tr("_", " ")
end
def normalize_other_names
self.other_names = other_names.map { |x| Artist.normalize_name(x) }.uniq
self.other_names -= [name]
end
end
module VersionMethods
@@ -145,8 +146,6 @@ class Artist < ApplicationRecord
artist = Artist.new(params)
end
artist.normalize_name
artist.normalize_other_names
artist
end
end
@@ -284,7 +283,6 @@ class Artist < ApplicationRecord
end
include UrlMethods
include NameMethods
include VersionMethods
extend FactoryMethods
include TagMethods

View File

@@ -3,12 +3,12 @@ class WikiPage < ApplicationRecord
META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"]
before_validation :normalize_other_names
before_save :update_dtext_links, if: :dtext_links_changed?
after_save :create_version
normalize :title, :normalize_title
normalize :body, :normalize_text
normalize :other_names, :normalize_other_names
validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed?
validates :body, presence: true, unless: -> { is_deleted? || other_names.present? }
@@ -153,12 +153,12 @@ class WikiPage < ApplicationRecord
title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").squeeze("_").gsub(/\A_|_\z/, "")
end
def normalize_other_names
self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq
def self.normalize_other_names(other_names)
other_names.map { |name| normalize_other_name(name) }.uniq.reject(&:blank?)
end
def self.normalize_other_name(name)
name.unicode_normalize(:nfkc).gsub(/[[:space:]]+/, " ").strip.tr(" ", "_")
name.to_s.unicode_normalize(:nfkc).normalize_whitespace.gsub(/[[:space:]]+/, "_").squeeze("_").gsub(/\A_|_\z/, "")
end
def category_name

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env ruby
require_relative "../../config/environment"
CurrentUser.scoped(User.system) do
WikiPage.transaction do
WikiPage.find_each do |wp|
old_other_names = wp.other_names
wp.other_names = old_other_names
if wp.changed?
p [wp.id, wp.changes]
wp.save!
end
end
end
end

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env ruby
require_relative "../../config/environment"
CurrentUser.scoped(User.system) do
Artist.transaction do
Artist.find_each do |artist|
old_other_names = artist.other_names
artist.other_names = old_other_names
if artist.changed?
p [artist.id, artist.changes]
artist.save!
end
end
end
end

View File

@@ -384,9 +384,27 @@ class ArtistTest < ActiveSupport::TestCase
end
end
should "normalize its other names" do
artist = FactoryBot.create(:artist, name: "a1", other_names: "a1 aaa aaa AAA bbb ccc_ddd")
assert_equal("aaa bbb ccc_ddd", artist.other_names_string)
context "the #normalize_other_names method" do
subject { build(:artist) }
should normalize_attribute(:other_names).from([" foo"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo "]).to(["foo"])
should normalize_attribute(:other_names).from(["___foo"]).to(["___foo"])
should normalize_attribute(:other_names).from(["foo___"]).to(["foo___"])
should normalize_attribute(:other_names).from(["foo\n"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo___bar"]).to(["foo___bar"])
should normalize_attribute(:other_names).from([" _Foo Bar_ "]).to(["_Foo_Bar_"])
should normalize_attribute(:other_names).from(["foo 1", "bar 2"]).to(["foo_1", "bar_2"])
should normalize_attribute(:other_names).from(["foo", nil, "", " ", "bar"]).to(["foo", "bar"])
should normalize_attribute(:other_names).from([nil, "", " "]).to([])
should normalize_attribute(:other_names).from(["pokémon".unicode_normalize(:nfd)]).to(["pokémon".unicode_normalize(:nfkc)])
should normalize_attribute(:other_names).from(["foo", "foo"]).to(["foo"])
should normalize_attribute(:other_names).from("foo foo").to(["foo"])
should normalize_attribute(:other_names).from("foo bar").to(["foo", "bar"])
should normalize_attribute(:other_names).from("_foo_ Bar").to(["_foo_", "Bar"])
end
should "search on its name should return results" do
@@ -550,7 +568,7 @@ class ArtistTest < ActiveSupport::TestCase
artist = Artist.new_with_defaults(source: source)
assert_equal("niceandcool", artist.name)
assert_equal("nice_and_cool", artist.other_names_string)
assert_equal("Nice_and_Cool niceandcool", artist.other_names_string)
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442")
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool")
end
@@ -561,7 +579,7 @@ class ArtistTest < ActiveSupport::TestCase
artist = Artist.new_with_defaults(name: "test_artist")
assert_equal("test_artist", artist.name)
assert_equal("nice_and_cool niceandcool", artist.other_names_string)
assert_equal("Nice_and_Cool niceandcool", artist.other_names_string)
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/users/906442")
assert_includes(artist.urls.map(&:url), "https://www.pixiv.net/stacc/niceandcool")
end

View File

@@ -18,15 +18,6 @@ class WikiPageTest < ActiveSupport::TestCase
@wiki_page = FactoryBot.create(:wiki_page, :title => "HOT POTATO", :other_names => "foo*bar baz")
end
should "normalize its title" do
assert_equal("hot_potato", @wiki_page.title)
end
should "normalize its other names" do
@wiki_page.update(:other_names => "foo*bar baz baz 加賀(艦これ)")
assert_equal(%w[foo*bar baz 加賀(艦これ)], @wiki_page.other_names)
end
should "search by title" do
matches = WikiPage.titled("hot potato")
assert_equal(1, matches.count)
@@ -91,6 +82,27 @@ class WikiPageTest < ActiveSupport::TestCase
end
end
context "the #normalize_other_names method" do
subject { build(:wiki_page) }
should normalize_attribute(:other_names).from([" foo"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo "]).to(["foo"])
should normalize_attribute(:other_names).from(["___foo"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo___"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo\n"]).to(["foo"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from(["foo___bar"]).to(["foo_bar"])
should normalize_attribute(:other_names).from([" _Foo Bar_ "]).to(["Foo_Bar"])
should normalize_attribute(:other_names).from(["foo 1", "bar 2"]).to(["foo_1", "bar_2"])
should normalize_attribute(:other_names).from(["foo", nil, "", " ", "bar"]).to(["foo", "bar"])
should normalize_attribute(:other_names).from([nil, "", " "]).to([])
should normalize_attribute(:other_names).from(["pokémon".unicode_normalize(:nfd)]).to(["pokémon".unicode_normalize(:nfkc)])
should normalize_attribute(:other_names).from([""]).to(["ABC"])
should normalize_attribute(:other_names).from(["foo", "foo"]).to(["foo"])
should normalize_attribute(:other_names).from(%w[foo*bar baz baz 加賀(艦これ)]).to(%w[foo*bar baz 加賀(艦これ)])
end
context "during title validation" do
should normalize_attribute(:title).from(" foo ").to("foo")
should normalize_attribute(:title).from("~foo").to("foo")