diff --git a/app/logical/concerns/normalizable.rb b/app/logical/concerns/normalizable.rb new file mode 100644 index 000000000..7cad97e61 --- /dev/null +++ b/app/logical/concerns/normalizable.rb @@ -0,0 +1,18 @@ +module Normalizable + extend ActiveSupport::Concern + + class_methods do + def normalize(attribute, method_name) + define_method("#{attribute}=") do |value| + normalized_value = self.class.send(method_name, value) + super(normalized_value) + end + end + + private + + def normalize_text(text) + text.unicode_normalize(:nfc).normalize_whitespace.strip + end + end +end diff --git a/app/models/application_record.rb b/app/models/application_record.rb index e162b39dd..fc8391890 100644 --- a/app/models/application_record.rb +++ b/app/models/application_record.rb @@ -3,6 +3,7 @@ class ApplicationRecord < ActiveRecord::Base include Deletable include Mentionable + include Normalizable extend HasBitFlags extend Searchable diff --git a/app/models/wiki_page.rb b/app/models/wiki_page.rb index 79922b98c..236ba0410 100644 --- a/app/models/wiki_page.rb +++ b/app/models/wiki_page.rb @@ -3,11 +3,13 @@ class WikiPage < ApplicationRecord META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"] - before_validation :normalize_title before_validation :normalize_other_names before_save :update_dtext_links, if: :dtext_links_changed? after_save :create_version + normalize :title, :normalize_title + normalize :body, :normalize_text + validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed? validates :body, presence: true, unless: -> { is_deleted? || other_names.present? } validate :validate_rename @@ -151,10 +153,6 @@ class WikiPage < ApplicationRecord title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").gsub(/__/, "_").gsub(/\A_|_\z/, "") end - def normalize_title - self.title = WikiPage.normalize_title(title) - end - def normalize_other_names self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq end diff --git a/config/initializers/core_extensions.rb b/config/initializers/core_extensions.rb index ef7dba48d..2456c6b9a 100644 --- a/config/initializers/core_extensions.rb +++ b/config/initializers/core_extensions.rb @@ -41,6 +41,19 @@ module Danbooru pattern = Regexp.escape(pattern).gsub(/\\\*/, ".*") match?(/\A#{pattern}\z/i) end + + def normalize_whitespace + # Normalize various horizontal space characters to ASCII space. + text = gsub(/\p{Zs}|\t/, " ") + + # Strip various zero width space characters. + text = text.gsub(/[\u180E\u200B\u200C\u200D\u2060\uFEFF]/, "") + + # Normalize various line ending characters to CRLF. + text = text.gsub(/\r?\n|\r|\v|\f|\u0085|\u2028|\u2029/, "\r\n") + + text + end end end end diff --git a/test/unit/string_test.rb b/test/unit/string_test.rb index 47224ba77..247c637e0 100644 --- a/test/unit/string_test.rb +++ b/test/unit/string_test.rb @@ -14,4 +14,32 @@ class StringTest < ActiveSupport::TestCase assert_equal('%*%', '*\**'.to_escaped_for_sql_like) end end + + context "String#normalize_whitespace" do + should "normalize unicode spaces" do + assert_equal("foo bar", "foo bar".normalize_whitespace) + assert_equal("foo bar", "foo\u00A0bar".normalize_whitespace) + assert_equal("foo bar", "foo\u3000bar".normalize_whitespace) + end + + should "strip zero width characters" do + assert_equal("foobar", "foo\u180Ebar".normalize_whitespace) + assert_equal("foobar", "foo\u200Bbar".normalize_whitespace) + assert_equal("foobar", "foo\u200Cbar".normalize_whitespace) + assert_equal("foobar", "foo\u200Dbar".normalize_whitespace) + assert_equal("foobar", "foo\u2060bar".normalize_whitespace) + assert_equal("foobar", "foo\uFEFFbar".normalize_whitespace) + end + + should "normalize line endings" do + assert_equal("foo\r\nbar", "foo\r\nbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\nbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\rbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\vbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\fbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\u0085bar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\u2028bar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\u2029bar".normalize_whitespace) + end + end end diff --git a/test/unit/wiki_page_test.rb b/test/unit/wiki_page_test.rb index 32391cfb8..150561e3b 100644 --- a/test/unit/wiki_page_test.rb +++ b/test/unit/wiki_page_test.rb @@ -78,6 +78,19 @@ class WikiPageTest < ActiveSupport::TestCase end end + context "the wiki body" do + should "be normalized to NFC" do + # \u00E9: é; \u0301: acute accent + @wiki = create(:wiki_page, body: "Poke\u0301mon") + assert_equal("Pok\u00E9mon", @wiki.body) + end + + should "normalize line endings and trim spaces" do + @wiki = create(:wiki_page, body: " foo\nbar\n") + assert_equal("foo\r\nbar", @wiki.body) + end + end + context "during title validation" do # these values are allowed because they're normalized first should allow_value(" foo ").for(:title).on(:create)