From efb836ac028d49082fa2d7d5879d7de51498eed3 Mon Sep 17 00:00:00 2001 From: evazion Date: Mon, 21 Dec 2020 03:23:19 -0600 Subject: [PATCH] wikis: normalize Unicode characters in wiki bodies. * Introduce an abstraction for normalizing attributes. Very loosely modeled after https://github.com/fnando/normalize_attributes. * Normalize wiki bodies to Unicode NFC form. * Normalize Unicode space characters in wiki bodies (strip zero width spaces, normalize line endings to CRLF, normalize Unicode spaces to ASCII spaces). * Trim spaces from the start and end of wiki page bodies. This may cause wiki page diffs to show spaces being removed even when the user didn't explicitly remove the spaces themselves. --- app/logical/concerns/normalizable.rb | 18 +++++++++++++++++ app/models/application_record.rb | 1 + app/models/wiki_page.rb | 8 +++----- config/initializers/core_extensions.rb | 13 ++++++++++++ test/unit/string_test.rb | 28 ++++++++++++++++++++++++++ test/unit/wiki_page_test.rb | 13 ++++++++++++ 6 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 app/logical/concerns/normalizable.rb diff --git a/app/logical/concerns/normalizable.rb b/app/logical/concerns/normalizable.rb new file mode 100644 index 000000000..7cad97e61 --- /dev/null +++ b/app/logical/concerns/normalizable.rb @@ -0,0 +1,18 @@ +module Normalizable + extend ActiveSupport::Concern + + class_methods do + def normalize(attribute, method_name) + define_method("#{attribute}=") do |value| + normalized_value = self.class.send(method_name, value) + super(normalized_value) + end + end + + private + + def normalize_text(text) + text.unicode_normalize(:nfc).normalize_whitespace.strip + end + end +end diff --git a/app/models/application_record.rb b/app/models/application_record.rb index e162b39dd..fc8391890 100644 --- a/app/models/application_record.rb +++ b/app/models/application_record.rb @@ -3,6 +3,7 @@ class ApplicationRecord < ActiveRecord::Base include Deletable include Mentionable + include Normalizable extend HasBitFlags extend Searchable diff --git a/app/models/wiki_page.rb b/app/models/wiki_page.rb index 79922b98c..236ba0410 100644 --- a/app/models/wiki_page.rb +++ b/app/models/wiki_page.rb @@ -3,11 +3,13 @@ class WikiPage < ApplicationRecord META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"] - before_validation :normalize_title before_validation :normalize_other_names before_save :update_dtext_links, if: :dtext_links_changed? after_save :create_version + normalize :title, :normalize_title + normalize :body, :normalize_text + validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed? validates :body, presence: true, unless: -> { is_deleted? || other_names.present? } validate :validate_rename @@ -151,10 +153,6 @@ class WikiPage < ApplicationRecord title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").gsub(/__/, "_").gsub(/\A_|_\z/, "") end - def normalize_title - self.title = WikiPage.normalize_title(title) - end - def normalize_other_names self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq end diff --git a/config/initializers/core_extensions.rb b/config/initializers/core_extensions.rb index ef7dba48d..2456c6b9a 100644 --- a/config/initializers/core_extensions.rb +++ b/config/initializers/core_extensions.rb @@ -41,6 +41,19 @@ module Danbooru pattern = Regexp.escape(pattern).gsub(/\\\*/, ".*") match?(/\A#{pattern}\z/i) end + + def normalize_whitespace + # Normalize various horizontal space characters to ASCII space. + text = gsub(/\p{Zs}|\t/, " ") + + # Strip various zero width space characters. + text = text.gsub(/[\u180E\u200B\u200C\u200D\u2060\uFEFF]/, "") + + # Normalize various line ending characters to CRLF. + text = text.gsub(/\r?\n|\r|\v|\f|\u0085|\u2028|\u2029/, "\r\n") + + text + end end end end diff --git a/test/unit/string_test.rb b/test/unit/string_test.rb index 47224ba77..247c637e0 100644 --- a/test/unit/string_test.rb +++ b/test/unit/string_test.rb @@ -14,4 +14,32 @@ class StringTest < ActiveSupport::TestCase assert_equal('%*%', '*\**'.to_escaped_for_sql_like) end end + + context "String#normalize_whitespace" do + should "normalize unicode spaces" do + assert_equal("foo bar", "foo bar".normalize_whitespace) + assert_equal("foo bar", "foo\u00A0bar".normalize_whitespace) + assert_equal("foo bar", "foo\u3000bar".normalize_whitespace) + end + + should "strip zero width characters" do + assert_equal("foobar", "foo\u180Ebar".normalize_whitespace) + assert_equal("foobar", "foo\u200Bbar".normalize_whitespace) + assert_equal("foobar", "foo\u200Cbar".normalize_whitespace) + assert_equal("foobar", "foo\u200Dbar".normalize_whitespace) + assert_equal("foobar", "foo\u2060bar".normalize_whitespace) + assert_equal("foobar", "foo\uFEFFbar".normalize_whitespace) + end + + should "normalize line endings" do + assert_equal("foo\r\nbar", "foo\r\nbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\nbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\rbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\vbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\fbar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\u0085bar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\u2028bar".normalize_whitespace) + assert_equal("foo\r\nbar", "foo\u2029bar".normalize_whitespace) + end + end end diff --git a/test/unit/wiki_page_test.rb b/test/unit/wiki_page_test.rb index 32391cfb8..150561e3b 100644 --- a/test/unit/wiki_page_test.rb +++ b/test/unit/wiki_page_test.rb @@ -78,6 +78,19 @@ class WikiPageTest < ActiveSupport::TestCase end end + context "the wiki body" do + should "be normalized to NFC" do + # \u00E9: é; \u0301: acute accent + @wiki = create(:wiki_page, body: "Poke\u0301mon") + assert_equal("Pok\u00E9mon", @wiki.body) + end + + should "normalize line endings and trim spaces" do + @wiki = create(:wiki_page, body: " foo\nbar\n") + assert_equal("foo\r\nbar", @wiki.body) + end + end + context "during title validation" do # these values are allowed because they're normalized first should allow_value(" foo ").for(:title).on(:create)