wikis: normalize Unicode characters in wiki bodies.

* Introduce an abstraction for normalizing attributes. Very loosely
  modeled after https://github.com/fnando/normalize_attributes.
* Normalize wiki bodies to Unicode NFC form.
* Normalize Unicode space characters in wiki bodies (strip zero width
  spaces, normalize line endings to CRLF, normalize Unicode spaces to
  ASCII spaces).
* Trim spaces from the start and end of wiki page bodies. This may cause
  wiki page diffs to show spaces being removed even when the user didn't
  explicitly remove the spaces themselves.
This commit is contained in:
evazion
2020-12-21 03:23:19 -06:00
parent 48ff7c42cd
commit efb836ac02
6 changed files with 76 additions and 5 deletions

View File

@@ -0,0 +1,18 @@
module Normalizable
extend ActiveSupport::Concern
class_methods do
def normalize(attribute, method_name)
define_method("#{attribute}=") do |value|
normalized_value = self.class.send(method_name, value)
super(normalized_value)
end
end
private
def normalize_text(text)
text.unicode_normalize(:nfc).normalize_whitespace.strip
end
end
end

View File

@@ -3,6 +3,7 @@ class ApplicationRecord < ActiveRecord::Base
include Deletable
include Mentionable
include Normalizable
extend HasBitFlags
extend Searchable

View File

@@ -3,11 +3,13 @@ class WikiPage < ApplicationRecord
META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"]
before_validation :normalize_title
before_validation :normalize_other_names
before_save :update_dtext_links, if: :dtext_links_changed?
after_save :create_version
normalize :title, :normalize_title
normalize :body, :normalize_text
validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed?
validates :body, presence: true, unless: -> { is_deleted? || other_names.present? }
validate :validate_rename
@@ -151,10 +153,6 @@ class WikiPage < ApplicationRecord
title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").gsub(/__/, "_").gsub(/\A_|_\z/, "")
end
def normalize_title
self.title = WikiPage.normalize_title(title)
end
def normalize_other_names
self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq
end

View File

@@ -41,6 +41,19 @@ module Danbooru
pattern = Regexp.escape(pattern).gsub(/\\\*/, ".*")
match?(/\A#{pattern}\z/i)
end
def normalize_whitespace
# Normalize various horizontal space characters to ASCII space.
text = gsub(/\p{Zs}|\t/, " ")
# Strip various zero width space characters.
text = text.gsub(/[\u180E\u200B\u200C\u200D\u2060\uFEFF]/, "")
# Normalize various line ending characters to CRLF.
text = text.gsub(/\r?\n|\r|\v|\f|\u0085|\u2028|\u2029/, "\r\n")
text
end
end
end
end

View File

@@ -14,4 +14,32 @@ class StringTest < ActiveSupport::TestCase
assert_equal('%*%', '*\**'.to_escaped_for_sql_like)
end
end
context "String#normalize_whitespace" do
should "normalize unicode spaces" do
assert_equal("foo bar", "foo bar".normalize_whitespace)
assert_equal("foo bar", "foo\u00A0bar".normalize_whitespace)
assert_equal("foo bar", "foo\u3000bar".normalize_whitespace)
end
should "strip zero width characters" do
assert_equal("foobar", "foo\u180Ebar".normalize_whitespace)
assert_equal("foobar", "foo\u200Bbar".normalize_whitespace)
assert_equal("foobar", "foo\u200Cbar".normalize_whitespace)
assert_equal("foobar", "foo\u200Dbar".normalize_whitespace)
assert_equal("foobar", "foo\u2060bar".normalize_whitespace)
assert_equal("foobar", "foo\uFEFFbar".normalize_whitespace)
end
should "normalize line endings" do
assert_equal("foo\r\nbar", "foo\r\nbar".normalize_whitespace)
assert_equal("foo\r\nbar", "foo\nbar".normalize_whitespace)
assert_equal("foo\r\nbar", "foo\rbar".normalize_whitespace)
assert_equal("foo\r\nbar", "foo\vbar".normalize_whitespace)
assert_equal("foo\r\nbar", "foo\fbar".normalize_whitespace)
assert_equal("foo\r\nbar", "foo\u0085bar".normalize_whitespace)
assert_equal("foo\r\nbar", "foo\u2028bar".normalize_whitespace)
assert_equal("foo\r\nbar", "foo\u2029bar".normalize_whitespace)
end
end
end

View File

@@ -78,6 +78,19 @@ class WikiPageTest < ActiveSupport::TestCase
end
end
context "the wiki body" do
should "be normalized to NFC" do
# \u00E9: é; \u0301: acute accent
@wiki = create(:wiki_page, body: "Poke\u0301mon")
assert_equal("Pok\u00E9mon", @wiki.body)
end
should "normalize line endings and trim spaces" do
@wiki = create(:wiki_page, body: " foo\nbar\n")
assert_equal("foo\r\nbar", @wiki.body)
end
end
context "during title validation" do
# these values are allowed because they're normalized first
should allow_value(" foo ").for(:title).on(:create)