wikis: normalize Unicode characters in wiki bodies.
* Introduce an abstraction for normalizing attributes. Very loosely modeled after https://github.com/fnando/normalize_attributes. * Normalize wiki bodies to Unicode NFC form. * Normalize Unicode space characters in wiki bodies (strip zero width spaces, normalize line endings to CRLF, normalize Unicode spaces to ASCII spaces). * Trim spaces from the start and end of wiki page bodies. This may cause wiki page diffs to show spaces being removed even when the user didn't explicitly remove the spaces themselves.
This commit is contained in:
18
app/logical/concerns/normalizable.rb
Normal file
18
app/logical/concerns/normalizable.rb
Normal file
@@ -0,0 +1,18 @@
|
||||
module Normalizable
|
||||
extend ActiveSupport::Concern
|
||||
|
||||
class_methods do
|
||||
def normalize(attribute, method_name)
|
||||
define_method("#{attribute}=") do |value|
|
||||
normalized_value = self.class.send(method_name, value)
|
||||
super(normalized_value)
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def normalize_text(text)
|
||||
text.unicode_normalize(:nfc).normalize_whitespace.strip
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -3,6 +3,7 @@ class ApplicationRecord < ActiveRecord::Base
|
||||
|
||||
include Deletable
|
||||
include Mentionable
|
||||
include Normalizable
|
||||
extend HasBitFlags
|
||||
extend Searchable
|
||||
|
||||
|
||||
@@ -3,11 +3,13 @@ class WikiPage < ApplicationRecord
|
||||
|
||||
META_WIKIS = ["list_of_", "tag_group:", "pool_group:", "howto:", "about:", "help:", "template:"]
|
||||
|
||||
before_validation :normalize_title
|
||||
before_validation :normalize_other_names
|
||||
before_save :update_dtext_links, if: :dtext_links_changed?
|
||||
after_save :create_version
|
||||
|
||||
normalize :title, :normalize_title
|
||||
normalize :body, :normalize_text
|
||||
|
||||
validates :title, tag_name: true, presence: true, uniqueness: true, if: :title_changed?
|
||||
validates :body, presence: true, unless: -> { is_deleted? || other_names.present? }
|
||||
validate :validate_rename
|
||||
@@ -151,10 +153,6 @@ class WikiPage < ApplicationRecord
|
||||
title.to_s.downcase.delete_prefix("~").gsub(/[[:space:]]+/, "_").gsub(/__/, "_").gsub(/\A_|_\z/, "")
|
||||
end
|
||||
|
||||
def normalize_title
|
||||
self.title = WikiPage.normalize_title(title)
|
||||
end
|
||||
|
||||
def normalize_other_names
|
||||
self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq
|
||||
end
|
||||
|
||||
@@ -41,6 +41,19 @@ module Danbooru
|
||||
pattern = Regexp.escape(pattern).gsub(/\\\*/, ".*")
|
||||
match?(/\A#{pattern}\z/i)
|
||||
end
|
||||
|
||||
def normalize_whitespace
|
||||
# Normalize various horizontal space characters to ASCII space.
|
||||
text = gsub(/\p{Zs}|\t/, " ")
|
||||
|
||||
# Strip various zero width space characters.
|
||||
text = text.gsub(/[\u180E\u200B\u200C\u200D\u2060\uFEFF]/, "")
|
||||
|
||||
# Normalize various line ending characters to CRLF.
|
||||
text = text.gsub(/\r?\n|\r|\v|\f|\u0085|\u2028|\u2029/, "\r\n")
|
||||
|
||||
text
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -14,4 +14,32 @@ class StringTest < ActiveSupport::TestCase
|
||||
assert_equal('%*%', '*\**'.to_escaped_for_sql_like)
|
||||
end
|
||||
end
|
||||
|
||||
context "String#normalize_whitespace" do
|
||||
should "normalize unicode spaces" do
|
||||
assert_equal("foo bar", "foo bar".normalize_whitespace)
|
||||
assert_equal("foo bar", "foo\u00A0bar".normalize_whitespace)
|
||||
assert_equal("foo bar", "foo\u3000bar".normalize_whitespace)
|
||||
end
|
||||
|
||||
should "strip zero width characters" do
|
||||
assert_equal("foobar", "foo\u180Ebar".normalize_whitespace)
|
||||
assert_equal("foobar", "foo\u200Bbar".normalize_whitespace)
|
||||
assert_equal("foobar", "foo\u200Cbar".normalize_whitespace)
|
||||
assert_equal("foobar", "foo\u200Dbar".normalize_whitespace)
|
||||
assert_equal("foobar", "foo\u2060bar".normalize_whitespace)
|
||||
assert_equal("foobar", "foo\uFEFFbar".normalize_whitespace)
|
||||
end
|
||||
|
||||
should "normalize line endings" do
|
||||
assert_equal("foo\r\nbar", "foo\r\nbar".normalize_whitespace)
|
||||
assert_equal("foo\r\nbar", "foo\nbar".normalize_whitespace)
|
||||
assert_equal("foo\r\nbar", "foo\rbar".normalize_whitespace)
|
||||
assert_equal("foo\r\nbar", "foo\vbar".normalize_whitespace)
|
||||
assert_equal("foo\r\nbar", "foo\fbar".normalize_whitespace)
|
||||
assert_equal("foo\r\nbar", "foo\u0085bar".normalize_whitespace)
|
||||
assert_equal("foo\r\nbar", "foo\u2028bar".normalize_whitespace)
|
||||
assert_equal("foo\r\nbar", "foo\u2029bar".normalize_whitespace)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -78,6 +78,19 @@ class WikiPageTest < ActiveSupport::TestCase
|
||||
end
|
||||
end
|
||||
|
||||
context "the wiki body" do
|
||||
should "be normalized to NFC" do
|
||||
# \u00E9: é; \u0301: acute accent
|
||||
@wiki = create(:wiki_page, body: "Poke\u0301mon")
|
||||
assert_equal("Pok\u00E9mon", @wiki.body)
|
||||
end
|
||||
|
||||
should "normalize line endings and trim spaces" do
|
||||
@wiki = create(:wiki_page, body: " foo\nbar\n")
|
||||
assert_equal("foo\r\nbar", @wiki.body)
|
||||
end
|
||||
end
|
||||
|
||||
context "during title validation" do
|
||||
# these values are allowed because they're normalized first
|
||||
should allow_value(" foo ").for(:title).on(:create)
|
||||
|
||||
Reference in New Issue
Block a user