From d18dc573fb151f82a06ad7bf36794b7488a0fcbd Mon Sep 17 00:00:00 2001 From: evazion Date: Sun, 10 Jan 2021 02:46:20 -0600 Subject: [PATCH] artists: fix misnormalization of emoji in other names. Fix `normalize_whitespace` to not strip zero-width joiner characters (U+200D). These characters are used in emoji and stripping them breaks some artist other names that use emoji. --- config/initializers/core_extensions.rb | 5 +++-- test/unit/artist_test.rb | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/config/initializers/core_extensions.rb b/config/initializers/core_extensions.rb index 2456c6b9a..7b8b87249 100644 --- a/config/initializers/core_extensions.rb +++ b/config/initializers/core_extensions.rb @@ -46,8 +46,9 @@ module Danbooru # Normalize various horizontal space characters to ASCII space. text = gsub(/\p{Zs}|\t/, " ") - # Strip various zero width space characters. - text = text.gsub(/[\u180E\u200B\u200C\u200D\u2060\uFEFF]/, "") + # Strip various zero width space characters. Zero width joiner (200D) + # is allowed because it's used in emoji. + text = text.gsub(/[\u180E\u200B\u200C\u2060\uFEFF]/, "") # Normalize various line ending characters to CRLF. text = text.gsub(/\r?\n|\r|\v|\f|\u0085|\u2028|\u2029/, "\r\n") diff --git a/test/unit/artist_test.rb b/test/unit/artist_test.rb index 7fdaa3471..9d17b8c0f 100644 --- a/test/unit/artist_test.rb +++ b/test/unit/artist_test.rb @@ -401,6 +401,7 @@ class ArtistTest < ActiveSupport::TestCase should normalize_attribute(:other_names).from([nil, "", " "]).to([]) should normalize_attribute(:other_names).from(["pokémon".unicode_normalize(:nfd)]).to(["pokémon".unicode_normalize(:nfkc)]) should normalize_attribute(:other_names).from(["foo", "foo"]).to(["foo"]) + should normalize_attribute(:other_names).from(["🏳️‍🌈"]).to(["🏳️‍🌈"]) should normalize_attribute(:other_names).from("foo foo").to(["foo"]) should normalize_attribute(:other_names).from("foo bar").to(["foo", "bar"])