diff --git a/app/logical/sources/strategies/twitter.rb b/app/logical/sources/strategies/twitter.rb index 8bb42f523..723664170 100644 --- a/app/logical/sources/strategies/twitter.rb +++ b/app/logical/sources/strategies/twitter.rb @@ -106,7 +106,7 @@ module Sources::Strategies end url_replacements = url_replacements.to_h - desc = artist_commentary_desc + desc = artist_commentary_desc.unicode_normalize(:nfkc) desc = CGI::unescapeHTML(desc) desc = desc.gsub(%r!https?://t\.co/[a-zA-Z0-9]+!i, url_replacements) desc = desc.gsub(%r!#([^[:space:]]+)!, '"#\\1":[https://twitter.com/hashtag/\\1]') diff --git a/test/unit/sources/twitter_test.rb b/test/unit/sources/twitter_test.rb index f6a0272eb..91089f0b4 100644 --- a/test/unit/sources/twitter_test.rb +++ b/test/unit/sources/twitter_test.rb @@ -193,5 +193,25 @@ module Sources assert_equal(tags, @site.tags) end end + + context "A tweet containing non-normalized Unicode text" do + should "be normalized to nfkc" do + site = Sources::Strategies.find("https://twitter.com/aprilarcus/status/367557195186970624") + desc1 = "𝖸𝗈 𝐔𝐧đĸ𝐜𝐨𝐝𝐞 𝗅 𝗁𝖾𝗋đ–Ŋ 𝕌 𝗅𝗂𝗄𝖾 𝑡đ‘Ļ𝑝𝑒𝑓𝑎𝑐𝑒𝑠 𝗌𝗈 𝗐𝖾 𝗉𝗎𝗍 𝗌𝗈𝗆𝖾 𝚌𝚘𝚍𝚎𝚙𝚘𝚒𝚗𝚝𝚜 𝗂𝗇 𝗒𝗈𝗎𝗋 𝔖𝔲𝔭𝔭𝔩đ”ĸđ”Ēđ”ĸđ”Ģđ”ąđ”žđ”¯đ”ļ 𝔚𝔲𝔩𝔱đ”Ļ𝔩đ”Ļđ”Ģđ”¤đ”ŗđ”žđ”Š 𝔓𝔩𝔞đ”Ģđ”ĸ 𝗌𝗈 𝗒𝗈𝗎 đ–ŧđ–ē𝗇 𝓮𝓷đ“Ŧ𝓸𝓭𝓮 𝕗𝕠𝕟đ•Ĩ𝕤 𝗂𝗇 𝗒𝗈𝗎𝗋 𝒇𝒐𝒏𝒕𝒔." + desc2 = "Yo Unicode l herd U like typefaces so we put some codepoints in your Supplementary Wultilingval Plane so you can encode fonts in your fonts." + + assert_equal(desc1, site.artist_commentary_desc) + assert_equal(desc2, site.dtext_artist_commentary_desc) + end + + should "normalize full-width hashtags" do + site = Sources::Strategies.find("https://twitter.com/corpsmanWelt/status/1037724260075069441") + desc1 = %{æ–°ã—ã„ãŠã¨ã‚‚ã ãĄ\nīŧƒã‘もぎフãƒŦãƒŗã‚ē https://t.co/sEAuu16yAQ} + desc2 = %{æ–°ã—ã„ãŠã¨ã‚‚ã ãĄ\n"#けもぎフãƒŦãƒŗã‚ē":[https://twitter.com/hashtag/けもぎフãƒŦãƒŗã‚ē]} + + assert_equal(desc1, site.artist_commentary_desc) + assert_equal(desc2, site.dtext_artist_commentary_desc) + end + end end end