twitter: normalize artist commentaries to nfkc (#3719)

Fixes hashtags not being interpreted when the author uses a fullwidth
number sign (#, U+FF03).

ref: https://github.com/r888888888/danbooru/issues/3719#issuecomment-419535610
This commit is contained in:
evazion
2018-09-10 21:45:50 -05:00
parent a0ebd90409
commit 9a980367f6
2 changed files with 21 additions and 1 deletions

View File

@@ -106,7 +106,7 @@ module Sources::Strategies
end
url_replacements = url_replacements.to_h
desc = artist_commentary_desc
desc = artist_commentary_desc.unicode_normalize(:nfkc)
desc = CGI::unescapeHTML(desc)
desc = desc.gsub(%r!https?://t\.co/[a-zA-Z0-9]+!i, url_replacements)
desc = desc.gsub(%r!#([^[:space:]]+)!, '"#\\1":[https://twitter.com/hashtag/\\1]')

View File

@@ -193,5 +193,25 @@ module Sources
assert_equal(tags, @site.tags)
end
end
context "A tweet containing non-normalized Unicode text" do
should "be normalized to nfkc" do
site = Sources::Strategies.find("https://twitter.com/aprilarcus/status/367557195186970624")
desc1 = "𝖸𝗈 𝐔𝐧𝐢𝐜𝐨𝐝𝐞 𝗅 𝗁𝖾𝗋𝖽 𝕌 𝗅𝗂𝗄𝖾 𝑡𝑦𝑝𝑒𝑓𝑎𝑐𝑒𝑠 𝗌𝗈 𝗐𝖾 𝗉𝗎𝗍 𝗌𝗈𝗆𝖾 𝚌𝚘𝚍𝚎𝚙𝚘𝚒𝚗𝚝𝚜 𝗂𝗇 𝗒𝗈𝗎𝗋 𝔖𝔲𝔭𝔭𝔩𝔢𝔪𝔢𝔫𝔱𝔞𝔯𝔶 𝔚𝔲𝔩𝔱𝔦𝔩𝔦𝔫𝔤𝔳𝔞𝔩 𝔓𝔩𝔞𝔫𝔢 𝗌𝗈 𝗒𝗈𝗎 𝖼𝖺𝗇 𝓮𝓷𝓬𝓸𝓭𝓮 𝕗𝕠𝕟𝕥𝕤 𝗂𝗇 𝗒𝗈𝗎𝗋 𝒇𝒐𝒏𝒕𝒔."
desc2 = "Yo Unicode l herd U like typefaces so we put some codepoints in your Supplementary Wultilingval Plane so you can encode fonts in your fonts."
assert_equal(desc1, site.artist_commentary_desc)
assert_equal(desc2, site.dtext_artist_commentary_desc)
end
should "normalize full-width hashtags" do
site = Sources::Strategies.find("https://twitter.com/corpsmanWelt/status/1037724260075069441")
desc1 = %{新しいおともだち\n#けものフレンズ https://t.co/sEAuu16yAQ}
desc2 = %{新しいおともだち\n"#けものフレンズ":[https://twitter.com/hashtag/けものフレンズ]}
assert_equal(desc1, site.artist_commentary_desc)
assert_equal(desc2, site.dtext_artist_commentary_desc)
end
end
end
end