From 61c043c6b1959520101cd836338143367d0d6e64 Mon Sep 17 00:00:00 2001 From: evazion Date: Mon, 31 Jan 2022 10:56:27 -0600 Subject: [PATCH] posts: normalize Unicode to NFC form in post sources. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix strings like "pokémon" (NFD form) and "pokémon" (NFC form) being considered different strings in sources. Also add a fix script to fix existing sources. There were only 15 posts with unnormalized sources. --- app/models/post.rb | 8 +++---- script/fixes/097_normalize_post_sources.rb | 14 +++++++++++ test/unit/post_test.rb | 9 +++++++ test/unit/upload_service_test.rb | 28 ---------------------- 4 files changed, 26 insertions(+), 33 deletions(-) create mode 100755 script/fixes/097_normalize_post_sources.rb diff --git a/app/models/post.rb b/app/models/post.rb index 5c417a4ae..78ba38c16 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -1,6 +1,4 @@ # frozen_string_literal: true -# normalize unicode in non-web sources -# normalize percent-encode unicode in source urls class Post < ApplicationRecord class RevertError < StandardError; end @@ -14,9 +12,9 @@ class Post < ApplicationRecord deletable + normalize :source, :normalize_source before_validation :merge_old_changes before_validation :normalize_tags - before_validation :strip_source before_validation :parse_pixiv_id before_validation :blank_out_nonexistent_parents before_validation :remove_parent_loops @@ -1334,8 +1332,8 @@ class Post < ApplicationRecord self end - def strip_source - self.source = source.try(:strip) + def self.normalize_source(source) + source.to_s.strip.unicode_normalize(:nfc) end def mark_as_translated(params) diff --git a/script/fixes/097_normalize_post_sources.rb b/script/fixes/097_normalize_post_sources.rb new file mode 100755 index 000000000..0f58b4107 --- /dev/null +++ b/script/fixes/097_normalize_post_sources.rb @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby + +require_relative "base" + +with_confirmation do + CurrentUser.scoped(User.system, "127.0.0.1") do + Post.where("source ~ '[^[:ascii:]]'").find_each do |post| + next if post.source.unicode_normalize(:nfc) == post.source + + post.update!(source: post.source) + puts({ id: post.id, old_source: post.source_before_last_save, new_source: post.source }) + end + end +end diff --git a/test/unit/post_test.rb b/test/unit/post_test.rb index 5818676dd..fcb6a917c 100644 --- a/test/unit/post_test.rb +++ b/test/unit/post_test.rb @@ -1340,6 +1340,15 @@ class PostTest < ActiveSupport::TestCase end context "with a source" do + context "that contains unicode characters" do + should "normalize the source to NFC form" do + source1 = "poke\u0301mon" # pokémon (nfd form) + source2 = "pok\u00e9mon" # pokémon (nfc form) + @post.update!(source: source1) + assert_equal(source2, @post.source) + end + end + context "that is not from pixiv" do should "clear the pixiv id" do @post.pixiv_id = 1234 diff --git a/test/unit/upload_service_test.rb b/test/unit/upload_service_test.rb index ab7ba85a6..b00c78785 100644 --- a/test/unit/upload_service_test.rb +++ b/test/unit/upload_service_test.rb @@ -294,32 +294,4 @@ class UploadServiceTest < ActiveSupport::TestCase end end end - - context "#start!" do - subject { UploadService } - - setup do - @source = "https://cdn.donmai.us/original/d3/4e/d34e4cf0a437a5d65f8e82b7bcd02606.jpg" - CurrentUser.user = travel_to(1.month.ago) do - FactoryBot.create(:user) - end - CurrentUser.ip_addr = "127.0.0.1" - end - - teardown do - CurrentUser.user = nil - CurrentUser.ip_addr = nil - end - - context "with a source containing unicode characters" do - should "normalize unicode characters in the source field" do - source1 = "poke\u0301mon" # pokémon (nfd form) - source2 = "pok\u00e9mon" # pokémon (nfc form) - service = subject.new(source: source1, rating: "s", file: upload_file("test/files/test.jpg")) - - assert_nothing_raised { @upload = service.start! } - assert_equal(source2, @upload.source) - end - end - end end