From c21af0c8539d4687bbe00909b7fce04aafbe4209 Mon Sep 17 00:00:00 2001 From: BrokenEagle Date: Fri, 29 May 2020 22:33:46 +0000 Subject: [PATCH 1/2] Fix invalid artist URLs being allowed The problem was that the Addressable parser does not catch all invalid URL cases, so some extra checks were added in. - hostname must contain a dot This accounts for URLs of the following type: http://http://something.com which has a hostname of http. The artist URL tests were also updated with cases which test all validation errors. --- app/models/artist_url.rb | 11 ++++++++++- test/unit/artist_url_test.rb | 14 +++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/app/models/artist_url.rb b/app/models/artist_url.rb index e39d3db65..0f7854d5f 100644 --- a/app/models/artist_url.rb +++ b/app/models/artist_url.rb @@ -120,9 +120,18 @@ class ArtistUrl < ApplicationRecord end end + def validate_scheme(uri) + errors[:url] << "'#{uri}' must begin with http:// or https:// " unless uri.scheme.in?(%w[http https]) + end + + def validate_hostname(uri) + errors[:url] << "'#{uri}' has a hostname '#{uri.host}' that does not contain a dot" unless uri.host&.include?('.') + end + def validate_url_format uri = Addressable::URI.parse(url) - errors[:url] << "'#{uri}' must begin with http:// or https:// " if !uri.scheme.in?(%w[http https]) + validate_scheme(uri) + validate_hostname(uri) rescue Addressable::URI::InvalidURIError => error errors[:url] << "'#{uri}' is malformed: #{error}" end diff --git a/test/unit/artist_url_test.rb b/test/unit/artist_url_test.rb index b17dc7cef..76eca5459 100644 --- a/test/unit/artist_url_test.rb +++ b/test/unit/artist_url_test.rb @@ -24,10 +24,18 @@ class ArtistUrlTest < ActiveSupport::TestCase end should "disallow invalid urls" do - url = FactoryBot.build(:artist_url, url: "www.example.com") + urls = [ + FactoryBot.build(:artist_url, url: "www.example.com"), + FactoryBot.build(:artist_url, url: ":www.example.com"), + FactoryBot.build(:artist_url, url: "http://http://www.example.com"), + ] - assert_equal(false, url.valid?) - assert_match(/must begin with http/, url.errors.full_messages.join) + assert_equal(false, urls[0].valid?) + assert_match(/must begin with http/, urls[0].errors.full_messages.join) + assert_equal(false, urls[1].valid?) + assert_match(/is malformed/, urls[1].errors.full_messages.join) + assert_equal(false, urls[2].valid?) + assert_match(/that does not contain a dot/, urls[2].errors.full_messages.join) end should "always add a trailing slash when normalized" do From ed9135bcf3b8c715a9a56f61b6b3a4c4e5a3d88c Mon Sep 17 00:00:00 2001 From: BrokenEagle Date: Fri, 29 May 2020 22:34:32 +0000 Subject: [PATCH 2/2] Perform some scheme and hostname normalization on the URL itself - Converts scheme and hostname to lowercase - Converts unicode hostnames into Punycode This all gets done before the normalized URL gets assigned. Additionally, this removes the dead commented out line for Nicoseiga. --- app/models/artist_url.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/app/models/artist_url.rb b/app/models/artist_url.rb index 0f7854d5f..54c743bbe 100644 --- a/app/models/artist_url.rb +++ b/app/models/artist_url.rb @@ -20,11 +20,9 @@ class ArtistUrl < ApplicationRecord nil else url = url.sub(%r!^https://!, "http://") - url = url.sub(%r!^http://([^/]+)!i) { |domain| domain.downcase } url = url.sub(%r!^http://blog\d+\.fc2!, "http://blog.fc2") url = url.sub(%r!^http://blog-imgs-\d+\.fc2!, "http://blog.fc2") url = url.sub(%r!^http://blog-imgs-\d+-\w+\.fc2!, "http://blog.fc2") - # url = url.sub(%r!^(http://seiga.nicovideo.jp/user/illust/\d+)\?.+!, '\1/') url = url.sub(%r!^http://pictures.hentai-foundry.com//!, "http://pictures.hentai-foundry.com/") # XXX should be handled by pixiv strategy. @@ -105,7 +103,15 @@ class ArtistUrl < ApplicationRecord end def normalize + # Perform some normalization with Addressable on the URL itself + # - Converts scheme and hostname to downcase + # - Converts unicode hostname to Punycode + uri = Addressable::URI.parse(url) + uri.site = uri.normalized_site + self.url = uri.to_s self.normalized_url = self.class.normalize(url) + rescue Addressable::URI::InvalidURIError + # Don't bother normalizing the URL if there is errors end def initialize_normalized_url