Perform some scheme and hostname normalization on the URL itself

- Converts scheme and hostname to lowercase
- Converts unicode hostnames into Punycode

This all gets done before the normalized URL gets assigned.

Additionally, this removes the dead commented out line for Nicoseiga.
This commit is contained in:
BrokenEagle
2020-05-29 22:34:32 +00:00
parent c21af0c853
commit ed9135bcf3

View File

@@ -20,11 +20,9 @@ class ArtistUrl < ApplicationRecord
nil nil
else else
url = url.sub(%r!^https://!, "http://") url = url.sub(%r!^https://!, "http://")
url = url.sub(%r!^http://([^/]+)!i) { |domain| domain.downcase }
url = url.sub(%r!^http://blog\d+\.fc2!, "http://blog.fc2") url = url.sub(%r!^http://blog\d+\.fc2!, "http://blog.fc2")
url = url.sub(%r!^http://blog-imgs-\d+\.fc2!, "http://blog.fc2") url = url.sub(%r!^http://blog-imgs-\d+\.fc2!, "http://blog.fc2")
url = url.sub(%r!^http://blog-imgs-\d+-\w+\.fc2!, "http://blog.fc2") url = url.sub(%r!^http://blog-imgs-\d+-\w+\.fc2!, "http://blog.fc2")
# url = url.sub(%r!^(http://seiga.nicovideo.jp/user/illust/\d+)\?.+!, '\1/')
url = url.sub(%r!^http://pictures.hentai-foundry.com//!, "http://pictures.hentai-foundry.com/") url = url.sub(%r!^http://pictures.hentai-foundry.com//!, "http://pictures.hentai-foundry.com/")
# XXX should be handled by pixiv strategy. # XXX should be handled by pixiv strategy.
@@ -105,7 +103,15 @@ class ArtistUrl < ApplicationRecord
end end
def normalize def normalize
# Perform some normalization with Addressable on the URL itself
# - Converts scheme and hostname to downcase
# - Converts unicode hostname to Punycode
uri = Addressable::URI.parse(url)
uri.site = uri.normalized_site
self.url = uri.to_s
self.normalized_url = self.class.normalize(url) self.normalized_url = self.class.normalize(url)
rescue Addressable::URI::InvalidURIError
# Don't bother normalizing the URL if there is errors
end end
def initialize_normalized_url def initialize_normalized_url