Perform some scheme and hostname normalization on the URL itself

- Converts scheme and hostname to lowercase
- Converts unicode hostnames into Punycode

This all gets done before the normalized URL gets assigned.

Additionally, this removes the dead commented out line for Nicoseiga.
This commit is contained in:
BrokenEagle
2020-05-29 22:34:32 +00:00
parent c21af0c853
commit ed9135bcf3

View File

@@ -20,11 +20,9 @@ class ArtistUrl < ApplicationRecord
nil
else
url = url.sub(%r!^https://!, "http://")
url = url.sub(%r!^http://([^/]+)!i) { |domain| domain.downcase }
url = url.sub(%r!^http://blog\d+\.fc2!, "http://blog.fc2")
url = url.sub(%r!^http://blog-imgs-\d+\.fc2!, "http://blog.fc2")
url = url.sub(%r!^http://blog-imgs-\d+-\w+\.fc2!, "http://blog.fc2")
# url = url.sub(%r!^(http://seiga.nicovideo.jp/user/illust/\d+)\?.+!, '\1/')
url = url.sub(%r!^http://pictures.hentai-foundry.com//!, "http://pictures.hentai-foundry.com/")
# XXX should be handled by pixiv strategy.
@@ -105,7 +103,15 @@ class ArtistUrl < ApplicationRecord
end
def normalize
# Perform some normalization with Addressable on the URL itself
# - Converts scheme and hostname to downcase
# - Converts unicode hostname to Punycode
uri = Addressable::URI.parse(url)
uri.site = uri.normalized_site
self.url = uri.to_s
self.normalized_url = self.class.normalize(url)
rescue Addressable::URI::InvalidURIError
# Don't bother normalizing the URL if there is errors
end
def initialize_normalized_url