sources: refactor normalize_for_source.

`normalize_for_source` was used to convert image URLs to page URLs when displaying sources
on the post show page. Move all the code for converting image URLs to page URLs from
`Sources::Strategies#normalize_for_source` to `Source::URL#page_url`.

Before we had to be very careful in source strategies not to make any network calls in
`normalize_for_source`, since it was used in the view for the post show page. Now all the
code for generating page URLs is isolated in Source::URL, which makes source strategies
simpler. It also makes it easier to check if a source is an image URL or page URL, and if
the image URL is convertible to a page URL, which will make autotagging bad_link or
bad_source feasible.

Finally, this fixes it to generate better page URLs in a handful of cases:

* https://www.artstation.com/artwork/qPVGP instead of https://anubis1982918.artstation.com/projects/qPVGP
* https://yande.re/post/show?md5=b4b1d11facd1700544554e4805d47bb6s instead of https://yande.re/post?tags=md5:b4b1d11facd1700544554e4805d47bb6
* http://gallery.minitokyo.net/view/365677 instead of http://gallery.minitokyo.net/download/365677
* https://valkyriecrusade.fandom.com/wiki/File:Crimson_Hatsune_H.png instead of https://valkyriecrusade.wikia.com/wiki/File:Crimson_Hatsune_H.png
* https://rule34.paheal.net/post/view/852405 instead of https://rule34.paheal.net/post/list/md5:854806addcd3b1246424e7cea49afe31/1
This commit is contained in:
evazion
2022-03-23 00:41:56 -05:00
parent 770f850c66
commit 3aa5cab2aa
59 changed files with 471 additions and 484 deletions

View File

@@ -23,7 +23,6 @@ module Sources
Strategies::Foundation,
Strategies::Plurk,
Strategies::Tinami,
Strategies::TwitPic,
Strategies::Fantia,
]
end
@@ -36,9 +35,5 @@ module Sources
def self.canonical(url, referer)
find(url, referer).canonical_url
end
def self.normalize_source(url)
find(url).normalize_for_source || url
end
end
end

View File

@@ -52,16 +52,6 @@ module Sources::Strategies
end
end
def normalize_for_source
return if project_id.blank?
if artist_name_from_url.present?
"https://#{artist_name_from_url}.artstation.com/projects/#{project_id}"
else
"https://www.artstation.com/artwork/#{project_id}"
end
end
def image_urls_from_api
api_response[:assets].to_a.map do |asset|
if asset[:asset_type] == "image"

View File

@@ -138,12 +138,6 @@ module Sources
end
memoize :http_downloader
# Given a post/image url, this is the normalized url that will be displayed in a post's page in its stead.
# This function should never make any network call, even indirectly. Return nil to never normalize.
def normalize_for_source
nil
end
def artists
ArtistFinder.find_artists(profile_url)
end

View File

@@ -65,10 +65,6 @@ module Sources
end
end
def normalize_for_source
page_url_from_image_url
end
def profile_url
return nil if artist_name.blank?
"https://www.deviantart.com/#{artist_name.downcase}"

View File

@@ -34,19 +34,6 @@ module Sources
end
end
def normalize_for_source
if illust_id.present?
if artist_name_from_url.present?
"https://#{artist_name_from_url}.fanbox.cc/posts/#{illust_id}"
elsif artist_id_from_url.present?
"https://www.pixiv.net/fanbox/creator/#{artist_id_from_url}/post/#{illust_id}"
end
elsif artist_id_from_url.present?
# Cover images
"https://www.pixiv.net/fanbox/creator/#{artist_id_from_url}"
end
end
def profile_url
return if artist_name.blank?

View File

@@ -128,10 +128,6 @@ module Sources::Strategies
DText.from_html(artist_commentary_desc)
end
def normalize_for_source
page_url
end
def work_type
parsed_url.work_type || parsed_referer&.work_type
end

View File

@@ -76,10 +76,6 @@ module Sources
DText.from_html(artist_commentary_desc)
end
def normalize_for_source
page_url
end
def api_response
return {} if page.nil?

View File

@@ -68,10 +68,6 @@ module Sources
DText.from_html(artist_commentary_desc).gsub(/\A[[:space:]]+|[[:space:]]+\z/, "").gsub(/\n+/, "\n")
end
def normalize_for_source
page_url
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end

View File

@@ -47,10 +47,6 @@ module Sources
page&.search(".ct .text, .content .text, .posts .photo .text").to_a.compact.first&.to_html
end
def normalize_for_source
page_url
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end

View File

@@ -79,10 +79,6 @@ module Sources::Strategies
api_response.tags
end
def normalize_for_source
page_url
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc) do |element|
if element.name == "a"

View File

@@ -25,17 +25,6 @@ module Sources
image_urls.first
end
def normalize_for_source
id = post_id_from_url
md5 = post_md5_from_url
if id.present?
"https://#{domain}/post/show/#{id}"
elsif md5.present?
"https://#{domain}/post?tags=md5:#{md5}"
end
end
def tags
api_response[:tags].to_s.split.map do |tag|
[tag, "https://#{domain}/post?tags=#{CGI.escape(tag)}"]

View File

@@ -84,10 +84,6 @@ module Sources
end
end
def normalize_for_source
page_url
end
def user_name
parsed_url.username || parsed_referer&.username
end

View File

@@ -66,10 +66,6 @@ module Sources
end.gsub(/[^\w]im(\d+)/, ' seiga #\1 ').chomp
end
def normalize_for_source
page_url
end
def tag_name
return if api_client&.user_id.blank?
"nicoseiga#{api_client.user_id}"

View File

@@ -109,12 +109,6 @@ module Sources
artist_id_from_url || artist_id_from_page
end
def normalize_for_source
return if illust_id.blank?
"https://nijie.info/view.php?id=#{illust_id}"
end
def doujin?
page&.at("#dojin_left").present?
end

View File

@@ -18,112 +18,6 @@ module Sources
def artists
ArtistFinder.find_artists(url)
end
def normalize_for_source
case url
when %r{\Ahttp://www\.karabako\.net/images(?:ub)?/karabako_(\d+)(?:_\d+)?\.}i
"http://www.karabako.net/post/view/#{$1}"
# XXX http://twipple.jp is defunct
# http://p.twpl.jp/show/orig/myRVs
when %r{\Ahttp://p\.twpl\.jp/show/(?:large|orig)/([a-z0-9]+)}i
"http://p.twipple.jp/#{$1}"
when %r{\Ahttps?://blog(?:(?:-imgs-)?\d*(?:-origin)?)?\.fc2\.com/(?:(?:[^/]/){3}|(?:[^/]/))([^/]+)/(?:file/)?([^.]+\.[^?]+)}i
username = $1
filename = $2
"http://#{username}.blog.fc2.com/img/#{filename}/"
when %r{\Ahttps?://diary(\d)?\.fc2\.com/user/([^/]+)/img/(\d+)_(\d+)/(\d+)\.}i
server_id = $1
username = $2
year = $3
month = $4
day = $5
"http://diary#{server_id}.fc2.com/cgi-sys/ed.cgi/#{username}?Y=#{year}&M=#{month}&D=#{day}"
when %r{\Ahttps?://(?:fbcdn-)?s(?:content|photos)-[^/]+\.(?:fbcdn|akamaihd)\.net/hphotos-.+/\d+_(\d+)_(?:\d+_){1,3}[no]\.}i
"https://www.facebook.com/photo.php?fbid=#{$1}"
when %r{\Ahttps?://c(?:s|han|[1-4])\.sankakucomplex\.com/data(?:/sample)?/(?:[a-f0-9]{2}/){2}(?:sample-|preview)?([a-f0-9]{32})}i
"https://chan.sankakucomplex.com/en/post/show?md5=#{$1}"
when %r{\Ahttps?://(?:www|s(?:tatic|[1-4]))\.zerochan\.net/.+(?:\.|\/)(\d+)(?:\.(?:jpe?g?|png))?\z}i
"https://www.zerochan.net/#{$1}#full"
when %r{\Ahttps?://static[1-6]?\.minitokyo\.net/(?:downloads|view)/(?:\d{2}/){2}(\d+)}i
"http://gallery.minitokyo.net/download/#{$1}"
# https://gelbooru.com//images/ee/5c/ee5c9a69db9602c95debdb9b98fb3e3e.jpeg
# http://simg.gelbooru.com//images/2003/edd1d2b3881cf70c3acf540780507531.png
# https://simg3.gelbooru.com//samples/0b/3a/sample_0b3ae5e225072b8e391c827cb470d29c.jpg
when %r{\Ahttps?://(?:\w+\.)?gelbooru\.com//?(?:images|samples)/(?:\d+|\h\h/\h\h)/(?:sample_)?(?<md5>\h{32})\.}i
"https://gelbooru.com/index.php?page=post&s=list&tags=md5:#{$~[:md5]}"
when %r{\Ahttps?://(?:slot\d*\.)?im(?:g|ages)\d*\.wikia\.(?:nocookie\.net|com)/(?:_{2}cb\d{14}/)?([^/]+)(?:/[a-z]{2})?/images/(?:(?:thumb|archive)?/)?[a-f0-9]/[a-f0-9]{2}/(?:\d{14}(?:!|%21))?([^/]+)}i
subdomain = $1
filename = $2
"https://#{subdomain}.wikia.com/wiki/File:#{filename}"
when %r{\Ahttps?://vignette(?:\d*)\.wikia\.nocookie\.net/([^/]+)/images/[a-f0-9]/[a-f0-9]{2}/([^/]+)}i
subdomain = $1
filename = $2
"https://#{subdomain}.wikia.com/wiki/File:#{filename}"
when %r{\Ahttps?://e-shuushuu.net/images/\d{4}-(?:\d{2}-){2}(\d+)}i
"https://e-shuushuu.net/image/#{$1}"
when %r{\Ahttps?://jpg\.nijigen-daiaru\.com/(\d+)}i
"http://nijigen-daiaru.com/book.php?idb=#{$1}"
when %r{\Ahttps?://sozai\.doujinantena\.com/contents_jpg/([a-f0-9]{32})/}i
"http://doujinantena.com/page.php?id=#{$1}"
when %r{\Ahttps?://rule34-(?:data-\d{3}|images)\.paheal\.net/(?:_images/)?([a-f0-9]{32})}i
"https://rule34.paheal.net/post/list/md5:#{$1}/1"
when %r{\Ahttps?://shimmie\.katawa-shoujo\.com/image/(\d+)}i
"https://shimmie.katawa-shoujo.com/post/view/#{$1}"
when %r{\Ahttps://(?:(?:\w+\.)?rule34\.xxx|img\.booru\.org/(?:rule34|r34))(?:/(?:img/rule34|r34))?/{1,2}images/\d+/([a-f0-9]{32})\.}i
"https://rule34.xxx/index.php?page=post&s=list&md5=#{$1}"
when %r{(\Ahttps?://.+)/diarypro/d(?:ata/upfile/|iary\.cgi\?mode=image&upfile=)(\d+)}i
base_url = $1
entry_no = $2
"#{base_url}/diarypro/diary.cgi?no=#{entry_no}"
# XXX site is defunct
when %r{\Ahttps?://i(?:\d)?\.minus\.com/(?:i|j)([^\.]{12,})}i
"http://minus.com/i/#{$1}"
# http://art59.photozou.jp/pub/212/1986212/photo/118493247_org.v1534644005.jpg
# http://kura3.photozou.jp/pub/794/1481794/photo/161537258_org.v1364829097.jpg
when %r{\Ahttps?://\w+\.photozou\.jp/pub/\d+/(?<artist_id>\d+)/photo/(?<photo_id>\d+)_.*$}i
"https://photozou.jp/photo/show/#{$~[:artist_id]}/#{$~[:photo_id]}"
# http://img.toranoana.jp/popup_img/04/0030/09/76/040030097695-2p.jpg
# http://img.toranoana.jp/popup_img18/04/0010/22/87/040010228714-1p.jpg
# http://img.toranoana.jp/popup_blimg/04/0030/08/30/040030083068-1p.jpg
# https://ecdnimg.toranoana.jp/ec/img/04/0030/65/34/040030653417-6p.jpg
when %r{\Ahttps?://(?:\w+\.)?toranoana\.jp/(?:popup_(?:bl)?img\d*|ec/img)/\d{2}/\d{4}/\d{2}/\d{2}/(?<work_id>\d+)}i
"https://ec.toranoana.jp/tora_r/ec/item/#{$~[:work_id]}/"
# https://a.hitomi.la/galleries/907838/1.png
# https://0a.hitomi.la/galleries/1169701/23.png
# https://aa.hitomi.la/galleries/990722/003_01_002.jpg
# https://la.hitomi.la/galleries/1054851/001_main_image.jpg
when %r{\Ahttps?://\w+\.hitomi\.la/galleries/(?<gallery_id>\d+)/(?<image_id>\d+)\w*\.[a-z]+\z}i
"https://hitomi.la/reader/#{$~[:gallery_id]}.html##{$~[:image_id].to_i}"
# https://aa.hitomi.la/galleries/883451/t_rena1g.png
when %r{\Ahttps?://\w+\.hitomi\.la/galleries/(?<gallery_id>\d+)/\w*\.[a-z]+\z}i
"https://hitomi.la/galleries/#{$~[:gallery_id]}.html"
else
nil
end
end
end
end
end

View File

@@ -96,11 +96,6 @@ module Sources
api_illust[:description]
end
def normalize_for_source
return nil if illust_id.blank?
"https://www.pixiv.net/artworks/#{illust_id}"
end
def tag_name
moniker
end

View File

@@ -110,10 +110,6 @@ module Sources
end.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
end
def normalize_for_source
page_url
end
memoize :page, :page_json, :api_replies
end
end

View File

@@ -48,10 +48,6 @@ module Sources
"https://skeb.jp/@#{artist_name}/works/#{illust_id}"
end
def normalize_for_source
page_url
end
def api_url
return nil unless artist_name.present? && illust_id.present?
"https://skeb.jp/api/users/#{artist_name}/works/#{illust_id}"

View File

@@ -83,10 +83,6 @@ module Sources::Strategies
super(tag)
end
def normalize_for_source
parsed_url.page_url
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc).strip
end

View File

@@ -1,14 +0,0 @@
# frozen_string_literal: true
# @see Source::URL::TwitPic
module Sources::Strategies
class TwitPic < Base
def match?
Source::URL::TwitPic === parsed_url
end
def normalize_for_source
parsed_url.page_url || url
end
end
end

View File

@@ -93,14 +93,6 @@ module Sources::Strategies
api_response[:full_text].to_s
end
def normalize_for_source
if tag_name_from_url.present? && status_id.present?
"https://twitter.com/#{tag_name_from_url}/status/#{status_id}"
elsif status_id.present?
"https://twitter.com/i/web/status/#{status_id}"
end
end
def tags
api_response.dig(:entities, :hashtags).to_a.map do |hashtag|
[hashtag[:text], "https://twitter.com/hashtag/#{hashtag[:text]}"]
@@ -150,7 +142,7 @@ module Sources::Strategies
end
def tag_name_from_url
parsed_url.twitter_username || parsed_referer&.twitter_username
parsed_url.username || parsed_referer&.username
end
memoize :api_response

View File

@@ -87,10 +87,6 @@ module Sources
end
end
def normalize_for_source
parsed_url.normalized_url
end
def api_response
return {} if (mobile_url = parsed_url.mobile_url || parsed_referer&.mobile_url).blank?