From d8e2f2ee334001bdb963430e6e3d570cc430e362 Mon Sep 17 00:00:00 2001 From: nonamethanks Date: Mon, 7 Mar 2022 16:06:44 +0100 Subject: [PATCH] sources: factor out Source::URL::Weibo Additionally, fixed some broken tests and changed normalization for urls of album type to point to the mobile version instead, because they're only visible to logged-in users. --- app/logical/source/url.rb | 1 + app/logical/source/url/weibo.rb | 95 +++++++++++++++ app/logical/sources/strategies/weibo.rb | 150 +++--------------------- test/unit/sources/weibo_test.rb | 20 +--- 4 files changed, 119 insertions(+), 147 deletions(-) create mode 100644 app/logical/source/url/weibo.rb diff --git a/app/logical/source/url.rb b/app/logical/source/url.rb index a3db149fd..b6612906d 100644 --- a/app/logical/source/url.rb +++ b/app/logical/source/url.rb @@ -31,6 +31,7 @@ module Source Source::URL::Plurk, Source::URL::Skeb, Source::URL::TwitPic, + Source::URL::Weibo, ] # Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL. diff --git a/app/logical/source/url/weibo.rb b/app/logical/source/url/weibo.rb new file mode 100644 index 000000000..c4c353b44 --- /dev/null +++ b/app/logical/source/url/weibo.rb @@ -0,0 +1,95 @@ +# frozen_string_literal: true + +class Source::URL::Weibo < Source::URL + attr_reader :full_image_url + + def self.match?(url) + url.domain.in?(["weibo.com", "weibo.cn", "sinaimg.cn"]) + end + + def parse + case [host, *path_segments] + + # http://ww1.sinaimg.cn/large/69917555gw1f6ggdghk28j20c87lbhdt.jpg + # https://wx1.sinaimg.cn/large/002NQ2vhly1gqzqfk1agfj62981aw4qr02.jpg + # http://ww4.sinaimg.cn/mw690/77a2d531gw1f4u411ws3aj20m816fagg.jpg (sample) + # https://wx4.sinaimg.cn/orj360/e3930166gy1g546bz86cij20u00u040y.jpg (sample) + # http://ww3.sinaimg.cn/mw1024/0065kjmOgw1fabcanrzx6j30f00lcjwv.jpg (sample) + in /\w+\.sinaimg\.cn/ => host, size, file + @full_image_url = "https://#{host}/large/#{file}" + + # http://tw.weibo.com/1300957955/3786333853668537 + in "tw.weibo.com", /\w+/, /\d+/ => illust_long_id + @illust_long_id = illust_long_id + + # http://weibo.com/3357910224/EEHA1AyJP + # https://www.weibo.com/5501756072/IF9fugHzj?from=page_1005055501756072_profile&wvr=6&mod=weibotime + in /(\w+\.)?weibo\.(com|cn)/, /\d+/ => artist_short_id, /\w+/ => illust_base62_id + @artist_short_id = artist_short_id + @illust_base62_id = illust_base62_id + + # http://photo.weibo.com/2125874520/wbphotos/large/mid/4194742441135220/pid/7eb64558gy1fnbryb5nzoj20dw10419t + # http://photo.weibo.com/5732523783/talbum/detail/photo_id/4029784374069389?prel=p6_3 + in "photo.weibo.com", /\d+/ => artist_short_id, _, _, _, /\d+/ => illust_long_id, *rest + @artist_short_id = artist_short_id + @illust_long_id = illust_long_id + + # https://m.weibo.cn/detail/4506950043618873 + in "m.weibo.cn", "detail", /\d+/ => illust_long_id + @illust_base62_id = illust_base62_id + + # https://m.weibo.cn/status/J33G4tH1B + in "m.weibo.cn", "status", /\w+/ => illust_base62_id + @illust_base62_id = illust_base62_id + + # https://www.weibo.com/u/5501756072 + # https://m.weibo.cn/profile/5501756072 + # https://m.weibo.cn/u/5501756072 + in _, ("u" | "profile"), /\d+/ => artist_short_id + @artist_short_id = artist_short_id + + # https://www.weibo.com/5501756072 + in _, /\d+/ => artist_short_id + @artist_short_id = artist_short_id + + in _, "p", /\d+/ => artist_long_id + @artist_long_id = artist_long_id + + else + end + end + + def image_url? + full_image_url.present? + end + + def profile_urls + [profile_short_url, profile_long_url].compact + end + + def profile_short_url + return if @artist_short_id.blank? + "https://www.weibo.com/u/#{@artist_short_id}" + end + + def profile_long_url + return if @artist_long_id.blank? + "https://www.weibo.com/p/#{@artist_long_id}" + end + + def mobile_url + if @illust_long_id.present? + "https://m.weibo.cn/detail/#{@illust_long_id}" + elsif @illust_base62_id.present? + "https://m.weibo.cn/status/#{@illust_base62_id}" + end + end + + def normalized_url + if @artist_short_id.present? && @illust_base62_id.present? + "https://www.weibo.com/#{@artist_short_id}/#{@illust_base62_id}" + elsif mobile_url.present? + mobile_url + end + end +end diff --git a/app/logical/sources/strategies/weibo.rb b/app/logical/sources/strategies/weibo.rb index ff8b55212..c1281a894 100644 --- a/app/logical/sources/strategies/weibo.rb +++ b/app/logical/sources/strategies/weibo.rb @@ -1,91 +1,35 @@ # frozen_string_literal: true -# Image URLS -# * http://ww1.sinaimg.cn/large/69917555gw1f6ggdghk28j20c87lbhdt.jpg -# * https://wx1.sinaimg.cn/large/002NQ2vhly1gqzqfk1agfj62981aw4qr02.jpg (more than 32 characters in hash) -# -# Image Samples -# * http://ww4.sinaimg.cn/mw690/77a2d531gw1f4u411ws3aj20m816fagg.jpg -# * https://wx4.sinaimg.cn/orj360/e3930166gy1g546bz86cij20u00u040y.jpg -# * http://ww3.sinaimg.cn/mw1024/0065kjmOgw1fabcanrzx6j30f00lcjwv.jpg -# -# Page URLS -# * http://weibo.com/3357910224/EEHA1AyJP -# * https://www.weibo.com/5501756072/IF9fugHzj?from=page_1005055501756072_profile&wvr=6&mod=weibotime -# -# * http://photo.weibo.com/5732523783/talbum/detail/photo_id/4029784374069389?prel=p6_3 -# * http://photo.weibo.com/2125874520/wbphotos/large/mid/4194742441135220/pid/7eb64558gy1fnbryb5nzoj20dw10419t -# * http://tw.weibo.com/1300957955/3786333853668537 -# -# * https://m.weibo.cn/detail/4506950043618873 -# * https://m.weibo.cn/status/J33G4tH1B -# -# Video -# * https://www.weibo.com/5501756072/IF9fugHzj -# -# Profile URLS -# ### Short ID -# * https://www.weibo.com/5501756072 -# * https://www.weibo.com/u/5501756072 -# * https://m.weibo.cn/profile/5501756072 -# * https://m.weibo.cn/u/5501756072 -# ### Long ID -# * https://www.weibo.com/p/1005055501756072 - +# @see Source::URL::Weibo module Sources module Strategies class Weibo < Base - PROFILE_URL_1 = %r{\Ahttps?://(?:(?:www|m)\.)?weibo\.c(?:om|n)/(?:(?:u|profile)/)?(?\d+)\z}i - PROFILE_URL_2 = %r{\Ahttps?://photo\.weibo\.com/(?\d+)}i - PROFILE_URL_3 = %r{\Ahttps?://(?:www\.)?weibo\.com/p/(?\d+)}i - PAGE_URL_1 = %r{\Ahttps?://(?:www\.)?weibo\.com/(?\d+)/(?\w+)(?:\?.*)?\z}i - PAGE_URL_2 = %r{#{PROFILE_URL_2}/(?:wbphotos/large/mid|talbum/detail/photo_id)/(?\d+)(?:/pid/(?\w{32}))?}i - PAGE_URL_3 = %r{\Ahttps?://m\.weibo\.cn/(?:detail/(?\d+)|status/(?\w+))}i - PAGE_URL_4 = %r{\Ahttps?://tw\.weibo\.com/(?:(?\d+)|\w+)/(?\d+)}i - - IMAGE_URL = %r{\Ahttps?://\w+\.sinaimg\.cn/\w+/(?\w+)\.}i - - def domains - ["weibo.com", "weibo.cn", "weibocdn.com", "sinaimg.cn"] + def match? + Source::URL::Weibo === parsed_url end def site_name - "Weibo" + parsed_url.site_name end def image_urls - urls = [] - - if url =~ IMAGE_URL - urls << self.class.convert_image_to_large(url) + if parsed_url.image_url? + [parsed_url.full_image_url] elsif api_response.present? if api_response["pics"].present? - urls += api_response["pics"].to_a.map { |pic| self.class.convert_image_to_large(pic["url"]) } + api_response["pics"].pluck("url").map { |url| Source::URL.parse(url).full_image_url } elsif api_response.dig("page_info", "type") == "video" variants = api_response["page_info"]["media_info"].to_h.values + api_response["page_info"]["urls"].to_h.values - urls << variants.max_by do |variant| + largest_video = variants.max_by do |variant| if /template=(?\d+)x(?\d+)/ =~ variant.to_s width.to_i * height.to_i else 0 end end + [largest_video] end - else - urls << url - end - - urls - end - - def image_url - image_id = url[PAGE_URL_2, :image_id] if url =~ PAGE_URL_2 - - if image_id.present? - image_urls.select { |i| i[IMAGE_URL, :image_id] == image_id }.compact.first - else - image_urls.first end end @@ -94,47 +38,30 @@ module Sources end def page_url - if api_response.present? - artist_id = api_response["user"]["id"] - illust_id = api_response["bid"] - "https://www.weibo.com/#{artist_id}/#{illust_id}" - elsif url =~ IMAGE_URL - self.class.convert_image_to_large(url) - else - url - end + return nil unless api_response.present? + + artist_id = api_response["user"]["id"] + illust_base62_id = api_response["bid"] + "https://www.weibo.com/#{artist_id}/#{illust_base62_id}" end def tags return [] if api_response.blank? matches = api_response["text"]&.scan(/surl-text">#(.*?)#