diff --git a/app/logical/sources/strategies/weibo.rb b/app/logical/sources/strategies/weibo.rb index 84d5d8b75..b6710ef6b 100644 --- a/app/logical/sources/strategies/weibo.rb +++ b/app/logical/sources/strategies/weibo.rb @@ -32,16 +32,16 @@ module Sources module Strategies class Weibo < Base - PROFILE_URL_1 = %r{https?://(?:(?:www|m)\.)?weibo\.c(?:om|n)/(?:(?:u|profile)/)?(?\d+)\z}i - PROFILE_URL_2 = %r{https?://photo\.weibo\.com/(?\d+)}i - PROFILE_URL_3 = %r{https?://(?:www\.)?weibo\.com/p/(?\d+)}i + PROFILE_URL_1 = %r{\Ahttps?://(?:(?:www|m)\.)?weibo\.c(?:om|n)/(?:(?:u|profile)/)?(?\d+)\z}i + PROFILE_URL_2 = %r{\Ahttps?://photo\.weibo\.com/(?\d+)}i + PROFILE_URL_3 = %r{\Ahttps?://(?:www\.)?weibo\.com/p/(?\d+)}i - PAGE_URL_1 = %r{https?://(?:www\.)?weibo\.com/(?\d+)/(?\w+)(?:\?.*)?\z}i + PAGE_URL_1 = %r{\Ahttps?://(?:www\.)?weibo\.com/(?\d+)/(?\w+)(?:\?.*)?\z}i PAGE_URL_2 = %r{#{PROFILE_URL_2}/(?:wbphotos/large/mid|talbum/detail/photo_id)/(?\d+)(?:/pid/(?\w{32}))?}i - PAGE_URL_3 = %r{https?://m\.weibo\.cn/(detail/(?\d+)|status/(?\w+))}i - PAGE_URL_4 = %r{https?://tw\.weibo\.com/(?:(?\d+)|\w+)/(?\d+)}i + PAGE_URL_3 = %r{\Ahttps?://m\.weibo\.cn/(detail/(?\d+)|status/(?\w+))}i + PAGE_URL_4 = %r{\Ahttps?://tw\.weibo\.com/(?:(?\d+)|\w+)/(?\d+)}i - IMAGE_URL = %r{https?://\w{3}\.sinaimg\.cn/\w+/(?\w{32})\.}i + IMAGE_URL = %r{\Ahttps?://\w{3}\.sinaimg\.cn/\w+/(?\w{32})\.}i def domains ["weibo.com", "weibo.cn", "weibocdn.com", "sinaimg.cn"] @@ -168,6 +168,21 @@ module Sources profile_url || url end + def normalize_for_source + return url if url =~ PAGE_URL_2 + artist_id = artist_short_id_from_url + + if artist_id.present? + if illust_base62_id.present? + "https://www.weibo.com/#{artist_id}/#{illust_base62_id}" + elsif illust_long_id.present? + "https://photo.weibo.com/#{artist_id}/talbum/detail/photo_id/#{illust_long_id}" + end + elsif mobile_url.present? + mobile_url + end + end + def self.convert_image_to_large(url) url.gsub(%r{.cn/\w+/(\w+)}, '.cn/large/\1') end @@ -181,7 +196,7 @@ module Sources end def artist_short_id_from_url - [url, referer_url].compact.map { |x| x[PROFILE_URL_1, :artist_short_id] || x[PROFILE_URL_2, :artist_short_id] || x[PAGE_URL_4, :artist_short_id] }.compact.first + [url, referer_url].compact.map { |x| x[PROFILE_URL_1, :artist_short_id] || x[PROFILE_URL_2, :artist_short_id] || x[PAGE_URL_1, :artist_short_id] || x[PAGE_URL_4, :artist_short_id] }.compact.first end def artist_short_id diff --git a/test/unit/sources/weibo_test.rb b/test/unit/sources/weibo_test.rb index 246e8620f..574d4e0c9 100644 --- a/test/unit/sources/weibo_test.rb +++ b/test/unit/sources/weibo_test.rb @@ -88,5 +88,27 @@ module Sources assert_equal("https://www.weibo.com/2125874520/FDKGo4Lk0", site.canonical_url) end end + + context "normalizing for source" do + should "normalize correctly" do + source1 = "https://www.weibo.com/3150932560/H4cFbeKKA?from=page_1005053150932560_profile&wvr=6&mod=weibotime" + source2 = "http://photo.weibo.com/2125874520/wbphotos/large/mid/4242129997905387/pid/7eb64558ly1friyzhj44lj20dw2qxe81" + source3 = "https://m.weibo.cn/status/4173757483008088?luicode=20000061&lfid=4170879204256635" + source4 = "https://tw.weibo.com/SEINEN/4098035921690224" + + assert_equal("https://www.weibo.com/3150932560/H4cFbeKKA", Sources::Strategies.normalize_source(source1)) + assert_equal(source2, Sources::Strategies.normalize_source(source2)) + assert_equal("https://m.weibo.cn/status/4173757483008088", Sources::Strategies.normalize_source(source3)) + assert_equal("https://m.weibo.cn/detail/4098035921690224", Sources::Strategies.normalize_source(source4)) + end + + should "avoid normalizing unnormalizable urls" do + bad_source1 = "https://weibo.com/u/" + bad_source2 = "https://www.weibo.com/4ubergine/photos" + + assert_equal(bad_source1, Sources::Strategies.normalize_source(bad_source1)) + assert_equal(bad_source2, Sources::Strategies.normalize_source(bad_source2)) + end + end end end