Weibo: add source normalization
This commit is contained in:
@@ -32,16 +32,16 @@
|
|||||||
module Sources
|
module Sources
|
||||||
module Strategies
|
module Strategies
|
||||||
class Weibo < Base
|
class Weibo < Base
|
||||||
PROFILE_URL_1 = %r{https?://(?:(?:www|m)\.)?weibo\.c(?:om|n)/(?:(?:u|profile)/)?(?<artist_short_id>\d+)\z}i
|
PROFILE_URL_1 = %r{\Ahttps?://(?:(?:www|m)\.)?weibo\.c(?:om|n)/(?:(?:u|profile)/)?(?<artist_short_id>\d+)\z}i
|
||||||
PROFILE_URL_2 = %r{https?://photo\.weibo\.com/(?<artist_short_id>\d+)}i
|
PROFILE_URL_2 = %r{\Ahttps?://photo\.weibo\.com/(?<artist_short_id>\d+)}i
|
||||||
PROFILE_URL_3 = %r{https?://(?:www\.)?weibo\.com/p/(?<artist_long_id>\d+)}i
|
PROFILE_URL_3 = %r{\Ahttps?://(?:www\.)?weibo\.com/p/(?<artist_long_id>\d+)}i
|
||||||
|
|
||||||
PAGE_URL_1 = %r{https?://(?:www\.)?weibo\.com/(?<artist_short_id>\d+)/(?<illust_base62_id>\w+)(?:\?.*)?\z}i
|
PAGE_URL_1 = %r{\Ahttps?://(?:www\.)?weibo\.com/(?<artist_short_id>\d+)/(?<illust_base62_id>\w+)(?:\?.*)?\z}i
|
||||||
PAGE_URL_2 = %r{#{PROFILE_URL_2}/(?:wbphotos/large/mid|talbum/detail/photo_id)/(?<illust_long_id>\d+)(?:/pid/(?<image_id>\w{32}))?}i
|
PAGE_URL_2 = %r{#{PROFILE_URL_2}/(?:wbphotos/large/mid|talbum/detail/photo_id)/(?<illust_long_id>\d+)(?:/pid/(?<image_id>\w{32}))?}i
|
||||||
PAGE_URL_3 = %r{https?://m\.weibo\.cn/(detail/(?<illust_long_id>\d+)|status/(?<illust_base62_id>\w+))}i
|
PAGE_URL_3 = %r{\Ahttps?://m\.weibo\.cn/(detail/(?<illust_long_id>\d+)|status/(?<illust_base62_id>\w+))}i
|
||||||
PAGE_URL_4 = %r{https?://tw\.weibo\.com/(?:(?<artist_short_id>\d+)|\w+)/(?<illust_long_id>\d+)}i
|
PAGE_URL_4 = %r{\Ahttps?://tw\.weibo\.com/(?:(?<artist_short_id>\d+)|\w+)/(?<illust_long_id>\d+)}i
|
||||||
|
|
||||||
IMAGE_URL = %r{https?://\w{3}\.sinaimg\.cn/\w+/(?<image_id>\w{32})\.}i
|
IMAGE_URL = %r{\Ahttps?://\w{3}\.sinaimg\.cn/\w+/(?<image_id>\w{32})\.}i
|
||||||
|
|
||||||
def domains
|
def domains
|
||||||
["weibo.com", "weibo.cn", "weibocdn.com", "sinaimg.cn"]
|
["weibo.com", "weibo.cn", "weibocdn.com", "sinaimg.cn"]
|
||||||
@@ -168,6 +168,21 @@ module Sources
|
|||||||
profile_url || url
|
profile_url || url
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def normalize_for_source
|
||||||
|
return url if url =~ PAGE_URL_2
|
||||||
|
artist_id = artist_short_id_from_url
|
||||||
|
|
||||||
|
if artist_id.present?
|
||||||
|
if illust_base62_id.present?
|
||||||
|
"https://www.weibo.com/#{artist_id}/#{illust_base62_id}"
|
||||||
|
elsif illust_long_id.present?
|
||||||
|
"https://photo.weibo.com/#{artist_id}/talbum/detail/photo_id/#{illust_long_id}"
|
||||||
|
end
|
||||||
|
elsif mobile_url.present?
|
||||||
|
mobile_url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def self.convert_image_to_large(url)
|
def self.convert_image_to_large(url)
|
||||||
url.gsub(%r{.cn/\w+/(\w+)}, '.cn/large/\1')
|
url.gsub(%r{.cn/\w+/(\w+)}, '.cn/large/\1')
|
||||||
end
|
end
|
||||||
@@ -181,7 +196,7 @@ module Sources
|
|||||||
end
|
end
|
||||||
|
|
||||||
def artist_short_id_from_url
|
def artist_short_id_from_url
|
||||||
[url, referer_url].compact.map { |x| x[PROFILE_URL_1, :artist_short_id] || x[PROFILE_URL_2, :artist_short_id] || x[PAGE_URL_4, :artist_short_id] }.compact.first
|
[url, referer_url].compact.map { |x| x[PROFILE_URL_1, :artist_short_id] || x[PROFILE_URL_2, :artist_short_id] || x[PAGE_URL_1, :artist_short_id] || x[PAGE_URL_4, :artist_short_id] }.compact.first
|
||||||
end
|
end
|
||||||
|
|
||||||
def artist_short_id
|
def artist_short_id
|
||||||
|
|||||||
@@ -88,5 +88,27 @@ module Sources
|
|||||||
assert_equal("https://www.weibo.com/2125874520/FDKGo4Lk0", site.canonical_url)
|
assert_equal("https://www.weibo.com/2125874520/FDKGo4Lk0", site.canonical_url)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "normalizing for source" do
|
||||||
|
should "normalize correctly" do
|
||||||
|
source1 = "https://www.weibo.com/3150932560/H4cFbeKKA?from=page_1005053150932560_profile&wvr=6&mod=weibotime"
|
||||||
|
source2 = "http://photo.weibo.com/2125874520/wbphotos/large/mid/4242129997905387/pid/7eb64558ly1friyzhj44lj20dw2qxe81"
|
||||||
|
source3 = "https://m.weibo.cn/status/4173757483008088?luicode=20000061&lfid=4170879204256635"
|
||||||
|
source4 = "https://tw.weibo.com/SEINEN/4098035921690224"
|
||||||
|
|
||||||
|
assert_equal("https://www.weibo.com/3150932560/H4cFbeKKA", Sources::Strategies.normalize_source(source1))
|
||||||
|
assert_equal(source2, Sources::Strategies.normalize_source(source2))
|
||||||
|
assert_equal("https://m.weibo.cn/status/4173757483008088", Sources::Strategies.normalize_source(source3))
|
||||||
|
assert_equal("https://m.weibo.cn/detail/4098035921690224", Sources::Strategies.normalize_source(source4))
|
||||||
|
end
|
||||||
|
|
||||||
|
should "avoid normalizing unnormalizable urls" do
|
||||||
|
bad_source1 = "https://weibo.com/u/"
|
||||||
|
bad_source2 = "https://www.weibo.com/4ubergine/photos"
|
||||||
|
|
||||||
|
assert_equal(bad_source1, Sources::Strategies.normalize_source(bad_source1))
|
||||||
|
assert_equal(bad_source2, Sources::Strategies.normalize_source(bad_source2))
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user