Merge pull request #4488 from nonamethanks/add_weibo_support

Add Weibo support
2020-05-27 16:53:14 -05:00
parent 2c60a51f64 5c7307a1c9
commit feeea6602c
4 changed files with 312 additions and 1 deletions
--- a/app/logical/artist_finder.rb
+++ b/app/logical/artist_finder.rb
@@ -107,6 +107,8 @@ module ArtistFinder
    "ustream.tv/user", # http://www.ustream.tv/user/kazaputi
    "vk.com", # https://vk.com/id425850679
    "weibo.com", # http://www.weibo.com/5536681649
+    "weibo.com/u",
+    "weibo.com/p",
    "wp.com",
    "yande.re",
    "youtube.com",
--- a/app/logical/sources/strategies.rb
+++ b/app/logical/sources/strategies.rb
@@ -12,7 +12,8 @@ module Sources
        Strategies::Nijie,
        Strategies::Pawoo,
        Strategies::Moebooru,
-        Strategies::HentaiFoundry
+        Strategies::HentaiFoundry,
+        Strategies::Weibo
      ]
    end

--- a/app/logical/sources/strategies/weibo.rb
+++ b/app/logical/sources/strategies/weibo.rb
@@ -0,0 +1,216 @@
+# Image URLS
+# * http://ww1.sinaimg.cn/large/69917555gw1f6ggdghk28j20c87lbhdt.jpg
+#
+# Image Samples
+# * http://ww4.sinaimg.cn/mw690/77a2d531gw1f4u411ws3aj20m816fagg.jpg
+# * https://wx4.sinaimg.cn/orj360/e3930166gy1g546bz86cij20u00u040y.jpg
+# * http://ww3.sinaimg.cn/mw1024/0065kjmOgw1fabcanrzx6j30f00lcjwv.jpg
+#
+# Page URLS
+# * http://weibo.com/3357910224/EEHA1AyJP
+# * https://www.weibo.com/5501756072/IF9fugHzj?from=page_1005055501756072_profile&wvr=6&mod=weibotime
+#
+# * http://photo.weibo.com/5732523783/talbum/detail/photo_id/4029784374069389?prel=p6_3
+# * http://photo.weibo.com/2125874520/wbphotos/large/mid/4194742441135220/pid/7eb64558gy1fnbryb5nzoj20dw10419t
+# * http://tw.weibo.com/1300957955/3786333853668537
+#
+# * https://m.weibo.cn/detail/4506950043618873
+# * https://m.weibo.cn/status/J33G4tH1B
+#
+# Video
+# * https://www.weibo.com/5501756072/IF9fugHzj
+#
+# Profile URLS
+# ### Short ID
+# * https://www.weibo.com/5501756072
+# * https://www.weibo.com/u/5501756072
+# * https://m.weibo.cn/profile/5501756072
+# * https://m.weibo.cn/u/5501756072
+# ### Long ID
+# * https://www.weibo.com/p/1005055501756072
+
+module Sources
+  module Strategies
+    class Weibo < Base
+      PROFILE_URL_1 = %r{https?://(?:(?:www|m)\.)?weibo\.c(?:om|n)/(?:(?:u|profile)/)?(?<artist_short_id>\d+)\z}i
+      PROFILE_URL_2 = %r{https?://photo\.weibo\.com/(?<artist_short_id>\d+)}i
+      PROFILE_URL_3 = %r{https?://(?:www\.)?weibo\.com/p/(?<artist_long_id>\d+)}i
+
+      PAGE_URL_1    = %r{https?://(?:www\.)?weibo\.com/(?<artist_short_id>\d+)/(?<illust_base62_id>\w+)(?:\?.*)?\z}i
+      PAGE_URL_2    = %r{#{PROFILE_URL_2}/(?:wbphotos/large/mid|talbum/detail/photo_id)/(?<illust_long_id>\d+)(?:/pid/(?<image_id>\w{32}))?}i
+      PAGE_URL_3    = %r{https?://m\.weibo\.cn/(detail/(?<illust_long_id>\d+)|status/(?<illust_base62_id>\w+))}i
+      PAGE_URL_4    = %r{https?://tw\.weibo\.com/(?:(?<artist_short_id>\d+)|\w+)/(?<illust_long_id>\d+)}i
+
+      IMAGE_URL     = %r{https?://\w{3}\.sinaimg\.cn/\w+/(?<image_id>\w{32})\.}i
+
+      def domains
+        ["weibo.com", "weibo.cn", "weibocdn.com", "sinaimg.cn"]
+      end
+
+      def site_name
+        "Weibo"
+      end
+
+      def image_urls
+        urls = []
+
+        if url =~ IMAGE_URL
+          urls << self.class.convert_image_to_large(url)
+        elsif api_response.present?
+          if api_response["pics"].present?
+            urls += api_response["pics"].to_a.map { |pic| self.class.convert_image_to_large(pic["url"]) }
+          elsif api_response.dig("page_info", "type") == "video"
+            variants = api_response["page_info"]["media_info"].to_h.values + api_response["page_info"]["urls"].to_h.values
+            urls << variants.max_by do |variant|
+              if /template=(?<width>\d+)x(?<height>\d+)/ =~ variant.to_s
+                width.to_i * height.to_i
+              else
+                0
+              end
+            end
+          end
+        else
+          urls << url
+        end
+
+        urls
+      end
+
+      def image_url
+        image_id = url[PAGE_URL_2, :image_id] if url =~ PAGE_URL_2
+
+        if image_id.present?
+          image_urls.select { |i| i[IMAGE_URL, :image_id] == image_id }.compact.first
+        else
+          image_urls.first
+        end
+      end
+
+      def preview_urls
+        image_urls.map { |img| img.gsub(%r{.cn/\w+/(\w+)}, '.cn/orj360/\1') }
+      end
+
+      def page_url
+        if api_response.present?
+          artist_id = api_response["user"]["id"]
+          illust_id = api_response["bid"]
+          "https://www.weibo.com/#{artist_id}/#{illust_id}"
+        elsif url =~ IMAGE_URL
+          self.class.convert_image_to_large(url)
+        else
+          url
+        end
+      end
+
+      def tags
+        return [] if api_response.blank?
+
+        matches = api_response["text"]&.scan(/surl-text">#(.*?)#</).to_a.map { |m| m[0] }
+
+        matches.map do |match|
+          [match, "https://s.weibo.com/weibo/#{match}"]
+        end
+      end
+
+      def profile_urls
+        [profile_short_url, profile_long_url].compact
+      end
+
+      def profile_url
+        profile_urls.first
+      end
+
+      def profile_short_url
+        return if artist_short_id.blank?
+
+        "https://www.weibo.com/u/#{artist_short_id}"
+      end
+
+      def profile_long_url
+        return if artist_long_id.blank?
+
+        "https://www.weibo.com/p/#{artist_long_id}"
+      end
+
+      def artist_commentary_desc
+        return if api_response.blank?
+
+        api_response["text"]
+      end
+
+      def dtext_artist_commentary_desc
+        DText.from_html(artist_commentary_desc) do |element|
+          if element["href"].present?
+            href = Addressable::URI.heuristic_parse(element["href"])
+            href.site ||= "https://www.weibo.com"
+            href.scheme ||= "https"
+            element["href"] = href.to_s
+          end
+
+          if element["src"].present?
+            src = Addressable::URI.heuristic_parse(element["src"])
+            src.scheme ||= "https"
+            element["src"] = src.to_s
+          end
+        end
+      end
+
+      def normalized_for_artist_finder
+        url =~ %r{weibo\.com/(u|p)/\d+\z}i
+      end
+
+      def normalizable_for_artist_finder?
+        artist_short_id_from_url.present? || artist_long_id.present?
+      end
+
+      def normalize_for_artist_finder
+        profile_url = profile_short_url || profile_long_url
+        profile_url || url
+      end
+
+      def self.convert_image_to_large(url)
+        url.gsub(%r{.cn/\w+/(\w+)}, '.cn/large/\1')
+      end
+
+      def illust_long_id
+        [url, referer_url].compact.map { |x| x[PAGE_URL_2, :illust_long_id] || x[PAGE_URL_3, :illust_long_id] || x[PAGE_URL_4, :illust_long_id] }.compact.first
+      end
+
+      def illust_base62_id
+        [url, referer_url].compact.map { |x| x[PAGE_URL_1, :illust_base62_id] || x[PAGE_URL_3, :illust_base62_id] }.compact.first
+      end
+
+      def artist_short_id_from_url
+        [url, referer_url].compact.map { |x| x[PROFILE_URL_1, :artist_short_id] || x[PROFILE_URL_2, :artist_short_id] || x[PAGE_URL_4, :artist_short_id] }.compact.first
+      end
+
+      def artist_short_id
+        artist_short_id_from_url || api_response&.dig("user", "id")
+      end
+
+      def artist_long_id
+        [url, referer_url].compact.map { |x| x[PROFILE_URL_3, :artist_long_id] }.compact.first
+      end
+
+      def mobile_url
+        if illust_long_id.present?
+          "https://m.weibo.cn/detail/#{illust_long_id}"
+        elsif illust_base62_id.present?
+          "https://m.weibo.cn/status/#{illust_base62_id}"
+        end
+      end
+
+      def api_response
+        return nil if mobile_url.blank?
+
+        resp = Danbooru::Http.cache(1.minute).get(mobile_url)
+        json_string = resp.to_s[/var \$render_data = \[(.*)\]\[0\]/m, 1]
+
+        return nil if json_string.blank?
+
+        JSON.parse(json_string)["status"]
+      end
+      memoize :api_response
+    end
+  end
+end
--- a/test/unit/sources/weibo_test.rb
+++ b/test/unit/sources/weibo_test.rb
@@ -0,0 +1,92 @@
+require 'test_helper'
+
+module Sources
+  class WeiboTest < ActiveSupport::TestCase
+    context "A post with multiple pictures" do
+      setup do
+        @site = Sources::Strategies.find("https://www.weibo.com/5501756072/J2UNKfbqV?type=comment#_rnd1590548401855")
+      end
+
+      should "extract all the image urls" do
+        urls = %w[
+          https://wx1.sinaimg.cn/large/0060kO5aly1gezsyt5xvhj30ok0sgtc9.jpg
+          https://wx3.sinaimg.cn/large/0060kO5aly1gezsyuaas1j30go0sgjtj.jpg
+          https://wx3.sinaimg.cn/large/0060kO5aly1gezsys1ai9j30gi0sg0v9.jpg
+        ]
+        assert_equal(urls, @site.image_urls)
+      end
+
+      should "get the correct commentary" do
+        assert_not_nil(@site.artist_commentary_desc)
+      end
+
+      should "get the profile url" do
+        assert_equal("https://www.weibo.com/u/5501756072", @site.profile_url)
+      end
+
+      should "set the right source" do
+        assert_equal("https://www.weibo.com/5501756072/J2UNKfbqV", @site.canonical_url)
+      end
+
+      should "download an image" do
+        assert_downloaded(134_721, @site.image_url)
+      end
+
+      should "get the tags" do
+        tags = [
+          %w[fgo https://s.weibo.com/weibo/fgo],
+          %w[Alter组 https://s.weibo.com/weibo/Alter组]
+        ]
+        assert_equal(tags, @site.tags)
+      end
+
+      should "find the correct artist" do
+        @artist = FactoryBot.create(:artist, name: "nipi27", url_string: "https://www.weibo.com/u/5501756072")
+        assert_equal([@artist], @site.artists)
+      end
+    end
+
+    context "A deleted or not existing picture" do
+      should "still find the artist name" do
+        site = Sources::Strategies.find("https://www.weibo.com/5501756072/AsdAsdAsd")
+        artist = FactoryBot.create(:artist, name: "nipi27", url_string: "https://www.weibo.com/u/5501756072")
+
+        assert_equal([artist], site.artists)
+      end
+    end
+
+    context "A post with video" do
+      should "get the correct video" do
+        site = Sources::Strategies.find("https://www.weibo.com/5501756072/IF9fugHzj")
+
+        assert_downloaded(7_676_656, site.image_url)
+      end
+    end
+
+    context "A direct image sample upload" do
+      should "get the largest version" do
+        sample = Sources::Strategies.find("https://wx3.sinaimg.cn/mw690/a00fa34cly1gf62g2n8z3j21yu2jo1ky.jpg")
+
+        assert_equal("https://wx3.sinaimg.cn/large/a00fa34cly1gf62g2n8z3j21yu2jo1ky.jpg", sample.image_url)
+      end
+    end
+
+    context "An album url for a post with multiple pictures" do
+      should "upload the right picture rather than just the first" do
+        site = Sources::Strategies.find("http://photo.weibo.com/2125874520/wbphotos/large/mid/4194742441135220/pid/7eb64558gy1fnbryb5nzoj20dw10419t")
+
+        assert_equal("https://wx4.sinaimg.cn/large/7eb64558gy1fnbryb5nzoj20dw10419t.jpg", site.image_url)
+      end
+    end
+
+    context "An upload from the batch bookmarklet" do
+      should "set the right source" do
+        url = "https://wx1.sinaimg.cn/large/7eb64558gy1fnbryriihwj20dw104wtu.jpg"
+        ref = "http://photo.weibo.com/2125874520/wbphotos/large/mid/4194742441135220/pid/7eb64558gy1fnbryb5nzoj20dw10419t"
+        site = Sources::Strategies.find(url, ref)
+
+        assert_equal("https://www.weibo.com/2125874520/FDKGo4Lk0", site.canonical_url)
+      end
+    end
+  end
+end