Add bilibili support

2022-11-04 12:11:08 +01:00
parent a244ec5a3b
commit fc4d620002
5 changed files with 415 additions and 0 deletions
--- a/app/logical/source/extractor.rb
+++ b/app/logical/source/extractor.rb
@@ -56,6 +56,7 @@ module Source
      Source::Extractor::Anifty,
      Source::Extractor::Furaffinity,
      Source::Extractor::Reddit,
+      Source::Extractor::Bilibili,
    ]

    # Should return true if the extractor is configured correctly. Return false
--- a/app/logical/source/extractor/bilibili.rb
+++ b/app/logical/source/extractor/bilibili.rb
@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+
+# @see Source::URL::Bilibili
+module Source
+  class Extractor
+    class Bilibili < Source::Extractor
+      def match?
+        Source::URL::Bilibili === parsed_url
+      end
+
+      def image_urls
+        if parsed_url&.full_image_url.present?
+          [parsed_url.full_image_url]
+        elsif data.present?
+          if t_work_id.present?
+            image_urls = data.dig("modules", "module_dynamic", "major", "draw", "items").to_a.pluck("src")
+          elsif h_work_id.present?
+            image_urls = data.dig("item", "pictures").to_a.pluck("img_src")
+          end
+          image_urls.to_a.compact.map { |u| Source::URL.parse(u).full_image_url || u }
+        elsif article_id.present?
+          page&.search("#article-content img").to_a.pluck("data-src").compact.map { |u| Source::URL.parse(URI.join("https://", u)).full_image_url || u }
+        else
+          [parsed_url.original_url]
+        end
+      end
+
+      def page_url
+        t_work_page || parsed_url.page_url || parsed_referer&.page_url
+      end
+
+      def t_work_page
+        return unless t_work_id.present?
+        "https://t.bilibili.com/#{data["id_str"]}"
+      end
+
+      def artist_commentary_title
+        if article_id.present?
+          page&.at(".article-container .title")&.text&.squish&.strip
+        end
+      end
+
+      def artist_commentary_desc
+        if t_work_id.present?
+          data.dig("modules", "module_dynamic", "desc", "rich_text_nodes").map do |text_node|
+            case text_node["type"]
+            when "RICH_TEXT_NODE_TYPE_BV"
+              "<a href='#{URI.join("https://", text_node["jump_url"])}'>#{text_node["text"]}</a>"
+            when "RICH_TEXT_NODE_TYPE_EMOJI"
+              " #{text_node.dig("emoji", "icon_url")} "
+            else # RICH_TEXT_NODE_TYPE_AT (mentions), RICH_TEXT_NODE_TYPE_TEXT (text), RICH_TEXT_NODE_TYPE_TOPIC (hashtags)
+              text_node["text"]
+            end
+          end.join
+        elsif h_work_id.present?
+          data.dig("item", "description")
+        elsif article_id.present?
+          page&.at("#article-content")&.to_html
+        end
+      end
+
+      def dtext_artist_commentary_desc
+        DText.from_html(artist_commentary_desc)
+      end
+
+      def tags
+        if t_work_id.present?
+          tag_list = data.dig("modules", "module_dynamic", "desc", "rich_text_nodes").to_a.select { |n| n["type"] == "RICH_TEXT_NODE_TYPE_TOPIC" }.map { |tag| tag["text"].gsub(/(^#|#$)/, "") }
+        elsif h_work_id.present?
+          tag_list = data.dig("item", "tags").to_a.pluck(:tag)
+        else # bilibili.com/read/:id posts have no tags that I could find
+          return []
+        end
+
+        tag_list.map { |tag| [tag, "https://t.bilibili.com/topic/name/#{tag}"] }
+      end
+
+      def artist_name
+        if t_work_id.present?
+          data.dig("modules", "module_author", "name")
+        elsif h_work_id.present?
+          data.dig("user", "name")
+        elsif article_id.present?
+          page&.at(".article-container .up-name")&.text&.squish&.strip
+        end
+      end
+
+      def tag_name
+        return unless artist_id.present?
+        "bilibili_#{artist_id}"
+      end
+
+      def artist_id
+        artist_id_from_data || parsed_url.artist_id || parsed_referer&.artist_id
+      end
+
+      def artist_id_from_data
+        if t_work_id.present?
+          data.dig("modules", "module_author", "mid")
+        elsif h_work_id.present?
+          data.dig("user", "uid")
+        elsif article_id.present?
+          artist_url = page&.at(".article-container .up-name")&.[]("href")
+          Source::URL.parse(URI.join("https://", artist_url))&.artist_id
+        end
+      end
+
+      def profile_url
+        return nil if artist_id.blank?
+        "https://space.bilibili.com/#{artist_id}"
+      end
+
+      def t_work_id
+        # for a repost this will be the ID of the repost, not the original one
+        parsed_url.t_work_id || parsed_referer&.t_work_id
+      end
+
+      def h_work_id
+        parsed_url.h_work_id || parsed_referer&.h_work_id
+      end
+
+      def article_id
+        parsed_url.article_id || parsed_referer&.article_id
+      end
+
+      def page
+        return unless page_url.present?
+        response = http.cache(1.minute).get(page_url)
+        return response.parse unless response.status != 200
+      end
+
+      def get_json(url)
+        response = http.cache(1.minute).get(url)
+        return {} unless response.status == 200
+        JSON.parse(response).with_indifferent_access
+      rescue JSON::ParserError
+        {}
+      end
+
+      def data
+        if t_work_id.present?
+          data = get_json("https://api.bilibili.com/x/polymer/web-dynamic/v1/detail?timezone_offset=-60&id=#{t_work_id}")
+          if data.dig("data", "item", "orig", "id_str").present? # it means it's a repost
+            data.dig("data", "item", "orig")
+          else
+            data.dig("data", "item").to_h
+          end
+        elsif h_work_id.present?
+          data = get_json("https://api.vc.bilibili.com/link_draw/v1/doc/detail?doc_id=#{h_work_id}")
+          data["data"].to_h
+        else
+          {}
+        end
+      end
+
+      memoize :data, :page
+    end
+  end
+end
--- a/app/logical/source/url.rb
+++ b/app/logical/source/url.rb
@@ -59,6 +59,7 @@ module Source
      Source::URL::Weibo,
      Source::URL::Anifty,
      Source::URL::Furaffinity,
+      Source::URL::Bilibili,
    ]

    # Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL.
--- a/app/logical/source/url/bilibili.rb
+++ b/app/logical/source/url/bilibili.rb
@@ -0,0 +1,107 @@
+# frozen_string_literal: true
+
+# Unsupported:
+# * https://www.bilibili.com/festival/arknights2022?bvid=BV1sr4y1e7gQ
+# * https://game.bilibili.com/sssj/#character
+# * http://i0.hdslb.com/Wallpaper/bilibili_chun.jpg
+# * https://www.bilibili.com/html/bizhi.html
+
+module Source
+  class URL
+    class Bilibili < Source::URL
+      attr_reader :file, :t_work_id, :h_work_id, :video_id, :article_id, :artist_id
+
+      def self.match?(url)
+        url.domain.in?(["bilibili.com", "hdslb.com"])
+      end
+
+      def parse
+        case [subdomain, domain, *path_segments]
+
+        # https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg@1036w.webp
+        # https://i0.hdslb.com/bfs/new_dyn/716a9733fc804d11d823cfacb7a3c78b11742550.jpg@208w_208h_1e_1c.webp
+        in _, "hdslb.com", "bfs", "new_dyn", /^(\w{32}(\d{8,})\.\w+)(?:@\w+\.\w+)?$/ => file
+          @file = $1
+          @artist_id = $2
+
+        # https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif@1036w.webp
+        # https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif
+        # https://i0.hdslb.com/bfs/article/48e75b3871fa5ed62b4e3a16bf60f52f96b1b3b1.jpg@942w_1334h_progressive.webp
+        in  _, "hdslb.com", "bfs", subsite, /^(\w{40}\.\w+)(?:@\w+\.\w+)?$/ => file
+          @file = $1
+
+        # https://i0.hdslb.com/bfs/activity-plat/static/2cf2b9af5d3c5781d611d6e36f405144/E738vcDvd3.png
+        in  _, "hdslb.com", "bfs", subsite, "static", subpath, /^\w+\.\w+$/ => file
+        # pass
+
+        # https://t.bilibili.com/686082748803186697
+        # https://t.bilibili.com/723052706467414039?spm_id_from=333.999.0.0 (quoted repost)
+        in "t", "bilibili.com", /^\d+$/ => t_work_id
+          @t_work_id = t_work_id
+
+        # https://m.bilibili.com/dynamic/612214375070704555
+        in "m", "bilibili.com", "dynamic", /^\d+$/ => t_work_id
+          @t_work_id = t_work_id
+
+        # https://h.bilibili.com/83341894
+        in "h", "bilibili.com", /^\d+$/ => h_work_id
+          @h_work_id = h_work_id
+
+        # https://www.bilibili.com/p/h5/8773541
+        in ("www" | ""), "bilibili.com", "p", "h5", /^\d+$/ => h_work_id
+          @h_work_id = h_work_id
+
+        # https://www.bilibili.com/read/cv7360489
+        in ("www" | ""), "bilibili.com", "read", /^cv(\d+)$/
+          @article_id = $1
+
+        # https://space.bilibili.com/355143
+        # https://space.bilibili.com/476725595/dynamic
+        # https://space.bilibili.com/476725595/video
+        in "space", "bilibili.com", /^\d+$/ => artist_id, *rest
+          @artist_id = artist_id
+
+        # https://www.bilibili.com/video/BV1dY4y1u7Vi/
+        # http://www.bilibili.tv/video/av439451/
+        in ("www" | "m" | ""), ("bilibili.com" | "bilibili.tv"), "video", video_id
+          @video_id = video_id
+
+        # https://www.bilibili.com/s/video/BV18b4y1X7av
+        in ("www" | "m" | ""), ("bilibili.com" | "bilibili.tv"), "s", "video", video_id
+          @video_id = video_id
+
+        else
+          nil
+        end
+      end
+
+      def image_url?
+        domain == "hdslb.com"
+      end
+
+      def full_image_url
+        if file.present?
+          original_url.gsub(/(\.\w+)@\w+\.\w+$/, "\\1")
+        end
+      end
+
+      def page_url
+        if t_work_id.present?
+          "https://t.bilibili.com/#{t_work_id}"
+        elsif h_work_id.present?
+          "https://h.bilibili.com/#{h_work_id}"
+        elsif article_id.present?
+          "https://www.bilibili.com/read/cv#{article_id}"
+        elsif video_id.present?
+          "https://www.bilibili.com/video/#{video_id}"
+        end
+      end
+
+      def profile_url
+        if artist_id.present?
+          "https://space.bilibili.com/#{artist_id}"
+        end
+      end
+    end
+  end
+end