From fc4d6200021a73b1b3e8a442932420691a1df17f Mon Sep 17 00:00:00 2001 From: nonamethanks Date: Fri, 4 Nov 2022 12:11:08 +0100 Subject: [PATCH] Add bilibili support --- app/logical/source/extractor.rb | 1 + app/logical/source/extractor/bilibili.rb | 159 +++++++++++++++++++++++ app/logical/source/url.rb | 1 + app/logical/source/url/bilibili.rb | 107 +++++++++++++++ test/unit/sources/bilibili_test.rb | 147 +++++++++++++++++++++ 5 files changed, 415 insertions(+) create mode 100644 app/logical/source/extractor/bilibili.rb create mode 100644 app/logical/source/url/bilibili.rb create mode 100644 test/unit/sources/bilibili_test.rb diff --git a/app/logical/source/extractor.rb b/app/logical/source/extractor.rb index 335132d88..5bc6e9ef9 100644 --- a/app/logical/source/extractor.rb +++ b/app/logical/source/extractor.rb @@ -56,6 +56,7 @@ module Source Source::Extractor::Anifty, Source::Extractor::Furaffinity, Source::Extractor::Reddit, + Source::Extractor::Bilibili, ] # Should return true if the extractor is configured correctly. Return false diff --git a/app/logical/source/extractor/bilibili.rb b/app/logical/source/extractor/bilibili.rb new file mode 100644 index 000000000..1f50a5971 --- /dev/null +++ b/app/logical/source/extractor/bilibili.rb @@ -0,0 +1,159 @@ +# frozen_string_literal: true + +# @see Source::URL::Bilibili +module Source + class Extractor + class Bilibili < Source::Extractor + def match? + Source::URL::Bilibili === parsed_url + end + + def image_urls + if parsed_url&.full_image_url.present? + [parsed_url.full_image_url] + elsif data.present? + if t_work_id.present? + image_urls = data.dig("modules", "module_dynamic", "major", "draw", "items").to_a.pluck("src") + elsif h_work_id.present? + image_urls = data.dig("item", "pictures").to_a.pluck("img_src") + end + image_urls.to_a.compact.map { |u| Source::URL.parse(u).full_image_url || u } + elsif article_id.present? + page&.search("#article-content img").to_a.pluck("data-src").compact.map { |u| Source::URL.parse(URI.join("https://", u)).full_image_url || u } + else + [parsed_url.original_url] + end + end + + def page_url + t_work_page || parsed_url.page_url || parsed_referer&.page_url + end + + def t_work_page + return unless t_work_id.present? + "https://t.bilibili.com/#{data["id_str"]}" + end + + def artist_commentary_title + if article_id.present? + page&.at(".article-container .title")&.text&.squish&.strip + end + end + + def artist_commentary_desc + if t_work_id.present? + data.dig("modules", "module_dynamic", "desc", "rich_text_nodes").map do |text_node| + case text_node["type"] + when "RICH_TEXT_NODE_TYPE_BV" + "#{text_node["text"]}" + when "RICH_TEXT_NODE_TYPE_EMOJI" + " #{text_node.dig("emoji", "icon_url")} " + else # RICH_TEXT_NODE_TYPE_AT (mentions), RICH_TEXT_NODE_TYPE_TEXT (text), RICH_TEXT_NODE_TYPE_TOPIC (hashtags) + text_node["text"] + end + end.join + elsif h_work_id.present? + data.dig("item", "description") + elsif article_id.present? + page&.at("#article-content")&.to_html + end + end + + def dtext_artist_commentary_desc + DText.from_html(artist_commentary_desc) + end + + def tags + if t_work_id.present? + tag_list = data.dig("modules", "module_dynamic", "desc", "rich_text_nodes").to_a.select { |n| n["type"] == "RICH_TEXT_NODE_TYPE_TOPIC" }.map { |tag| tag["text"].gsub(/(^#|#$)/, "") } + elsif h_work_id.present? + tag_list = data.dig("item", "tags").to_a.pluck(:tag) + else # bilibili.com/read/:id posts have no tags that I could find + return [] + end + + tag_list.map { |tag| [tag, "https://t.bilibili.com/topic/name/#{tag}"] } + end + + def artist_name + if t_work_id.present? + data.dig("modules", "module_author", "name") + elsif h_work_id.present? + data.dig("user", "name") + elsif article_id.present? + page&.at(".article-container .up-name")&.text&.squish&.strip + end + end + + def tag_name + return unless artist_id.present? + "bilibili_#{artist_id}" + end + + def artist_id + artist_id_from_data || parsed_url.artist_id || parsed_referer&.artist_id + end + + def artist_id_from_data + if t_work_id.present? + data.dig("modules", "module_author", "mid") + elsif h_work_id.present? + data.dig("user", "uid") + elsif article_id.present? + artist_url = page&.at(".article-container .up-name")&.[]("href") + Source::URL.parse(URI.join("https://", artist_url))&.artist_id + end + end + + def profile_url + return nil if artist_id.blank? + "https://space.bilibili.com/#{artist_id}" + end + + def t_work_id + # for a repost this will be the ID of the repost, not the original one + parsed_url.t_work_id || parsed_referer&.t_work_id + end + + def h_work_id + parsed_url.h_work_id || parsed_referer&.h_work_id + end + + def article_id + parsed_url.article_id || parsed_referer&.article_id + end + + def page + return unless page_url.present? + response = http.cache(1.minute).get(page_url) + return response.parse unless response.status != 200 + end + + def get_json(url) + response = http.cache(1.minute).get(url) + return {} unless response.status == 200 + JSON.parse(response).with_indifferent_access + rescue JSON::ParserError + {} + end + + def data + if t_work_id.present? + data = get_json("https://api.bilibili.com/x/polymer/web-dynamic/v1/detail?timezone_offset=-60&id=#{t_work_id}") + if data.dig("data", "item", "orig", "id_str").present? # it means it's a repost + data.dig("data", "item", "orig") + else + data.dig("data", "item").to_h + end + elsif h_work_id.present? + data = get_json("https://api.vc.bilibili.com/link_draw/v1/doc/detail?doc_id=#{h_work_id}") + data["data"].to_h + else + {} + end + end + + memoize :data, :page + end + end +end diff --git a/app/logical/source/url.rb b/app/logical/source/url.rb index e9c48ca00..9da644091 100644 --- a/app/logical/source/url.rb +++ b/app/logical/source/url.rb @@ -59,6 +59,7 @@ module Source Source::URL::Weibo, Source::URL::Anifty, Source::URL::Furaffinity, + Source::URL::Bilibili, ] # Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL. diff --git a/app/logical/source/url/bilibili.rb b/app/logical/source/url/bilibili.rb new file mode 100644 index 000000000..8cb4f0a5a --- /dev/null +++ b/app/logical/source/url/bilibili.rb @@ -0,0 +1,107 @@ +# frozen_string_literal: true + +# Unsupported: +# * https://www.bilibili.com/festival/arknights2022?bvid=BV1sr4y1e7gQ +# * https://game.bilibili.com/sssj/#character +# * http://i0.hdslb.com/Wallpaper/bilibili_chun.jpg +# * https://www.bilibili.com/html/bizhi.html + +module Source + class URL + class Bilibili < Source::URL + attr_reader :file, :t_work_id, :h_work_id, :video_id, :article_id, :artist_id + + def self.match?(url) + url.domain.in?(["bilibili.com", "hdslb.com"]) + end + + def parse + case [subdomain, domain, *path_segments] + + # https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg@1036w.webp + # https://i0.hdslb.com/bfs/new_dyn/716a9733fc804d11d823cfacb7a3c78b11742550.jpg@208w_208h_1e_1c.webp + in _, "hdslb.com", "bfs", "new_dyn", /^(\w{32}(\d{8,})\.\w+)(?:@\w+\.\w+)?$/ => file + @file = $1 + @artist_id = $2 + + # https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif@1036w.webp + # https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif + # https://i0.hdslb.com/bfs/article/48e75b3871fa5ed62b4e3a16bf60f52f96b1b3b1.jpg@942w_1334h_progressive.webp + in _, "hdslb.com", "bfs", subsite, /^(\w{40}\.\w+)(?:@\w+\.\w+)?$/ => file + @file = $1 + + # https://i0.hdslb.com/bfs/activity-plat/static/2cf2b9af5d3c5781d611d6e36f405144/E738vcDvd3.png + in _, "hdslb.com", "bfs", subsite, "static", subpath, /^\w+\.\w+$/ => file + # pass + + # https://t.bilibili.com/686082748803186697 + # https://t.bilibili.com/723052706467414039?spm_id_from=333.999.0.0 (quoted repost) + in "t", "bilibili.com", /^\d+$/ => t_work_id + @t_work_id = t_work_id + + # https://m.bilibili.com/dynamic/612214375070704555 + in "m", "bilibili.com", "dynamic", /^\d+$/ => t_work_id + @t_work_id = t_work_id + + # https://h.bilibili.com/83341894 + in "h", "bilibili.com", /^\d+$/ => h_work_id + @h_work_id = h_work_id + + # https://www.bilibili.com/p/h5/8773541 + in ("www" | ""), "bilibili.com", "p", "h5", /^\d+$/ => h_work_id + @h_work_id = h_work_id + + # https://www.bilibili.com/read/cv7360489 + in ("www" | ""), "bilibili.com", "read", /^cv(\d+)$/ + @article_id = $1 + + # https://space.bilibili.com/355143 + # https://space.bilibili.com/476725595/dynamic + # https://space.bilibili.com/476725595/video + in "space", "bilibili.com", /^\d+$/ => artist_id, *rest + @artist_id = artist_id + + # https://www.bilibili.com/video/BV1dY4y1u7Vi/ + # http://www.bilibili.tv/video/av439451/ + in ("www" | "m" | ""), ("bilibili.com" | "bilibili.tv"), "video", video_id + @video_id = video_id + + # https://www.bilibili.com/s/video/BV18b4y1X7av + in ("www" | "m" | ""), ("bilibili.com" | "bilibili.tv"), "s", "video", video_id + @video_id = video_id + + else + nil + end + end + + def image_url? + domain == "hdslb.com" + end + + def full_image_url + if file.present? + original_url.gsub(/(\.\w+)@\w+\.\w+$/, "\\1") + end + end + + def page_url + if t_work_id.present? + "https://t.bilibili.com/#{t_work_id}" + elsif h_work_id.present? + "https://h.bilibili.com/#{h_work_id}" + elsif article_id.present? + "https://www.bilibili.com/read/cv#{article_id}" + elsif video_id.present? + "https://www.bilibili.com/video/#{video_id}" + end + end + + def profile_url + if artist_id.present? + "https://space.bilibili.com/#{artist_id}" + end + end + end + end +end diff --git a/test/unit/sources/bilibili_test.rb b/test/unit/sources/bilibili_test.rb new file mode 100644 index 000000000..a46e4fa36 --- /dev/null +++ b/test/unit/sources/bilibili_test.rb @@ -0,0 +1,147 @@ +require 'test_helper' + +module Sources + class BilibiliTest < ActiveSupport::TestCase + context "A t.bilibili.com/:id post" do + strategy_should_work( + "https://t.bilibili.com/686082748803186697", + image_urls: [ + "https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/4c6b93d5e85b8ed5b84c3f04909f195711742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/e1a1e6be01b6c68f6610cdf1d127f38311742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/9ff31bbe8005aa1b9c438e1b2e6ce81111742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/716a9733fc804d11d823cfacb7a3c78b11742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/fa42eaa6ee9cd2a896cadc41e16ab62b11742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/fc9553ff7e4ad1185e0379b3ccf7e2d911742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/da95475b858be577fc8c79bd22b7519e11742550.jpg", + "https://i0.hdslb.com/bfs/new_dyn/60a3c652b362c54bc61ea3365258d1d111742550.jpg", + ], + page_url: "https://t.bilibili.com/686082748803186697", + artist_name: "哈米伦的弄笛者", + tag_name: "bilibili_11742550", + profile_url: "https://space.bilibili.com/11742550", + tags: [], + artist_commentary_title: nil, + dtext_artist_commentary_desc: "\"【崩坏3】少女,泳装,夏日时光!\":[https://www.bilibili.com/video/BV1fB4y1Y7zt] 新视频的图片分享!大家记得来康 http://i0.hdslb.com/bfs/emote/d8c665db9fdc69b3b90c71de3fe05536ac795409.png " + ) + end + + context "A t.bilibili.com:id repost" do + strategy_should_work( + "https://t.bilibili.com/723052706467414039?spm_id_from=333.999.0.0", + image_urls: [ + "https://i0.hdslb.com/bfs/new_dyn/fd40435a0ff15d2eed45da7c0f890bdf15817819.jpg", + "https://i0.hdslb.com/bfs/new_dyn/1beb12760dc8790f7443515307225ad015817819.jpg", + "https://i0.hdslb.com/bfs/new_dyn/113aacf139984f808721f50883e908b815817819.jpg", + "https://i0.hdslb.com/bfs/new_dyn/ad1537c506b87ce2c30e19e4ef54204715817819.jpg", + "https://i0.hdslb.com/bfs/new_dyn/4a098d62f90d17bf516e3edded670d5e15817819.jpg", + "https://i0.hdslb.com/bfs/new_dyn/89397fe05083ee25879962afba60a70515817819.jpg", + ], + page_url: "https://t.bilibili.com/722702993036673113", + artist_name: "星尘Official", + tag_name: "bilibili_15817819", + profile_url: "https://space.bilibili.com/15817819", + tags: [], + artist_commentary_desc: " http://i0.hdslb.com/bfs/emote/fd8aa275d5d91cdf71410bc1a738415fd6e2ab86.png " + ) + end + + context "A text-only t.bilibili.com post with hashtags" do + strategy_should_work( + "https://t.bilibili.com/707554407156285477", + image_urls: [], + profile_url: "https://space.bilibili.com/476720460", + tags: [ + ["一起用原神痛车", "https://t.bilibili.com/topic/name/一起用原神痛车"], + ["凯迪拉克原神联名座驾", "https://t.bilibili.com/topic/name/凯迪拉克原神联名座驾"], + ["原神", "https://t.bilibili.com/topic/name/原神"], + ["凯迪拉克原神联动", "https://t.bilibili.com/topic/name/凯迪拉克原神联动"], + ["风起雷涌特别的旅途", "https://t.bilibili.com/topic/name/风起雷涌特别的旅途"], + ["凯迪拉克CT4", "https://t.bilibili.com/topic/name/凯迪拉克CT4"], + ["凯迪拉克XT4", "https://t.bilibili.com/topic/name/凯迪拉克XT4"], + ] + ) + end + + context "A h.bilibili.com/:id post" do + strategy_should_work( + "https://h.bilibili.com/83341894", + image_urls: [ + "https://i0.hdslb.com/bfs/album/669c0974a2a7508cbbb60b185eddaa0ccf8c5b7a.jpg", + "https://i0.hdslb.com/bfs/album/de8043c382b9eb022519380bf6b570285ea3bf81.gif", + "https://i0.hdslb.com/bfs/album/0ea658d4a9b2323665b6a5b6df6eff0e23e98c22.gif", + "https://i0.hdslb.com/bfs/album/6448067578847d7006c6a94ffc56d6fde30b8b1e.gif", + "https://i0.hdslb.com/bfs/album/ef2a9939264ff1e98cb4653c5b427c1d32e5ff24.gif", + "https://i0.hdslb.com/bfs/album/6198a9290219be0775d214cfa16afb02e8b357f7.gif", + ], + artist_commentary_title: nil, + artist_name: "明日方舟", + profile_url: "https://space.bilibili.com/161775300", + page_url: "https://h.bilibili.com/83341894", + tag_name: "bilibili_161775300", + artist_commentary_desc: "#明日方舟#\n【新增服饰】\n//灿阳朝露 SD01 - 临光\nMARTHE [珊瑚海岸/CoralCoast]灿阳朝露系列泳衣01款。贴身、透气、轻便,专为夏日而生。\n\n即使是耀骑士,在海边的太阳前依旧要涂好防晒霜竖起遮阳伞。 ​​​​ " + ) + end + + context "A bilibili.com/read/:id post" do + strategy_should_work( + "https://www.bilibili.com/read/cv7360489", + image_urls: [ + "https://i0.hdslb.com/bfs/article/48e75b3871fa5ed62b4e3a16bf60f52f96b1b3b1.jpg", + "https://i0.hdslb.com/bfs/article/72de3b6de4465fcb14c719354d8aeb55e93aa022.jpg", + "https://i0.hdslb.com/bfs/article/f6f56a387517ecf3a721228f8da6b21ffbf92210.jpg", + "https://i0.hdslb.com/bfs/article/7ac6fd23295eab8d3f62254187c34ae4867ea889.jpg", + "https://i0.hdslb.com/bfs/article/f90d0110964e3794aca245b1a4b5d934156d231f.jpg", + "https://i0.hdslb.com/bfs/article/b5a85177d15f3c53d06fae45ba53af3e64f7af14.jpg", + "https://i0.hdslb.com/bfs/article/3ca6ec1056eb8dfb6e9fde732146b8244fd605ad.jpg", + "https://i0.hdslb.com/bfs/article/1e860b392bef10f07e5abb7866e82998419f586a.jpg", + "https://i0.hdslb.com/bfs/article/2d392a5ab0676e153355d850c13a93f16d5eb7a0.jpg", + "https://i0.hdslb.com/bfs/article/e19cb5691afbe77c003b535759cda619b2d813cb.jpg", + ], + page_url: "https://www.bilibili.com/read/cv7360489", + artist_name: "时光印记2016", + tag_name: "bilibili_285452636", + profile_url: "https://space.bilibili.com/285452636", + artist_commentary_title: "斗罗大陆 4,觉醒后的古月娜(第一期)", + dtext_artist_commentary_desc: "\n\n超喜欢2345678910\n\n不定时更新,兴趣爱好!\n\n", + tags: [] + ) + end + + context "A bilibili image url" do + strategy_should_work( + "https://i0.hdslb.com/bfs/activity-plat/static/2cf2b9af5d3c5781d611d6e36f405144/E738vcDvd3.png", + image_urls: ["https://i0.hdslb.com/bfs/activity-plat/static/2cf2b9af5d3c5781d611d6e36f405144/E738vcDvd3.png"], + profile_url: "", + artist_id: "", + page_url: "" + ) + end + + context "A bilibili image url with embedded artist ID" do + strategy_should_work( + "https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg@1036w.webp", + image_urls: ["https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg"], + artist_id: "11742550", + profile_url: "https://space.bilibili.com/11742550" + ) + end + + should "Parse Bilibili URLs correctly" do + assert_equal("https://h.bilibili.com/8773541", Source::URL.page_url("https://www.bilibili.com/p/h5/8773541")) + assert_equal("https://t.bilibili.com/612214375070704555", Source::URL.page_url("https://m.bilibili.com/dynamic/612214375070704555")) + + assert(Source::URL.page_url?("https://t.bilibili.com/612214375070704555")) + assert(Source::URL.page_url?("https://h.bilibili.com/8773541")) + assert(Source::URL.page_url?("https://www.bilibili.com/read/cv7360489")) + assert(Source::URL.page_url?("https://www.bilibili.com/video/BV1dY4y1u7Vi")) + + assert(Source::URL.image_url?("https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg")) + assert(Source::URL.image_url?("https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif")) + + assert(Source::URL.profile_url?("https://space.bilibili.com/355143")) + + assert_not(Source::URL.profile_url?("https://space.bilibili.com")) + end + end +end