Merge pull request #5329 from nonamethanks/feat-bilibili

Add bilibili support
This commit is contained in:
evazion
2022-11-09 01:17:18 -06:00
committed by GitHub
8 changed files with 417 additions and 10 deletions

View File

@@ -56,6 +56,7 @@ module Source
Source::Extractor::Anifty,
Source::Extractor::Furaffinity,
Source::Extractor::Reddit,
Source::Extractor::Bilibili,
]
# Should return true if the extractor is configured correctly. Return false

View File

@@ -0,0 +1,159 @@
# frozen_string_literal: true
# @see Source::URL::Bilibili
module Source
class Extractor
class Bilibili < Source::Extractor
def match?
Source::URL::Bilibili === parsed_url
end
def image_urls
if parsed_url&.full_image_url.present?
[parsed_url.full_image_url]
elsif data.present?
if t_work_id.present?
image_urls = data.dig("modules", "module_dynamic", "major", "draw", "items").to_a.pluck("src")
elsif h_work_id.present?
image_urls = data.dig("item", "pictures").to_a.pluck("img_src")
end
image_urls.to_a.compact.map { |u| Source::URL.parse(u).full_image_url || u }
elsif article_id.present?
page&.search("#article-content img").to_a.pluck("data-src").compact.map { |u| Source::URL.parse(URI.join("https://", u)).full_image_url || u }
else
[parsed_url.original_url]
end
end
def page_url
t_work_page || parsed_url.page_url || parsed_referer&.page_url
end
def t_work_page
return unless t_work_id.present?
"https://t.bilibili.com/#{data["id_str"]}"
end
def artist_commentary_title
if article_id.present?
page&.at(".article-container .title")&.text&.squish&.strip
end
end
def artist_commentary_desc
if t_work_id.present?
data.dig("modules", "module_dynamic", "desc", "rich_text_nodes").map do |text_node|
case text_node["type"]
when "RICH_TEXT_NODE_TYPE_BV"
"<a href='#{URI.join("https://", text_node["jump_url"])}'>#{text_node["text"]}</a>"
when "RICH_TEXT_NODE_TYPE_EMOJI"
" #{text_node.dig("emoji", "icon_url")} "
else # RICH_TEXT_NODE_TYPE_AT (mentions), RICH_TEXT_NODE_TYPE_TEXT (text), RICH_TEXT_NODE_TYPE_TOPIC (hashtags)
text_node["text"]
end
end.join
elsif h_work_id.present?
data.dig("item", "description")
elsif article_id.present?
page&.at("#article-content")&.to_html
end
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc)
end
def tags
if t_work_id.present?
tag_list = data.dig("modules", "module_dynamic", "desc", "rich_text_nodes").to_a.select { |n| n["type"] == "RICH_TEXT_NODE_TYPE_TOPIC" }.map { |tag| tag["text"].gsub(/(^#|#$)/, "") }
elsif h_work_id.present?
tag_list = data.dig("item", "tags").to_a.pluck(:tag)
else # bilibili.com/read/:id posts have no tags that I could find
return []
end
tag_list.map { |tag| [tag, "https://t.bilibili.com/topic/name/#{tag}"] }
end
def artist_name
if t_work_id.present?
data.dig("modules", "module_author", "name")
elsif h_work_id.present?
data.dig("user", "name")
elsif article_id.present?
page&.at(".article-container .up-name")&.text&.squish&.strip
end
end
def tag_name
return unless artist_id.present?
"bilibili_#{artist_id}"
end
def artist_id
artist_id_from_data || parsed_url.artist_id || parsed_referer&.artist_id
end
def artist_id_from_data
if t_work_id.present?
data.dig("modules", "module_author", "mid")
elsif h_work_id.present?
data.dig("user", "uid")
elsif article_id.present?
artist_url = page&.at(".article-container .up-name")&.[]("href")
Source::URL.parse(URI.join("https://", artist_url))&.artist_id
end
end
def profile_url
return nil if artist_id.blank?
"https://space.bilibili.com/#{artist_id}"
end
def t_work_id
# for a repost this will be the ID of the repost, not the original one
parsed_url.t_work_id || parsed_referer&.t_work_id
end
def h_work_id
parsed_url.h_work_id || parsed_referer&.h_work_id
end
def article_id
parsed_url.article_id || parsed_referer&.article_id
end
def page
return unless page_url.present?
response = http.cache(1.minute).get(page_url)
return response.parse unless response.status != 200
end
def get_json(url)
response = http.cache(1.minute).get(url)
return {} unless response.status == 200
JSON.parse(response).with_indifferent_access
rescue JSON::ParserError
{}
end
def data
if t_work_id.present?
data = get_json("https://api.bilibili.com/x/polymer/web-dynamic/v1/detail?timezone_offset=-60&id=#{t_work_id}")
if data.dig("data", "item", "orig", "id_str").present? # it means it's a repost
data.dig("data", "item", "orig")
else
data.dig("data", "item").to_h
end
elsif h_work_id.present?
data = get_json("https://api.vc.bilibili.com/link_draw/v1/doc/detail?doc_id=#{h_work_id}")
data["data"].to_h
else
{}
end
end
memoize :data, :page
end
end
end

View File

@@ -59,6 +59,7 @@ module Source
Source::URL::Weibo,
Source::URL::Anifty,
Source::URL::Furaffinity,
Source::URL::Bilibili,
]
# Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL.

View File

@@ -7,10 +7,6 @@ class Source::URL::Anifty < Source::URL
url.domain == "anifty.jp" || url.host == "anifty.imgix.net" || (url.host == "storage.googleapis.com" && url.path.include?("/anifty-media/"))
end
def site_name
"Anifty"
end
def parse
case [host, *path_segments]

View File

@@ -0,0 +1,107 @@
# frozen_string_literal: true
# Unsupported:
# * https://www.bilibili.com/festival/arknights2022?bvid=BV1sr4y1e7gQ
# * https://game.bilibili.com/sssj/#character
# * http://i0.hdslb.com/Wallpaper/bilibili_chun.jpg
# * https://www.bilibili.com/html/bizhi.html
module Source
class URL
class Bilibili < Source::URL
attr_reader :file, :t_work_id, :h_work_id, :video_id, :article_id, :artist_id
def self.match?(url)
url.domain.in?(["bilibili.com", "hdslb.com"])
end
def parse
case [subdomain, domain, *path_segments]
# https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg@1036w.webp
# https://i0.hdslb.com/bfs/new_dyn/716a9733fc804d11d823cfacb7a3c78b11742550.jpg@208w_208h_1e_1c.webp
in _, "hdslb.com", "bfs", "new_dyn", /^(\w{32}(\d{8,})\.\w+)(?:@\w+\.\w+)?$/ => file
@file = $1
@artist_id = $2
# https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif@1036w.webp
# https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif
# https://i0.hdslb.com/bfs/article/48e75b3871fa5ed62b4e3a16bf60f52f96b1b3b1.jpg@942w_1334h_progressive.webp
in _, "hdslb.com", "bfs", subsite, /^(\w{40}\.\w+)(?:@\w+\.\w+)?$/ => file
@file = $1
# https://i0.hdslb.com/bfs/activity-plat/static/2cf2b9af5d3c5781d611d6e36f405144/E738vcDvd3.png
in _, "hdslb.com", "bfs", subsite, "static", subpath, /^\w+\.\w+$/ => file
# pass
# https://t.bilibili.com/686082748803186697
# https://t.bilibili.com/723052706467414039?spm_id_from=333.999.0.0 (quoted repost)
in "t", "bilibili.com", /^\d+$/ => t_work_id
@t_work_id = t_work_id
# https://m.bilibili.com/dynamic/612214375070704555
in "m", "bilibili.com", "dynamic", /^\d+$/ => t_work_id
@t_work_id = t_work_id
# https://h.bilibili.com/83341894
in "h", "bilibili.com", /^\d+$/ => h_work_id
@h_work_id = h_work_id
# https://www.bilibili.com/p/h5/8773541
in ("www" | ""), "bilibili.com", "p", "h5", /^\d+$/ => h_work_id
@h_work_id = h_work_id
# https://www.bilibili.com/read/cv7360489
in ("www" | ""), "bilibili.com", "read", /^cv(\d+)$/
@article_id = $1
# https://space.bilibili.com/355143
# https://space.bilibili.com/476725595/dynamic
# https://space.bilibili.com/476725595/video
in "space", "bilibili.com", /^\d+$/ => artist_id, *rest
@artist_id = artist_id
# https://www.bilibili.com/video/BV1dY4y1u7Vi/
# http://www.bilibili.tv/video/av439451/
in ("www" | "m" | ""), ("bilibili.com" | "bilibili.tv"), "video", video_id
@video_id = video_id
# https://www.bilibili.com/s/video/BV18b4y1X7av
in ("www" | "m" | ""), ("bilibili.com" | "bilibili.tv"), "s", "video", video_id
@video_id = video_id
else
nil
end
end
def image_url?
domain == "hdslb.com"
end
def full_image_url
if file.present?
original_url.gsub(/(\.\w+)@\w+\.\w+$/, "\\1")
end
end
def page_url
if t_work_id.present?
"https://t.bilibili.com/#{t_work_id}"
elsif h_work_id.present?
"https://h.bilibili.com/#{h_work_id}"
elsif article_id.present?
"https://www.bilibili.com/read/cv#{article_id}"
elsif video_id.present?
"https://www.bilibili.com/video/#{video_id}"
end
end
def profile_url
if artist_id.present?
"https://space.bilibili.com/#{artist_id}"
end
end
end
end
end

View File

@@ -27,10 +27,6 @@ module Source
url.domain.in?(%w[nicovideo.jp nicoseiga.jp nicomanga.jp nimg.jp nico.ms])
end
def site_name
"Nico Seiga"
end
def parse
case [host, *path_segments]

View File

@@ -93,8 +93,8 @@ class ArtistURL < ApplicationRecord
def priority
sites = %w[
Pixiv Twitter
Anifty ArtStation Baraag BCY Booth Deviant\ Art Hentai\ Foundry Fantia Furaffinity Foundation Lofter Nico\ Seiga Nijie Pawoo Fanbox Pixiv\ Sketch Plurk Tinami Tumblr Weibo
Ask.fm Facebook FC2 Gumroad Instagram Ko-fi Livedoor Mihuashi Mixi.jp Patreon Piapro.jp Picarto Privatter Sakura.ne.jp Stickam Skeb Twitch Youtube
Anifty ArtStation Baraag Bilibili BCY Booth Deviant\ Art Fantia Foundation Furaffinity Hentai\ Foundry Lofter Newgrounds Nico\ Seiga Nijie Pawoo Fanbox Pixiv\ Sketch Plurk Reddit Skeb Tinami Tumblr Weibo
Ask.fm Facebook FC2 Gumroad Instagram Ko-fi Livedoor Mihuashi Mixi.jp Patreon Piapro.jp Picarto Privatter Sakura.ne.jp Stickam Twitch Youtube
Amazon Circle.ms DLSite Doujinshi.org Erogamescape Mangaupdates Melonbooks Toranoana Wikipedia
]

View File

@@ -0,0 +1,147 @@
require 'test_helper'
module Sources
class BilibiliTest < ActiveSupport::TestCase
context "A t.bilibili.com/:id post" do
strategy_should_work(
"https://t.bilibili.com/686082748803186697",
image_urls: [
"https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/4c6b93d5e85b8ed5b84c3f04909f195711742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/e1a1e6be01b6c68f6610cdf1d127f38311742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/9ff31bbe8005aa1b9c438e1b2e6ce81111742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/716a9733fc804d11d823cfacb7a3c78b11742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/fa42eaa6ee9cd2a896cadc41e16ab62b11742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/fc9553ff7e4ad1185e0379b3ccf7e2d911742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/da95475b858be577fc8c79bd22b7519e11742550.jpg",
"https://i0.hdslb.com/bfs/new_dyn/60a3c652b362c54bc61ea3365258d1d111742550.jpg",
],
page_url: "https://t.bilibili.com/686082748803186697",
artist_name: "哈米伦的弄笛者",
tag_name: "bilibili_11742550",
profile_url: "https://space.bilibili.com/11742550",
tags: [],
artist_commentary_title: nil,
dtext_artist_commentary_desc: "\"【崩坏3】少女泳装夏日时光\":[https://www.bilibili.com/video/BV1fB4y1Y7zt] 新视频的图片分享!大家记得来康 http://i0.hdslb.com/bfs/emote/d8c665db9fdc69b3b90c71de3fe05536ac795409.png "
)
end
context "A t.bilibili.com:id repost" do
strategy_should_work(
"https://t.bilibili.com/723052706467414039?spm_id_from=333.999.0.0",
image_urls: [
"https://i0.hdslb.com/bfs/new_dyn/fd40435a0ff15d2eed45da7c0f890bdf15817819.jpg",
"https://i0.hdslb.com/bfs/new_dyn/1beb12760dc8790f7443515307225ad015817819.jpg",
"https://i0.hdslb.com/bfs/new_dyn/113aacf139984f808721f50883e908b815817819.jpg",
"https://i0.hdslb.com/bfs/new_dyn/ad1537c506b87ce2c30e19e4ef54204715817819.jpg",
"https://i0.hdslb.com/bfs/new_dyn/4a098d62f90d17bf516e3edded670d5e15817819.jpg",
"https://i0.hdslb.com/bfs/new_dyn/89397fe05083ee25879962afba60a70515817819.jpg",
],
page_url: "https://t.bilibili.com/722702993036673113",
artist_name: "星尘Official",
tag_name: "bilibili_15817819",
profile_url: "https://space.bilibili.com/15817819",
tags: [],
artist_commentary_desc: " http://i0.hdslb.com/bfs/emote/fd8aa275d5d91cdf71410bc1a738415fd6e2ab86.png "
)
end
context "A text-only t.bilibili.com post with hashtags" do
strategy_should_work(
"https://t.bilibili.com/707554407156285477",
image_urls: [],
profile_url: "https://space.bilibili.com/476720460",
tags: [
["一起用原神痛车", "https://t.bilibili.com/topic/name/一起用原神痛车"],
["凯迪拉克原神联名座驾", "https://t.bilibili.com/topic/name/凯迪拉克原神联名座驾"],
["原神", "https://t.bilibili.com/topic/name/原神"],
["凯迪拉克原神联动", "https://t.bilibili.com/topic/name/凯迪拉克原神联动"],
["风起雷涌特别的旅途", "https://t.bilibili.com/topic/name/风起雷涌特别的旅途"],
["凯迪拉克CT4", "https://t.bilibili.com/topic/name/凯迪拉克CT4"],
["凯迪拉克XT4", "https://t.bilibili.com/topic/name/凯迪拉克XT4"],
]
)
end
context "A h.bilibili.com/:id post" do
strategy_should_work(
"https://h.bilibili.com/83341894",
image_urls: [
"https://i0.hdslb.com/bfs/album/669c0974a2a7508cbbb60b185eddaa0ccf8c5b7a.jpg",
"https://i0.hdslb.com/bfs/album/de8043c382b9eb022519380bf6b570285ea3bf81.gif",
"https://i0.hdslb.com/bfs/album/0ea658d4a9b2323665b6a5b6df6eff0e23e98c22.gif",
"https://i0.hdslb.com/bfs/album/6448067578847d7006c6a94ffc56d6fde30b8b1e.gif",
"https://i0.hdslb.com/bfs/album/ef2a9939264ff1e98cb4653c5b427c1d32e5ff24.gif",
"https://i0.hdslb.com/bfs/album/6198a9290219be0775d214cfa16afb02e8b357f7.gif",
],
artist_commentary_title: nil,
artist_name: "明日方舟",
profile_url: "https://space.bilibili.com/161775300",
page_url: "https://h.bilibili.com/83341894",
tag_name: "bilibili_161775300",
artist_commentary_desc: "#明日方舟#\n【新增服饰】\n//灿阳朝露 SD01 - 临光\nMARTHE [珊瑚海岸/CoralCoast]灿阳朝露系列泳衣01款。贴身、透气、轻便专为夏日而生。\n\n即使是耀骑士,在海边的太阳前依旧要涂好防晒霜竖起遮阳伞。 "
)
end
context "A bilibili.com/read/:id post" do
strategy_should_work(
"https://www.bilibili.com/read/cv7360489",
image_urls: [
"https://i0.hdslb.com/bfs/article/48e75b3871fa5ed62b4e3a16bf60f52f96b1b3b1.jpg",
"https://i0.hdslb.com/bfs/article/72de3b6de4465fcb14c719354d8aeb55e93aa022.jpg",
"https://i0.hdslb.com/bfs/article/f6f56a387517ecf3a721228f8da6b21ffbf92210.jpg",
"https://i0.hdslb.com/bfs/article/7ac6fd23295eab8d3f62254187c34ae4867ea889.jpg",
"https://i0.hdslb.com/bfs/article/f90d0110964e3794aca245b1a4b5d934156d231f.jpg",
"https://i0.hdslb.com/bfs/article/b5a85177d15f3c53d06fae45ba53af3e64f7af14.jpg",
"https://i0.hdslb.com/bfs/article/3ca6ec1056eb8dfb6e9fde732146b8244fd605ad.jpg",
"https://i0.hdslb.com/bfs/article/1e860b392bef10f07e5abb7866e82998419f586a.jpg",
"https://i0.hdslb.com/bfs/article/2d392a5ab0676e153355d850c13a93f16d5eb7a0.jpg",
"https://i0.hdslb.com/bfs/article/e19cb5691afbe77c003b535759cda619b2d813cb.jpg",
],
page_url: "https://www.bilibili.com/read/cv7360489",
artist_name: "时光印记2016",
tag_name: "bilibili_285452636",
profile_url: "https://space.bilibili.com/285452636",
artist_commentary_title: "斗罗大陆 4觉醒后的古月娜第一期",
dtext_artist_commentary_desc: "\n\n超喜欢2345678910\n\n不定时更新,兴趣爱好!\n\n",
tags: []
)
end
context "A bilibili image url" do
strategy_should_work(
"https://i0.hdslb.com/bfs/activity-plat/static/2cf2b9af5d3c5781d611d6e36f405144/E738vcDvd3.png",
image_urls: ["https://i0.hdslb.com/bfs/activity-plat/static/2cf2b9af5d3c5781d611d6e36f405144/E738vcDvd3.png"],
profile_url: "",
artist_id: "",
page_url: ""
)
end
context "A bilibili image url with embedded artist ID" do
strategy_should_work(
"https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg@1036w.webp",
image_urls: ["https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg"],
artist_id: "11742550",
profile_url: "https://space.bilibili.com/11742550"
)
end
should "Parse Bilibili URLs correctly" do
assert_equal("https://h.bilibili.com/8773541", Source::URL.page_url("https://www.bilibili.com/p/h5/8773541"))
assert_equal("https://t.bilibili.com/612214375070704555", Source::URL.page_url("https://m.bilibili.com/dynamic/612214375070704555"))
assert(Source::URL.page_url?("https://t.bilibili.com/612214375070704555"))
assert(Source::URL.page_url?("https://h.bilibili.com/8773541"))
assert(Source::URL.page_url?("https://www.bilibili.com/read/cv7360489"))
assert(Source::URL.page_url?("https://www.bilibili.com/video/BV1dY4y1u7Vi"))
assert(Source::URL.image_url?("https://i0.hdslb.com/bfs/new_dyn/675526fd8baa2f75d7ea0e7ea957bc0811742550.jpg"))
assert(Source::URL.image_url?("https://i0.hdslb.com/bfs/album/37f77871d417c76a08a9467527e9670810c4c442.gif"))
assert(Source::URL.profile_url?("https://space.bilibili.com/355143"))
assert_not(Source::URL.profile_url?("https://space.bilibili.com"))
end
end
end