sources: rename Sources::Strategies to Source::Extractor.

Rename Sources::Strategies to Source::Extractor. A Source::Extractor
represents a thing that extracts information from a given URL.
This commit is contained in:
evazion
2022-03-24 03:05:10 -05:00
parent 34aa22f90b
commit d9d3c1dfe4
63 changed files with 622 additions and 606 deletions

View File

@@ -0,0 +1,101 @@
# frozen_string_literal: true
# @see Source::URL::ArtStation
class Source::Extractor
class ArtStation < Source::Extractor
def match?
Source::URL::ArtStation === parsed_url
end
def image_urls
if parsed_url.image_url?
[asset_url(url)]
else
image_urls_from_api
end
end
def page_url
return nil if project_id.blank?
if artist_name.present?
"https://#{artist_name}.artstation.com/projects/#{project_id}"
else
"https://www.artstation.com/artwork/#{project_id}"
end
end
def profile_url
return nil if artist_name.blank?
"https://www.artstation.com/#{artist_name}"
end
def artist_name
artist_name_from_url || api_response.dig(:user, :username)
end
def artist_commentary_title
api_response[:title]
end
def artist_commentary_desc
api_response[:description]
end
def dtext_artist_commentary_desc
ActionView::Base.full_sanitizer.sanitize(artist_commentary_desc)
end
def tags
api_response[:tags].to_a.map do |tag|
[tag, "https://www.artstation.com/search?q=" + CGI.escape(tag)]
end
end
def image_urls_from_api
api_response[:assets].to_a.map do |asset|
if asset[:asset_type] == "image"
asset_url(asset[:image_url])
elsif asset[:asset_type] == "video_clip"
next # XXX Skip for now; actually downloading these videos requires bypassing a Cloudflare captcha.
url = Nokogiri::HTML5.parse(asset[:player_embedded]).at("iframe").attr("src")
next if url.nil?
response = http.cache(1.minute).get(url)
next if response.status != 200
response.parse.at("video source").attr("src")
end
end.compact
end
def artist_name_from_url
parsed_url.username || parsed_referer&.username
end
def project_id
parsed_url.work_id || parsed_referer&.work_id
end
def api_response
return {} if project_id.blank?
resp = http.cache(1.minute).get("https://www.artstation.com/projects/#{project_id}.json")
return {} if resp.code != 200
resp.parse.with_indifferent_access
end
memoize :api_response
def asset_url(url)
parsed_url = Source::URL.parse(url)
image_sizes = %w[original 4k large medium small]
urls = image_sizes.map { |size| parsed_url.full_image_url(size) }
chosen_url = urls.find { |url| http_exists?(url) }
chosen_url || url
end
end
end

View File

@@ -0,0 +1,209 @@
# frozen_string_literal: true
module Source
class Extractor
class DeviantArt < Source::Extractor
def self.enabled?
Danbooru.config.deviantart_client_id.present? && Danbooru.config.deviantart_client_secret.present?
end
def match?
Source::URL::DeviantArt === parsed_url
end
def image_urls
[image_url]
end
def image_url
# work is private, deleted, or the url didn't contain a deviation id; use image url as given by user.
if api_deviation.blank?
url
elsif api_deviation[:is_downloadable]
api_download[:src]
elsif api_deviation[:flash].present?
api_deviation.dig(:flash, :src)
elsif api_deviation[:videos].present?
api_deviation[:videos].max_by { |x| x[:filesize] }[:src]
else
src = api_deviation.dig(:content, :src)
if deviation_id && deviation_id.to_i <= 790_677_560 && src =~ %r{\Ahttps://images-wixmp-} && src !~ /\.gif\?/
src = src.sub(%r{(/f/[a-f0-9-]+/[a-f0-9-]+)}, '/intermediary\1')
src = src.sub(%r{/v1/(fit|fill)/.*\z}i, "")
end
src = src.sub(%r{\Ahttps?://orig\d+\.deviantart\.net}i, "http://origin-orig.deviantart.net")
src = src.gsub(/q_\d+,strp/, "q_100")
src
end
end
def page_url
if stash_page.present?
stash_page
elsif api_deviation.present?
api_deviation[:url]
elsif deviation_id.present?
page_url_from_image_url
else
nil
end
end
def page_url_from_image_url
stash_page || parsed_url.page_url || parsed_referer&.page_url
end
# Sta.sh posts have the same image URLs as DeviantArt but different page URLs. We use the Sta.sh page if we have one.
#
# Image: https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/83d3eb4d-13e5-4aea-a08f-8d4331d033c4/dcmjs1s-389a7505-142d-4b34-a790-ab4ea1ec9eaa.png?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcLzgzZDNlYjRkLTEzZTUtNGFlYS1hMDhmLThkNDMzMWQwMzNjNFwvZGNtanMxcy0zODlhNzUwNS0xNDJkLTRiMzQtYTc5MC1hYjRlYTFlYzllYWEucG5nIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.pIddc32BoLpAJt6D8YcRFonoVy9nC8RgROlYwMp3huo
# Page: https://sta.sh/01pwva4zzf98
def stash_page
if parsed_url.stash_id.present?
parsed_url.page_url
elsif parsed_referer&.stash_id.present?
parsed_referer.page_url
end
end
def profile_url
return nil if artist_name.blank?
"https://www.deviantart.com/#{artist_name.downcase}"
end
# Prefer the name from the url because the api metadata won't be present when
# the input url doesn't contain a deviation id, or the deviation is private or deleted.
def artist_name
if artist_name_from_url.present?
artist_name_from_url
elsif api_metadata.present?
api_metadata.dig(:author, :username)
else
nil
end
end
def artist_commentary_title
api_metadata[:title]
end
def artist_commentary_desc
api_metadata[:description]
end
def tags
if api_metadata.blank?
return []
end
api_metadata[:tags].map do |tag|
[tag[:tag_name], "https://www.deviantart.com/tag/#{tag[:tag_name]}"]
end
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc) do |element|
# Convert embedded thumbnails of journal posts to 'deviantart #123'
# links. Strip embedded thumbnails of image posts. Example:
# https://sa-dui.deviantart.com/art/Commission-Meinos-Kaen-695905927.
if element.name == "a" && element["data-sigil"] == "thumb"
element.name = "span"
# <a href="https://sa-dui.deviantart.com/journal/About-Commissions-223178193" data-sigil="thumb" class="thumb lit" ...>
if element["class"].split.include?("lit")
deviation_id = element["href"][/-(\d+)\z/, 1].to_i
element.content = "deviantart ##{deviation_id}"
else
element.content = ""
end
end
if element.name == "a" && element["href"].present?
element["href"] = element["href"].gsub(%r{\Ahttps?://www\.deviantart\.com/users/outgoing\?}i, "")
# href may be missing the `http://` bit (ex: `inprnt.com`, `//inprnt.com`). Add it if missing.
uri = Addressable::URI.heuristic_parse(element["href"]) rescue nil
if uri.present? && uri.path.present?
uri.scheme ||= "http"
element["href"] = uri.to_s
end
end
end.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
end
def deviation_id
parsed_url.work_id || parsed_referer&.work_id
end
def artist_name_from_url
parsed_url.username || parsed_referer&.username
end
def page
return nil if page_url_from_image_url.blank?
resp = http.cache(1.minute).get(page_url_from_image_url, follow: {max_hops: 1})
if resp.status.success?
resp.parse
# the work was deleted
elsif resp.code == 404
nil
else
raise "failed to fetch page (got code #{resp.code})"
end
end
memoize :page
# Scrape UUID from <meta property="da:appurl" content="DeviantArt://deviation/12F08C5D-A3A4-338C-2F1A-7E4E268C0E8B">
# For hidden or deleted works the UUID will be nil.
def uuid
return nil if page.nil?
meta = page.at_css('meta[property="da:appurl"]')
return nil if meta.nil?
appurl = meta["content"]
uuid = appurl[%r{\ADeviantArt://deviation/(.*)\z}, 1]
uuid
end
memoize :uuid
def api_client
api_client = DeviantArtApiClient.new(
Danbooru.config.deviantart_client_id,
Danbooru.config.deviantart_client_secret
)
api_client.access_token = Cache.get("da-access-token", 11.weeks) do
api_client.access_token.to_hash
end
api_client
end
memoize :api_client
def api_deviation
return {} if uuid.nil?
api_client.deviation(uuid)
end
memoize :api_deviation
def api_metadata
return {} if uuid.nil?
api_client.metadata(uuid)[:metadata].first
end
memoize :api_metadata
def api_download
return {} unless uuid.present? && api_deviation[:is_downloadable]
api_client.download(uuid)
end
memoize :api_download
def api_response
{
deviation: api_deviation,
metadata: api_metadata,
download: api_download
}
end
end
end
end

View File

@@ -0,0 +1,124 @@
# frozen_string_literal: true
# @see Source::URL::Fanbox
module Source
class Extractor
class Fanbox < Source::Extractor
def match?
Source::URL::Fanbox === parsed_url
end
def image_urls
if parsed_url.image_url?
[parsed_url.full_image_url]
elsif api_response.present?
# There's two ways pics are returned via api:
# Pics in proper array: https://yanmi0308.fanbox.cc/posts/1141325
# Embedded pics (imageMap): https://www.fanbox.cc/@tsukiori/posts/1080657
images = api_response.dig("body", "images").to_a + api_response.dig("body", "imageMap").to_a.map { |id| id[1] }
# The following is needed because imageMap is sorted alphabetically rather than by image order
sort_order = api_response.dig("body", "blocks").to_a.map { |b| b["imageId"] if b["type"] == "image" }.compact.uniq
images = images.sort_by { |img| sort_order.index(img["id"]) } if sort_order.present?
images.pluck("originalUrl")
else
[]
end
end
def page_url
if artist_name.present? && illust_id.present?
"https://#{artist_name}.fanbox.cc/posts/#{illust_id}"
elsif parsed_url.image_url? && artist_name.present?
# Cover images
"https://#{artist_name}.fanbox.cc"
end
end
def profile_url
return if artist_name.blank?
"https://#{artist_name}.fanbox.cc"
end
def artist_name
artist_name_from_url || api_response["creatorId"] || artist_api_response["creatorId"]
end
def display_name
api_response.dig("user", "name") || artist_api_response.dig("user", "name")
end
def other_names
[artist_name, display_name].compact.uniq
end
def tags
api_response["tags"].to_a.map { |tag| [tag, "https://fanbox.cc/tags/#{tag}"] }
end
def artist_commentary_title
api_response["title"]
end
def artist_commentary_desc
body = api_response["body"]
return if body.blank?
if body["text"].present?
body["text"]
elsif body["blocks"].present?
# Reference: https://official.fanbox.cc/posts/182757
# Commentary can get pretty complex, but unfortunately it's served in json format so it's a pain to parse it.
# I've left out parsing external embeds because each supported site has its own id mapped to the domain
commentary = body["blocks"].map do |node|
if node["type"] == "image"
body["imageMap"][node["imageId"]]["originalUrl"]
else
node["text"] || "\n"
end
end
commentary.join("\n")
end
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end
def artist_id_from_url
parsed_url.user_id || parsed_referer&.user_id
end
def artist_name_from_url
parsed_url.username || parsed_referer&.username
end
def api_response
return {} if illust_id.blank?
resp = client.get("https://api.fanbox.cc/post.info?postId=#{illust_id}")
json_response = JSON.parse(resp)["body"]
# At some point in 2020 fanbox stopped hiding R18 posts from the api
# This check exists in case they ever start blocking them again
return {} if json_response["restrictedFor"] == 2 && json_response["body"].blank?
json_response
rescue JSON::ParserError
{}
end
def artist_api_response
# Needed to fetch artist from cover pages
return {} if artist_id_from_url.blank?
resp = client.get("https://api.fanbox.cc/creator.get?userId=#{artist_id_from_url}")
JSON.parse(resp)["body"]
rescue JSON::ParserError
{}
end
def client
@client ||= http.headers(Origin: "https://fanbox.cc").cache(1.minute)
end
end
end
end

View File

@@ -0,0 +1,163 @@
# frozen_string_literal: true
class Source::Extractor
class Fantia < Source::Extractor
def self.enabled?
Danbooru.config.fantia_session_id.present?
end
def match?
Source::URL::Fantia === parsed_url
end
def image_urls
return [parsed_url.full_image_url] if parsed_url.image_url?
return [image_from_downloadable(parsed_url)] if parsed_url.downloadable?
images = images_for_post.presence || images_for_product.presence || []
full_images = images.compact.map do |image|
parsed = Source::URL.parse(image)
if parsed&.image_url?
parsed.full_image_url
elsif parsed&.downloadable?
image_from_downloadable(parsed)
else
image
end
end
full_images.compact.uniq
end
def image_from_downloadable(url)
resp = http.head(url)
return url if resp.status != 200
resp.uri.to_s
end
def images_for_post
return [] unless api_response.present?
images = [api_response.dig("post", "thumb_micro")]
api_response.dig("post", "post_contents").to_a.map do |content|
next if content["visible_status"] != "visible"
case content["category"]
when "photo_gallery"
content["post_content_photos"].to_a.map { |i| images << i.dig("url", "original") }
when "file"
images << image_from_downloadable("https://www.fantia.jp/#{content["download_uri"]}")
when "blog"
begin
sub_json = JSON.parse(content["comment"])
rescue Json::ParserError
sub_json = {}
end
sub_json["ops"].to_a.map { |js| images << js.dig("insert", "fantiaImage", "url") }
end
end
images
end
def images_for_product
html_response&.css(".product-gallery-item .img-fluid").to_a.map do |element|
element["src"] unless element["src"] =~ %r{/fallback/}
end.compact
end
def page_url
parsed_url.page_url || parsed_referer&.page_url
end
def tags
case work_type
when "post"
api_response&.dig("post", "tags").to_a.map do |tag|
[tag["name"], "https://fantia.jp/posts?tag=#{tag["name"]}"]
end
when "product"
html_response&.css(".product-category a").to_a.map do |element|
tag_name = element.text.delete_prefix("#")
[tag_name, "https://fantia.jp/products?product_category=##{tag_name}"]
end
else
[]
end
end
def other_names
case work_type
when "post"
[api_response&.dig("post", "fanclub", "creator_name")].compact
when "product"
[html_response&.at(".fanclub-name a")&.text].compact
end
end
def profile_url
case work_type
when "post"
fanclub_id = api_response&.dig("post", "fanclub", "id")
return unless fanclub_id.present?
"https://fantia.jp/fanclubs/#{fanclub_id}"
when "product"
href = html_response&.at(".fanclub-name a")&.[]("href")
return unless href.present?
URI.join("https://fantia.jp/", href).to_s
end
end
def artist_commentary_title
case work_type
when "post"
api_response&.dig("post", "title")
when "product"
html_response&.at(".product-title")&.text
end
end
def artist_commentary_desc
case work_type
when "post"
api_response&.dig("post", "comment")
when "product"
html_response&.at(".product-description")&.text
end
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc)
end
def work_type
parsed_url.work_type || parsed_referer&.work_type
end
def work_id
parsed_url.work_id || parsed_referer&.work_id
end
def api_response
return {} unless work_type == "post"
api_url = "https://fantia.jp/api/v1/posts/#{work_id}"
response = http.cache(1.minute).get(api_url)
return {} unless response.status == 200
JSON.parse(response)
rescue JSON::ParserError
{}
end
def html_response
return nil unless work_type == "product"
response = http.cache(1.minute).get("https://fantia.jp/products/#{work_id}")
return nil unless response.status == 200
response.parse
end
def http
Danbooru::Http.new.cookies(_session_id: Danbooru.config.fantia_session_id)
end
end
end

View File

@@ -0,0 +1,91 @@
# frozen_string_literal: true
# @see Source::URL::Foundation
module Source
class Extractor
class Foundation < Source::Extractor
def match?
Source::URL::Foundation === parsed_url
end
def image_urls
if parsed_url.full_image_url.present?
[parsed_url.full_image_url]
elsif image_url = page&.at(".fullscreen img, .fullscreen video")&.attr(:src)
[Source::URL.parse(image_url).full_image_url].compact
else
[]
end
end
def page_url
parsed_url.page_url || parsed_referer&.page_url
end
def page
return nil if page_url.blank?
response = http.cache(1.minute).get(page_url)
return nil unless response.status == 200
response.parse
end
def tags
tags = api_response.dig("props", "pageProps", "artwork", "tags").to_a
tags.map do |tag|
[tag, "https://foundation.app/tags/#{tag}"]
end
end
def artist_name
parsed_url.username || parsed_referer&.username || api_response.dig("props", "pageProps", "artwork", "creator", "username")
end
def profile_url
return nil if artist_name.blank?
"https://foundation.app/@#{artist_name}"
end
def profile_urls
[profile_url, creator_public_key_url].compact
end
def creator_public_key_url
return nil if creator_public_key.nil?
"https://foundation.app/#{creator_public_key}"
end
def creator_public_key
api_response.dig("props", "pageProps", "artwork", "creator", "publicKey")
end
def artist_commentary_title
return nil if page.blank?
page.at("meta[property='og:title']")["content"].gsub(/ \| Foundation$/, "")
end
def artist_commentary_desc
header = page&.xpath("//h2[text()='Description']")&.first
return nil if header.blank?
header&.parent&.search("div").first&.to_html
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc)
end
def api_response
return {} if page.nil?
data = page.at("#__NEXT_DATA__")&.text
return {} if data.blank?
JSON.parse(data).with_indifferent_access
end
memoize :api_response
end
end
end

View File

@@ -0,0 +1,74 @@
# frozen_string_literal: true
# @see Source::URL::HentaiFoundry
module Source
class Extractor
class HentaiFoundry < Source::Extractor
def match?
Source::URL::HentaiFoundry === parsed_url
end
def image_urls
image = page&.search("#picBox img")
return [] unless image
image.to_a.map { |img| URI.join(page_url, img["src"]).to_s }
end
def page_url
return nil if illust_id.blank?
if artist_name.blank?
"https://www.hentai-foundry.com/pic-#{illust_id}"
else
"https://www.hentai-foundry.com/pictures/user/#{artist_name}/#{illust_id}"
end
end
def page
return nil if page_url.blank?
response = http.cache(1.minute).get("#{page_url}?enterAgree=1")
return nil unless response.status == 200
response.parse
end
def tags
tags = page&.search(".boxbody [rel='tag']") || []
tags.map do |tag|
[tag.text, URI.join(page_url, tag.attr("href")).to_s]
end
end
def artist_name
parsed_url.username || parsed_referer&.username
end
def profile_url
return nil if artist_name.blank?
"https://www.hentai-foundry.com/user/#{artist_name}"
end
def artist_commentary_title
page&.search("#picBox .imageTitle")&.text
end
def artist_commentary_desc
page&.search("#descriptionBox .picDescript")&.to_html
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc).gsub(/\A[[:space:]]+|[[:space:]]+\z/, "").gsub(/\n+/, "\n")
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end
memoize :page
end
end
end

View File

@@ -0,0 +1,61 @@
# frozen_string_literal: true
# @see Source::URL::Lofter
module Source
class Extractor
class Lofter < Source::Extractor
def match?
Source::URL::Lofter === parsed_url
end
def image_urls
if parsed_url.image_url?
[parsed_url.full_image_url]
else
images = page&.search(".imgclasstag img")
images.to_a.pluck("src").map { |url| Source::URL.parse(url).full_image_url }
end
end
def profile_url
return nil if artist_name.blank?
"https://#{artist_name}.lofter.com"
end
def page_url
return nil if illust_id.blank? || profile_url.blank?
"#{profile_url}/post/#{illust_id}"
end
def page
return nil if page_url.blank?
response = http.cache(1.minute).get(page_url)
response.parse if response.status == 200
end
def tags
return [] if artist_name.blank?
page&.search("[href*='#{artist_name}.lofter.com/tag/']").to_a.map do |tag|
href = tag.attr("href")
[Source::URL.parse(href).unescaped_tag, href]
end
end
def artist_commentary_desc
page&.search(".ct .text, .content .text, .posts .photo .text").to_a.compact.first&.to_html
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end
def artist_name
parsed_url.username || parsed_referer&.username
end
memoize :page
end
end
end

View File

@@ -0,0 +1,97 @@
# frozen_string_literal: true
# @see Source::URL::Mastodon
class Source::Extractor
class Mastodon < Source::Extractor
def match?
Source::URL::Mastodon === parsed_url
end
def domain
case site_name
when "Pawoo" then "pawoo.net"
when "Baraag" then "baraag.net"
end
end
def image_urls
if parsed_url.image_url?
[parsed_url.full_image_url]
else
api_response.image_urls
end
end
def page_url
artist_name = artist_name_from_url
status_id = status_id_from_url
return if status_id.blank?
if artist_name.present?
"https://#{domain}/@#{artist_name}/#{status_id}"
else
"https://#{domain}/web/statuses/#{status_id}"
end
end
def profile_url
if artist_name_from_url.present?
"https://#{domain}/@#{artist_name_from_url}"
elsif api_response.present? && api_response.profile_url.present?
api_response.profile_url
end
end
def account_url
return if account_id.blank?
"https://#{domain}/web/accounts/#{account_id}"
end
def profile_urls
[profile_url, account_url].compact
end
def artist_name
api_response.account_name
end
def artist_name_from_url
parsed_url.username || parsed_referer&.username
end
def other_names
[api_response.display_name]
end
def account_id
parsed_url.user_id || parsed_referer&.user_id || api_response.account_id
end
def status_id_from_url
parsed_url.work_id || parsed_referer&.work_id
end
def artist_commentary_desc
api_response.commentary
end
def tags
api_response.tags
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc) do |element|
if element.name == "a"
# don't include links to the toot itself.
media_urls = api_response.json["media_attachments"].map { |attr| attr["text_url"] }
element["href"] = nil if element["href"].in?(media_urls)
end
end.strip
end
def api_response
MastodonApiClient.new(domain, status_id_from_url)
end
memoize :api_response
end
end

View File

@@ -0,0 +1,96 @@
# frozen_string_literal: true
# @see Source::URL::Moebooru
module Source
class Extractor
class Moebooru < Source::Extractor
delegate :artist_name, :profile_url, :tag_name, :artist_commentary_title, :artist_commentary_desc, :dtext_artist_commentary_title, :dtext_artist_commentary_desc, to: :sub_extractor, allow_nil: true
delegate :site_name, :domain, to: :parsed_url
def match?
Source::URL::Moebooru === parsed_url
end
def image_urls
return [] if post_md5.blank? || file_ext.blank?
[Source::URL::Moebooru.full_image_url(site_name, post_md5, file_ext, post_id)]
end
def page_url
return nil if post_id.blank?
"https://#{domain}/post/show/#{post_id}"
end
def tags
api_response[:tags].to_s.split.map do |tag|
[tag, "https://#{domain}/post?tags=#{CGI.escape(tag)}"]
end
end
# XXX the base extractor excludes artist tags from the translated tags; we don't want that for moebooru.
def translated_tags
tags.map(&:first).flat_map(&method(:translate_tag)).uniq.sort
end
# Moebooru returns an empty array when doing an md5:<hash> search for a
# deleted post. Because of this, api_response may be empty in some cases.
def api_response
if post_id_from_url.present?
params = { tags: "id:#{post_id_from_url}" }
elsif post_md5_from_url.present?
params = { tags: "md5:#{post_md5_from_url}" }
else
return {}
end
response = http.cache(1.minute).get("https://#{domain}/post.json", params: params)
post = response.parse.first&.with_indifferent_access
post || {}
end
memoize :api_response
concerning :HelperMethods do
def sub_extractor
@sub_extractor ||= Source::Extractor.find(api_response[:source], default: nil)
end
def file_ext
if parsed_url.original_file_ext.present?
parsed_url.original_file_ext
# file_ext is not present in konachan's api (only on yande.re)
elsif api_response[:file_ext].present?
api_response[:file_ext]
# file_url is not present in yande.re's api on deleted posts
elsif api_response[:file_url].present?
api_response[:file_url][/\.(jpg|jpeg|png|gif)\z/i, 1]
# the api_response wasn't available because it's a deleted post.
elsif post_md5.present?
%w[jpg png gif].find { |ext| http_exists?("https://#{domain}/image/#{post_md5}.#{ext}") }
else
nil
end
end
def post_id_from_url
parsed_url.work_id || parsed_referer&.work_id
end
def post_md5_from_url
parsed_url.md5 || parsed_referer&.md5
end
def post_id
post_id_from_url || api_response[:id]
end
def post_md5
post_md5_from_url || api_response[:md5]
end
end
end
end
end

View File

@@ -0,0 +1,87 @@
# frozen_string_literal: true
# @see Source::URL::Newgrounds
module Source
class Extractor
class Newgrounds < Source::Extractor
def match?
Source::URL::Newgrounds === parsed_url
end
def image_urls
if parsed_url.image_url?
[url]
else
urls = []
urls += page&.css(".image img").to_a.map { |img| img["src"] }
urls += page&.css("#author_comments img[data-user-image='1']").to_a.map { |img| img["data-smartload-src"] || img["src"] }
urls.compact
end
end
def page_url
return nil if illust_title.blank? || user_name.blank?
"https://www.newgrounds.com/art/view/#{user_name}/#{illust_title}"
end
def page
return nil if page_url.blank?
response = http.cache(1.minute).get(page_url)
return nil if response.status == 404
response.parse
end
memoize :page
def tags
page&.css("#sidestats .tags a").to_a.map do |tag|
[tag.text, "https://www.newgrounds.com/search/conduct/art?match=tags&tags=" + tag.text]
end
end
def normalize_tag(tag)
tag = tag.tr("-", "_")
super(tag)
end
def artist_name
name = page&.css(".item-user .item-details h4 a")&.text&.strip || user_name
name&.downcase
end
def other_names
[artist_name, user_name].compact.uniq
end
def profile_url
# user names are not mutable, artist names are.
# However we need the latest name for normalization
"https://#{artist_name}.newgrounds.com"
end
def artist_commentary_title
page&.css(".pod-head > [itemprop='name']")&.text
end
def artist_commentary_desc
page&.css("#author_comments")&.to_html
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc)
end
def user_name
parsed_url.username || parsed_referer&.username
end
def illust_title
parsed_url.work_title || parsed_referer&.work_title
end
end
end
end

View File

@@ -0,0 +1,111 @@
# frozen_string_literal: true
# @see Source::URL::NicoSeiga
module Source
class Extractor
class NicoSeiga < Source::Extractor
def self.enabled?
Danbooru.config.nico_seiga_user_session.present?
end
def match?
Source::URL::NicoSeiga === parsed_url
end
def image_urls
if image_id.present?
[image_url_for("https://seiga.nicovideo.jp/image/source/#{image_id}")]
elsif illust_id.present?
[image_url_for("https://seiga.nicovideo.jp/image/source/#{illust_id}")]
elsif manga_id.present? && api_client.image_ids.present?
api_client.image_ids.map { |id| image_url_for("https://seiga.nicovideo.jp/image/source/#{id}") }
else
[image_url_for(url)]
end
end
def image_url_for(url)
return url if api_client.blank?
resp = api_client.head(url)
if resp.uri.to_s =~ %r{https?://.+/(\w+/\d+/\d+)\z}i
"https://lohas.nicoseiga.jp/priv/#{$1}"
else
url
end
end
def page_url
parsed_referer&.page_url || parsed_url.page_url
end
def profile_url
"https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}" if api_client&.user_id.present?
end
def artist_name
return if api_client.blank?
api_client.user_name
end
def artist_commentary_title
return if api_client.blank?
api_client.title
end
def artist_commentary_desc
return if api_client.blank?
api_client.description
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc) do |element|
if element.name == "font" && element["color"] == "white"
element.content = "[spoiler]#{element.content}[/spoiler]"
end
end.gsub(/[^\w]im(\d+)/, ' seiga #\1 ').chomp
end
def tag_name
return if api_client&.user_id.blank?
"nicoseiga#{api_client.user_id}"
end
def tags
return [] if api_client.blank?
base_url = "https://seiga.nicovideo.jp/"
base_url += "manga/" if manga_id.present?
base_url += "tag/"
api_client.tags.map do |name|
[name, base_url + CGI.escape(name)]
end
end
def image_id
parsed_url.image_id || parsed_referer&.image_id
end
def illust_id
parsed_url.illust_id || parsed_referer&.illust_id
end
def manga_id
parsed_url.manga_id || parsed_referer&.manga_id
end
def api_client
if illust_id.present?
NicoSeigaApiClient.new(work_id: illust_id, type: "illust", http: http)
elsif manga_id.present?
NicoSeigaApiClient.new(work_id: manga_id, type: "manga", http: http)
elsif image_id.present?
# We default to illust to attempt getting the api anyway
NicoSeigaApiClient.new(work_id: image_id, type: "illust", http: http)
end
end
memoize :api_client
end
end
end

View File

@@ -0,0 +1,174 @@
# frozen_string_literal: true
# @see Source::URL::Nijie
module Source
class Extractor
class Nijie < Source::Extractor
def self.enabled?
Danbooru.config.nijie_login.present? && Danbooru.config.nijie_password.present?
end
def match?
Source::URL::Nijie === parsed_url
end
def image_urls
if parsed_url.image_url?
[parsed_url.full_image_url]
else
image_urls_from_page
end
end
def image_urls_from_page
if doujin?
images = page&.search("#dojin_left .left img").to_a.pluck("src")
images += page&.search("#dojin_diff img.mozamoza").to_a.pluck("data-original")
else
images = page&.search("div#gallery a > .mozamoza").to_a.pluck("src")
end
images.map { |img| Source::URL.parse("https:#{img}").full_image_url }
end
def page_url
return nil if illust_id.blank?
"https://nijie.info/view.php?id=#{illust_id}"
end
def profile_url
return nil if artist_id.blank?
"https://nijie.info/members.php?id=#{artist_id}"
end
def artist_name
if doujin?
page&.at("#dojin_left .right a[href*='members.php?id=']")&.text
else
page&.at("a.name")&.text
end
end
def artist_commentary_title
if doujin?
page&.search("#dojin_text p.title")&.text
else
page&.search("h2.illust_title")&.text
end
end
def artist_commentary_desc
if doujin?
page&.search("#dojin_text p:not(.title)")&.to_html
else
page&.search('#illust_text > p')&.to_html
end
end
def tags
links = page&.search("div#view-tag a") || []
search_links = links.select do |node|
node["href"] =~ /search(?:_dojin)?\.php/
end
search_links.map do |node|
[node.inner_text, "https://nijie.info" + node.attr("href")]
end
end
def tag_name
"nijie" + artist_id.to_s
end
def self.to_dtext(text)
text = text.to_s.gsub(/\r\n|\r/, "<br>")
dtext = DText.from_html(text) do |element|
if element.name == "a" && element["href"]&.start_with?("/jump.php")
element["href"] = element.text
end
end
dtext.strip
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end
def artist_id_from_url
parsed_url.user_id || parsed_referer&.user_id
end
def artist_id_from_page
page&.search("a.name")&.first&.attr("href")&.match(/members\.php\?id=(\d+)/) { $1.to_i }
end
def artist_id
artist_id_from_url || artist_id_from_page
end
def doujin?
page&.at("#dojin_left").present?
end
def page
return nil if page_url.blank? || client.blank?
response = client.cache(1.minute).get(page_url)
if response.status != 200 || response.parse.search("#login_illust").present?
clear_cached_session_cookie!
else
response.parse
end
end
memoize :page
def client
return nil if cached_session_cookie.nil?
http.cookies(R18: 1, **cached_session_cookie)
end
def http
super.timeout(60).use(retriable: { max_retries: 20 })
end
# { "NIJIEIJIEID" => "5ca3f816c0c1f3e647940b08b8ab7a45", "nijie_tok" => <long-base64-string> }
def cached_session_cookie
Cache.get("nijie-session-cookie", 60.minutes, skip_nil: true) do
session_cookie
end
end
def clear_cached_session_cookie!
flush_cache # clear memoized session cookie
Cache.delete("nijie-session-cookie")
end
def session_cookie
login_page = http.get("https://nijie.info/login.php").parse
form = {
email: Danbooru.config.nijie_login,
password: Danbooru.config.nijie_password,
url: login_page.at("input[name='url']")&.fetch("value"),
save: "on",
ticket: ""
}
response = http.post("https://nijie.info/login_int.php", form: form)
if response.status == 200
response.cookies.cookies.map { |cookie| [cookie.name, cookie.value] }.to_h
else
DanbooruLogger.info "Nijie login failed (#{url}, #{response.status})"
nil
end
end
memoize :client, :cached_session_cookie
end
end
end

View File

@@ -0,0 +1,19 @@
# frozen_string_literal: true
module Source
class Extractor
class Null < Source::Extractor
def image_urls
[url]
end
def page_url
nil
end
def artists
ArtistFinder.find_artists(url)
end
end
end
end

View File

@@ -0,0 +1,165 @@
# frozen_string_literal: true
# @see Source::URL::Pixiv
module Source
class Extractor
class Pixiv < Source::Extractor
def self.enabled?
Danbooru.config.pixiv_phpsessid.present?
end
def self.to_dtext(text)
return nil if text.nil?
text = text.gsub(%r{<a href="https?://www\.pixiv\.net/en/artworks/([0-9]+)">illust/[0-9]+</a>}i) do |_match|
pixiv_id = $1
%(pixiv ##{pixiv_id} "»":[#{Routes.posts_path(tags: "pixiv:#{pixiv_id}")}])
end
text = text.gsub(%r{<a href="https?://www\.pixiv\.net/en/users/([0-9]+)">user/[0-9]+</a>}i) do |_match|
member_id = $1
profile_url = "https://www.pixiv.net/users/#{member_id}"
artist_search_url = Routes.artists_path(search: { url_matches: profile_url })
%("user/#{member_id}":[#{profile_url}] "»":[#{artist_search_url}])
end
DText.from_html(text) do |element|
if element.name == "a" && element["href"].match?(%r!\A/jump\.php\?!)
element["href"] = Addressable::URI.heuristic_parse(element["href"]).normalized_query
end
end
end
def match?
Source::URL::Pixiv === parsed_url
end
def image_urls
if is_ugoira?
[api_ugoira[:originalSrc]]
elsif parsed_url.image_url? && parsed_url.page && original_urls.present?
[original_urls[parsed_url.page]]
elsif parsed_url.image_url?
[parsed_url.to_s]
else
original_urls
end
end
def original_urls
api_pages.pluck("urls").pluck("original").to_a
end
def page_url
return nil if illust_id.blank?
"https://www.pixiv.net/artworks/#{illust_id}"
end
def profile_url
if api_illust[:userId].present?
"https://www.pixiv.net/users/#{api_illust[:userId]}"
elsif parsed_url.profile_url.present?
parsed_url.profile_url
end
end
def stacc_url
return nil if moniker.blank?
"https://www.pixiv.net/stacc/#{moniker}"
end
def profile_urls
[profile_url, stacc_url].compact
end
def artist_name
api_illust[:userName]
end
def other_names
other_names = [artist_name]
other_names << moniker unless moniker&.starts_with?("user_")
other_names.compact.uniq
end
def artist_commentary_title
api_illust[:title]
end
def artist_commentary_desc
api_illust[:description]
end
def tag_name
moniker
end
def tags
api_illust.dig(:tags, :tags).to_a.map do |item|
tag = item[:tag]
[tag, "https://www.pixiv.net/search.php?s_mode=s_tag_full&#{{word: tag}.to_param}"]
end
end
def normalize_tag(tag)
tag.gsub(/\d+users入り\z/i, "")
end
def download_file!(url)
file = super(url)
file.frame_data = ugoira_frame_data if is_ugoira?
file
end
def translate_tag(tag)
translated_tags = super(tag)
if translated_tags.empty? && tag.include?("/")
translated_tags = tag.split("/").flat_map { |translated_tag| super(translated_tag) }
end
translated_tags
end
def related_posts_search_query
illust_id.present? ? "pixiv:#{illust_id}" : "source:#{url}"
end
def is_ugoira?
original_urls.any? { |url| Source::URL.parse(url).is_ugoira? }
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end
def api_client
PixivAjaxClient.new(Danbooru.config.pixiv_phpsessid, http: http)
end
def api_illust
api_client.illust(illust_id)
end
def api_pages
api_client.pages(illust_id)
end
def api_ugoira
api_client.ugoira_meta(illust_id)
end
def moniker
parsed_url.username || api_illust[:userAccount]
end
def ugoira_frame_data
api_ugoira[:frames]
end
memoize :illust_id, :api_client, :api_illust, :api_pages, :api_ugoira
end
end
end

View File

@@ -0,0 +1,82 @@
# frozen_string_literal: true
# @see Source::URL::PixivSketch
module Source
class Extractor
class PixivSketch < Source::Extractor
def match?
Source::URL::PixivSketch === parsed_url
end
def image_urls
if parsed_url.image_url?
[parsed_url.full_image_url]
else
image_urls_from_api
end
end
def image_urls_from_api
api_response.dig("data", "media").to_a.pluck("photo").pluck("original").pluck("url2x")
end
def profile_url
"https://sketch.pixiv.net/@#{artist_name}" if artist_name.present?
end
def artist_name
api_response.dig("data", "user", "unique_name")
end
def other_names
[artist_name, display_name].compact
end
def profile_urls
[profile_url, pixiv_profile_url].compact
end
def artist_commentary_desc
api_response.dig("data", "text")
end
def tags
api_response.dig("data", "tags").to_a.map do |tag|
[tag, "https://sketch.pixiv.net/tags/#{tag}"]
end
end
def display_name
api_response.dig("data", "user", "name")
end
def pixiv_profile_url
"https://www.pixiv.net/users/#{pixiv_user_id}" if pixiv_user_id.present?
end
def pixiv_user_id
api_response.dig("data", "user", "pixiv_user_id")
end
# curl https://sketch.pixiv.net/api/items/5835314698645024323.json | jq
def api_response
return {} if api_url.blank?
response = http.cache(1.minute).get(api_url)
return {} if response.status == 404
response.parse
end
def page_url
parsed_url.page_url || parsed_referer&.page_url
end
def api_url
parsed_url.api_url || parsed_referer&.api_url
end
memoize :api_response
end
end
end

View File

@@ -0,0 +1,116 @@
# frozen_string_literal: true
# @see Source::URL::Plurk
module Source
class Extractor
class Plurk < Source::Extractor
def match?
Source::URL::Plurk === parsed_url
end
def image_urls
# * Posts can have up to 10 images.
# * Artists commonly post extra images by replying to their own post.
# * Adult posts are hidden for logged out users. The main images can be found by
# scraping a <script> tag, but an API call is needed to get the images in the replies.
#
# Examples:
# * https://www.plurk.com/p/om6zv4 (non-adult, single image)
# * https://www.plurk.com/p/okxzae (non-adult, multiple images, with replies)
# * https://www.plurk.com/p/omc64y (adult, multiple images, with replies)
if parsed_url.image_url?
[url]
elsif page_json["porn"]
# in case of adult posts, we get the main images and the replies separately
images_from_script_tag + images_from_replies
else
images_from_page
end
end
def page_url
return nil if illust_id.blank?
"https://plurk.com/p/#{illust_id}"
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end
def page
return nil if page_url.blank?
response = http.cache(1.minute).get(page_url)
return nil unless response.status == 200
response.parse
end
# For non-adult works, returns both the main images and the images posted by the artist in the replies.
# For adult works, returns only the main images.
def images_from_page
page&.search(".bigplurk .content a img, .response.highlight_owner .content a img").to_a.pluck("alt")
end
# Returns only the main images, not the images posted in the replies. Used for adult works.
def images_from_script_tag
URI.extract(page_json["content_raw"])
end
# Returns images posted by the artist in the replies. Used for adult works.
def images_from_replies
artist_responses = api_replies["responses"].to_a.select { _1["user_id"].to_i == artist_id.to_i }
urls = artist_responses.pluck("content_raw").flat_map { URI.extract(_1) }
urls.select { Source::URL.parse(_1)&.image_url? }.uniq
end
def page_json
script_text = page&.search("body script").to_a.map(&:text).grep(/plurk =/).first.to_s
json = script_text.strip.delete_prefix("plurk = ").delete_suffix(";").gsub(/new Date\((.*?)\)/) { $1 }
return {} if json.blank?
JSON.parse(json)
end
def api_replies
return {} if illust_id.blank?
response = http.cache(1.minute).post("https://www.plurk.com/Responses/get", form: { plurk_id: illust_id.to_i(36), from_response_id: 0 })
return {} unless response.status == 200
response.parse
end
def tag_name
page&.at(".bigplurk .user a")&.[](:href)&.gsub(%r{^/}, "")
end
def artist_name
page&.at(".bigplurk .user a")&.text
end
def artist_id
page&.at("a[data-uid]")&.attr("data-uid").to_i
end
def profile_url
return nil if artist_name.blank?
"https://www.plurk.com/#{tag_name}"
end
def artist_commentary_desc
page&.search(".bigplurk .content .text_holder, .response.highlight_owner .content .text_holder")&.to_html
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc) do |element|
if element.name == "a"
element.content = ""
end
end.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
end
memoize :page, :page_json, :api_replies
end
end
end

View File

@@ -0,0 +1,112 @@
# frozen_string_literal: true
# @see Source::URL::Skeb
module Source
class Extractor
class Skeb < Extractor
def match?
Source::URL::Skeb === parsed_url
end
def image_urls
if parsed_url.image_url?
[url]
elsif unwatermarked_url.present?
# If the unwatermarked URL is present, then find and replace the watermarked URL
# with the unwatermarked version (unless the watermarked version is a video or
# gif, in which case the unwatermarked URL is not used because it's a still image).
#
# https://skeb.jp/@goma_feet/works/1: https://skeb.imgix.net/uploads/origins/78ca23dc-a053-4ebe-894f-d5a06e228af8?bg=%23fff&auto=format&w=800&s=3de55b04236059113659f99fd6900d7d
# https://skeb.jp/@2gi0gi_/works/13: https://skeb.imgix.net/requests/191942_0?bg=%23fff&fm=jpg&q=45&w=696&s=5783ee951cc55d183713395926389453
# https://skeb.jp/@tontaro_/works/316: https://skeb.imgix.net/uploads/origins/5097b1e1-18ce-418e-82f0-e7e2cdab1cea?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&fm=mp4&w=800&s=fcff06871e114b3dbf505c04f27b5ed1
sample_urls.map do |sample_url|
if sample_url.path == unwatermarked_url.path && sample_url.watermarked? && !sample_url.animated?
unwatermarked_url
else
sample_url
end
end.map(&:to_s)
else
sample_urls.map(&:to_s)
end
end
def sample_urls
api_response["previews"].to_a.pluck("url").compact.map { |url| Source::URL.parse(url) }
end
# Some posts have an unwatermarked version of the image. Usually it's lower
# resolution and lower JPEG quality than the watermarked image. Multi-image posts
# will have only one unwatermarked URL.
def unwatermarked_url
return nil if api_response["article_image_url"].nil?
Source::URL.parse(api_response["article_image_url"])
end
def page_url
return unless artist_name.present? && illust_id.present?
"https://skeb.jp/@#{artist_name}/works/#{illust_id}"
end
def api_url
return nil unless artist_name.present? && illust_id.present?
"https://skeb.jp/api/users/#{artist_name}/works/#{illust_id}"
end
def api_response
return {} unless api_url.present?
headers = {
Referer: profile_url,
Authorization: "Bearer null",
}
response = http.cache(1.minute).headers(headers).get(api_url)
return {} unless response.status == 200
# The status check is required for private commissions, which return 404
response.parse
end
def profile_url
return nil if artist_name.blank?
"https://skeb.jp/@#{artist_name}"
end
def artist_name
parsed_url.username || parsed_referer&.username
end
def display_name
api_response&.dig("creator", "name")
end
def illust_id
parsed_url.work_id || parsed_referer&.work_id
end
def other_names
[display_name].compact.uniq
end
def artist_commentary_desc
api_response&.dig("source_body") || api_response&.dig("body")
# skeb "titles" are not needed: it's just the first few characters of the description
end
def client_response
api_response&.dig("source_thanks") || api_response&.dig("thanks")
end
def dtext_artist_commentary_desc
if client_response.present? && artist_commentary_desc.present?
"h6. Original Request:\n\n#{artist_commentary_desc}\n\nh6. Client Response:\n\n#{client_response}"
else
artist_commentary_desc
end
end
memoize :api_response
end
end
end

View File

@@ -0,0 +1,113 @@
# frozen_string_literal: true
# @see Source::URL::Tinami
module Source
class Extractor
class Tinami < Source::Extractor
def match?
Source::URL::Tinami === parsed_url
end
def image_urls
if parsed_url.image_url?
[url]
# http://www.tinami.com/view/1087268 (single image)
elsif page&.css("img.captify")&.size.to_i == 1
[full_image_url].compact
# http://www.tinami.com/view/1087270 (multiple images)
elsif image_sub_ids.present?
image_sub_ids.map { |sub_id| full_image_url(sub_id) }.compact
# http://www.tinami.com/view/1087271 (multiple images)
elsif nv_body_image_urls.present?
nv_body_image_urls
# http://www.tinami.com/view/1087267 (no images, text only)
else
[]
end
end
def nv_body_image_urls
page&.css(".viewbody .nv_body img").to_a.map do |img|
"https:#{img[:src]}" # img[:src] == "//img.tinami.com/illust2/img/619/6234b647da609.jpg"
end
end
def image_sub_ids
page&.css(".viewbody #controller_model .thumbnail_list").to_a.map { |td| td.attr("sub_id") }
end
def page_url
parsed_url.page_url || parsed_referer&.page_url
end
def tags
page&.css("#view .tag a[href^='/search/list']").to_a.map do |tag|
[tag.text, "https://www.tinami.com/search/list?keyword=#{CGI.escape(tag.text)}"]
end
end
def profile_url
"https://www.tinami.com/creator/profile/#{user_id}" if user_id.present?
end
def tag_name
nil
end
def artist_name
page&.at("#view .prof > p > a > strong")&.text
end
def artist_commentary_title
page&.at("#view .viewdata h1")&.text.to_s.strip
end
def artist_commentary_desc
page&.at("#view .comment .description")&.text.to_s.strip.delete("\t")
end
def user_id
url = page&.at("#view .prof > p > a")&.attr("href")&.prepend("https://www.tinami.com")
Source::URL.parse(url)&.user_id
end
def work_id
parsed_url.work_id || parsed_referer&.work_id
end
def ethna_csrf
page&.at("#open_original_content input[name=ethna_csrf]")&.attr("value")
end
def full_image_url(sub_id = nil)
return nil unless work_id.present? && ethna_csrf.present?
# Note that we have to spoof the Referer here.
response = http.post(page_url, form: { action_view_original: true, cont_id: work_id, sub_id: sub_id, ethna_csrf: ethna_csrf })
return nil unless response.status == 200
response.parse.at("body > div > a > img[src^='//img.tinami.com']")&.attr("src")&.prepend("https:")
end
def page
return nil if page_url.blank?
response = http.cache(1.minute).get(page_url)
return nil unless response.status == 200
response.parse
end
def http
super.cookies(Tinami2SESSID: Danbooru.config.tinami_session_id).use(:spoof_referrer)
end
memoize :page, :user_id, :work_id, :ethna_csrf, :image_urls, :image_sub_ids, :nv_body_image_urls
end
end
end

View File

@@ -0,0 +1,145 @@
# frozen_string_literal: true
# @see Source::URL::Tumblr
class Source::Extractor
class Tumblr < Source::Extractor
def self.enabled?
Danbooru.config.tumblr_consumer_key.present?
end
def match?
Source::URL::Tumblr === parsed_url
end
def image_urls
return [find_largest(parsed_url)].compact if parsed_url.asset_url?
assets = []
case post[:type]
when "photo"
assets += post[:photos].map do |photo|
sizes = [photo[:original_size]] + photo[:alt_sizes]
biggest = sizes.max_by { |x| x[:width] * x[:height] }
biggest[:url]
end
when "video"
assets += [post[:video_url]]
end
assets += inline_images
assets.map { |url| find_largest(url) }
end
def page_url
parsed_url.page_url || parsed_referer&.page_url || post_url_from_image_html&.page_url
end
def profile_url
parsed_url.profile_url || parsed_referer&.profile_url || post_url_from_image_html&.profile_url
end
def artist_commentary_title
case post[:type]
when "text", "link"
post[:title]
when "answer"
"#{post[:asking_name]} asked: #{post[:question]}"
else
nil
end
end
def artist_commentary_desc
case post[:type]
when "text"
post[:body]
when "link"
post[:description]
when "photo", "video"
post[:caption]
when "answer"
post[:answer]
else
nil
end
end
def tags
post[:tags].to_a.map do |tag|
[tag, "https://tumblr.com/tagged/#{CGI.escape(tag)}"]
end.uniq
end
def normalize_tag(tag)
tag = tag.tr("-", "_")
super(tag)
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc).strip
end
def find_largest(image_url)
parsed_image = Source::URL.parse(image_url)
if parsed_image.full_image_url.present?
image_url_html(parsed_image.full_image_url)&.at("img[src*='/#{parsed_image.directory}/']")&.[](:src)
elsif parsed_image.variants.present?
# Look for the biggest available version on media.tumblr.com. A bigger
# version may or may not exist.
parsed_image.variants.find { |variant| http_exists?(variant) }
else
parsed_image.original_url
end
end
def post_url_from_image_html
extracted = image_url_html(parsed_url)&.at("[href*='/post/']")&.[](:href)
Source::URL.parse(extracted)
end
def image_url_html(image_url)
resp = http.cache(1.minute).headers(accept: "text/html").get(image_url)
return nil if resp.code != 200
resp.parse
end
def inline_images
html = Nokogiri::HTML5.fragment(artist_commentary_desc)
html.css("img").map { |node| node["src"] }
end
def artist_name
parsed_url.blog_name || parsed_referer&.blog_name || post_url_from_image_html&.blog_name
end
def work_id
parsed_url.work_id || parsed_referer&.work_id || post_url_from_image_html&.work_id
end
def api_response
return {} unless self.class.enabled?
return {} unless artist_name.present? && work_id.present?
response = http.cache(1.minute).get(
"https://api.tumblr.com/v2/blog/#{artist_name}/posts",
params: { id: work_id, api_key: Danbooru.config.tumblr_consumer_key }
)
return {} if response.code != 200
response.parse.with_indifferent_access
end
memoize :api_response
def post
api_response.dig(:response, :posts)&.first || {}
end
end
end

View File

@@ -0,0 +1,150 @@
# frozen_string_literal: true
# @see Source::URL::Twitter
class Source::Extractor
class Twitter < Source::Extractor
# List of hashtag suffixes attached to tag other names
# Ex: 西住みほ生誕祭2019 should be checked as 西住みほ
# The regexes will not match if there is nothing preceding
# the pattern to avoid creating empty strings.
COMMON_TAG_REGEXES = [
/(?<!\A)生誕祭(?:\d*)\z/,
/(?<!\A)誕生祭(?:\d*)\z/,
/(?<!\A)版もうひとつの深夜の真剣お絵描き60分一本勝負(?:_\d+)?\z/,
/(?<!\A)版深夜の真剣お絵描き60分一本勝負(?:_\d+)?\z/,
/(?<!\A)版深夜の真剣お絵かき60分一本勝負(?:_\d+)?\z/,
/(?<!\A)深夜の真剣お絵描き60分一本勝負(?:_\d+)?\z/,
/(?<!\A)版深夜のお絵描き60分一本勝負(?:_\d+)?\z/,
/(?<!\A)版真剣お絵描き60分一本勝(?:_\d+)?\z/,
/(?<!\A)版お絵描き60分一本勝負(?:_\d+)?\z/
]
def self.enabled?
Danbooru.config.twitter_api_key.present? && Danbooru.config.twitter_api_secret.present?
end
def match?
Source::URL::Twitter === parsed_url
end
def image_urls
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg:orig
if parsed_url.image_url?
[parsed_url.full_image_url]
elsif api_response.present?
api_response.dig(:extended_entities, :media).to_a.map do |media|
if media[:type] == "photo"
media[:media_url_https] + ":orig"
elsif media[:type].in?(["video", "animated_gif"])
variants = media.dig(:video_info, :variants)
videos = variants.select { |variant| variant[:content_type] == "video/mp4" }
video = videos.max_by { |v| v[:bitrate].to_i }
video[:url]
end
end
else
[]
end
end
def page_url
return nil if status_id.blank? || tag_name.blank?
"https://twitter.com/#{tag_name}/status/#{status_id}"
end
def profile_url
return nil if tag_name.blank?
"https://twitter.com/#{tag_name}"
end
def intent_url
user_id = api_response.dig(:user, :id_str)
return nil if user_id.blank?
"https://twitter.com/intent/user?user_id=#{user_id}"
end
def profile_urls
[profile_url, intent_url].compact
end
def tag_name
if tag_name_from_url.present?
tag_name_from_url
elsif api_response.present?
api_response.dig(:user, :screen_name)
else
""
end
end
def artist_name
if api_response.present?
api_response.dig(:user, :name)
else
tag_name
end
end
def artist_commentary_title
""
end
def artist_commentary_desc
api_response[:full_text].to_s
end
def tags
api_response.dig(:entities, :hashtags).to_a.map do |hashtag|
[hashtag[:text], "https://twitter.com/hashtag/#{hashtag[:text]}"]
end
end
def normalize_tag(tag)
COMMON_TAG_REGEXES.each do |rg|
norm_tag = tag.gsub(rg, "")
if norm_tag != tag
return norm_tag
end
end
tag
end
def dtext_artist_commentary_desc
return "" if artist_commentary_desc.blank?
url_replacements = api_response.dig(:entities, :urls).to_a.map do |obj|
[obj[:url], obj[:expanded_url]]
end
url_replacements += api_response.dig(:extended_entities, :media).to_a.map do |obj|
[obj[:url], ""]
end
url_replacements = url_replacements.to_h
desc = artist_commentary_desc.unicode_normalize(:nfkc)
desc = CGI.unescapeHTML(desc)
desc = desc.gsub(%r{https?://t\.co/[a-zA-Z0-9]+}i, url_replacements)
desc = desc.gsub(/#([^[:space:]]+)/, '"#\\1":[https://twitter.com/hashtag/\\1]')
desc = desc.gsub(/@([a-zA-Z0-9_]+)/, '"@\\1":[https://twitter.com/\\1]')
desc.strip
end
def api_client
TwitterApiClient.new(Danbooru.config.twitter_api_key, Danbooru.config.twitter_api_secret)
end
def api_response
return {} unless self.class.enabled? && status_id.present?
api_client.status(status_id)
end
def status_id
parsed_url.status_id || parsed_referer&.status_id
end
def tag_name_from_url
parsed_url.username || parsed_referer&.username
end
memoize :api_response
end
end

View File

@@ -0,0 +1,102 @@
# frozen_string_literal: true
# @see Source::URL::Weibo
module Source
class Extractor
class Weibo < Source::Extractor
def match?
Source::URL::Weibo === parsed_url
end
def image_urls
if parsed_url.image_url?
[parsed_url.full_image_url]
elsif api_response.present?
if api_response["pics"].present?
api_response["pics"].pluck("url").map { |url| Source::URL.parse(url).full_image_url }
elsif api_response.dig("page_info", "type") == "video"
variants = api_response["page_info"]["media_info"].to_h.values + api_response["page_info"]["urls"].to_h.values
largest_video = variants.max_by do |variant|
if /template=(?<width>\d+)x(?<height>\d+)/ =~ variant.to_s
width.to_i * height.to_i
else
0
end
end
[largest_video]
end
else
[url]
end
end
def page_url
return nil unless api_response.present?
artist_id = api_response["user"]["id"]
illust_base62_id = api_response["bid"]
"https://www.weibo.com/#{artist_id}/#{illust_base62_id}"
end
def tags
return [] if api_response.blank?
matches = api_response["text"]&.scan(/surl-text">#(.*?)#</).to_a.map { |m| m[0] }
matches.map do |match|
[match, "https://s.weibo.com/weibo/#{match}"]
end
end
def profile_urls
[parsed_url.profile_url, parsed_referer&.profile_url].compact.uniq
end
def profile_url
"https://www.weibo.com/u/#{artist_id}" if artist_id.present?
end
def artist_name
api_response&.dig("user", "screen_name")
end
def artist_id
parsed_url.artist_short_id || parsed_referer&.artist_short_id || api_response&.dig("user", "id")
end
def artist_commentary_desc
return if api_response.blank?
api_response["text"]
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc) do |element|
if element["href"].present?
href = Addressable::URI.heuristic_parse(element["href"])
href.site ||= "https://www.weibo.com"
href.scheme ||= "https"
element["href"] = href.to_s
end
if element["src"].present?
src = Addressable::URI.heuristic_parse(element["src"])
src.scheme ||= "https"
element["src"] = src.to_s
end
end
end
def api_response
return {} if (mobile_url = parsed_url.mobile_url || parsed_referer&.mobile_url).blank?
resp = http.cache(1.minute).get(mobile_url)
json_string = resp.to_s[/var \$render_data = \[(.*)\]\[0\]/m, 1]
return {} if json_string.blank?
JSON.parse(json_string)["status"]
end
memoize :api_response
end
end
end