Split requests made by Danbooru::Http into either internal or external requests. Internal requests are API calls to internal services run by Danbooru. External requests are requests to external websites, for example fetching sources or downloading files. External requests may use a HTTP proxy if one is configured. Internal requests don't. Fixes a few source extractors not using the HTTP proxy for certain API calls.
323 lines
11 KiB
Ruby
323 lines
11 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# A source extractor is used to extract information from a given source URL. It
|
|
# extracts all the images and videos from the URL, as well as metadata such as
|
|
# the tags, commentary, artist name, profile URL, and additional names and URLs
|
|
# for new artist entries.
|
|
#
|
|
# To add a new site, create a subclass of Source::Extractor and implement the following methods:
|
|
#
|
|
# * match? - True if the extractor should be used for this URL.
|
|
# * image_urls - The list of images or videos at this URL. Used during uploads.
|
|
# * page_url - The page containing the images. Used for post sources.
|
|
# * profile_url - The URL of the artist's profile page. Used for artist finding.
|
|
# * profile_urls - Extra profile URLs to add to the artist entry.
|
|
# * tag_name - The artist's login name. Used as the default name for new artist tags.
|
|
# * artist_name - The artist's display name. Used as an other name in new artist entries.
|
|
# * other_names - Extra names used in new artist entries.
|
|
# * tags - The artist's tags for the work. Used by translated tags.
|
|
# * artist_commentary_title - The artist's title of the work. Used for artist commentaries.
|
|
# * artist_commentary_desc - The artist's description of the work. Used for artist commentaries.
|
|
#
|
|
module Source
|
|
class Extractor
|
|
extend Memoist
|
|
|
|
# The http timeout to download a file.
|
|
DOWNLOAD_TIMEOUT = 60
|
|
|
|
attr_reader :url, :referer_url, :parsed_url, :parsed_referer
|
|
|
|
delegate :site_name, to: :parsed_url
|
|
|
|
SUBCLASSES = [
|
|
Source::Extractor::Pixiv,
|
|
Source::Extractor::Twitter,
|
|
Source::Extractor::Tumblr,
|
|
Source::Extractor::NicoSeiga,
|
|
Source::Extractor::DeviantArt,
|
|
Source::Extractor::Moebooru,
|
|
Source::Extractor::Nijie,
|
|
Source::Extractor::ArtStation,
|
|
Source::Extractor::Gelbooru,
|
|
Source::Extractor::HentaiFoundry,
|
|
Source::Extractor::Fanbox,
|
|
Source::Extractor::Mastodon,
|
|
Source::Extractor::PixivSketch,
|
|
Source::Extractor::Weibo,
|
|
Source::Extractor::Newgrounds,
|
|
Source::Extractor::Skeb,
|
|
Source::Extractor::Lofter,
|
|
Source::Extractor::Foundation,
|
|
Source::Extractor::Plurk,
|
|
Source::Extractor::Tinami,
|
|
Source::Extractor::Fantia,
|
|
Source::Extractor::Booth,
|
|
Source::Extractor::Anifty,
|
|
Source::Extractor::Furaffinity,
|
|
]
|
|
|
|
# Should return true if the extractor is configured correctly. Return false
|
|
# if the extractor requires api keys that have not been configured.
|
|
def self.enabled?
|
|
true
|
|
end
|
|
|
|
# Return the extractor for the given `url`. The `url` may be either a
|
|
# direct image URL, or the URL of a page containing one or more images.
|
|
#
|
|
# The `referer_url` is optionally provided when uploading direct image URLs
|
|
# with the bookmarklet. This will be the page containing the image. This
|
|
# lets us extract information from sites like Twitter, where the image URL by
|
|
# itself doesn't have enough information to find the page containing the image.
|
|
#
|
|
# @param url [String] The URL to extract information from.
|
|
# @param referer_url [String, nil] The page URL if `url` is an image URL.
|
|
# @return [Source::Extractor]
|
|
def self.find(url, referer_url = nil, default: Extractor::Null)
|
|
extractor = SUBCLASSES.lazy.map { |extractor| extractor.new(url, referer_url) }.find(&:match?)
|
|
extractor || default&.new(url, referer_url)
|
|
end
|
|
|
|
# Initialize an extractor. Normally one should call `Source::Extractor.find`
|
|
# instead of instantiating an extractor directly.
|
|
#
|
|
# @param url [String] The URL to extract information form.
|
|
# @param referer_url [String, nil] The page URL if `url` is an image URL.
|
|
def initialize(url, referer_url = nil)
|
|
@url = url.to_s
|
|
@referer_url = referer_url&.to_s
|
|
|
|
@parsed_url = Source::URL.parse(url)
|
|
@parsed_referer = Source::URL.parse(referer_url) if referer_url.present?
|
|
@parsed_referer = nil if parsed_url&.site_name != parsed_referer&.site_name
|
|
end
|
|
|
|
# Should return true if this extractor should be used for this URL.
|
|
# Normally, this should check if the URL is from the right site.
|
|
#
|
|
# @return [Boolean]
|
|
def match?
|
|
false
|
|
end
|
|
|
|
# The list of image (or video) URLs extracted from the target URL.
|
|
#
|
|
# If the target URL is a page, this should be every image on the page. If
|
|
# the target URL is a single image, this should be the image itself.
|
|
#
|
|
# @return [Array<String>]
|
|
def image_urls
|
|
[]
|
|
end
|
|
|
|
# The URL of the page containing the image, or nil if it can't be found.
|
|
#
|
|
# The source of the post will be set to the page URL if it's not possible
|
|
# to convert the image URL to a page URL for this site.
|
|
#
|
|
# For example, for sites like Twitter and Tumblr, it's not possible to
|
|
# convert image URLs to page URLs, so the page URL will be used as the
|
|
# source for these sites. For sites like Pixiv and DeviantArt, it is
|
|
# possible to convert image URLs to page URLs, so the image URL will be
|
|
# used as the source for these sites. This is determined by whether
|
|
# `Source::URL#page_url` returns a URL or nil.
|
|
#
|
|
# @return [String, nil]
|
|
def page_url
|
|
nil
|
|
end
|
|
|
|
# A name to suggest as the artist's tag name when creating a new artist.
|
|
# This should usually be the artist's login name. It should be plain ASCII,
|
|
# hopefully unique, and it should follow the rules for tag names (see
|
|
# TagNameValidator).
|
|
#
|
|
# @return [String, nil]
|
|
def tag_name
|
|
artist_name
|
|
end
|
|
|
|
# The artists's primary name. If an artist has both a display name and a
|
|
# login name, this should be the display name. This will be used as an
|
|
# other name for new artist entries.
|
|
#
|
|
# @return [String, nil]
|
|
def artist_name
|
|
nil
|
|
end
|
|
|
|
# A list of all names associated with the artist. These names will be suggested
|
|
# as other names when creating a new artist.
|
|
#
|
|
# @return [Array<String>]
|
|
def other_names
|
|
[artist_name, tag_name].compact.uniq
|
|
end
|
|
|
|
# A link to the artist's profile page on the site. This will be used for
|
|
# artist finding purposes, so it needs to match the URL in the artist entry.
|
|
#
|
|
# @return [String, nil]
|
|
def profile_url
|
|
nil
|
|
end
|
|
|
|
# A list of all profile urls associated with the artist. These urls will
|
|
# be suggested when creating a new artist.
|
|
#
|
|
# @return [Array<String>]
|
|
def profile_urls
|
|
[profile_url].compact
|
|
end
|
|
|
|
# The artist's title of the work. Used for the artist commentary.
|
|
#
|
|
# @return [String, nil]
|
|
def artist_commentary_title
|
|
nil
|
|
end
|
|
|
|
# The artist's description of the work. Used for the artist commentary.
|
|
#
|
|
# @return [String, nil]
|
|
def artist_commentary_desc
|
|
nil
|
|
end
|
|
|
|
# Download the file at the given url. Raises Danbooru::Http::DownloadError if the download fails, or
|
|
# Danbooru::Http::FileTooLargeError if the file is too large.
|
|
#
|
|
# @return [MediaFile] the downloaded file
|
|
def download_file!(download_url)
|
|
response, file = http_downloader.download_media(download_url)
|
|
file
|
|
end
|
|
|
|
# A http client for API requests.
|
|
def http
|
|
Danbooru::Http.external
|
|
end
|
|
|
|
# A http client for downloading files.
|
|
def http_downloader
|
|
http.timeout(DOWNLOAD_TIMEOUT).max_size(Danbooru.config.max_file_size).use(:spoof_referrer).use(:unpolish_cloudflare)
|
|
end
|
|
|
|
def artists
|
|
ArtistFinder.find_artists(profile_url)
|
|
end
|
|
|
|
# A new artist entry with suggested defaults for when the artist doesn't
|
|
# exist. Used in Artist.new_with_defaults to prefill the new artist form.
|
|
def new_artist
|
|
Artist.new(
|
|
name: tag_name,
|
|
other_names: other_names,
|
|
url_string: profile_urls.join("\n")
|
|
)
|
|
end
|
|
|
|
def tags
|
|
(@tags || []).uniq
|
|
end
|
|
|
|
def normalized_tags
|
|
tags.map { |tag, _url| normalize_tag(tag) }.sort.uniq
|
|
end
|
|
|
|
def normalize_tag(tag)
|
|
WikiPage.normalize_other_name(tag).downcase
|
|
end
|
|
|
|
def translated_tags
|
|
translated_tags = normalized_tags.flat_map(&method(:translate_tag)).uniq.sort
|
|
translated_tags.reject(&:artist?).reject(&:is_deprecated?)
|
|
end
|
|
|
|
# Given a tag from the source site, should return an array of corresponding Danbooru tags.
|
|
def translate_tag(untranslated_tag)
|
|
return [] if untranslated_tag.blank?
|
|
|
|
translated_tag_names = WikiPage.active.other_names_include(untranslated_tag).uniq.pluck(:title)
|
|
translated_tag_names = TagAlias.to_aliased(translated_tag_names)
|
|
translated_tags = Tag.where(name: translated_tag_names)
|
|
|
|
if translated_tags.empty?
|
|
normalized_name = TagAlias.to_aliased([Tag.normalize_name(untranslated_tag)])
|
|
translated_tags = Tag.nonempty.where(name: normalized_name)
|
|
end
|
|
|
|
translated_tags
|
|
end
|
|
|
|
def dtext_artist_commentary_title
|
|
self.class.to_dtext(artist_commentary_title)
|
|
end
|
|
|
|
def dtext_artist_commentary_desc
|
|
self.class.to_dtext(artist_commentary_desc)
|
|
end
|
|
|
|
# A search query that should return any posts that were previously
|
|
# uploaded from the same source. These may be duplicates, or they may be
|
|
# other posts from the same gallery.
|
|
def related_posts_search_query
|
|
"source:#{url}"
|
|
end
|
|
|
|
def related_posts(limit = 5)
|
|
Post.system_tag_match(related_posts_search_query).paginate(1, limit: limit)
|
|
end
|
|
|
|
# A hash containing the results of any API calls made by the extractor. For debugging purposes only.
|
|
def api_response
|
|
nil
|
|
end
|
|
|
|
def to_h
|
|
{
|
|
:artist => {
|
|
:name => artist_name,
|
|
:tag_name => tag_name,
|
|
:other_names => other_names,
|
|
:profile_url => profile_url,
|
|
:profile_urls => profile_urls
|
|
},
|
|
:artists => artists.as_json(include: :sorted_urls),
|
|
:image_urls => image_urls,
|
|
:page_url => page_url,
|
|
:tags => tags,
|
|
:normalized_tags => normalized_tags,
|
|
:translated_tags => translated_tags,
|
|
:artist_commentary => {
|
|
:title => artist_commentary_title,
|
|
:description => artist_commentary_desc,
|
|
:dtext_title => dtext_artist_commentary_title,
|
|
:dtext_description => dtext_artist_commentary_desc
|
|
},
|
|
:api_response => api_response.to_h
|
|
}
|
|
end
|
|
|
|
def to_json(*_args)
|
|
to_h.to_json
|
|
end
|
|
|
|
def http_exists?(url)
|
|
http_downloader.head(url).status.success?
|
|
end
|
|
|
|
# Convert commentary to dtext by stripping html tags. Sites can override
|
|
# this to customize how their markup is translated to dtext.
|
|
def self.to_dtext(text)
|
|
text = text.to_s
|
|
text = Rails::Html::FullSanitizer.new.sanitize(text, encode_special_chars: false)
|
|
text = CGI.unescapeHTML(text)
|
|
text
|
|
end
|
|
|
|
memoize :http, :http_downloader, :related_posts
|
|
end
|
|
end
|