Files
danbooru/app/logical/sources/strategies/base.rb
evazion bb7f24d279 Add HTTP proxy support.
Add support for using a proxy for HTTP requests. Only used for external
requests, such as downloading files or talking to source sites such as
Pixiv or Twitter, not for internal requests, such as talking to IQDB or
Reportbooru.
2021-08-28 04:53:33 -05:00

385 lines
12 KiB
Ruby

# This is a collection of strategies for extracting information about a
# resource. At a minimum it tries to extract the artist name and a canonical
# URL to download the image from. But it can also be used to normalize a URL
# for use with the artist finder.
#
# Design Principles
#
# In general you should minimize state. You can safely assume that <tt>url</tt>
# and <tt>referer_url</tt> will not change over the lifetime of an instance,
# so you can safely memoize methods and their results. A common pattern is
# conditionally making an external API call and parsing its response. You should
# make this call on demand and memoize the response.
module Sources
module Strategies
class Base
class DownloadError < StandardError; end
attr_reader :url, :referer_url, :urls, :parsed_url, :parsed_referer, :parsed_urls
extend Memoist
# Should return true if all prerequisites for using the strategy are met.
# Return false if the strategy requires api keys that have not been configured.
def self.enabled?
true
end
# * <tt>url</tt> - Should point to a resource suitable for
# downloading. This may sometimes point to the binary file.
# It may also point to the artist's profile page, in cases
# where this class is being used to normalize artist urls.
# Implementations should be smart enough to detect this and
# behave accordingly.
# * <tt>referer_url</tt> - Sometimes the HTML page cannot be
# determined from <tt>url</tt>. You should generally pass in a
# <tt>referrer_url</tt> so the strategy can discover the HTML
# page and other information.
def initialize(url, referer_url = nil)
@url = url.to_s
@referer_url = referer_url&.to_s
@urls = [@url, @referer_url].select(&:present?)
@parsed_url = Addressable::URI.heuristic_parse(url) rescue nil
@parsed_referer = Addressable::URI.heuristic_parse(referer_url) rescue nil
@parsed_urls = [parsed_url, parsed_referer].select(&:present?)
end
# Should return true if this strategy should be used. By default, checks
# if the main url belongs to any of the domains associated with this site.
def match?
return false if parsed_url.nil?
parsed_url.domain.in?(domains)
end
# The list of base domains belonging to this site. Subdomains are
# automatically included (i.e. "pixiv.net" matches "fanbox.pixiv.net").
def domains
[]
end
def site_name
host = Addressable::URI.heuristic_parse(url)&.host
# XXX should go in dedicated strategies.
case host
when /amazon\.(com|jp|co\.jp)\z/i
"Amazon"
when /ask\.fm\z/i
"Ask.fm"
when /bcy\.net\z/i
"BCY"
when /booth\.pm\z/i
"Booth.pm"
when /circle\.ms\z/i
"Circle.ms"
when /dlsite\.(com|net)\z/i
"DLSite"
when /doujinshi\.mugimugi\.org\z/i, /doujinshi\.org\z/i
"Doujinshi.org"
when /erogamescape\.dyndns\.org\z/i
"Erogamescape"
when /facebook\.com\z/i
"Facebook"
when /fantia\.jp\z/i
"Fantia"
when /fc2\.com\z/i
"FC2"
when /gumroad\.com\z/i
"Gumroad"
when /instagram\.com\z/i
"Instagram"
when /ko-fi\.com\z/i
"Ko-fi"
when /livedoor\.(jp|com)\z/i
"Livedoor"
when /lofter\.com\z/i
"Lofter"
when /mangaupdates\.com\z/i
"Mangaupdates"
when /melonbooks\.co\.jp\z/i
"Melonbooks"
when /mihuashi\.com\z/i
"Mihuashi"
when /mixi\.jp\z/i
"Mixi.jp"
when /patreon\.com\z/i
"Patreon"
when /piapro\.jp\z/i
"Piapro.jp"
when /picarto\.tv\z/i
"Picarto"
when /privatter\.net\z/i
"Privatter"
when /sakura\.ne\.jp\z/i
"Sakura.ne.jp"
when /stickam\.jp\z/i
"Stickam"
when /tinami\.com\z/i
"Tinami"
when /toranoana\.(jp|shop)\z/i
"Toranoana"
when /twitch\.tv\z/i
"Twitch"
when /wikipedia\.org\z/i
"Wikipedia"
when /youtube\.com\z/i
"Youtube"
else
host
end
rescue Addressable::URI::InvalidURIError
nil
end
# Whatever <tt>url</tt> is, this method should return the direct links
# to the canonical binary files. It should not be an HTML page. It should
# be a list of JPEG, PNG, GIF, WEBM, MP4, ZIP, etc. It is what the
# downloader will fetch and save to disk.
def image_urls
raise NotImplementedError
end
def image_url
image_urls.first
end
# A smaller representation of the image that's suitable for
# displaying previews.
def preview_urls
image_urls
end
def preview_url
preview_urls.first
end
# Whatever <tt>url</tt> is, this method should return a link to the HTML
# page containing the resource. It should not be a binary file. It will
# eventually be assigned as the source for the post, but it does not
# represent what the downloader will fetch.
def page_url
nil
end
# This will be the url stored in posts. Typically this is the page
# url, but on some sites it may be preferable to store the image url.
def canonical_url
page_url || image_url
end
# A name to suggest as the artist's tag name when creating a new artist.
# This should usually be the artist's account name.
def tag_name
artist_name
end
# The artists's primary name. If an artist has both a display name and an
# account name, this should be the display name.
def artist_name
nil
end
# A list of all names associated with the artist. These names will be suggested
# as other names when creating a new artist.
def other_names
[artist_name, tag_name].compact.uniq
end
# A link to the artist's profile page on the site.
def profile_url
nil
end
# A list of all profile urls associated with the artist. These urls will
# be suggested when creating a new artist.
def profile_urls
[profile_url].compact
end
def artist_commentary_title
nil
end
def artist_commentary_desc
nil
end
# Subclasses should merge in any required headers needed to access resources
# on the site.
def headers
{}
end
# Returns the size of the image resource without actually downloading the file.
def remote_size
response = http_downloader.head(image_url)
return nil unless response.status == 200 && response.content_length.present?
response.content_length.to_i
end
memoize :remote_size
# Download the file at the given url, or at the main image url by default.
def download_file!(download_url = image_url)
raise DownloadError, "Download failed: couldn't find download url for #{url}" if download_url.blank?
response, file = http_downloader.download_media(download_url)
raise DownloadError, "Download failed: #{download_url} returned error #{response.status}" if response.status != 200
file
end
# A http client for API requests.
def http
Danbooru::Http.new.proxy.public_only
end
memoize :http
# A http client for downloading files.
def http_downloader
http.timeout(30).max_size(Danbooru.config.max_file_size).use(:spoof_referrer).use(:unpolish_cloudflare)
end
memoize :http_downloader
# The url to use for artist finding purposes. This will be stored in the
# artist entry. Normally this will be the profile url.
def normalize_for_artist_finder
profile_url.presence || url
end
# Given a post/image url, this is the normalized url that will be displayed in a post's page in its stead.
# This function should never make any network call, even indirectly. Return nil to never normalize.
def normalize_for_source
nil
end
def artists
ArtistFinder.find_artists(normalize_for_artist_finder.to_s)
end
# A new artist entry with suggested defaults for when the artist doesn't
# exist. Used in Artist.new_with_defaults to prefill the new artist form.
def new_artist
Artist.new(
name: tag_name,
other_names: other_names,
url_string: profile_urls.join("\n")
)
end
def file_url
image_url
end
def data
{}
end
def tags
(@tags || []).uniq
end
def normalized_tags
tags.map { |tag, _url| normalize_tag(tag) }.sort.uniq
end
def normalize_tag(tag)
WikiPage.normalize_other_name(tag).downcase
end
def translated_tags
translated_tags = normalized_tags.flat_map(&method(:translate_tag)).uniq.sort
translated_tags.reject(&:artist?)
end
# Given a tag from the source site, should return an array of corresponding Danbooru tags.
def translate_tag(untranslated_tag)
return [] if untranslated_tag.blank?
translated_tag_names = WikiPage.active.other_names_include(untranslated_tag).uniq.pluck(:title)
translated_tag_names = TagAlias.to_aliased(translated_tag_names)
translated_tags = Tag.where(name: translated_tag_names)
if translated_tags.empty?
normalized_name = TagAlias.to_aliased([Tag.normalize_name(untranslated_tag)])
translated_tags = Tag.nonempty.where(name: normalized_name)
end
translated_tags
end
def dtext_artist_commentary_title
self.class.to_dtext(artist_commentary_title)
end
def dtext_artist_commentary_desc
self.class.to_dtext(artist_commentary_desc)
end
# A search query that should return any posts that were previously
# uploaded from the same source. These may be duplicates, or they may be
# other posts from the same gallery.
def related_posts_search_query
"source:#{canonical_url}"
end
def related_posts(limit = 5)
Post.system_tag_match(related_posts_search_query).paginate(1, limit: limit)
end
memoize :related_posts
# A hash containing the results of any API calls made by the strategy. For debugging purposes only.
def api_response
nil
end
def to_h
{
:artist => {
:name => artist_name,
:tag_name => tag_name,
:other_names => other_names,
:profile_url => profile_url,
:profile_urls => profile_urls
},
:artists => artists.as_json(include: :sorted_urls),
:image_url => image_url,
:image_urls => image_urls,
:preview_url => preview_url,
:preview_urls => preview_urls,
:page_url => page_url,
:canonical_url => canonical_url,
:normalized_for_artist_finder_url => normalize_for_artist_finder,
:tags => tags,
:normalized_tags => normalized_tags,
:translated_tags => translated_tags,
:artist_commentary => {
:title => artist_commentary_title,
:description => artist_commentary_desc,
:dtext_title => dtext_artist_commentary_title,
:dtext_description => dtext_artist_commentary_desc
},
:api_response => api_response.to_h
}
end
def to_json(*_args)
to_h.to_json
end
def http_exists?(url)
http_downloader.head(url).status.success?
end
# Convert commentary to dtext by stripping html tags. Sites can override
# this to customize how their markup is translated to dtext.
def self.to_dtext(text)
text = text.to_s
text = Rails::Html::FullSanitizer.new.sanitize(text, encode_special_chars: false)
text = CGI.unescapeHTML(text)
text
end
end
end
end