Merge branch 'master' into fix-pixiv-profile-url

This commit is contained in:
evazion
2020-06-24 00:06:55 -05:00
committed by GitHub
103 changed files with 1639 additions and 2247 deletions

View File

@@ -147,7 +147,7 @@ module Sources::Strategies
urls = urls.reverse
end
chosen_url = urls.find { |url| http_exists?(url, headers) }
chosen_url = urls.find { |url| http_exists?(url) }
chosen_url || url
end
end

View File

@@ -14,6 +14,8 @@
module Sources
module Strategies
class Base
class DownloadError < StandardError; end
attr_reader :url, :referer_url, :urls, :parsed_url, :parsed_referer, :parsed_urls
extend Memoist
@@ -35,9 +37,9 @@ module Sources
# <tt>referrer_url</tt> so the strategy can discover the HTML
# page and other information.
def initialize(url, referer_url = nil)
@url = url
@referer_url = referer_url
@urls = [url, referer_url].select(&:present?)
@url = url.to_s
@referer_url = referer_url&.to_s
@urls = [@url, @referer_url].select(&:present?)
@parsed_url = Addressable::URI.heuristic_parse(url) rescue nil
@parsed_referer = Addressable::URI.heuristic_parse(referer_url) rescue nil
@@ -139,15 +141,28 @@ module Sources
# Subclasses should merge in any required headers needed to access resources
# on the site.
def headers
Danbooru.config.http_headers
{}
end
# Returns the size of the image resource without actually downloading the file.
def size
Downloads::File.new(image_url).size
http.head(image_url).content_length.to_i
end
memoize :size
# Download the file at the given url, or at the main image url by default.
def download_file!(download_url = image_url)
raise DownloadError, "Download failed: couldn't find download url for #{url}" if download_url.blank?
response, file = http.download_media(download_url)
raise DownloadError, "Download failed: #{download_url} returned error #{response.status}" if response.status != 200
file
end
def http
Danbooru::Http.public_only.timeout(30).max_size(Danbooru.config.max_file_size)
end
memoize :http
# The url to use for artist finding purposes. This will be stored in the
# artist entry. Normally this will be the profile url.
def normalize_for_artist_finder
@@ -274,9 +289,8 @@ module Sources
to_h.to_json
end
def http_exists?(url, headers)
res = HTTParty.head(url, Danbooru.config.httparty_options.deep_merge(headers: headers))
res.success?
def http_exists?(url, headers = {})
http.headers(headers).head(url).status.success?
end
# Convert commentary to dtext by stripping html tags. Sites can override

View File

@@ -64,11 +64,10 @@ module Sources
def page
return nil if page_url.blank?
doc = Cache.get("hentai-foundry:#{page_url}", 1.minute) do
HTTParty.get("#{page_url}?enterAgree=1").body
end
response = Danbooru::Http.new.cache(1.minute).get("#{page_url}?enterAgree=1")
return nil unless response.status == 200
Nokogiri::HTML(doc)
response.parse
end
def tags

View File

@@ -73,8 +73,7 @@ module Sources
end
def image_url
return if image_urls.blank?
return url if api_client.blank?
return url if image_urls.blank? || api_client.blank?
img = case url
when DIRECT || CDN_DIRECT then "https://seiga.nicovideo.jp/image/source/#{image_id_from_url(url)}"
@@ -83,7 +82,7 @@ module Sources
end
resp = api_client.get(img)
if resp.headers["Location"] =~ %r{https?://.+/(\w+/\d+/\d+)\z}i
if resp.uri.to_s =~ %r{https?://.+/(\w+/\d+/\d+)\z}i
"https://lohas.nicoseiga.jp/priv/#{$1}"
else
img
@@ -181,12 +180,12 @@ module Sources
def api_client
if illust_id.present?
NicoSeigaApiClient.new(work_id: illust_id, type: "illust")
NicoSeigaApiClient.new(work_id: illust_id, type: "illust", http: http)
elsif manga_id.present?
NicoSeigaApiClient.new(work_id: manga_id, type: "manga")
NicoSeigaApiClient.new(work_id: manga_id, type: "manga", http: http)
elsif image_id.present?
# We default to illust to attempt getting the api anyway
NicoSeigaApiClient.new(work_id: image_id, type: "illust")
NicoSeigaApiClient.new(work_id: image_id, type: "illust", http: http)
end
end
memoize :api_client

View File

@@ -178,54 +178,21 @@ module Sources
def page
return nil if page_url.blank?
doc = agent.get(page_url)
http = Danbooru::Http.new
form = { email: Danbooru.config.nijie_login, password: Danbooru.config.nijie_password }
if doc.search("div#header-login-container").any?
# Session cache is invalid, clear it and log in normally.
Cache.delete("nijie-session")
doc = agent.get(page_url)
end
# XXX `retriable` must come after `cache` so that retries don't return cached error responses.
response = http.cache(1.hour).use(retriable: { max_retries: 20 }).post("https://nijie.info/login_int.php", form: form)
DanbooruLogger.info "Nijie login failed (#{url}, #{response.status})" if response.status != 200
return nil unless response.status == 200
doc
rescue Mechanize::ResponseCodeError => e
return nil if e.response_code.to_i == 404
raise
response = http.cookies(R18: 1).cache(1.minute).get(page_url)
return nil unless response.status == 200
response&.parse
end
memoize :page
def agent
mech = Mechanize.new
session = Cache.get("nijie-session")
if session
cookie = Mechanize::Cookie.new("NIJIEIJIEID", session)
cookie.domain = ".nijie.info"
cookie.path = "/"
mech.cookie_jar.add(cookie)
else
mech.get("https://nijie.info/login.php") do |page|
page.form_with(:action => "/login_int.php") do |form|
form['email'] = Danbooru.config.nijie_login
form['password'] = Danbooru.config.nijie_password
end.click_button
end
session = mech.cookie_jar.cookies.select {|c| c.name == "NIJIEIJIEID"}.first
Cache.put("nijie-session", session.value, 1.day) if session
end
# This cookie needs to be set to allow viewing of adult works while anonymous
cookie = Mechanize::Cookie.new("R18", "1")
cookie.domain = ".nijie.info"
cookie.path = "/"
mech.cookie_jar.add(cookie)
mech
rescue Mechanize::ResponseCodeError => e
raise unless e.response_code.to_i == 429
sleep(5)
retry
end
memoize :agent
end
end
end

View File

@@ -47,7 +47,7 @@ module Sources
when %r{\Ahttps?://c(?:s|han|[1-4])\.sankakucomplex\.com/data(?:/sample)?/(?:[a-f0-9]{2}/){2}(?:sample-|preview)?([a-f0-9]{32})}i
"https://chan.sankakucomplex.com/en/post/show?md5=#{$1}"
when %r{\Ahttps?://(?:www|s(?:tatic|[1-4]))\.zerochan\.net/.+(?:\.|\/)(\d+)(?:\.(?:jpe?g?))?\z}i
when %r{\Ahttps?://(?:www|s(?:tatic|[1-4]))\.zerochan\.net/.+(?:\.|\/)(\d+)(?:\.(?:jpe?g?|png))?\z}i
"https://www.zerochan.net/#{$1}#full"
when %r{\Ahttps?://static[1-6]?\.minitokyo\.net/(?:downloads|view)/(?:\d{2}/){2}(\d+)}i

View File

@@ -64,9 +64,6 @@ module Sources
ORIG_IMAGE = %r{#{PXIMG}/img-original/img/#{DATE}/(?<illust_id>\d+)_p(?<page>\d+)\.#{EXT}\z}i
STACC_PAGE = %r{\A#{WEB}/stacc/#{MONIKER}/?\z}i
NOVEL_PAGE = %r{(?:\Ahttps?://www\.pixiv\.net/novel/show\.php\?id=(\d+))}
FANBOX_ACCOUNT = %r{(?:\Ahttps?://www\.pixiv\.net/fanbox/creator/\d+\z)}
FANBOX_IMAGE = %r{(?:\Ahttps?://fanbox\.pixiv\.net/images/post/(\d+))}
FANBOX_PAGE = %r{(?:\Ahttps?://www\.pixiv\.net/fanbox/creator/\d+/post/(\d+))}
def self.to_dtext(text)
if text.nil?
@@ -127,14 +124,6 @@ module Sources
return "https://www.pixiv.net/novel/show.php?id=#{novel_id}&mode=cover"
end
if fanbox_id.present?
return "https://www.pixiv.net/fanbox/creator/#{metadata.user_id}/post/#{fanbox_id}"
end
if fanbox_account_id.present?
return "https://www.pixiv.net/fanbox/creator/#{fanbox_account_id}"
end
if illust_id.present?
return "https://www.pixiv.net/artworks/#{illust_id}"
end
@@ -192,17 +181,7 @@ module Sources
end
def headers
if fanbox_id.present?
# need the session to download fanbox images
return {
"Referer" => "https://www.pixiv.net/fanbox",
"Cookie" => HTTP::Cookie.cookie_value(agent.cookies)
}
end
{
"Referer" => "https://www.pixiv.net"
}
{ "Referer" => "https://www.pixiv.net" }
end
def normalize_for_source
@@ -242,10 +221,6 @@ module Sources
end
def image_urls_sub
if url =~ FANBOX_IMAGE
return [url]
end
# there's too much normalization bullshit we have to deal with
# raw urls, so just fetch the canonical url from the api every
# time.
@@ -265,7 +240,7 @@ module Sources
# even though it makes sense to reference page_url here, it will only look
# at (url, referer_url).
def illust_id
return nil if novel_id.present? || fanbox_id.present?
return nil if novel_id.present?
parsed_urls.each do |url|
# http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
@@ -328,46 +303,11 @@ module Sources
end
memoize :novel_id
def fanbox_id
[url, referer_url].each do |x|
if x =~ FANBOX_PAGE
return $1
end
if x =~ FANBOX_IMAGE
return $1
end
end
nil
end
memoize :fanbox_id
def fanbox_account_id
[url, referer_url].each do |x|
if x =~ FANBOX_ACCOUNT
return x
end
end
nil
end
memoize :fanbox_account_id
def agent
PixivWebAgent.build
end
memoize :agent
def metadata
if novel_id.present?
return PixivApiClient.new.novel(novel_id)
end
if fanbox_id.present?
return PixivApiClient.new.fanbox(fanbox_id)
end
PixivApiClient.new.work(illust_id)
end
memoize :metadata

View File

@@ -23,7 +23,7 @@ module Sources::Strategies
OLD_IMAGE = %r{\Ahttps?://#{DOMAIN}/(?<dir>#{MD5}/)?#{FILENAME}_(?<size>\w+)\.#{EXT}\z}i
IMAGE = %r{\Ahttps?://#{DOMAIN}/}i
VIDEO = %r{\Ahttps?://(?:vtt|ve\.media)\.tumblr\.com/}i
VIDEO = %r{\Ahttps?://(?:vtt|ve|va\.media)\.tumblr\.com/}i
POST = %r{\Ahttps?://(?<blog_name>[^.]+)\.tumblr\.com/(?:post|image)/(?<post_id>\d+)}i
def self.enabled?
@@ -168,7 +168,7 @@ module Sources::Strategies
end
candidates.find do |candidate|
http_exists?(candidate, headers)
http_exists?(candidate)
end
end

View File

@@ -200,7 +200,7 @@ module Sources::Strategies
end
def api_response
return {} unless self.class.enabled?
return {} unless self.class.enabled? && status_id.present?
api_client.status(status_id)
end