tumblr: extract info from url when api data is unavailable.

Derive the artist name / profile url / page url from the source URLs when
the API response is unavailable because the Tumblr post was deleted.

This fixes the artist finder to work on bad_tumblr_id posts.
This commit is contained in:
evazion
2018-10-09 11:57:23 -05:00
parent 0c31a5d6a9
commit b0d7d90103
3 changed files with 33 additions and 26 deletions

View File

@@ -221,6 +221,8 @@ module Sources
:profile_url => profile_url,
:image_url => image_url,
:image_urls => image_urls,
:page_url => page_url,
:canonical_url => canonical_url,
:normalized_for_artist_finder_url => normalize_for_artist_finder,
:tags => tags,
:translated_tags => translated_tags,

View File

@@ -1,5 +1,6 @@
module Sources::Strategies
class Tumblr < Base
BASE_URL = %r!\Ahttps?://(?:[^/]+\.)*tumblr\.com!i
DOMAIN = %r{(data|(\d+\.)?media)\.tumblr\.com}
MD5 = %r{(?<md5>[0-9a-f]{32})}i
FILENAME = %r{(?<filename>(tumblr_(inline_)?)?[a-z0-9]+(_r[0-9]+)?)}i
@@ -13,18 +14,7 @@ module Sources::Strategies
end
def self.match?(*urls)
urls.compact.any? do |url|
blog_name, post_id = parse_info_from_url(url)
url =~ IMAGE || blog_name.present? && post_id.present?
end
end
def self.parse_info_from_url(url)
if url =~ POST
[$~[:blog_name], $~[:post_id]]
else
[]
end
urls.compact.any? { |url| url.match?(BASE_URL) }
end
def site_name
@@ -42,22 +32,21 @@ module Sources::Strategies
end
def page_url
[url, referer_url].each do |x|
if x =~ POST
blog_name, post_id = self.class.parse_info_from_url(x)
return "https://#{blog_name}.tumblr.com/post/#{post_id}"
end
end
return nil unless blog_name.present? && post_id.present?
"https://#{blog_name}.tumblr.com/post/#{post_id}"
end
return super
def canonical_url
page_url
end
def profile_url
"https://#{artist_name}.tumblr.com/"
return nil if artist_name.blank?
"https://#{artist_name}.tumblr.com"
end
def artist_name
post[:blog_name]
post[:blog_name] || blog_name
end
def artist_commentary_title
@@ -99,7 +88,6 @@ module Sources::Strategies
[etag, "https://tumblr.com/tagged/#{CGI.escape(etag)}"]
end.uniq
end
memoize :tags
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc).strip
@@ -175,11 +163,18 @@ module Sources::Strategies
html = Nokogiri::HTML.fragment(artist_commentary_desc)
html.css("img").map { |node| node["src"] }
end
memoize :inline_images
def blog_name
urls.map { |url| url[POST, :blog_name] }.compact.first
end
def post_id
urls.map { |url| url[POST, :post_id] }.compact.first
end
def api_response
return {} unless self.class.enabled?
blog_name, post_id = self.class.parse_info_from_url(page_url)
return {} unless blog_name.present? && post_id.present?
body, code = HttpartyCache.get("/#{blog_name}/posts",
params: { id: post_id, api_key: Danbooru.config.tumblr_consumer_key },

View File

@@ -16,7 +16,7 @@ module Sources
end
should "get the profile" do
assert_equal("https://noizave.tumblr.com/", @site.profile_url)
assert_equal("https://noizave.tumblr.com", @site.profile_url)
end
should "get the tags" do
@@ -180,10 +180,20 @@ module Sources
end
context "A deleted tumblr post" do
should "work" do
should "extract the info from the url" do
site = Sources::Strategies.find("http://shimetsukage.tumblr.com/post/176805588268/20180809-ssb-coolboy")
data = {
artist_name: "shimetsukage",
profile_url: "https://shimetsukage.tumblr.com",
page_url: "https://shimetsukage.tumblr.com/post/176805588268",
canonical_url: "https://shimetsukage.tumblr.com/post/176805588268",
image_url: nil,
image_urls: [],
tags: [],
}
assert_nothing_raised { site.to_h }
assert_operator(data, :<, site.to_h)
end
end
end