deviantart: replace html scraper with api client (#3260).
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
module Sources
|
module Sources
|
||||||
module Strategies
|
module Strategies
|
||||||
class DeviantArt < Base
|
class DeviantArt < Base
|
||||||
DEVIANTART_SESSION_CACHE_KEY = "deviantart-session"
|
extend Memoist
|
||||||
|
|
||||||
def self.url_match?(url)
|
def self.url_match?(url)
|
||||||
url =~ /^https?:\/\/(?:.+?\.)?deviantart\.(?:com|net)/
|
url =~ /^https?:\/\/(?:.+?\.)?deviantart\.(?:com|net)/
|
||||||
@@ -20,28 +20,15 @@ module Sources
|
|||||||
end
|
end
|
||||||
|
|
||||||
def unique_id
|
def unique_id
|
||||||
profile_url =~ /https?:\/\/(.+?)\.deviantart\.com/
|
artist_name
|
||||||
"deviantart" + $1
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def get
|
def get
|
||||||
agent.get(URI.parse(normalized_url)) do |page|
|
# no-op
|
||||||
page.encoding = "utf-8"
|
|
||||||
@artist_name, @profile_url = get_profile_from_page(page)
|
|
||||||
@image_url = get_image_url_from_page(page)
|
|
||||||
@tags = get_tags_from_page(page)
|
|
||||||
@artist_commentary_title = get_artist_commentary_title_from_page(page)
|
|
||||||
@artist_commentary_desc = get_artist_commentary_desc_from_page(page)
|
|
||||||
end
|
|
||||||
rescue Mechanize::ResponseCodeError
|
|
||||||
# try the normal url
|
|
||||||
if url =~ /\.(jpg|jpeg|png|gif)/
|
|
||||||
@image_url = url
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.to_dtext(text)
|
def dtext_artist_commentary_desc
|
||||||
DText.from_html(text) do |element|
|
DText.from_html(artist_commentary_desc) do |element|
|
||||||
# Convert embedded thumbnails of journal posts to 'deviantart #123'
|
# Convert embedded thumbnails of journal posts to 'deviantart #123'
|
||||||
# links. Strip embedded thumbnails of image posts. Example:
|
# links. Strip embedded thumbnails of image posts. Example:
|
||||||
# https://sa-dui.deviantart.com/art/Commission-Meinos-Kaen-695905927.
|
# https://sa-dui.deviantart.com/art/Commission-Meinos-Kaen-695905927.
|
||||||
@@ -70,64 +57,51 @@ module Sources
|
|||||||
end.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
|
end.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
|
||||||
end
|
end
|
||||||
|
|
||||||
protected
|
def artist_name
|
||||||
|
api_metadata.dig(:author, :username)
|
||||||
|
end
|
||||||
|
|
||||||
def get_profile_from_page(page)
|
def profile_url
|
||||||
links = page.search("div.dev-title-container a.username")
|
"https://#{artist_name.downcase}.deviantart.com"
|
||||||
|
end
|
||||||
|
|
||||||
if links.any?
|
def image_url
|
||||||
profile_url = links[0]["href"]
|
# work is deleted, use image url as given by user.
|
||||||
artist_name = links[0].text
|
if uuid.nil?
|
||||||
|
url
|
||||||
|
# work is downloadable
|
||||||
|
elsif api_deviation[:is_downloadable] && api_deviation[:download_filesize] != api_deviation.dig(:content, :filesize)
|
||||||
|
src = api_download[:src]
|
||||||
|
src.gsub!(%r!\Ahttps?://s3\.amazonaws\.com/!i, "https://")
|
||||||
|
src.gsub!(/\?.*\z/, "") # strip s3 query params
|
||||||
|
|
||||||
|
src
|
||||||
|
# work isn't downloadable, or download size is same as regular size.
|
||||||
|
elsif api_deviation.present?
|
||||||
|
api_deviation.dig(:content, :src)
|
||||||
else
|
else
|
||||||
profile_url = nil
|
raise "couldn't find image url"
|
||||||
artist_name = nil
|
|
||||||
end
|
|
||||||
|
|
||||||
return [artist_name, profile_url].compact
|
|
||||||
end
|
|
||||||
|
|
||||||
def get_image_url_from_page(page)
|
|
||||||
download_link = page.link_with(:class => /dev-page-download/)
|
|
||||||
|
|
||||||
if download_link
|
|
||||||
download_link.click.uri.to_s # need to follow the redirect now to get the full size url, following it later seems to not work.
|
|
||||||
else
|
|
||||||
image = page.search("div.dev-view-deviation img.dev-content-full")
|
|
||||||
|
|
||||||
if image.any?
|
|
||||||
image[0]["src"]
|
|
||||||
else
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_tags_from_page(page)
|
def tags
|
||||||
links = page.search("a.discoverytag")
|
return [] if api_metadata.blank?
|
||||||
|
|
||||||
links.map do |node|
|
api_metadata[:tags].map do |tag|
|
||||||
[node.attr("data-canonical-tag"), node.attr("href")]
|
[tag[:tag_name], "https://www.deviantart.com/tag/#{tag[:tag_name]}"]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_artist_commentary_title_from_page(page)
|
def artist_commentary_title
|
||||||
title = page.search("div.dev-title-container a").find_all do |node|
|
api_metadata[:title]
|
||||||
node["data-ga_click_event"] =~ /description_title/
|
|
||||||
end
|
|
||||||
|
|
||||||
if title.any?
|
|
||||||
title[0].inner_text
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_artist_commentary_desc_from_page(page)
|
def artist_commentary_desc
|
||||||
desc = page.search("div.dev-description div.text.block")
|
api_metadata[:description]
|
||||||
|
|
||||||
if desc.any?
|
|
||||||
desc[0].children.to_s
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
protected
|
||||||
|
|
||||||
def normalized_url
|
def normalized_url
|
||||||
@normalized_url ||= begin
|
@normalized_url ||= begin
|
||||||
if url =~ %r{\Ahttps?://(?:fc|th|pre|orig|img)\d{2}\.deviantart\.net/.+/[a-z0-9_]*_by_[a-z0-9_]+-d([a-z0-9]+)\.}i
|
if url =~ %r{\Ahttps?://(?:fc|th|pre|orig|img)\d{2}\.deviantart\.net/.+/[a-z0-9_]*_by_[a-z0-9_]+-d([a-z0-9]+)\.}i
|
||||||
@@ -142,70 +116,44 @@ module Sources
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def agent
|
def page
|
||||||
@agent ||= begin
|
resp = HTTParty.get(normalized_url, Danbooru.config.httparty_options)
|
||||||
mech = Mechanize.new
|
Nokogiri::HTML(resp.body)
|
||||||
auth, userinfo, auth_secure = session_cookies(mech)
|
|
||||||
|
|
||||||
if auth
|
|
||||||
# This cookie needs to be set to allow viewing of mature works
|
|
||||||
cookie = Mechanize::Cookie.new("agegate_state", "1")
|
|
||||||
cookie.domain = ".deviantart.com"
|
|
||||||
cookie.path = "/"
|
|
||||||
mech.cookie_jar.add(cookie)
|
|
||||||
|
|
||||||
cookie = Mechanize::Cookie.new("auth", auth)
|
|
||||||
cookie.domain = ".deviantart.com"
|
|
||||||
cookie.path = "/"
|
|
||||||
mech.cookie_jar.add(cookie)
|
|
||||||
|
|
||||||
cookie = Mechanize::Cookie.new("userinfo", userinfo)
|
|
||||||
cookie.domain = ".deviantart.com"
|
|
||||||
cookie.path = "/"
|
|
||||||
mech.cookie_jar.add(cookie)
|
|
||||||
|
|
||||||
if auth_secure
|
|
||||||
cookie = Mechanize::Cookie.new("auth_secure", auth_secure)
|
|
||||||
cookie.domain = ".deviantart.com"
|
|
||||||
cookie.path = "/"
|
|
||||||
mech.cookie_jar.add(cookie)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
mech
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def session_cookies(mech)
|
# Scrape UUID from <meta property="da:appurl" content="DeviantArt://deviation/12F08C5D-A3A4-338C-2F1A-7E4E268C0E8B">
|
||||||
Cache.get(DEVIANTART_SESSION_CACHE_KEY, 2.hours) do
|
# For private works the UUID will be nil.
|
||||||
mech.request_headers = Danbooru.config.http_headers
|
def uuid
|
||||||
|
meta = page.search('meta[property="da:appurl"]').first
|
||||||
|
return nil if meta.nil?
|
||||||
|
|
||||||
page = mech.get("https://www.deviantart.com/users/login")
|
appurl = meta["content"]
|
||||||
|
uuid = appurl[%r!\ADeviantArt://deviation/(.*)\z!, 1]
|
||||||
if page.search('div[class="g-recaptcha"]').any?
|
uuid
|
||||||
# we got captcha'd, have to abort
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
validate_key = page.search('input[name="validate_key"]').attribute("value").value
|
|
||||||
validate_token = page.search('input[name="validate_token"]').attribute("value").value
|
|
||||||
|
|
||||||
mech.post("https://www.deviantart.com/users/login", {
|
|
||||||
username: Danbooru.config.deviantart_login,
|
|
||||||
password: Danbooru.config.deviantart_password,
|
|
||||||
validate_key: validate_key,
|
|
||||||
validate_token: validate_token,
|
|
||||||
remember_me: 1,
|
|
||||||
})
|
|
||||||
|
|
||||||
auth = mech.cookies.find { |cookie| cookie.name == "auth" }.try(:value)
|
|
||||||
userinfo = mech.cookies.find { |cookie| cookie.name == "userinfo" }.try(:value)
|
|
||||||
auth_secure = mech.cookies.find { |cookie| cookie.name == "auth_secure" }.try(:value)
|
|
||||||
mech.cookie_jar.clear
|
|
||||||
|
|
||||||
[auth, userinfo, auth_secure]
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def api_client
|
||||||
|
api_client = DeviantArtApiClient.new(Danbooru.config.deviantart_client_id, Danbooru.config.deviantart_client_secret, Danbooru.config.httparty_options)
|
||||||
|
api_client.access_token = Cache.get("da-access-token", 55.minutes) { api_client.access_token.to_hash }
|
||||||
|
api_client
|
||||||
|
end
|
||||||
|
|
||||||
|
def api_deviation
|
||||||
|
return {} if uuid.nil?
|
||||||
|
api_client.deviation(uuid)
|
||||||
|
end
|
||||||
|
|
||||||
|
def api_metadata
|
||||||
|
return {} if uuid.nil?
|
||||||
|
api_client.metadata(uuid)[:metadata].first
|
||||||
|
end
|
||||||
|
|
||||||
|
def api_download
|
||||||
|
return {} if uuid.nil?
|
||||||
|
api_client.download(uuid)
|
||||||
|
end
|
||||||
|
|
||||||
|
memoize :page, :uuid, :api_client, :api_deviation, :api_metadata, :api_download
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -13,6 +13,13 @@ module Sources
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "The source for a download-disabled DeviantArt artwork page" do
|
||||||
|
should "get the image url" do
|
||||||
|
@site = Sources::Site.new("https://noizave.deviantart.com/art/test-no-download-697415967")
|
||||||
|
assert_equal(["https://img00.deviantart.net/56ee/i/2017/219/2/3/test__no_download_by_noizave-dbj81lr.jpg"], @site.image_urls)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
context "The source for an DeviantArt artwork page" do
|
context "The source for an DeviantArt artwork page" do
|
||||||
setup do
|
setup do
|
||||||
@site = Sources::Site.new("http://noizave.deviantart.com/art/test-post-please-ignore-685436408")
|
@site = Sources::Site.new("http://noizave.deviantart.com/art/test-post-please-ignore-685436408")
|
||||||
@@ -20,11 +27,11 @@ module Sources
|
|||||||
end
|
end
|
||||||
|
|
||||||
should "get the image url" do
|
should "get the image url" do
|
||||||
assert_match(%r!https://orig\d+.deviantart.net/7b5b/f/2017/160/c/5/test_post_please_ignore_by_noizave-dbc3a48.png!, @site.image_url)
|
assert_match(%r!https://origin-orig.deviantart.net/7b5b/f/2017/160/c/5/test_post_please_ignore_by_noizave-dbc3a48.png!, @site.image_url)
|
||||||
end
|
end
|
||||||
|
|
||||||
should "get the profile" do
|
should "get the profile" do
|
||||||
assert_equal("https://noizave.deviantart.com/", @site.profile_url)
|
assert_equal("https://noizave.deviantart.com", @site.profile_url)
|
||||||
end
|
end
|
||||||
|
|
||||||
should "get the artist name" do
|
should "get the artist name" do
|
||||||
@@ -37,7 +44,7 @@ module Sources
|
|||||||
|
|
||||||
should "get the artist commentary" do
|
should "get the artist commentary" do
|
||||||
title = "test post please ignore"
|
title = "test post please ignore"
|
||||||
desc = "<div align=\"center\"><span>blah blah<br><div align=\"left\">\n<a class=\"external\" href=\"https://www.deviantart.com/users/outgoing?http://www.google.com\">test link</a><br>\n</div></span></div>\n<br><h1>lol</h1>\n<br><br><b>blah</b> <i>blah</i> <u>blah</u> <strike>blah</strike><br>herp derp<br><br><blockquote>this is a quote</blockquote>\n<ol>\n<li>one</li>\n<li>two</li>\n<li>three</li>\n</ol>\n<ul>\n<li>one</li>\n<li>two</li>\n<li>three</li>\n</ul>\n<img src=\"https://e.deviantart.net/emoticons/h/heart.gif\" alt=\"Heart\" style=\"width: 15px; height: 13px;\" data-embed-type=\"emoticon\" data-embed-id=\"357\"> "
|
desc = "<div align=\"center\"><span>blah blah<br /><div align=\"left\"><a class=\"external\" href=\"https://www.deviantart.com/users/outgoing?http://www.google.com\">test link</a><br /></div></span></div><br /><h1>lol</h1><br /><br /><b>blah</b> <i>blah</i> <u>blah</u> <strike>blah</strike><br />herp derp<br /><br /><blockquote>this is a quote</blockquote><ol><li>one</li><li>two</li><li>three</li></ol><ul><li>one</li><li>two</li><li>three</li></ul><img src=\"https://e.deviantart.net/emoticons/h/heart.gif\" alt=\"Heart\" style=\"width: 15px; height: 13px;\" data-embed-type=\"emoticon\" data-embed-id=\"357\"> "
|
||||||
|
|
||||||
assert_equal(title, @site.artist_commentary_title)
|
assert_equal(title, @site.artist_commentary_title)
|
||||||
assert_equal(desc, @site.artist_commentary_desc)
|
assert_equal(desc, @site.artist_commentary_desc)
|
||||||
@@ -79,7 +86,7 @@ module Sources
|
|||||||
end
|
end
|
||||||
|
|
||||||
should "get the image url" do
|
should "get the image url" do
|
||||||
assert_match(%r!https://orig\d+\.deviantart\.net/cb25/f/2017/160/1/9/hidden_work_by_noizave-dbc3r29\.png!, @site.image_url)
|
assert_match(%r!https://origin-orig\.deviantart\.net/cb25/f/2017/160/1/9/hidden_work_by_noizave-dbc3r29\.png!, @site.image_url)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user