From 8f87fb90d95d813303e3987235f6eddaddde7c85 Mon Sep 17 00:00:00 2001 From: evazion Date: Sun, 2 Sep 2018 22:28:16 -0500 Subject: [PATCH 1/2] deviantart: handle urls without deviation ids (fix #3864) Some older URL formats don't contain the deviation id: * http://fc08.deviantart.net/files/f/2007/120/c/9/Cool_Like_Me_by_47ness.jpg * http://pre06.deviantart.net/8497/th/pre/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg In these cases we can't make the API call. Fix failures due to not being able to do API calls in this situation. Also fix canonical_url to use the image_url when it contains the deviation id, or the page_url when it doesn't. Finally, fix page_url to use the url from the API instead of the raw url given by the user, so that it's in a consistent form for canonical_url. --- app/logical/sources/strategies/deviant_art.rb | 100 +++++++++++++----- test/unit/sources/deviantart_test.rb | 76 ++++++++++++- 2 files changed, 146 insertions(+), 30 deletions(-) diff --git a/app/logical/sources/strategies/deviant_art.rb b/app/logical/sources/strategies/deviant_art.rb index 6965be8aa..8324a94e1 100644 --- a/app/logical/sources/strategies/deviant_art.rb +++ b/app/logical/sources/strategies/deviant_art.rb @@ -1,13 +1,47 @@ +# Asset URLs: +# +# * http://orig12.deviantart.net/9b69/f/2017/023/7/c/illustration___tokyo_encount_oei__by_melisaongmiqin-dawi58s.png +# * http://pre15.deviantart.net/81de/th/pre/f/2015/063/5/f/inha_by_inhaestudios-d8kfzm5.jpg +# * http://th00.deviantart.net/fs71/PRE/f/2014/065/3/b/goruto_by_xyelkiltrox-d797tit.png +# +# * http://th04.deviantart.net/fs70/300W/f/2009/364/4/d/Alphes_Mimic___Rika_by_Juriesute.png +# * http://fc02.deviantart.net/fs48/f/2009/186/2/c/Animation_by_epe_tohri.swf +# * http://fc08.deviantart.net/files/f/2007/120/c/9/Cool_Like_Me_by_47ness.jpg +# +# * http://fc08.deviantart.net/images3/i/2004/088/8/f/Blackrose_for_MuzicFreq.jpg +# * http://img04.deviantart.net/720b/i/2003/37/9/6/princess_peach.jpg +# +# * http://prnt00.deviantart.net/9b74/b/2016/101/4/468a9d89f52a835d4f6f1c8caca0dfb2-pnjfbh.jpg +# * http://fc00.deviantart.net/fs71/f/2013/234/d/8/d84e05f26f0695b1153e9dab3a962f16-d6j8jl9.jpg +# * http://th04.deviantart.net/fs71/PRE/f/2013/337/3/5/35081351f62b432f84eaeddeb4693caf-d6wlrqs.jpg +# +# * http://fc09.deviantart.net/fs22/o/2009/197/3/7/37ac79eaeef9fb32e6ae998e9a77d8dd.jpg +# * http://pre06.deviantart.net/8497/th/pre/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg +# +# Page URLs: +# +# * https://www.deviantart.com/noizave/art/test-post-please-ignore-685436408 +# * https://noizave.deviantart.com/art/test-post-please-ignore-685436408 +# * https://www.deviantart.com/deviation/685436408 +# * https://fav.me/dbc3a48 + module Sources module Strategies class DeviantArt < Base ASSET_SUBDOMAINS = %r{(?:fc|th|pre|img|orig|origin-orig)\d*}i - ATTRIBUTED_ASSET = %r{\Ahttps?://#{ASSET_SUBDOMAINS}\.deviantart\.net/.+/[a-z0-9_]*_by_[a-z0-9_]+-d([a-z0-9]+)\.}i - ASSET = %r{\Ahttps?://#{ASSET_SUBDOMAINS}\.deviantart\.net/.+/[a-f0-9]+-d([a-z0-9]+)\.}i - PATH_ART = %r{\Ahttps?://www\.deviantart\.com/([^/]+)/art/} - RESERVED_SUBDOMAINS = %r{\Ahttps?://(?:#{ASSET_SUBDOMAINS}|www)\.} - SUBDOMAIN_ART = %r{\Ahttps?://(.+?)\.deviantart\.com(.*)} - PROFILE = %r{\Ahttps?://www\.deviantart\.com/([^/]+)/?\z} + RESERVED_SUBDOMAINS = %r{\Ahttps?://(?:#{ASSET_SUBDOMAINS}|www)\.}i + + TITLE = %r{(?[a-z0-9_-]+?)}i + ARTIST = %r{(?<artist>[a-z0-9_-]+?)}i + DEVIATION_ID = %r{(?<deviation_id>[0-9]+)}i + + ASSET = %r{\Ahttps?://#{ASSET_SUBDOMAINS}\.deviantart\.net/.+/#{TITLE}(?:_by_#{ARTIST}(?:-d(?<base36_deviation_id>\w+))?)?\.}i + + PATH_ART = %r{\Ahttps?://www\.deviantart\.com/#{ARTIST}/art/#{TITLE}-#{DEVIATION_ID}\z}i + SUBDOMAIN_ART = %r{\Ahttps?://#{ARTIST}\.deviantart\.com/art/#{TITLE}-#{DEVIATION_ID}\z}i + + PATH_PROFILE = %r{\Ahttps?://www\.deviantart\.com/#{ARTIST}/?\z}i + SUBDOMAIN_PROFILE = %r{\Ahttps?://#{ARTIST}\.deviantart\.com/?\z}i def self.match?(*urls) urls.compact.any? { |x| x.match?(/^https?:\/\/(?:.+?\.)?deviantart\.(?:com|net)/) } @@ -17,8 +51,16 @@ module Sources "Deviant Art" end + def canonical_url + if self.class.deviation_id_from_url(image_url).present? || page_url.blank? + image_url + else + page_url + end + end + def image_urls - # work is private or deleted, use image url as given by user. + # work is private, deleted, or the url didn't contain a deviation id; use image url as given by user. if api_deviation.blank? [url] # work is downloadable @@ -39,29 +81,12 @@ module Sources end def page_url - [url, referer_url].each do |x| - if x =~ ATTRIBUTED_ASSET - return "http://fav.me/d#{$1}" - end - - if x =~ ASSET - return "http://fav.me/d#{$1}" - end - - if x =~ PATH_ART - return x - end - - if x !~ RESERVED_SUBDOMAINS && x =~ SUBDOMAIN_ART - return "http://www.deviantart.com/#{$1}#{$2}" - end - end - - return super + return "" if api_deviation.blank? + api_deviation[:url] end def profile_url - if url =~ PROFILE + if url =~ PATH_PROFILE return url end @@ -85,7 +110,7 @@ module Sources end def normalized_for_artist_finder? - url =~ PROFILE + url =~ PATH_PROFILE end def normalizable_for_artist_finder? @@ -138,12 +163,29 @@ module Sources public + def self.deviation_id_from_url(url) + if url =~ ASSET + $~[:base36_deviation_id].try(:to_i, 36) + elsif url =~ PATH_ART || (url !~ RESERVED_SUBDOMAINS && url =~ SUBDOMAIN_ART) + $~[:deviation_id].to_i + else + nil + end + end + + def deviation_id + self.class.deviation_id_from_url(url) || self.class.deviation_id_from_url(referer_url) + end + def page + return nil if deviation_id.blank? + deviation_url = "https://www.deviantart.com/deviation/#{deviation_id}" + options = Danbooru.config.httparty_options.deep_merge( format: :plain, headers: { "Accept-Encoding" => "gzip" } ) - resp = HTTParty.get(page_url, **options) + resp = HTTParty.get(deviation_url, **options) if resp.success? body = Zlib.gunzip(resp.body) diff --git a/test/unit/sources/deviantart_test.rb b/test/unit/sources/deviantart_test.rb index 63b3b52e7..68fd9fb55 100644 --- a/test/unit/sources/deviantart_test.rb +++ b/test/unit/sources/deviantart_test.rb @@ -14,6 +14,7 @@ module Sources should "work" do assert_equal(["http://origin-orig.deviantart.net/d533/f/2014/004/8/d/holiday_elincia_by_aeror404-d70rm0s.jpg"], @site.image_urls) + assert_equal(@site.image_url, @site.canonical_url) end end @@ -21,13 +22,17 @@ module Sources should "work" do @site = Sources::Strategies.find("https://pre00.deviantart.net/423b/th/pre/i/2017/281/e/0/mindflayer_girl01_by_nickbeja-dbpxdt8.png") assert_equal("https://pre00.deviantart.net/423b/th/pre/i/2017/281/e/0/mindflayer_girl01_by_nickbeja-dbpxdt8.png", @site.image_url) + assert_equal(@site.image_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } end end context "The source for a download-disabled DeviantArt artwork page" do should "get the image url" do @site = Sources::Strategies.find("https://noizave.deviantart.com/art/test-no-download-697415967") + assert_equal(["https://img00.deviantart.net/56ee/i/2017/219/2/3/test__no_download_by_noizave-dbj81lr.jpg"], @site.image_urls) + assert_equal(@site.image_url, @site.canonical_url) end end @@ -38,6 +43,7 @@ module Sources assert_equal("hideyoshi", @site.artist_name) assert_equal("https://www.deviantart.com/hideyoshi", @site.profile_url) assert_equal("http://origin-orig.deviantart.net/9e1f/f/2016/265/3/5/legend_of_galactic_heroes_by_hideyoshi-daihpha.jpg", @site.image_url) + assert_equal(@site.image_url, @site.canonical_url) end end @@ -45,9 +51,11 @@ module Sources should "work" do @site = Sources::Strategies.find("http://origin-orig.deviantart.net/7b5b/f/2017/160/c/5/test_post_please_ignore_by_noizave-dbc3a48.png") + assert_equal(@site.url, @site.image_url) + assert_equal("https://www.deviantart.com/noizave/art/test-post-please-ignore-685436408", @site.page_url) + assert_equal(@site.image_url, @site.canonical_url) assert_equal("noizave", @site.artist_name) assert_equal("https://www.deviantart.com/noizave", @site.profile_url) - assert_equal("http://origin-orig.deviantart.net/7b5b/f/2017/160/c/5/test_post_please_ignore_by_noizave-dbc3a48.png", @site.image_url) assert_equal(%w[bar baz foo], @site.tags.map(&:first)) assert_nothing_raised { @site.to_h } end @@ -67,6 +75,72 @@ module Sources end end + context "The source for a *.deviantart.net/*/:title_by_:artist.jpg url" do + setup do + @url = "http://fc08.deviantart.net/files/f/2007/120/c/9/cool_like_me_by_47ness.jpg" + @ref = "https://47ness.deviantart.com/art/Cool-Like-Me-54339311" + end + + context "without a referer" do + should "work" do + @site = Sources::Strategies.find(@url) + + assert_equal(@site.url, @site.image_url) + assert_equal("47ness", @site.artist_name) + assert_equal("https://www.deviantart.com/47ness", @site.profile_url) + assert_equal("", @site.page_url) + assert_equal(@site.image_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + + context "with a referer" do + should "work" do + @site = Sources::Strategies.find(@url, @ref) + + assert_equal("http://origin-orig.deviantart.net/a418/f/2007/120/c/9/cool_like_me_by_47ness.jpg", @site.image_url) + assert_equal("47ness", @site.artist_name) + assert_equal("https://www.deviantart.com/47ness", @site.profile_url) + assert_equal("https://www.deviantart.com/47ness/art/Cool-Like-Me-54339311", @site.page_url) + assert_equal(@site.page_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + end + + context "The source for a *.deviantart.net/*/:hash.jpg url" do + setup do + @url = "http://pre06.deviantart.net/8497/th/pre/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg" + @ref = "https://www.deviantart.com/edsfox/art/Silverhawks-Quicksilver-126872896" + end + + context "without a referer" do + should "work" do + @site = Sources::Strategies.find(@url) + + assert_equal(@url, @site.image_url) + assert_equal("", @site.artist_name) + assert_equal("", @site.profile_url) + assert_equal("", @site.page_url) + assert_equal(@site.image_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + + context "with a referer" do + should "work" do + @site = Sources::Strategies.find(@url, @ref) + + assert_equal("http://origin-orig.deviantart.net/66c1/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg", @site.image_url) + assert_equal("edsfox", @site.artist_name) + assert_equal("https://www.deviantart.com/edsfox", @site.profile_url) + assert_equal("https://www.deviantart.com/edsfox/art/Silverhawks-Quicksilver-126872896", @site.page_url) + assert_equal(@site.page_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + end + context "The source for an DeviantArt artwork page" do setup do @site = Sources::Strategies.find("http://noizave.deviantart.com/art/test-post-please-ignore-685436408") From e37844303d12388f304d8d32760deaa596b5be41 Mon Sep 17 00:00:00 2001 From: evazion <noizave@gmail.com> Date: Sun, 2 Sep 2018 22:53:24 -0500 Subject: [PATCH 2/2] deviantart: take artist name from url when unavailable from API. In some cases we can't get the artist name from the API, either because we can't do the API call because the url doesn't contain a deviation id, or because the work is deleted: * http://fc08.deviantart.net/files/f/2007/120/c/9/cool_like_me_by_47ness.jpg (work: http://fav.me/dwcohb) * https://pre00.deviantart.net/423b/th/pre/i/2017/281/e/0/mindflayer_girl01_by_nickbeja-dbpxdt8.png (work: http://fav.me/dbpxd58) Switch to taking the artist name from the url (when present) to deal with these cases. Fixes the artist finder and the artist url normalizer to work in this situation. --- app/logical/sources/strategies/deviant_art.rb | 37 +++++++++++++------ test/unit/sources/deviantart_test.rb | 13 +++++++ 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/app/logical/sources/strategies/deviant_art.rb b/app/logical/sources/strategies/deviant_art.rb index 8324a94e1..f593950b9 100644 --- a/app/logical/sources/strategies/deviant_art.rb +++ b/app/logical/sources/strategies/deviant_art.rb @@ -86,19 +86,20 @@ module Sources end def profile_url - if url =~ PATH_PROFILE - return url - end - - if artist_name.blank? - return nil - end - - return "https://www.deviantart.com/#{artist_name}" + return "" if artist_name.blank? + "https://www.deviantart.com/#{artist_name.downcase}" end + # Prefer the name from the url because the api metadata won't be present when + # the input url doesn't contain a deviation id, or the deviation is private or deleted. def artist_name - api_metadata.dig(:author, :username).try(&:downcase) + if artist_name_from_url.present? + artist_name_from_url + elsif api_metadata.present? + api_metadata.dig(:author, :username) + else + "" + end end def artist_commentary_title @@ -114,7 +115,7 @@ module Sources end def normalizable_for_artist_finder? - url =~ PATH_ART || url =~ SUBDOMAIN_ART + normalize_for_artist_finder.present? end def normalize_for_artist_finder @@ -173,10 +174,24 @@ module Sources end end + def self.artist_name_from_url(url) + if url =~ ASSET || url =~ PATH_ART || url =~ PATH_PROFILE + $~[:artist] + elsif url !~ RESERVED_SUBDOMAINS && (url =~ SUBDOMAIN_ART || url =~ SUBDOMAIN_PROFILE) + $~[:artist] + else + nil + end + end + def deviation_id self.class.deviation_id_from_url(url) || self.class.deviation_id_from_url(referer_url) end + def artist_name_from_url + self.class.artist_name_from_url(url) || self.class.artist_name_from_url(referer_url) + end + def page return nil if deviation_id.blank? deviation_url = "https://www.deviantart.com/deviation/#{deviation_id}" diff --git a/test/unit/sources/deviantart_test.rb b/test/unit/sources/deviantart_test.rb index 68fd9fb55..4bf3960c1 100644 --- a/test/unit/sources/deviantart_test.rb +++ b/test/unit/sources/deviantart_test.rb @@ -15,14 +15,21 @@ module Sources should "work" do assert_equal(["http://origin-orig.deviantart.net/d533/f/2014/004/8/d/holiday_elincia_by_aeror404-d70rm0s.jpg"], @site.image_urls) assert_equal(@site.image_url, @site.canonical_url) + assert_equal("aeror404", @site.artist_name) + assert_equal("https://www.deviantart.com/aeror404", @site.profile_url) end end context "The source for a deleted DeviantArt image URL" do should "work" do @site = Sources::Strategies.find("https://pre00.deviantart.net/423b/th/pre/i/2017/281/e/0/mindflayer_girl01_by_nickbeja-dbpxdt8.png") + @artist = FactoryBot.create(:artist, name: "nickbeja", url_string: "https://nickbeja.deviantart.com") + assert_equal("https://pre00.deviantart.net/423b/th/pre/i/2017/281/e/0/mindflayer_girl01_by_nickbeja-dbpxdt8.png", @site.image_url) assert_equal(@site.image_url, @site.canonical_url) + assert_equal("nickbeja", @site.artist_name) + assert_equal("https://www.deviantart.com/nickbeja", @site.profile_url) + assert_equal([@artist], @site.artists) assert_nothing_raised { @site.to_h } end end @@ -79,6 +86,7 @@ module Sources setup do @url = "http://fc08.deviantart.net/files/f/2007/120/c/9/cool_like_me_by_47ness.jpg" @ref = "https://47ness.deviantart.com/art/Cool-Like-Me-54339311" + @artist = FactoryBot.create(:artist, name: "47ness", url_string: "https://www.deviantart.com/47ness") end context "without a referer" do @@ -90,6 +98,7 @@ module Sources assert_equal("https://www.deviantart.com/47ness", @site.profile_url) assert_equal("", @site.page_url) assert_equal(@site.image_url, @site.canonical_url) + assert_equal([@artist], @site.artists) assert_nothing_raised { @site.to_h } end end @@ -103,6 +112,7 @@ module Sources assert_equal("https://www.deviantart.com/47ness", @site.profile_url) assert_equal("https://www.deviantart.com/47ness/art/Cool-Like-Me-54339311", @site.page_url) assert_equal(@site.page_url, @site.canonical_url) + assert_equal([@artist], @site.artists) assert_nothing_raised { @site.to_h } end end @@ -112,6 +122,7 @@ module Sources setup do @url = "http://pre06.deviantart.net/8497/th/pre/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg" @ref = "https://www.deviantart.com/edsfox/art/Silverhawks-Quicksilver-126872896" + @artist = FactoryBot.create(:artist, name: "edsfox", url_string: "https://edsfox.deviantart.com") end context "without a referer" do @@ -123,6 +134,7 @@ module Sources assert_equal("", @site.profile_url) assert_equal("", @site.page_url) assert_equal(@site.image_url, @site.canonical_url) + assert_equal([], @site.artists) assert_nothing_raised { @site.to_h } end end @@ -136,6 +148,7 @@ module Sources assert_equal("https://www.deviantart.com/edsfox", @site.profile_url) assert_equal("https://www.deviantart.com/edsfox/art/Silverhawks-Quicksilver-126872896", @site.page_url) assert_equal(@site.page_url, @site.canonical_url) + assert_equal([@artist], @site.artists) assert_nothing_raised { @site.to_h } end end