From 8f87fb90d95d813303e3987235f6eddaddde7c85 Mon Sep 17 00:00:00 2001 From: evazion Date: Sun, 2 Sep 2018 22:28:16 -0500 Subject: [PATCH] deviantart: handle urls without deviation ids (fix #3864) Some older URL formats don't contain the deviation id: * http://fc08.deviantart.net/files/f/2007/120/c/9/Cool_Like_Me_by_47ness.jpg * http://pre06.deviantart.net/8497/th/pre/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg In these cases we can't make the API call. Fix failures due to not being able to do API calls in this situation. Also fix canonical_url to use the image_url when it contains the deviation id, or the page_url when it doesn't. Finally, fix page_url to use the url from the API instead of the raw url given by the user, so that it's in a consistent form for canonical_url. --- app/logical/sources/strategies/deviant_art.rb | 100 +++++++++++++----- test/unit/sources/deviantart_test.rb | 76 ++++++++++++- 2 files changed, 146 insertions(+), 30 deletions(-) diff --git a/app/logical/sources/strategies/deviant_art.rb b/app/logical/sources/strategies/deviant_art.rb index 6965be8aa..8324a94e1 100644 --- a/app/logical/sources/strategies/deviant_art.rb +++ b/app/logical/sources/strategies/deviant_art.rb @@ -1,13 +1,47 @@ +# Asset URLs: +# +# * http://orig12.deviantart.net/9b69/f/2017/023/7/c/illustration___tokyo_encount_oei__by_melisaongmiqin-dawi58s.png +# * http://pre15.deviantart.net/81de/th/pre/f/2015/063/5/f/inha_by_inhaestudios-d8kfzm5.jpg +# * http://th00.deviantart.net/fs71/PRE/f/2014/065/3/b/goruto_by_xyelkiltrox-d797tit.png +# +# * http://th04.deviantart.net/fs70/300W/f/2009/364/4/d/Alphes_Mimic___Rika_by_Juriesute.png +# * http://fc02.deviantart.net/fs48/f/2009/186/2/c/Animation_by_epe_tohri.swf +# * http://fc08.deviantart.net/files/f/2007/120/c/9/Cool_Like_Me_by_47ness.jpg +# +# * http://fc08.deviantart.net/images3/i/2004/088/8/f/Blackrose_for_MuzicFreq.jpg +# * http://img04.deviantart.net/720b/i/2003/37/9/6/princess_peach.jpg +# +# * http://prnt00.deviantart.net/9b74/b/2016/101/4/468a9d89f52a835d4f6f1c8caca0dfb2-pnjfbh.jpg +# * http://fc00.deviantart.net/fs71/f/2013/234/d/8/d84e05f26f0695b1153e9dab3a962f16-d6j8jl9.jpg +# * http://th04.deviantart.net/fs71/PRE/f/2013/337/3/5/35081351f62b432f84eaeddeb4693caf-d6wlrqs.jpg +# +# * http://fc09.deviantart.net/fs22/o/2009/197/3/7/37ac79eaeef9fb32e6ae998e9a77d8dd.jpg +# * http://pre06.deviantart.net/8497/th/pre/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg +# +# Page URLs: +# +# * https://www.deviantart.com/noizave/art/test-post-please-ignore-685436408 +# * https://noizave.deviantart.com/art/test-post-please-ignore-685436408 +# * https://www.deviantart.com/deviation/685436408 +# * https://fav.me/dbc3a48 + module Sources module Strategies class DeviantArt < Base ASSET_SUBDOMAINS = %r{(?:fc|th|pre|img|orig|origin-orig)\d*}i - ATTRIBUTED_ASSET = %r{\Ahttps?://#{ASSET_SUBDOMAINS}\.deviantart\.net/.+/[a-z0-9_]*_by_[a-z0-9_]+-d([a-z0-9]+)\.}i - ASSET = %r{\Ahttps?://#{ASSET_SUBDOMAINS}\.deviantart\.net/.+/[a-f0-9]+-d([a-z0-9]+)\.}i - PATH_ART = %r{\Ahttps?://www\.deviantart\.com/([^/]+)/art/} - RESERVED_SUBDOMAINS = %r{\Ahttps?://(?:#{ASSET_SUBDOMAINS}|www)\.} - SUBDOMAIN_ART = %r{\Ahttps?://(.+?)\.deviantart\.com(.*)} - PROFILE = %r{\Ahttps?://www\.deviantart\.com/([^/]+)/?\z} + RESERVED_SUBDOMAINS = %r{\Ahttps?://(?:#{ASSET_SUBDOMAINS}|www)\.}i + + TITLE = %r{(?[a-z0-9_-]+?)}i + ARTIST = %r{(?<artist>[a-z0-9_-]+?)}i + DEVIATION_ID = %r{(?<deviation_id>[0-9]+)}i + + ASSET = %r{\Ahttps?://#{ASSET_SUBDOMAINS}\.deviantart\.net/.+/#{TITLE}(?:_by_#{ARTIST}(?:-d(?<base36_deviation_id>\w+))?)?\.}i + + PATH_ART = %r{\Ahttps?://www\.deviantart\.com/#{ARTIST}/art/#{TITLE}-#{DEVIATION_ID}\z}i + SUBDOMAIN_ART = %r{\Ahttps?://#{ARTIST}\.deviantart\.com/art/#{TITLE}-#{DEVIATION_ID}\z}i + + PATH_PROFILE = %r{\Ahttps?://www\.deviantart\.com/#{ARTIST}/?\z}i + SUBDOMAIN_PROFILE = %r{\Ahttps?://#{ARTIST}\.deviantart\.com/?\z}i def self.match?(*urls) urls.compact.any? { |x| x.match?(/^https?:\/\/(?:.+?\.)?deviantart\.(?:com|net)/) } @@ -17,8 +51,16 @@ module Sources "Deviant Art" end + def canonical_url + if self.class.deviation_id_from_url(image_url).present? || page_url.blank? + image_url + else + page_url + end + end + def image_urls - # work is private or deleted, use image url as given by user. + # work is private, deleted, or the url didn't contain a deviation id; use image url as given by user. if api_deviation.blank? [url] # work is downloadable @@ -39,29 +81,12 @@ module Sources end def page_url - [url, referer_url].each do |x| - if x =~ ATTRIBUTED_ASSET - return "http://fav.me/d#{$1}" - end - - if x =~ ASSET - return "http://fav.me/d#{$1}" - end - - if x =~ PATH_ART - return x - end - - if x !~ RESERVED_SUBDOMAINS && x =~ SUBDOMAIN_ART - return "http://www.deviantart.com/#{$1}#{$2}" - end - end - - return super + return "" if api_deviation.blank? + api_deviation[:url] end def profile_url - if url =~ PROFILE + if url =~ PATH_PROFILE return url end @@ -85,7 +110,7 @@ module Sources end def normalized_for_artist_finder? - url =~ PROFILE + url =~ PATH_PROFILE end def normalizable_for_artist_finder? @@ -138,12 +163,29 @@ module Sources public + def self.deviation_id_from_url(url) + if url =~ ASSET + $~[:base36_deviation_id].try(:to_i, 36) + elsif url =~ PATH_ART || (url !~ RESERVED_SUBDOMAINS && url =~ SUBDOMAIN_ART) + $~[:deviation_id].to_i + else + nil + end + end + + def deviation_id + self.class.deviation_id_from_url(url) || self.class.deviation_id_from_url(referer_url) + end + def page + return nil if deviation_id.blank? + deviation_url = "https://www.deviantart.com/deviation/#{deviation_id}" + options = Danbooru.config.httparty_options.deep_merge( format: :plain, headers: { "Accept-Encoding" => "gzip" } ) - resp = HTTParty.get(page_url, **options) + resp = HTTParty.get(deviation_url, **options) if resp.success? body = Zlib.gunzip(resp.body) diff --git a/test/unit/sources/deviantart_test.rb b/test/unit/sources/deviantart_test.rb index 63b3b52e7..68fd9fb55 100644 --- a/test/unit/sources/deviantart_test.rb +++ b/test/unit/sources/deviantart_test.rb @@ -14,6 +14,7 @@ module Sources should "work" do assert_equal(["http://origin-orig.deviantart.net/d533/f/2014/004/8/d/holiday_elincia_by_aeror404-d70rm0s.jpg"], @site.image_urls) + assert_equal(@site.image_url, @site.canonical_url) end end @@ -21,13 +22,17 @@ module Sources should "work" do @site = Sources::Strategies.find("https://pre00.deviantart.net/423b/th/pre/i/2017/281/e/0/mindflayer_girl01_by_nickbeja-dbpxdt8.png") assert_equal("https://pre00.deviantart.net/423b/th/pre/i/2017/281/e/0/mindflayer_girl01_by_nickbeja-dbpxdt8.png", @site.image_url) + assert_equal(@site.image_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } end end context "The source for a download-disabled DeviantArt artwork page" do should "get the image url" do @site = Sources::Strategies.find("https://noizave.deviantart.com/art/test-no-download-697415967") + assert_equal(["https://img00.deviantart.net/56ee/i/2017/219/2/3/test__no_download_by_noizave-dbj81lr.jpg"], @site.image_urls) + assert_equal(@site.image_url, @site.canonical_url) end end @@ -38,6 +43,7 @@ module Sources assert_equal("hideyoshi", @site.artist_name) assert_equal("https://www.deviantart.com/hideyoshi", @site.profile_url) assert_equal("http://origin-orig.deviantart.net/9e1f/f/2016/265/3/5/legend_of_galactic_heroes_by_hideyoshi-daihpha.jpg", @site.image_url) + assert_equal(@site.image_url, @site.canonical_url) end end @@ -45,9 +51,11 @@ module Sources should "work" do @site = Sources::Strategies.find("http://origin-orig.deviantart.net/7b5b/f/2017/160/c/5/test_post_please_ignore_by_noizave-dbc3a48.png") + assert_equal(@site.url, @site.image_url) + assert_equal("https://www.deviantart.com/noizave/art/test-post-please-ignore-685436408", @site.page_url) + assert_equal(@site.image_url, @site.canonical_url) assert_equal("noizave", @site.artist_name) assert_equal("https://www.deviantart.com/noizave", @site.profile_url) - assert_equal("http://origin-orig.deviantart.net/7b5b/f/2017/160/c/5/test_post_please_ignore_by_noizave-dbc3a48.png", @site.image_url) assert_equal(%w[bar baz foo], @site.tags.map(&:first)) assert_nothing_raised { @site.to_h } end @@ -67,6 +75,72 @@ module Sources end end + context "The source for a *.deviantart.net/*/:title_by_:artist.jpg url" do + setup do + @url = "http://fc08.deviantart.net/files/f/2007/120/c/9/cool_like_me_by_47ness.jpg" + @ref = "https://47ness.deviantart.com/art/Cool-Like-Me-54339311" + end + + context "without a referer" do + should "work" do + @site = Sources::Strategies.find(@url) + + assert_equal(@site.url, @site.image_url) + assert_equal("47ness", @site.artist_name) + assert_equal("https://www.deviantart.com/47ness", @site.profile_url) + assert_equal("", @site.page_url) + assert_equal(@site.image_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + + context "with a referer" do + should "work" do + @site = Sources::Strategies.find(@url, @ref) + + assert_equal("http://origin-orig.deviantart.net/a418/f/2007/120/c/9/cool_like_me_by_47ness.jpg", @site.image_url) + assert_equal("47ness", @site.artist_name) + assert_equal("https://www.deviantart.com/47ness", @site.profile_url) + assert_equal("https://www.deviantart.com/47ness/art/Cool-Like-Me-54339311", @site.page_url) + assert_equal(@site.page_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + end + + context "The source for a *.deviantart.net/*/:hash.jpg url" do + setup do + @url = "http://pre06.deviantart.net/8497/th/pre/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg" + @ref = "https://www.deviantart.com/edsfox/art/Silverhawks-Quicksilver-126872896" + end + + context "without a referer" do + should "work" do + @site = Sources::Strategies.find(@url) + + assert_equal(@url, @site.image_url) + assert_equal("", @site.artist_name) + assert_equal("", @site.profile_url) + assert_equal("", @site.page_url) + assert_equal(@site.image_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + + context "with a referer" do + should "work" do + @site = Sources::Strategies.find(@url, @ref) + + assert_equal("http://origin-orig.deviantart.net/66c1/f/2009/173/c/c/cc9686111dcffffffb5fcfaf0cf069fb.jpg", @site.image_url) + assert_equal("edsfox", @site.artist_name) + assert_equal("https://www.deviantart.com/edsfox", @site.profile_url) + assert_equal("https://www.deviantart.com/edsfox/art/Silverhawks-Quicksilver-126872896", @site.page_url) + assert_equal(@site.page_url, @site.canonical_url) + assert_nothing_raised { @site.to_h } + end + end + end + context "The source for an DeviantArt artwork page" do setup do @site = Sources::Strategies.find("http://noizave.deviantart.com/art/test-post-please-ignore-685436408")