diff --git a/app/logical/source/extractor.rb b/app/logical/source/extractor.rb index fec50f6f0..f758404a0 100644 --- a/app/logical/source/extractor.rb +++ b/app/logical/source/extractor.rb @@ -27,6 +27,7 @@ module Source DOWNLOAD_TIMEOUT = 60 attr_reader :url, :referer_url, :parsed_url, :parsed_referer + delegate :site_name, to: :parsed_url SUBCLASSES = [ @@ -50,6 +51,7 @@ module Source Source::Extractor::Plurk, Source::Extractor::Tinami, Source::Extractor::Fantia, + Source::Extractor::Booth, ] # Should return true if the extractor is configured correctly. Return false diff --git a/app/logical/source/extractor/booth.rb b/app/logical/source/extractor/booth.rb new file mode 100644 index 000000000..70c337e4b --- /dev/null +++ b/app/logical/source/extractor/booth.rb @@ -0,0 +1,83 @@ +# frozen_string_literal: true + +# @see Source::URL::Booth +class Source::Extractor + class Booth < Source::Extractor + def match? + Source::URL::Booth === parsed_url + end + + def image_urls + if parsed_url.image_url? + if parsed_url.full_image_url? + [parsed_url.to_s] + else + [find_right_extension(parsed_url)] + end + else + page&.css(".market-item-detail-item-image")&.pluck("data-origin").to_a.compact + end + end + + def profile_url + if page.present? + page.at(".summary [data-product-list*='shop_index']")&.[]("href")&.chomp("/") + else + parsed_url.profile_url || parsed_referer&.profile_url + end + end + + def artist_name + return nil unless profile_url.present? + Source::URL.parse(profile_url)&.username + end + + def display_name + page&.at(".summary .user-avatar")&.[]("alt") + end + + def other_names + [display_name].compact + end + + def artist_commentary_title + page&.at(".summary .u-tpg-title1")&.text + end + + def artist_commentary_desc + page&.at(".autolink")&.to_html + end + + def dtext_artist_commentary_desc + DText.from_html(artist_commentary_desc) + end + + def tags + page&.css(".item-info-detail [data-product-list*='tag_category_search']").to_a.map do |element| + [element.text.gsub(/ x .*/, ""), element["href"]] + end + end + + def page_url + parsed_url.page_url || parsed_referer&.page_url + end + + def page + return nil if parsed_url.page_url.blank? + + resp = http.cache(1.minute).cookies(adult: "t").get(page_url) + return nil if resp.code != 200 + + resp.parse + end + memoize :page + + def find_right_extension(parsed_url) + extensions = %w[png jpg jpeg] + candidates = extensions.map { |ext| parsed_url.full_image_url_for(ext) } + + chosen_url = candidates.find { |candidate| http_exists?(candidate) } + chosen_url || parsed_url.to_s + end + end +end diff --git a/app/logical/source/url/booth.rb b/app/logical/source/url/booth.rb index a363b696c..d8d272bd0 100644 --- a/app/logical/source/url/booth.rb +++ b/app/logical/source/url/booth.rb @@ -1,15 +1,13 @@ # frozen_string_literal: true -# Unhandled: -# -# https://booth.pximg.net/c/128x128/users/3193929/icon_image/5be9eff4-1d9e-4a79-b097-33c1cd4ad314_base_resized.jpg (profile icon) -# https://s2.booth.pm/8bb9e4e3-d171-4027-88df-84480480f79d/3d70de06-8e7c-444e-b8eb-a8a95bf20638.png (profile cover image) +# Unhandled +# https://booth.pm/downloadables/1376468 (from https://booth.pm/en/items/2425521, requires pixiv login to download) module Source class URL::Booth < Source::URL RESERVED_SUBDOMAINS = ["www", "s", "s2", "asset", "accounts", nil] - attr_reader :work_id, :user_id, :username + attr_reader :work_id, :user_id, :user_uuid, :username def self.match?(url) url.domain == "booth.pm" || url.host == "booth.pximg.net" @@ -21,6 +19,7 @@ module Source # https://booth.pximg.net/8bb9e4e3-d171-4027-88df-84480480f79d/i/2864768/00cdfef0-e8d5-454b-8554-4885a7e4827d_base_resized.jpg (full) # https://booth.pximg.net/c/300x300_a2_g5/8bb9e4e3-d171-4027-88df-84480480f79d/i/2864768/00cdfef0-e8d5-454b-8554-4885a7e4827d_base_resized.jpg (thumb) # https://booth.pximg.net/c/72x72_a2_g5/8bb9e4e3-d171-4027-88df-84480480f79d/i/2864768/00cdfef0-e8d5-454b-8554-4885a7e4827d_base_resized.jpg (thumb) + # https://booth.pximg.net/8bb9e4e3-d171-4027-88df-84480480f79d/i/2864768/00cdfef0-e8d5-454b-8554-4885a7e4827d.jpeg (full) # # https://s2.booth.pm/b242a7bd-0747-48c4-891d-9e8552edd5d7/i/3746752/52dbee27-7ad2-4048-9c1d-827eee36625c_base_resized.jpg (sample) # https://booth.pximg.net/b242a7bd-0747-48c4-891d-9e8552edd5d7/i/3746752/52dbee27-7ad2-4048-9c1d-827eee36625c.jpg (full) @@ -32,6 +31,20 @@ module Source @work_id = work_id @file = file + # profile icons + # https://booth.pximg.net/c/128x128/users/3193929/icon_image/5be9eff4-1d9e-4a79-b097-33c1cd4ad314_base_resized.jpg (sample) + # https://booth.pximg.net/users/3193929/icon_image/5be9eff4-1d9e-4a79-b097-33c1cd4ad314.png (full) + in _, _, *, "users", user_id, "icon_image", file + @user_id = user_id + @file = file + + # profile cover images + # https://s2.booth.pm/8bb9e4e3-d171-4027-88df-84480480f79d/3d70de06-8e7c-444e-b8eb-a8a95bf20638_base_resized.jpg (sample) + # https://s2.booth.pm/8bb9e4e3-d171-4027-88df-84480480f79d/3d70de06-8e7c-444e-b8eb-a8a95bf20638.png (full) + in _, _, *, /\h{8}-\h{4}-\h{4}-\h{4}-\h{12}/i => user_uuid, file + @user_uuid = user_uuid + @file = file + # https://booth.pm/en/items/2864768 # https://booth.pm/ja/items/2864768 in _, "booth.pm", _, "items", work_id @@ -53,7 +66,25 @@ module Source end def image_url? - url.host == "booth.pximg.net" + url.host.in?(["booth.pximg.net", "s2.booth.pm"]) + end + + def full_image_url? + image_url? && @file.exclude?("_base_resized") + end + + def full_image_url_for(extension) + return unless @file.present? + full_file = @file.gsub(/_base_resized\.\w+$/, ".#{extension}") + if user_uuid + if work_id + "https://#{host}/#{user_uuid}/i/#{work_id}/#{full_file}" + else + "https://#{host}/#{user_uuid}/#{full_file}" + end + elsif user_id + "https://#{host}/users/#{user_id}/icon_image/#{full_file}" + end end def page_url diff --git a/app/models/artist_url.rb b/app/models/artist_url.rb index a89fbba89..b1301f633 100644 --- a/app/models/artist_url.rb +++ b/app/models/artist_url.rb @@ -95,8 +95,8 @@ class ArtistURL < ApplicationRecord def priority sites = %w[ Pixiv Twitter - ArtStation Baraag BCY Deviant\ Art Hentai\ Foundry Fantia Foundation Lofter Nico\ Seiga Nijie Pawoo Fanbox Pixiv\ Sketch Plurk Tinami Tumblr Weibo - Ask.fm Booth Facebook FC2 Gumroad Instagram Ko-fi Livedoor Mihuashi Mixi.jp Patreon Piapro.jp Picarto Privatter Sakura.ne.jp Stickam Skeb Twitch Youtube + ArtStation Baraag BCY Booth Deviant\ Art Hentai\ Foundry Fantia Foundation Lofter Nico\ Seiga Nijie Pawoo Fanbox Pixiv\ Sketch Plurk Tinami Tumblr Weibo + Ask.fm Facebook FC2 Gumroad Instagram Ko-fi Livedoor Mihuashi Mixi.jp Patreon Piapro.jp Picarto Privatter Sakura.ne.jp Stickam Skeb Twitch Youtube Amazon Circle.ms DLSite Doujinshi.org Erogamescape Mangaupdates Melonbooks Toranoana Wikipedia ] diff --git a/test/test_helper.rb b/test/test_helper.rb index 65dd62766..d96e278fe 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -26,6 +26,7 @@ class ActiveSupport::TestCase include DownloadTestHelper include IqdbTestHelper include UploadTestHelper + extend SourceTestHelper extend StripeTestHelper extend NormalizeAttributeHelper diff --git a/test/test_helpers/source_test_helper.rb b/test/test_helpers/source_test_helper.rb new file mode 100644 index 000000000..e938e4a0c --- /dev/null +++ b/test/test_helpers/source_test_helper.rb @@ -0,0 +1,93 @@ +module SourceTestHelper + # A helper method to automate all the checks needed to make sure that a strategy does not break. + # + # * If download_size is nil, it tests that the file is downloaded correctly, otherwise it also checks the filesize. + # * If deleted is true, it skips the downloading check, but it still tries everything else and makes sure nothing breaks. + # * Any passed kwargs parameter is tested against the strategy. + def strategy_should_work(url, referer: nil, download_size: nil, deleted: false, **methods_to_test) + context "a strategy for #{url}#{", referer: #{referer}" if referer.present?}".chomp do + strategy = Source::Extractor.find(url, referer) + + should "not raise anything" do + assert_nothing_raised { strategy.to_h } + end + + should "make sure that image_urls is an array of valid elements" do + assert((strategy.image_urls.instance_of? Array)) + assert_not(strategy.image_urls.include?(nil)) + end + + should_download_successfully(strategy, download_size) unless deleted + + # {profile_url: nil}[:profile_url].present? -> false + # Doing it this way instead we can check profile_url even if it's passed as a nil. + if methods_to_test.include? :profile_url + profile_url = methods_to_test.delete(:profile_url) + should_handle_artists_correctly(strategy, profile_url) + end + + tags = methods_to_test.delete(:tags) + should_validate_tags(strategy, tags) + + # check any method that is passed as kwargs, in order to hardcode as few thingss as possible + methods_to_test.each do |method_name, expected_value| + should "make sure that '#{method_name}' matches" do + if expected_value.instance_of? Regexp + assert_match(expected_value, strategy.try(method_name)) + elsif expected_value.nil? + assert_nil(strategy.try(method_name)) + else + assert_equal(expected_value, strategy.try(method_name)) + end + end + end + end + end + + def should_download_successfully(strategy, download_size = nil) + should "download successfully" do + file = strategy.download_file!(strategy.image_urls.first) + if download_size.present? + assert_equal(expected_filesize, file.size) + else + assert_not_nil(file.size) + end + end + end + + def should_handle_artists_correctly(strategy, profile_url) + if profile_url.present? + should "correctly match a strategy to an artist with the same profile url" do + assert_equal(profile_url, strategy.profile_url) + artist = FactoryBot.create(:artist, name: strategy.artist_name, url_string: profile_url) + assert_equal([artist], strategy.artists) + end + else + should "not incorrectly extract a profile url or artist data when there's none to be found" do + assert_nil(strategy.profile_url) + assert_nil(strategy.artist_name) + assert_equal([], strategy.other_names) + end + end + end + + def should_validate_tags(strategy, tags = nil) + should "make sure that tags return an array of arrays" do + assert((strategy.tags.instance_of? Array)) + if strategy.tags.present? + assert((strategy.tags.first.instance_of? Array)) + end + end + + return unless tags.present? + + should "make sure that tags match" do + if tags&.first.instance_of? Array + assert_equal(tags.sort, strategy.tags.sort) + elsif tags&.first.instance_of? String + assert_equal(tags.map(&:downcase).sort, strategy.tags.map(&:first).map(&:downcase).sort) + end + end + end + +end diff --git a/test/unit/sources/booth_test.rb b/test/unit/sources/booth_test.rb new file mode 100644 index 000000000..b99cb1e6a --- /dev/null +++ b/test/unit/sources/booth_test.rb @@ -0,0 +1,60 @@ +require "test_helper" + +module Sources + class BoothTest < ActiveSupport::TestCase + standard_url_images = %w[ + https://booth.pximg.net/a212cd73-75ab-482d-8fce-1ce2965e4d4f/i/3713604/ae0fdbcf-e4c5-4840-8d5c-43e18bddc93e.jpg + https://booth.pximg.net/a212cd73-75ab-482d-8fce-1ce2965e4d4f/i/3713604/d12bce50-a0c7-43f8-a4fb-5ee0ea6855a3.jpg + https://booth.pximg.net/a212cd73-75ab-482d-8fce-1ce2965e4d4f/i/3713604/f5332da3-4097-4d33-bbf6-a9b64c7671b3.jpg + ] + strategy_should_work( + "https://booth.pm/en/items/3713604", + image_urls: standard_url_images, + profile_url: "https://amedamacon.booth.pm", + page_url: "https://booth.pm/en/items/3713604", + artist_name: "amedamacon", + other_names: ["あめうさぎBOOTH"], + tags: [["抱き枕カバー", "https://booth.pm/en/browse/Pillow%20Cover?tags%5B%5D=%E6%8A%B1%E3%81%8D%E6%9E%95%E3%82%AB%E3%83%90%E3%83%BC"]], + artist_commentary_title: "フユちゃん抱き枕カバー", + dtext_artist_commentary_desc: /発送:6月上旬頃(BOOTH倉庫より発送)/ + ) + + strategy_should_work( + "https://booth.pximg.net/a212cd73-75ab-482d-8fce-1ce2965e4d4f/i/3713604/d12bce50-a0c7-43f8-a4fb-5ee0ea6855a3_base_resized.jpg", + image_urls: [standard_url_images.second], + profile_url: "https://amedamacon.booth.pm", + page_url: "https://booth.pm/en/items/3713604", + artist_name: "amedamacon", + other_names: ["あめうさぎBOOTH"], + tags: [["抱き枕カバー", "https://booth.pm/en/browse/Pillow%20Cover?tags%5B%5D=%E6%8A%B1%E3%81%8D%E6%9E%95%E3%82%AB%E3%83%90%E3%83%BC"]], + artist_commentary_title: "フユちゃん抱き枕カバー", + dtext_artist_commentary_desc: /発送:6月上旬頃(BOOTH倉庫より発送)/ + ) + + strategy_should_work( + "https://re-face.booth.pm/items/2423989", + image_urls: ["https://booth.pximg.net/8bb9e4e3-d171-4027-88df-84480480f79d/i/2423989/a692d4f3-4371-4a86-a337-83fee82d46a4.png"], + profile_url: "https://re-face.booth.pm", + page_url: "https://booth.pm/en/items/2423989", + artist_name: "re-face", + other_names: ["Re:fAce/りふぇいす。"], + tags: ["original"], + artist_commentary_title: "RwithV vol.1 -アイドルはじめます!-", + dtext_artist_commentary_desc: /注文が殺到した際は、発送が遅れてしまう場合もございますので予めご了承ください。/ + ) + + strategy_should_work( + "https://s2.booth.pm/8bb9e4e3-d171-4027-88df-84480480f79d/3d70de06-8e7c-444e-b8eb-a8a95bf20638_base_resized.jpg", + image_urls: ["https://s2.booth.pm/8bb9e4e3-d171-4027-88df-84480480f79d/3d70de06-8e7c-444e-b8eb-a8a95bf20638.png"], + profile_url: nil + ) + + strategy_should_work( + "https://booth.pximg.net/c/128x128/users/3193929/icon_image/5be9eff4-1d9e-4a79-b097-33c1cd4ad314_base_resized.jpg", + image_urls: ["https://booth.pximg.net/users/3193929/icon_image/5be9eff4-1d9e-4a79-b097-33c1cd4ad314.png"], + profile_url: nil + ) + + strategy_should_work("https://booth.pm/en/items/2003079", deleted: true) + end +end