sources: factor out Source::URL::Plurk.
Also fix it so that for adult works, we get the images posted by the artist in the replies. Example: https://www.plurk.com/p/omc64y (nsfw).
This commit is contained in:
@@ -19,6 +19,7 @@ module Source
|
||||
class URL < Danbooru::URL
|
||||
SUBCLASSES = [
|
||||
Source::URL::Twitter,
|
||||
Source::URL::Plurk,
|
||||
Source::URL::Skeb,
|
||||
Source::URL::TwitPic,
|
||||
Source::URL::Foundation,
|
||||
|
||||
68
app/logical/source/url/plurk.rb
Normal file
68
app/logical/source/url/plurk.rb
Normal file
@@ -0,0 +1,68 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Notes
|
||||
#
|
||||
# * Posts can have up to 10 images.
|
||||
# * Artists commonly post extra images by replying to their own post.
|
||||
# * Adult posts are hidden for logged out users. The main images can be found by
|
||||
# scraping a <script> tag, but an API call is needed to get the images in the replies.
|
||||
#
|
||||
# Image URLs
|
||||
#
|
||||
# * https://images.plurk.com/5wj6WD0r6y4rLN0DL3sqag.jpg
|
||||
#
|
||||
# Thumbnail URLs
|
||||
#
|
||||
# * https://images.plurk.com/mx_5wj6WD0r6y4rLN0DL3sqag.jpg
|
||||
#
|
||||
# Page URLs
|
||||
#
|
||||
# * https://www.plurk.com/p/om6zv4 (non-adult, single image)
|
||||
# * https://www.plurk.com/p/okxzae (non-adult, multiple images, with replies)
|
||||
# * https://www.plurk.com/p/omc64y (adult, multiple images, with replies)
|
||||
# * https://www.plurk.com/m/p/omc64y
|
||||
#
|
||||
# Profile URLs
|
||||
#
|
||||
# * https://www.plurk.com/redeyehare
|
||||
# * https://www.plurk.com/m/redeyehare
|
||||
|
||||
class Source::URL::Plurk < Source::URL
|
||||
attr_reader :username, :work_id
|
||||
|
||||
def self.match?(url)
|
||||
url.domain == "plurk.com"
|
||||
end
|
||||
|
||||
def parse
|
||||
case [domain, *path_segments]
|
||||
|
||||
# https://images.plurk.com/5wj6WD0r6y4rLN0DL3sqag.jpg
|
||||
# https://images.plurk.com/mx_5wj6WD0r6y4rLN0DL3sqag.jpg
|
||||
in "plurk.com", /^(mx_)?(\w{22})\.(\w+)$/
|
||||
@filename, @file_ext = $2, $3
|
||||
|
||||
# https://www.plurk.com/p/om6zv4
|
||||
in "plurk.com", "p", work_id
|
||||
@work_id = work_id
|
||||
|
||||
# https://www.plurk.com/m/p/okxzae
|
||||
in "plurk.com", "m", "p", work_id
|
||||
@work_id = work_id
|
||||
|
||||
# https://www.plurk.com/redeyehare
|
||||
in "plurk.com", username
|
||||
@username = username
|
||||
|
||||
# https://www.plurk.com/m/redeyehare
|
||||
in "plurk.com", "m", username
|
||||
@username = username
|
||||
|
||||
else
|
||||
end
|
||||
end
|
||||
|
||||
def image_url?
|
||||
host == "images.plurk.com"
|
||||
end
|
||||
end
|
||||
@@ -1,24 +1,10 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Image URLs
|
||||
#
|
||||
# * https://images.plurk.com/5wj6WD0r6y4rLN0DL3sqag.jpg
|
||||
#
|
||||
# Page URLs
|
||||
#
|
||||
# * https://www.plurk.com/p/om6zv4
|
||||
#
|
||||
# Profile URLs
|
||||
#
|
||||
# * https://www.plurk.com/redeyehare
|
||||
|
||||
# @see Source::URL::Plurk
|
||||
module Sources
|
||||
module Strategies
|
||||
class Plurk < Base
|
||||
BASE_URL = %r{\Ahttps?://(?:www\.)?plurk\.com}i
|
||||
PAGE_URL = %r{#{BASE_URL}(?:/m)?/p/(?<illust_id>\w+)}i
|
||||
PROFILE_URL = %r{#{BASE_URL}/\w+}i
|
||||
IMAGE_URL = %r{https?://images\.plurk\.com/\w+\.\w+}i
|
||||
extend Memoist
|
||||
|
||||
def domains
|
||||
["plurk.com"]
|
||||
@@ -29,26 +15,23 @@ module Sources
|
||||
end
|
||||
|
||||
def image_urls
|
||||
return [url] if url =~ IMAGE_URL
|
||||
images = page&.search(".bigplurk .content a img, .response.highlight_owner .content a img").to_a.map { |img| img["alt"] }
|
||||
# the above returns both the "main" images, and any other art the artist might have posted in the replies
|
||||
|
||||
if images.empty?
|
||||
# in case of adult posts, we fall back to the internal api, which doesn't show replies
|
||||
images = images_from_internal_api
|
||||
if parsed_url.image_url?
|
||||
[url]
|
||||
elsif page_json["porn"]
|
||||
# in case of adult posts, we get the main images and the replies separately
|
||||
images_from_script_tag + images_from_replies
|
||||
else
|
||||
images_from_page
|
||||
end
|
||||
|
||||
images
|
||||
end
|
||||
|
||||
def page_url
|
||||
return nil if illust_id.blank?
|
||||
|
||||
"https://plurk.com/p/#{illust_id}"
|
||||
end
|
||||
|
||||
def illust_id
|
||||
urls.map { |u| u[PAGE_URL, :illust_id] }.compact.first
|
||||
parsed_url.work_id || parsed_referer&.work_id
|
||||
end
|
||||
|
||||
def page
|
||||
@@ -60,10 +43,38 @@ module Sources
|
||||
response.parse
|
||||
end
|
||||
|
||||
def images_from_internal_api
|
||||
internal_api = page&.search("body script")&.select {|s| s.text =~ /plurk =/ }.to_a.compact.first&.text
|
||||
return [] unless internal_api.present?
|
||||
internal_api.scan(/(#{IMAGE_URL})/).flatten.compact.uniq.filter { |img| img !~ %r{/mx_\w+}i }
|
||||
# For non-adult works, returns both the main images and the images posted by the artist in the replies.
|
||||
# For adult works, returns only the main images.
|
||||
def images_from_page
|
||||
page&.search(".bigplurk .content a img, .response.highlight_owner .content a img").to_a.pluck("alt")
|
||||
end
|
||||
|
||||
# Returns only the main images, not the images posted in the replies. Used for adult works.
|
||||
def images_from_script_tag
|
||||
URI.extract(page_json["content_raw"])
|
||||
end
|
||||
|
||||
# Returns images posted by the artist in the replies. Used for adult works.
|
||||
def images_from_replies
|
||||
artist_responses = api_replies["responses"].to_a.select { _1["user_id"].to_i == artist_id.to_i }
|
||||
urls = artist_responses.pluck("content_raw").flat_map { URI.extract(_1) }
|
||||
urls.select { Source::URL.parse(_1)&.image_url? }.uniq
|
||||
end
|
||||
|
||||
def page_json
|
||||
script_text = page&.search("body script").to_a.map(&:text).grep(/plurk =/).first.to_s
|
||||
json = script_text.strip.delete_prefix("plurk = ").delete_suffix(";").gsub(/new Date\((.*?)\)/) { $1 }
|
||||
return {} if json.blank?
|
||||
JSON.parse(json)
|
||||
end
|
||||
|
||||
def api_replies
|
||||
return {} if illust_id.blank?
|
||||
|
||||
response = http.cache(1.minute).post("https://www.plurk.com/Responses/get", form: { plurk_id: illust_id.to_i(36), from_response_id: 0 })
|
||||
return {} unless response.status == 200
|
||||
|
||||
response.parse
|
||||
end
|
||||
|
||||
def tag_name
|
||||
@@ -74,6 +85,10 @@ module Sources
|
||||
page&.at(".bigplurk .user a")&.text
|
||||
end
|
||||
|
||||
def artist_id
|
||||
page&.at("a[data-uid]")&.attr("data-uid").to_i
|
||||
end
|
||||
|
||||
def profile_url
|
||||
return nil if artist_name.blank?
|
||||
"https://www.plurk.com/#{tag_name}"
|
||||
@@ -94,6 +109,8 @@ module Sources
|
||||
def normalize_for_source
|
||||
page_url
|
||||
end
|
||||
|
||||
memoize :page, :page_json, :api_replies
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -342,6 +342,9 @@ class UploadsControllerTest < ActionDispatch::IntegrationTest
|
||||
should_upload_successfully("https://skeb.jp/@kokuzou593/works/45")
|
||||
should_upload_successfully("https://skeb.jp/@LambOic029/works/146")
|
||||
should_upload_successfully("https://skeb.imgix.net/uploads/origins/307941e9-dbe0-4e4b-93d4-94accdaff9a0?bg=%23fff&auto=format&w=800&s=e0ddfb1fa0d9f23797b338598aae78fa")
|
||||
|
||||
should_upload_successfully("https://www.plurk.com/p/omc64y")
|
||||
should_upload_successfully("https://www.plurk.com/p/om6zv4")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -37,16 +37,56 @@ module Sources
|
||||
end
|
||||
|
||||
should "get the image urls for an adult post" do
|
||||
images = ["https://images.plurk.com/yfnumBJqqoQt50Em6xKwf.png",
|
||||
"https://images.plurk.com/5NaqqO3Yi6bQW1wKXq1Dc2.png",
|
||||
"https://images.plurk.com/3HzNcbMhCozHPk5YY8j9fI.png",
|
||||
"https://images.plurk.com/2e0duwn8BpSW9MGuUvbrim.png",
|
||||
"https://images.plurk.com/1OuiMDp82hYPEUn64CWFFB.png",
|
||||
"https://images.plurk.com/3F3KzZOabeMYkgTeseEZ0r.png",
|
||||
"https://images.plurk.com/7onKKTAIXkY4pASszrBys8.png",
|
||||
"https://images.plurk.com/6aotmjLGbtMLiI3slN7ODv.png",
|
||||
"https://images.plurk.com/6pzn7jE2nkj9EV7H25L0x1.png",
|
||||
"https://images.plurk.com/yA8egjDuhy0eNG9yxRj1d.png",]
|
||||
images = %w[
|
||||
https://images.plurk.com/yfnumBJqqoQt50Em6xKwf.png
|
||||
https://images.plurk.com/5NaqqO3Yi6bQW1wKXq1Dc2.png
|
||||
https://images.plurk.com/3HzNcbMhCozHPk5YY8j9fI.png
|
||||
https://images.plurk.com/2e0duwn8BpSW9MGuUvbrim.png
|
||||
https://images.plurk.com/1OuiMDp82hYPEUn64CWFFB.png
|
||||
https://images.plurk.com/3F3KzZOabeMYkgTeseEZ0r.png
|
||||
https://images.plurk.com/7onKKTAIXkY4pASszrBys8.png
|
||||
https://images.plurk.com/6aotmjLGbtMLiI3slN7ODv.png
|
||||
https://images.plurk.com/6pzn7jE2nkj9EV7H25L0x1.png
|
||||
https://images.plurk.com/yA8egjDuhy0eNG9yxRj1d.png
|
||||
https://images.plurk.com/55tbTkH3cKTTYkZe9fu1Pv.png
|
||||
https://images.plurk.com/5z64F9uUipJ0fMJWXNGHTw.png
|
||||
https://images.plurk.com/6cwurMe6jymEu6INzmyg74.png
|
||||
https://images.plurk.com/7zyTReS8UVyCFYtU1DJRYt.png
|
||||
https://images.plurk.com/1PiRWGzaXozU15Scx1ZC4T.png
|
||||
https://images.plurk.com/2xzB5qacdLVV75GhaFifaY.png
|
||||
https://images.plurk.com/7uQENFmFNtWSKF0AAQKffr.png
|
||||
https://images.plurk.com/7ChGLokdAezvbEjPCLUr8f.png
|
||||
https://images.plurk.com/3AzjLxynamDGxNDTq4wt5x.png
|
||||
https://images.plurk.com/3SYjvKc3IBbz6ZXWeG1pY8.png
|
||||
https://images.plurk.com/7bk2kYN2fEVV0kiT5qoiuO.png
|
||||
https://images.plurk.com/6mgCwWjSqOfi0BtSg6THcZ.png
|
||||
https://images.plurk.com/3BwtMvr6S13gr96r5TLIFd.png
|
||||
https://images.plurk.com/22CPzkRM71frDR5eRMPthC.png
|
||||
https://images.plurk.com/1IFScoxA7m0FXNu6XirBwa.jpg
|
||||
https://images.plurk.com/5v1ZXQxbS7ocV4BybwbCSs.jpg
|
||||
https://images.plurk.com/4n1og7pg4KP3wRYSKpFzF7.png
|
||||
https://images.plurk.com/5gK1PyPTrVYoeZBr10lEYu.png
|
||||
https://images.plurk.com/3m8YZS3D9vaAH8Lw1LDTix.png
|
||||
https://images.plurk.com/3oy7joPrEFm0Wlo7NplXOl.png
|
||||
https://images.plurk.com/2IBA93ghmCJCJT72mQyLUK.png
|
||||
https://images.plurk.com/16jqEhVqtuLJwnRjpIDRCr.png
|
||||
https://images.plurk.com/7cKzaSigAvKc6DKNxeGmnH.png
|
||||
https://images.plurk.com/ypfkOMsC24hIPGSEWjJ8A.png
|
||||
https://images.plurk.com/5qW11yr06e9u3t5Zt9Jxmm.png
|
||||
https://images.plurk.com/4H5st1xsFDSFgLd7gNXgD8.png
|
||||
https://images.plurk.com/4nf49mWygwQyrYriZ453Qx.png
|
||||
https://images.plurk.com/2Y0TXcYZkni94j7yxxosV9.png
|
||||
https://images.plurk.com/5ih71C9XNJDq88wzKbBdNp.png
|
||||
https://images.plurk.com/UmoZjSHx0Y4NYa3mgKffU.png
|
||||
https://images.plurk.com/4IHGG5mQNw95vqClFEBoOM.png
|
||||
https://images.plurk.com/5J3bRPjGBZV8fDxo7cTwGs.png
|
||||
https://images.plurk.com/3uAjR5oBfe4d6MFThFQ0Gt.png
|
||||
https://images.plurk.com/3fFJ8RN3HkmfcuUdn7OpnQ.png
|
||||
https://images.plurk.com/sxkaWnhmDrCSsUEg6Kn9Y.png
|
||||
https://images.plurk.com/1f3W8JnHlwpt3OlT4ZJhiu.gif
|
||||
https://images.plurk.com/5lNGKqPCf6opXu21f5DdbU.gif
|
||||
]
|
||||
|
||||
assert_equal(images, @post4.image_urls)
|
||||
end
|
||||
|
||||
|
||||
Reference in New Issue
Block a user