From 902cd0bbdf90337cc26935270e08ae922ea05d2e Mon Sep 17 00:00:00 2001 From: evazion Date: Thu, 17 Nov 2022 03:32:08 -0600 Subject: [PATCH] sources: add 4chan.org upload support. Add upload support for 4chan.org. You can upload either individual posts or entire threads at a time. --- app/logical/danbooru/url.rb | 2 +- app/logical/source/extractor.rb | 3 +- app/logical/source/extractor/four_chan.rb | 133 +++++++++++++++++++++ app/logical/source/url.rb | 1 + app/logical/source/url/four_chan.rb | 51 ++++++++ app/logical/source/url/null.rb | 2 - test/functional/uploads_controller_test.rb | 2 + test/unit/sources/four_chan_test.rb | 73 +++++++++++ 8 files changed, 263 insertions(+), 4 deletions(-) create mode 100644 app/logical/source/extractor/four_chan.rb create mode 100644 app/logical/source/url/four_chan.rb create mode 100644 test/unit/sources/four_chan_test.rb diff --git a/app/logical/danbooru/url.rb b/app/logical/danbooru/url.rb index d719598e1..7dce194a0 100644 --- a/app/logical/danbooru/url.rb +++ b/app/logical/danbooru/url.rb @@ -34,7 +34,7 @@ module Danbooru # @return [Addressable:URI] The parsed and normalized URL. attr_reader :url - delegate :domain, :host, :port, :site, :path, :query, :password, to: :url + delegate :domain, :host, :port, :site, :path, :query, :fragment, :password, to: :url # Parse a string into a URL, or raise an exception if the string is not a valid HTTP or HTTPS URL. # diff --git a/app/logical/source/extractor.rb b/app/logical/source/extractor.rb index b77a2ba61..1d8cd3efa 100644 --- a/app/logical/source/extractor.rb +++ b/app/logical/source/extractor.rb @@ -58,6 +58,7 @@ module Source Source::Extractor::Reddit, Source::Extractor::Bilibili, Source::Extractor::Rule34DotUs, + Source::Extractor::FourChan, ] # Should return true if the extractor is configured correctly. Return false @@ -299,7 +300,7 @@ module Source :dtext_title => dtext_artist_commentary_title, :dtext_description => dtext_artist_commentary_desc }, - :api_response => api_response.to_h + :api_response => api_response } end diff --git a/app/logical/source/extractor/four_chan.rb b/app/logical/source/extractor/four_chan.rb new file mode 100644 index 000000000..b8bd5362c --- /dev/null +++ b/app/logical/source/extractor/four_chan.rb @@ -0,0 +1,133 @@ +# frozen_string_literal: true + +# Source extractor for 4chan.org. +# +# TODO: +# +# * If given only an image URL, scrape the board catalog to find which thread it belongs to. +# * If the thread is expired, scrape data from archive sites. +# * If the image or video is a soundpost, remux the file to include the sound (https://github.com/rcc11/4chan-sounds-player#creating-sound-images) +# +# @see https://github.com/4chan/4chan-API +# @see https://github.com/4chan/4chan-API/blob/master/pages/Threads.md +module Source + class Extractor + class FourChan < Source::Extractor + def match? + Source::URL::FourChan === parsed_url + end + + def image_urls + if parsed_url.full_image_url.present? + [parsed_url.full_image_url] + # If this is a post URL, or an image URL for which we can find the post + elsif post.present? && post["tim"].present? && post["ext"].present? + ["https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}"] + # If this is a thread URL + elsif thread_id.present? && post_id_from_url.nil? + api_response.map do |post| + "https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}" if post["tim"].present? && post["ext"].present? + end.compact + # If this is a thumbnail image URL and we can't get the full image URL from the API + elsif parsed_url.image_url? + [url.to_s] + else + [] + end + end + + def page_url + if board.present? && thread_id.present? && post_id.present? + "https://#{domain}/#{board}/thread/#{thread_id}#p#{post_id}" + elsif board.present? && thread_id.present? + "https://#{domain}/#{board}/thread/#{thread_id}" + end + end + + def artist_commentary_title + if post.present? + "#{post["name"]}#{post["trip"]} #{post["now"]} No.#{post["no"]}" + end + end + + def artist_commentary_desc + commentary = String.new + + if post["filename"].present? + commentary << "#{post["filename"]}#{post["ext"]} (#{post["fsize"].to_fs(:human_size)}, #{post["w"]}x#{post["h"]})\n" + end + + if post["com"].present? + commentary << post["com"] + end + + commentary.presence + end + + def dtext_artist_commentary_desc + DText.from_html(artist_commentary_desc) do |element| + if element.name == "a" && element["class"] == "quotelink" + # `>>1234` + if element["href"].starts_with?("#") + element["href"] = "https://#{domain}/#{board}/thread/#{thread_id}#{element["href"]}" + # `>>5678` + elsif element["href"].starts_with?("/") + element["href"] = "https://#{domain}#{element["href"]}" + end + end + end + end + + def domain + if parsed_url.domain in "4chan.org" | "4channel.org" + "boards.#{parsed_url.domain}" + elsif parsed_referer&.domain in "4chan.org" | "4channel.org" + "boards.#{parsed_referer.domain}" + end + end + + def board + parsed_url.board || parsed_referer&.board + end + + def thread_id + parsed_url.thread_id || parsed_referer&.thread_id + end + + def image_id + parsed_url.image_id || parsed_referer&.image_id + end + + def post_id + post_id_from_url || post_id_from_api + end + + def post_id_from_url + parsed_url.post_id || parsed_referer&.post_id + end + + def post_id_from_api + post["no"] + end + + memoize def post + api_response.find do |post| + (image_id.present? && post["tim"] == image_id) || post["no"] == post_id_from_url + end.to_h + end + + memoize def api_response + return [] unless api_url.present? + + response = http.cache(1.minute).get(api_url) + return [] unless response.status == 200 + + response.parse["posts"] + end + + def api_url + "https://a.4cdn.org/#{board}/thread/#{thread_id}.json" if board.present? && thread_id.present? + end + end + end +end diff --git a/app/logical/source/url.rb b/app/logical/source/url.rb index 9cc22d98b..7826b04e6 100644 --- a/app/logical/source/url.rb +++ b/app/logical/source/url.rb @@ -61,6 +61,7 @@ module Source Source::URL::Furaffinity, Source::URL::Bilibili, Source::URL::Rule34DotUs, + Source::URL::FourChan, ] # Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL. diff --git a/app/logical/source/url/four_chan.rb b/app/logical/source/url/four_chan.rb new file mode 100644 index 000000000..9f84b8bb4 --- /dev/null +++ b/app/logical/source/url/four_chan.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +# @see https://github.com/4chan/4chan-API +# @see https://github.com/4chan/4chan-API/blob/master/pages/User_images_and_static_content.md +class Source::URL::FourChan < Source::URL + attr_reader :board, :thread_id, :post_id, :image_type, :image_id, :full_image_url + + def self.match?(url) + url.domain.in?(%w[4cdn.org 4chan.org 4channel.org]) + end + + def site_name + "4chan" + end + + def parse + case [subdomain, domain, *path_segments] + + # https://boards.4channel.org/vt/thread/37293562#p37294005 + in _, ("4channel.org" | "4chan.org"), board, "thread", /\A[0-9]+\z/ => thread_id + @board = board + @thread_id = thread_id.to_i + @post_id = fragment.to_s[/^p([0-9]+)$/, 1]&.to_i + + # https://i.4cdn.org/vt/1668729957824814.webm + # https://i.4cdn.org/vt/1668729957824814s.jpg + in "i", "4cdn.org", board, /\A([0-9]+)(s?)\./ + @board = board + @image_id = $1.to_i + @image_type = $2 == "s" ? :preview : :original + @full_image_url = url.to_s if @image_type == :original + + else + nil + end + end + + def image_url? + host == "i.4cdn.org" + end + + def page_url + if thread_id.present? + url.to_s + end + end + + def api_url + "https://a.4cdn.org/#{board}/thread/#{thread_id}.json" if board.present? && thread_id.present? + end +end diff --git a/app/logical/source/url/null.rb b/app/logical/source/url/null.rb index f65775c3f..d1b61a373 100644 --- a/app/logical/source/url/null.rb +++ b/app/logical/source/url/null.rb @@ -9,8 +9,6 @@ class Source::URL::Null < Source::URL def site_name case [subdomain, domain] - in _, "4cdn.org" - "4chan" in _, "myportfolio.com" "Adobe Portfolio" in _, "adobe.com" if host == "portfolio.adobe.com" diff --git a/test/functional/uploads_controller_test.rb b/test/functional/uploads_controller_test.rb index 003d312da..7c27de676 100644 --- a/test/functional/uploads_controller_test.rb +++ b/test/functional/uploads_controller_test.rb @@ -518,6 +518,8 @@ class UploadsControllerTest < ActionDispatch::IntegrationTest should_upload_successfully("https://rule34.xxx/index.php?page=post&s=view&id=6961597") should_upload_successfully("https://rule34.us/index.php?r=posts/view&id=6204967") + should_upload_successfully("https://boards.4channel.org/vt/thread/1#p1") + should_upload_successfully("http://lohas.nicoseiga.jp/o/910aecf08e542285862954017f8a33a8c32a8aec/1433298801/4937663") should_upload_successfully("http://seiga.nicovideo.jp/seiga/im4937663") should_upload_successfully("https://seiga.nicovideo.jp/image/source/9146749") diff --git a/test/unit/sources/four_chan_test.rb b/test/unit/sources/four_chan_test.rb new file mode 100644 index 000000000..b3d54b26d --- /dev/null +++ b/test/unit/sources/four_chan_test.rb @@ -0,0 +1,73 @@ +require "test_helper" + +module Sources + class FourChanTest < ActiveSupport::TestCase + context "A 4chan source extractor" do + context "A 4chan direct image url without a referer" do + strategy_should_work( + "https://i.4cdn.org/vt/1611919211191.jpg", + image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"], + page_url: nil, + artist_commentary_title: nil, + artist_commentary_desc: nil, + dtext_artist_commentary_desc: "", + download_size: 145_602, + ) + end + + context "A 4chan direct image url with a referer" do + strategy_should_work( + "https://i.4cdn.org/vt/1611919211191.jpg", + referer: "https://boards.4channel.org/vt/thread/1", + image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"], + page_url: "https://boards.4channel.org/vt/thread/1#p1", + artist_commentary_title: "Anonymous 01/29/21(Fri)06:20:11 No.1", + dtext_artist_commentary_desc: <<~EOS.chomp, + "vt.jpg":[https://i.4cdn.org/vt/1611919211191.jpg] (142 KB, 767x677) + This board is for the discussion of Virtual YouTubers ("VTubers"), including those streaming in Japanese, English, and other languages. VTubers don't necessarily need to be on Youtube of course, they can be on Twitch, Niconico, Bilibili, or any other platform. + + Please note that discussion should pertain to a VTuber's streams and content, and should not pertain to their real lives, relationships, or appearances ("IRL"). + EOS + download_size: 145_602, + ) + end + + context "A 4chan thumbnail image url without a referer" do + strategy_should_work( + "https://i.4cdn.org/vt/1611919211191s.jpg", + image_urls: ["https://i.4cdn.org/vt/1611919211191s.jpg"], + page_url: nil, + artist_commentary_title: nil, + artist_commentary_desc: nil, + dtext_artist_commentary_desc: "", + download_size: 7430, + ) + end + + context "A 4chan post url" do + strategy_should_work( + "https://boards.4channel.org/vt/thread/1#p1", + image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"], + page_url: "https://boards.4channel.org/vt/thread/1#p1", + artist_commentary_title: "Anonymous 01/29/21(Fri)06:20:11 No.1", + dtext_artist_commentary_desc: <<~EOS.chomp, + "vt.jpg":[https://i.4cdn.org/vt/1611919211191.jpg] (142 KB, 767x677) + This board is for the discussion of Virtual YouTubers ("VTubers"), including those streaming in Japanese, English, and other languages. VTubers don't necessarily need to be on Youtube of course, they can be on Twitch, Niconico, Bilibili, or any other platform. + + Please note that discussion should pertain to a VTuber's streams and content, and should not pertain to their real lives, relationships, or appearances ("IRL"). + EOS + download_size: 145_602, + ) + end + + context "A 4chan thread url" do + strategy_should_work( + "https://boards.4channel.org/vt/thread/1", + image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"], + page_url: "https://boards.4channel.org/vt/thread/1", + download_size: 145_602, + ) + end + end + end +end