sources: add 4chan.org upload support.

Add upload support for 4chan.org. You can upload either individual posts or entire threads at a time.
2022-11-17 03:32:08 -06:00
parent 94824fb171
commit 902cd0bbdf
8 changed files with 263 additions and 4 deletions
--- a/app/logical/danbooru/url.rb
+++ b/app/logical/danbooru/url.rb
@@ -34,7 +34,7 @@ module Danbooru
    # @return [Addressable:URI] The parsed and normalized URL.
    attr_reader :url

-    delegate :domain, :host, :port, :site, :path, :query, :password, to: :url
+    delegate :domain, :host, :port, :site, :path, :query, :fragment, :password, to: :url

    # Parse a string into a URL, or raise an exception if the string is not a valid HTTP or HTTPS URL.
    #
--- a/app/logical/source/extractor.rb
+++ b/app/logical/source/extractor.rb
@@ -58,6 +58,7 @@ module Source
      Source::Extractor::Reddit,
      Source::Extractor::Bilibili,
      Source::Extractor::Rule34DotUs,
+      Source::Extractor::FourChan,
    ]

    # Should return true if the extractor is configured correctly. Return false
@@ -299,7 +300,7 @@ module Source
          :dtext_title => dtext_artist_commentary_title,
          :dtext_description => dtext_artist_commentary_desc
        },
-        :api_response => api_response.to_h
+        :api_response => api_response
      }
    end

--- a/app/logical/source/extractor/four_chan.rb
+++ b/app/logical/source/extractor/four_chan.rb
@@ -0,0 +1,133 @@
+# frozen_string_literal: true
+
+# Source extractor for 4chan.org.
+#
+# TODO:
+#
+# * If given only an image URL, scrape the board catalog to find which thread it belongs to.
+# * If the thread is expired, scrape data from archive sites.
+# * If the image or video is a soundpost, remux the file to include the sound (https://github.com/rcc11/4chan-sounds-player#creating-sound-images)
+#
+# @see https://github.com/4chan/4chan-API
+# @see https://github.com/4chan/4chan-API/blob/master/pages/Threads.md
+module Source
+  class Extractor
+    class FourChan < Source::Extractor
+      def match?
+        Source::URL::FourChan === parsed_url
+      end
+
+      def image_urls
+        if parsed_url.full_image_url.present?
+          [parsed_url.full_image_url]
+        # If this is a post URL, or an image URL for which we can find the post
+        elsif post.present? && post["tim"].present? && post["ext"].present?
+          ["https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}"]
+        # If this is a thread URL
+        elsif thread_id.present? && post_id_from_url.nil?
+          api_response.map do |post|
+            "https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}" if post["tim"].present? && post["ext"].present?
+          end.compact
+        # If this is a thumbnail image URL and we can't get the full image URL from the API
+        elsif parsed_url.image_url?
+          [url.to_s]
+        else
+          []
+        end
+      end
+
+      def page_url
+        if board.present? && thread_id.present? && post_id.present?
+          "https://#{domain}/#{board}/thread/#{thread_id}#p#{post_id}"
+        elsif board.present? && thread_id.present?
+          "https://#{domain}/#{board}/thread/#{thread_id}"
+        end
+      end
+
+      def artist_commentary_title
+        if post.present?
+          "#{post["name"]}#{post["trip"]} #{post["now"]} No.#{post["no"]}"
+        end
+      end
+
+      def artist_commentary_desc
+        commentary = String.new
+
+        if post["filename"].present?
+          commentary << "<a href=\"https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}\">#{post["filename"]}#{post["ext"]}</a> (#{post["fsize"].to_fs(:human_size)}, #{post["w"]}x#{post["h"]})\n"
+        end
+
+        if post["com"].present?
+          commentary << post["com"]
+        end
+
+        commentary.presence
+      end
+
+      def dtext_artist_commentary_desc
+        DText.from_html(artist_commentary_desc) do |element|
+          if element.name == "a" && element["class"] == "quotelink"
+            # `<a href="#p1234" class="quotelink">&gt;&gt;1234</a>`
+            if element["href"].starts_with?("#")
+              element["href"] = "https://#{domain}/#{board}/thread/#{thread_id}#{element["href"]}"
+            # `<a href="/vt/thread/1234#p5678" class="quotelink">&gt;&gt;5678</a>`
+            elsif element["href"].starts_with?("/")
+              element["href"] = "https://#{domain}#{element["href"]}"
+            end
+          end
+        end
+      end
+
+      def domain
+        if parsed_url.domain in "4chan.org" | "4channel.org"
+          "boards.#{parsed_url.domain}"
+        elsif parsed_referer&.domain in "4chan.org" | "4channel.org"
+          "boards.#{parsed_referer.domain}"
+        end
+      end
+
+      def board
+        parsed_url.board || parsed_referer&.board
+      end
+
+      def thread_id
+        parsed_url.thread_id || parsed_referer&.thread_id
+      end
+
+      def image_id
+        parsed_url.image_id || parsed_referer&.image_id
+      end
+
+      def post_id
+        post_id_from_url || post_id_from_api
+      end
+
+      def post_id_from_url
+        parsed_url.post_id || parsed_referer&.post_id
+      end
+
+      def post_id_from_api
+        post["no"]
+      end
+
+      memoize def post
+        api_response.find do |post|
+          (image_id.present? && post["tim"] == image_id) || post["no"] == post_id_from_url
+        end.to_h
+      end
+
+      memoize def api_response
+        return [] unless api_url.present?
+
+        response = http.cache(1.minute).get(api_url)
+        return [] unless response.status == 200
+
+        response.parse["posts"]
+      end
+
+      def api_url
+        "https://a.4cdn.org/#{board}/thread/#{thread_id}.json" if board.present? && thread_id.present?
+      end
+    end
+  end
+end
--- a/app/logical/source/url.rb
+++ b/app/logical/source/url.rb
@@ -61,6 +61,7 @@ module Source
      Source::URL::Furaffinity,
      Source::URL::Bilibili,
      Source::URL::Rule34DotUs,
+      Source::URL::FourChan,
    ]

    # Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL.
--- a/app/logical/source/url/four_chan.rb
+++ b/app/logical/source/url/four_chan.rb
@@ -0,0 +1,51 @@
+# frozen_string_literal: true
+
+# @see https://github.com/4chan/4chan-API
+# @see https://github.com/4chan/4chan-API/blob/master/pages/User_images_and_static_content.md
+class Source::URL::FourChan < Source::URL
+  attr_reader :board, :thread_id, :post_id, :image_type, :image_id, :full_image_url
+
+  def self.match?(url)
+    url.domain.in?(%w[4cdn.org 4chan.org 4channel.org])
+  end
+
+  def site_name
+    "4chan"
+  end
+
+  def parse
+    case [subdomain, domain, *path_segments]
+
+    # https://boards.4channel.org/vt/thread/37293562#p37294005
+    in _, ("4channel.org" | "4chan.org"), board, "thread", /\A[0-9]+\z/ => thread_id
+      @board = board
+      @thread_id = thread_id.to_i
+      @post_id = fragment.to_s[/^p([0-9]+)$/, 1]&.to_i
+
+    # https://i.4cdn.org/vt/1668729957824814.webm
+    # https://i.4cdn.org/vt/1668729957824814s.jpg
+    in "i", "4cdn.org", board, /\A([0-9]+)(s?)\./
+      @board = board
+      @image_id = $1.to_i
+      @image_type = $2 == "s" ? :preview : :original
+      @full_image_url = url.to_s if @image_type == :original
+
+    else
+      nil
+    end
+  end
+
+  def image_url?
+    host == "i.4cdn.org"
+  end
+
+  def page_url
+    if thread_id.present?
+      url.to_s
+    end
+  end
+
+  def api_url
+    "https://a.4cdn.org/#{board}/thread/#{thread_id}.json" if board.present? && thread_id.present?
+  end
+end
--- a/app/logical/source/url/null.rb
+++ b/app/logical/source/url/null.rb
@@ -9,8 +9,6 @@ class Source::URL::Null < Source::URL

  def site_name
    case [subdomain, domain]
-    in _, "4cdn.org"
-      "4chan"
    in _, "myportfolio.com"
      "Adobe Portfolio"
    in _, "adobe.com" if host == "portfolio.adobe.com"
--- a/test/functional/uploads_controller_test.rb
+++ b/test/functional/uploads_controller_test.rb
@@ -518,6 +518,8 @@ class UploadsControllerTest < ActionDispatch::IntegrationTest
        should_upload_successfully("https://rule34.xxx/index.php?page=post&s=view&id=6961597")
        should_upload_successfully("https://rule34.us/index.php?r=posts/view&id=6204967")

+        should_upload_successfully("https://boards.4channel.org/vt/thread/1#p1")
+
        should_upload_successfully("http://lohas.nicoseiga.jp/o/910aecf08e542285862954017f8a33a8c32a8aec/1433298801/4937663")
        should_upload_successfully("http://seiga.nicovideo.jp/seiga/im4937663")
        should_upload_successfully("https://seiga.nicovideo.jp/image/source/9146749")
--- a/test/unit/sources/four_chan_test.rb
+++ b/test/unit/sources/four_chan_test.rb
@@ -0,0 +1,73 @@
+require "test_helper"
+
+module Sources
+  class FourChanTest < ActiveSupport::TestCase
+    context "A 4chan source extractor" do
+      context "A 4chan direct image url without a referer" do
+        strategy_should_work(
+          "https://i.4cdn.org/vt/1611919211191.jpg",
+          image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"],
+          page_url: nil,
+          artist_commentary_title: nil,
+          artist_commentary_desc: nil,
+          dtext_artist_commentary_desc: "",
+          download_size: 145_602,
+        )
+      end
+
+      context "A 4chan direct image url with a referer" do
+        strategy_should_work(
+          "https://i.4cdn.org/vt/1611919211191.jpg",
+          referer: "https://boards.4channel.org/vt/thread/1",
+          image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"],
+          page_url: "https://boards.4channel.org/vt/thread/1#p1",
+          artist_commentary_title: "Anonymous 01/29/21(Fri)06:20:11 No.1",
+          dtext_artist_commentary_desc: <<~EOS.chomp,
+            "vt.jpg":[https://i.4cdn.org/vt/1611919211191.jpg] (142 KB, 767x677)
+            This board is for the discussion of Virtual YouTubers ("VTubers"), including those streaming in Japanese, English, and other languages. VTubers don't necessarily need to be on Youtube of course, they can be on Twitch, Niconico, Bilibili, or any other platform.
+
+            Please note that discussion should pertain to a VTuber's streams and content, and should not pertain to their real lives, relationships, or appearances ("IRL").
+          EOS
+          download_size: 145_602,
+        )
+      end
+
+      context "A 4chan thumbnail image url without a referer" do
+        strategy_should_work(
+          "https://i.4cdn.org/vt/1611919211191s.jpg",
+          image_urls: ["https://i.4cdn.org/vt/1611919211191s.jpg"],
+          page_url: nil,
+          artist_commentary_title: nil,
+          artist_commentary_desc: nil,
+          dtext_artist_commentary_desc: "",
+          download_size: 7430,
+        )
+      end
+
+      context "A 4chan post url" do
+        strategy_should_work(
+          "https://boards.4channel.org/vt/thread/1#p1",
+          image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"],
+          page_url: "https://boards.4channel.org/vt/thread/1#p1",
+          artist_commentary_title: "Anonymous 01/29/21(Fri)06:20:11 No.1",
+          dtext_artist_commentary_desc: <<~EOS.chomp,
+            "vt.jpg":[https://i.4cdn.org/vt/1611919211191.jpg] (142 KB, 767x677)
+            This board is for the discussion of Virtual YouTubers ("VTubers"), including those streaming in Japanese, English, and other languages. VTubers don't necessarily need to be on Youtube of course, they can be on Twitch, Niconico, Bilibili, or any other platform.
+
+            Please note that discussion should pertain to a VTuber's streams and content, and should not pertain to their real lives, relationships, or appearances ("IRL").
+          EOS
+          download_size: 145_602,
+        )
+      end
+
+      context "A 4chan thread url" do
+        strategy_should_work(
+          "https://boards.4channel.org/vt/thread/1",
+          image_urls: ["https://i.4cdn.org/vt/1611919211191.jpg"],
+          page_url: "https://boards.4channel.org/vt/thread/1",
+          download_size: 145_602,
+        )
+      end
+    end
+  end
+end