sources: add 4chan.org upload support.

Add upload support for 4chan.org. You can upload either individual posts or entire threads at a time.
2022-11-17 03:32:08 -06:00
parent 94824fb171
commit 902cd0bbdf
8 changed files with 263 additions and 4 deletions
--- a/app/logical/danbooru/url.rb
+++ b/app/logical/danbooru/url.rb
@@ -34,7 +34,7 @@ module Danbooru
    # @return [Addressable:URI] The parsed and normalized URL.
    attr_reader :url

-    delegate :domain, :host, :port, :site, :path, :query, :password, to: :url
+    delegate :domain, :host, :port, :site, :path, :query, :fragment, :password, to: :url

    # Parse a string into a URL, or raise an exception if the string is not a valid HTTP or HTTPS URL.
    #
--- a/app/logical/source/extractor.rb
+++ b/app/logical/source/extractor.rb
@@ -58,6 +58,7 @@ module Source
      Source::Extractor::Reddit,
      Source::Extractor::Bilibili,
      Source::Extractor::Rule34DotUs,
+      Source::Extractor::FourChan,
    ]

    # Should return true if the extractor is configured correctly. Return false
@@ -299,7 +300,7 @@ module Source
          :dtext_title => dtext_artist_commentary_title,
          :dtext_description => dtext_artist_commentary_desc
        },
-        :api_response => api_response.to_h
+        :api_response => api_response
      }
    end

--- a/app/logical/source/extractor/four_chan.rb
+++ b/app/logical/source/extractor/four_chan.rb
@@ -0,0 +1,133 @@
+# frozen_string_literal: true
+
+# Source extractor for 4chan.org.
+#
+# TODO:
+#
+# * If given only an image URL, scrape the board catalog to find which thread it belongs to.
+# * If the thread is expired, scrape data from archive sites.
+# * If the image or video is a soundpost, remux the file to include the sound (https://github.com/rcc11/4chan-sounds-player#creating-sound-images)
+#
+# @see https://github.com/4chan/4chan-API
+# @see https://github.com/4chan/4chan-API/blob/master/pages/Threads.md
+module Source
+  class Extractor
+    class FourChan < Source::Extractor
+      def match?
+        Source::URL::FourChan === parsed_url
+      end
+
+      def image_urls
+        if parsed_url.full_image_url.present?
+          [parsed_url.full_image_url]
+        # If this is a post URL, or an image URL for which we can find the post
+        elsif post.present? && post["tim"].present? && post["ext"].present?
+          ["https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}"]
+        # If this is a thread URL
+        elsif thread_id.present? && post_id_from_url.nil?
+          api_response.map do |post|
+            "https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}" if post["tim"].present? && post["ext"].present?
+          end.compact
+        # If this is a thumbnail image URL and we can't get the full image URL from the API
+        elsif parsed_url.image_url?
+          [url.to_s]
+        else
+          []
+        end
+      end
+
+      def page_url
+        if board.present? && thread_id.present? && post_id.present?
+          "https://#{domain}/#{board}/thread/#{thread_id}#p#{post_id}"
+        elsif board.present? && thread_id.present?
+          "https://#{domain}/#{board}/thread/#{thread_id}"
+        end
+      end
+
+      def artist_commentary_title
+        if post.present?
+          "#{post["name"]}#{post["trip"]} #{post["now"]} No.#{post["no"]}"
+        end
+      end
+
+      def artist_commentary_desc
+        commentary = String.new
+
+        if post["filename"].present?
+          commentary << "<a href=\"https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}\">#{post["filename"]}#{post["ext"]}</a> (#{post["fsize"].to_fs(:human_size)}, #{post["w"]}x#{post["h"]})\n"
+        end
+
+        if post["com"].present?
+          commentary << post["com"]
+        end
+
+        commentary.presence
+      end
+
+      def dtext_artist_commentary_desc
+        DText.from_html(artist_commentary_desc) do |element|
+          if element.name == "a" && element["class"] == "quotelink"
+            # `<a href="#p1234" class="quotelink">&gt;&gt;1234</a>`
+            if element["href"].starts_with?("#")
+              element["href"] = "https://#{domain}/#{board}/thread/#{thread_id}#{element["href"]}"
+            # `<a href="/vt/thread/1234#p5678" class="quotelink">&gt;&gt;5678</a>`
+            elsif element["href"].starts_with?("/")
+              element["href"] = "https://#{domain}#{element["href"]}"
+            end
+          end
+        end
+      end
+
+      def domain
+        if parsed_url.domain in "4chan.org" | "4channel.org"
+          "boards.#{parsed_url.domain}"
+        elsif parsed_referer&.domain in "4chan.org" | "4channel.org"
+          "boards.#{parsed_referer.domain}"
+        end
+      end
+
+      def board
+        parsed_url.board || parsed_referer&.board
+      end
+
+      def thread_id
+        parsed_url.thread_id || parsed_referer&.thread_id
+      end
+
+      def image_id
+        parsed_url.image_id || parsed_referer&.image_id
+      end
+
+      def post_id
+        post_id_from_url || post_id_from_api
+      end
+
+      def post_id_from_url
+        parsed_url.post_id || parsed_referer&.post_id
+      end
+
+      def post_id_from_api
+        post["no"]
+      end
+
+      memoize def post
+        api_response.find do |post|
+          (image_id.present? && post["tim"] == image_id) || post["no"] == post_id_from_url
+        end.to_h
+      end
+
+      memoize def api_response
+        return [] unless api_url.present?
+
+        response = http.cache(1.minute).get(api_url)
+        return [] unless response.status == 200
+
+        response.parse["posts"]
+      end
+
+      def api_url
+        "https://a.4cdn.org/#{board}/thread/#{thread_id}.json" if board.present? && thread_id.present?
+      end
+    end
+  end
+end
--- a/app/logical/source/url.rb
+++ b/app/logical/source/url.rb
@@ -61,6 +61,7 @@ module Source
      Source::URL::Furaffinity,
      Source::URL::Bilibili,
      Source::URL::Rule34DotUs,
+      Source::URL::FourChan,
    ]

    # Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL.
--- a/app/logical/source/url/four_chan.rb
+++ b/app/logical/source/url/four_chan.rb
@@ -0,0 +1,51 @@
+# frozen_string_literal: true
+
+# @see https://github.com/4chan/4chan-API
+# @see https://github.com/4chan/4chan-API/blob/master/pages/User_images_and_static_content.md
+class Source::URL::FourChan < Source::URL
+  attr_reader :board, :thread_id, :post_id, :image_type, :image_id, :full_image_url
+
+  def self.match?(url)
+    url.domain.in?(%w[4cdn.org 4chan.org 4channel.org])
+  end
+
+  def site_name
+    "4chan"
+  end
+
+  def parse
+    case [subdomain, domain, *path_segments]
+
+    # https://boards.4channel.org/vt/thread/37293562#p37294005
+    in _, ("4channel.org" | "4chan.org"), board, "thread", /\A[0-9]+\z/ => thread_id
+      @board = board
+      @thread_id = thread_id.to_i
+      @post_id = fragment.to_s[/^p([0-9]+)$/, 1]&.to_i
+
+    # https://i.4cdn.org/vt/1668729957824814.webm
+    # https://i.4cdn.org/vt/1668729957824814s.jpg
+    in "i", "4cdn.org", board, /\A([0-9]+)(s?)\./
+      @board = board
+      @image_id = $1.to_i
+      @image_type = $2 == "s" ? :preview : :original
+      @full_image_url = url.to_s if @image_type == :original
+
+    else
+      nil
+    end
+  end
+
+  def image_url?
+    host == "i.4cdn.org"
+  end
+
+  def page_url
+    if thread_id.present?
+      url.to_s
+    end
+  end
+
+  def api_url
+    "https://a.4cdn.org/#{board}/thread/#{thread_id}.json" if board.present? && thread_id.present?
+  end
+end
--- a/app/logical/source/url/null.rb
+++ b/app/logical/source/url/null.rb
@@ -9,8 +9,6 @@ class Source::URL::Null < Source::URL

  def site_name
    case [subdomain, domain]
-    in _, "4cdn.org"
-      "4chan"
    in _, "myportfolio.com"
      "Adobe Portfolio"
    in _, "adobe.com" if host == "portfolio.adobe.com"