# frozen_string_literal: true # Source extractor for 4chan.org. # # TODO: # # * If given only an image URL, scrape the board catalog to find which thread it belongs to. # * If the thread is expired, scrape data from archive sites. # * If the image or video is a soundpost, remux the file to include the sound (https://github.com/rcc11/4chan-sounds-player#creating-sound-images) # # @see https://github.com/4chan/4chan-API # @see https://github.com/4chan/4chan-API/blob/master/pages/Threads.md module Source class Extractor class FourChan < Source::Extractor def match? Source::URL::FourChan === parsed_url end def image_urls if parsed_url.full_image_url.present? [parsed_url.full_image_url] # If this is a post URL, or an image URL for which we can find the post elsif post.present? && post["tim"].present? && post["ext"].present? ["https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}"] # If this is a thread URL elsif thread_id.present? && post_id_from_url.nil? api_response.map do |post| "https://i.4cdn.org/#{board}/#{post["tim"]}#{post["ext"]}" if post["tim"].present? && post["ext"].present? end.compact # If this is a thumbnail image URL and we can't get the full image URL from the API elsif parsed_url.image_url? [url.to_s] else [] end end def page_url if board.present? && thread_id.present? && post_id.present? "https://#{domain}/#{board}/thread/#{thread_id}#p#{post_id}" elsif board.present? && thread_id.present? "https://#{domain}/#{board}/thread/#{thread_id}" end end def artist_commentary_title if post.present? "#{post["name"]}#{post["trip"]} #{post["now"]} No.#{post["no"]}" end end def artist_commentary_desc commentary = String.new if post["filename"].present? commentary << "#{post["filename"]}#{post["ext"]} (#{post["fsize"].to_fs(:human_size)}, #{post["w"]}x#{post["h"]})
" end if post["com"].present? commentary << post["com"] end commentary.presence end def dtext_artist_commentary_desc DText.from_html(artist_commentary_desc) do |element| if element.name == "a" && element["class"] == "quotelink" # `>>1234` if element["href"].starts_with?("#") element["href"] = "https://#{domain}/#{board}/thread/#{thread_id}#{element["href"]}" # `>>5678` elsif element["href"].starts_with?("/") element["href"] = "https://#{domain}#{element["href"]}" end end end end def domain if parsed_url.domain in "4chan.org" | "4channel.org" "boards.#{parsed_url.domain}" elsif parsed_referer&.domain in "4chan.org" | "4channel.org" "boards.#{parsed_referer.domain}" end end def board parsed_url.board || parsed_referer&.board end def thread_id parsed_url.thread_id || parsed_referer&.thread_id end def image_id parsed_url.image_id || parsed_referer&.image_id end def post_id post_id_from_url || post_id_from_api end def post_id_from_url parsed_url.post_id || parsed_referer&.post_id end def post_id_from_api post["no"] end memoize def post api_response.find do |post| (image_id.present? && post["tim"] == image_id) || post["no"] == post_id_from_url end.to_h end memoize def api_response return [] unless api_url.present? response = http.cache(1.minute).get(api_url) return [] unless response.status == 200 response.parse["posts"] end def api_url "https://a.4cdn.org/#{board}/thread/#{thread_id}.json" if board.present? && thread_id.present? end end end end