diff --git a/app/logical/danbooru/url.rb b/app/logical/danbooru/url.rb index f17614077..d9f379906 100644 --- a/app/logical/danbooru/url.rb +++ b/app/logical/danbooru/url.rb @@ -4,29 +4,42 @@ module Danbooru class URL class Error < StandardError; end + # @return [String] The original URL as a string. attr_reader :original_url, :url - delegate_missing_to :url - # Parse a string into an URL, or raise an exception if the string is not a valid HTTPS or HTTPS URL. + # @return [Addressable:URI] The parsed and normalized URL. + attr_reader :url + + delegate :domain, :host, :site, :path, to: :url + + # Parse a string into a URL, or raise an exception if the string is not a valid HTTPS or HTTPS URL. # - # @param string [String] - # @return [Danbooru::URL] - def initialize(string) - @original_url = string - @url = Addressable::URI.heuristic_parse(string).display_uri + # @param url [String, Danbooru::URL] + def initialize(url) + @original_url = url.to_s + @url = Addressable::URI.heuristic_parse(original_url).display_uri @url.path = nil if @url.path == "/" - raise Error, "#{string} is not an http:// URL" if !@url.normalized_scheme.in?(["http", "https"]) + + raise Error, "#{original_url} is not an http:// URL" if !@url.normalized_scheme.in?(["http", "https"]) rescue Addressable::URI::InvalidURIError => e raise Error, e end - # Parse a string into an URL, or return nil if the string is not a valid HTTP or HTTPS URL. + # Parse a string into a URL, or raise an exception if the string is not a valid HTTPS or HTTPS URL. # - # @param string [String] + # @param url [String, Danbooru::URL] # @return [Danbooru::URL] - def self.parse(string) - new(string) - rescue StandardError => e + def self.parse!(url) + new(url) + end + + # Parse a string into a URL, or return nil if the string is not a valid HTTP or HTTPS URL. + # + # @param url [String, Danbooru::URL] + # @return [Danbooru::URL] + def self.parse(url) + parse!(url) + rescue Error nil end @@ -42,7 +55,7 @@ module Danbooru # @return [Hash] the URL's query parameters def params - url.query_values.with_indifferent_access + url.query_values.to_h.with_indifferent_access end end end diff --git a/app/logical/source/url.rb b/app/logical/source/url.rb new file mode 100644 index 000000000..d72d3f696 --- /dev/null +++ b/app/logical/source/url.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +# A Source::URL is a URL from a source site, such as Twitter, Pixiv, etc. Each site has a +# subclass responsible for parsing and extracting information from URLs for that site. +# +# To add a new site, create a subclass of Source::URL and implement `#match?` to define +# which URLs belong to the site, and `#parse` to parse and extract information from the URL. +# +# Source::URL is a subclass of Danbooru::URL, so it inherits some common utility methods +# from there. +# +# @example +# url = Source::URL.parse("https://twitter.com/yasunavert/status/1496123903290314755") +# url.site_name # => "Twitter" +# url.status_id # => "1496123903290314755" +# url.twitter_username # => "yasunavert" +# +module Source + class URL < Danbooru::URL + SUBCLASSES = [ + Source::URL::Twitter, + ] + + # Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL. + # + # @param url [String, Danbooru::URL] + # @return [Source::URL] + def self.parse!(url) + url = Danbooru::URL.new(url) + subclass = SUBCLASSES.find { |c| c.match?(url) } || Source::URL + subclass.new(url) + end + + # Parse a string into a URL, or return nil if the string is not a valid HTTP or HTTPS URL. + # + # @param url [String, Danbooru::URL] + # @return [Danbooru::URL] + def self.parse(url) + parse!(url) + rescue Error + nil + end + + # Subclasses should implement this to return true for URLs that should be handled by the subclass. + # + # @param url [Danbooru::URL] The source URL. + def self.match?(url) + raise NotImplementedError + end + + # @return [String, nil] The name of the site this URL belongs to, or possibly nil if unknown. + def site_name + self.class.name.demodulize + end + + protected def initialize(...) + super(...) + parse + end + + # Subclasses should implement this to parse and extract any useful information from + # the URL. This is called when the URL is initialized. + protected def parse + end + end +end diff --git a/app/logical/source/url/twitter.rb b/app/logical/source/url/twitter.rb new file mode 100644 index 000000000..7fc683091 --- /dev/null +++ b/app/logical/source/url/twitter.rb @@ -0,0 +1,118 @@ +# frozen_string_literal: true + +# Page URLs: +# +# * https://twitter.com/motty08111213 +# * https://twitter.com/motty08111213/status/943446161586733056 +# * https://twitter.com/motty08111213/status/943446161586733056?s=19 +# * https://twitter.com/i/web/status/943446161586733056 +# +# * https://mobile.twitter.com/motty08111213 +# * https://mobile.twitter.com/motty08111213/status/943446161586733056 +# * https://mobile.twitter.com/i/web/status/943446161586733056 +# +# * https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1 +# * https://twitter.com/sato_1_11/status/1496489742791475201/photo/2 +# +# Sample image URLs: +# +# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg +# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg?name=large +# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD?format=jpg&name=large +# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD?format=jpg&name=small +# +# Full image URLs: +# +# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg:orig +# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg?name=orig +# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD?format=jpg&name=orig +# +# Video URLs: +# +# * https://video.twimg.com/tweet_video/E_8lAMJUYAIyenr.mp4 +# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/pl/Srzcr2EsBK5Mwlvf.m3u8?tag=12&container=fmp4 +# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/vid/360x270/SygSrUcDpCr1AnOf.mp4?tag=12 +# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/vid/960x720/wiC1XIw8QehhL5JL.mp4?tag=12 +# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/vid/480x360/amWjOw0MmLdnPMPB.mp4?tag=12 +# +# Video thumbnail URLs: +# +# * https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg +# * https://pbs.twimg.com/ext_tw_video_thumb/1496554514312269828/pu/img/Asrdh3Ji-EqYOYHv.jpg +# * https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg +# +# Profile image URLs: +# +# * https://pbs.twimg.com/profile_banners/780804311529906176/1475001696 +# * https://pbs.twimg.com/profile_images/1493345400929112064/lF1mY1i2_normal.jpg +# +# Shortened URLs: +# +# * https://t.co/Dxn7CuVErW => https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1 +# * https://pic.twitter.com/Dxn7CuVErW => https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1 +# +class Source::URL::Twitter < Source::URL + # Twitter provides a list of reserved usernames but it's inaccurate; some names ('intent') aren't + # included and other names in the list aren't actually reserved. + # https://developer.twitter.com/en/docs/developer-utilities/configuration/api-reference/get-help-configuration + RESERVED_USERNAMES = %w[home i intent search] + + attr_reader :status_id, :twitter_username + + def self.match?(url) + url.host.in?(%w[twitter.com mobile.twitter.com pic.twitter.com pbs.twimg.com video.twimg.com t.co]) + end + + def parse + case [domain, *path_segments] + + # https://twitter.com/i/web/status/943446161586733056 + in "twitter.com", "i", "web", "status", status_id + @status_id = status_id + + # https://twitter.com/motty08111213/status/943446161586733056 + # https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1 + # https://twitter.com/sato_1_11/status/1496489742791475201/photo/2 + in "twitter.com", username, "status", status_id, *rest + @twitter_username = username + @status_id = status_id + + # https://twitter.com/motty08111213 + in "twitter.com", username, *rest + @twitter_username = username unless username.in?(RESERVED_USERNAMES) + + # https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg + # https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg:small + # https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb?format=jpg&name=900x900 + # https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg + # https://pbs.twimg.com/ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg + # https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg + in "twimg.com", ("media" | "tweet_video_thumb" | "ext_tw_video_thumb" | "amplify_video_thumb") => media_type, *subdirs, filename + # EBGbJe_U8AA4Ekb.jpg:small + @filename, @file_size = filename.split(":") + @filename, @file_ext = @filename.split(".") + + # EBGbJe_U8AA4Ekb?format=jpg&name=900x900 + @file_size = params[:name] if params[:name].present? + @file_ext = params[:format] if params[:format].present? + + # /media/EBGbJe_U8AA4Ekb.jpg + # /ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg + @file_path = File.join(media_type, subdirs.join("/"), "#{@filename}.#{@file_ext}") + else + end + end + + def image_url? + orig_image_url.present? + end + + # https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg:orig + # https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg:orig + # https://pbs.twimg.com/ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg:orig + # https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg:orig + def orig_image_url + return nil unless @file_path.present? + "#{site}/#{@file_path}:orig" + end +end diff --git a/app/logical/sources/strategies/base.rb b/app/logical/sources/strategies/base.rb index d28da7e1b..1851ddc0d 100644 --- a/app/logical/sources/strategies/base.rb +++ b/app/logical/sources/strategies/base.rb @@ -46,8 +46,8 @@ module Sources @referer_url = referer_url&.to_s @urls = [@url, @referer_url].select(&:present?) - @parsed_url = Danbooru::URL.parse(url) - @parsed_referer = Danbooru::URL.parse(referer_url) + @parsed_url = Source::URL.parse(url) + @parsed_referer = Source::URL.parse(referer_url) if referer_url.present? @parsed_urls = [parsed_url, parsed_referer].select(&:present?) end diff --git a/app/logical/sources/strategies/pixiv.rb b/app/logical/sources/strategies/pixiv.rb index 64b3a99f5..7cc714ed2 100644 --- a/app/logical/sources/strategies/pixiv.rb +++ b/app/logical/sources/strategies/pixiv.rb @@ -228,8 +228,8 @@ module Sources # http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054 # http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054 # http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1 - if url.host == "www.pixiv.net" && url.path == "/member_illust.php" && url.query_values&.has_key?("illust_id") - return url.query_values["illust_id"].to_i + if url.host == "www.pixiv.net" && url.path == "/member_illust.php" && url.params.has_key?("illust_id") + return url.params[:illust_id].to_i # http://www.pixiv.net/en/artworks/46324488 elsif url.host == "www.pixiv.net" && url.path =~ %r{\A/(?:en/)?artworks/(?\d+)}i diff --git a/app/logical/sources/strategies/twitter.rb b/app/logical/sources/strategies/twitter.rb index ef9ab265b..0234add57 100644 --- a/app/logical/sources/strategies/twitter.rb +++ b/app/logical/sources/strategies/twitter.rb @@ -1,28 +1,8 @@ # frozen_string_literal: true +# @see Source::URL::Twitter module Sources::Strategies class Twitter < Base - PAGE = %r{\Ahttps?://(?:mobile\.)?twitter\.com}i - PROFILE = %r{\Ahttps?://(?:mobile\.)?twitter.com/(?[a-z0-9_]+)}i - - # https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg - # https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb?format=jpg&name=900x900 - # https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg - # https://pbs.twimg.com/ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg - # https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg - BASE_IMAGE_URL = %r{\Ahttps?://pbs\.twimg\.com/(?media|tweet_video_thumb|ext_tw_video_thumb|amplify_video_thumb)}i - FILENAME1 = /(?[a-zA-Z0-9_-]+)\.(?\w+)/i - FILENAME2 = /(?[a-zA-Z0-9_-]+)\?.*format=(?\w+)/i - FILEPATH1 = %r{(?\d+/[\w_-]+/img)}i - FILEPATH2 = %r{(?\d+/img)}i - IMAGE_URL1 = %r{#{BASE_IMAGE_URL}/#{Regexp.union(FILENAME1, FILENAME2)}}i - IMAGE_URL2 = %r{#{BASE_IMAGE_URL}/#{Regexp.union(FILEPATH1, FILEPATH2)}/#{FILENAME1}}i - - # Twitter provides a list but it's inaccurate; some names ('intent') aren't - # included and other names in the list aren't actually reserved. - # https://developer.twitter.com/en/docs/developer-utilities/configuration/api-reference/get-help-configuration - RESERVED_USERNAMES = %w[home i intent search] - # List of hashtag suffixes attached to tag other names # Ex: 西住みほ生誕祭2019 should be checked as 西住みほ # The regexes will not match if there is nothing preceding @@ -43,24 +23,6 @@ module Sources::Strategies Danbooru.config.twitter_api_key.present? && Danbooru.config.twitter_api_secret.present? end - # https://twitter.com/i/web/status/943446161586733056 - # https://twitter.com/motty08111213/status/943446161586733056 - def self.status_id_from_url(url) - if url =~ %r{\Ahttps?://(?:(?:www|mobile)\.)?twitter\.com/(?:i/web|\w+)/status/(\d+)}i - return $1 - end - - nil - end - - def self.tag_name_from_url(url) - if url =~ PROFILE && !$~[:username].in?(RESERVED_USERNAMES) - $~[:username] - else - nil - end - end - def domains ["twitter.com", "twimg.com"] end @@ -70,10 +32,9 @@ module Sources::Strategies end def image_urls - if url =~ IMAGE_URL1 - ["https://pbs.twimg.com/#{$~[:media_type]}/#{$~[:file_name]}.#{$~[:file_ext]}:orig"] - elsif url =~ IMAGE_URL2 - ["https://pbs.twimg.com/#{$~[:media_type]}/#{$~[:file_path]}/#{$~[:file_name]}.#{$~[:file_ext]}:orig"] + # https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg:orig + if parsed_url.image_url? + [parsed_url.orig_image_url] elsif api_response.present? api_response.dig(:extended_entities, :media).to_a.map do |media| if media[:type] == "photo" @@ -217,11 +178,11 @@ module Sources::Strategies end def status_id - [url, referer_url].map {|x| self.class.status_id_from_url(x)}.compact.first + parsed_url.status_id || parsed_referer&.status_id end def tag_name_from_url - [url, referer_url].map {|x| self.class.tag_name_from_url(x)}.compact.first + parsed_url.twitter_username || parsed_referer&.twitter_username end memoize :api_response