sources: add Source::URL class; factor out Source::URL::Twitter.

Introduce a Source::URL class for parsing URLs from source sites. Refactor the Twitter
source strategy to use it.

This is the first step towards factoring all the URL parsing logic out of source
strategies and moving it to subclasses of Source::URL. Each site will have a subclass
of Source::URL dedicated to parsing URLs from that site. Source strategies will use
these classes to extract information from URLs.

This is to simplify source strategies. Most sites have many different URL formats we have
to parse or rewrite, and handling all these different cases tends to make source
strategies very complex. Isolating the URL parsing logic from the site scraping logic
should make source strategies easier to maintain.
This commit is contained in:
evazion
2022-02-23 17:35:39 -06:00
parent 6f5aef1cef
commit 7ed8f95a8e
6 changed files with 221 additions and 63 deletions

View File

@@ -4,29 +4,42 @@ module Danbooru
class URL
class Error < StandardError; end
# @return [String] The original URL as a string.
attr_reader :original_url, :url
delegate_missing_to :url
# Parse a string into an URL, or raise an exception if the string is not a valid HTTPS or HTTPS URL.
# @return [Addressable:URI] The parsed and normalized URL.
attr_reader :url
delegate :domain, :host, :site, :path, to: :url
# Parse a string into a URL, or raise an exception if the string is not a valid HTTPS or HTTPS URL.
#
# @param string [String]
# @return [Danbooru::URL]
def initialize(string)
@original_url = string
@url = Addressable::URI.heuristic_parse(string).display_uri
# @param url [String, Danbooru::URL]
def initialize(url)
@original_url = url.to_s
@url = Addressable::URI.heuristic_parse(original_url).display_uri
@url.path = nil if @url.path == "/"
raise Error, "#{string} is not an http:// URL" if !@url.normalized_scheme.in?(["http", "https"])
raise Error, "#{original_url} is not an http:// URL" if !@url.normalized_scheme.in?(["http", "https"])
rescue Addressable::URI::InvalidURIError => e
raise Error, e
end
# Parse a string into an URL, or return nil if the string is not a valid HTTP or HTTPS URL.
# Parse a string into a URL, or raise an exception if the string is not a valid HTTPS or HTTPS URL.
#
# @param string [String]
# @param url [String, Danbooru::URL]
# @return [Danbooru::URL]
def self.parse(string)
new(string)
rescue StandardError => e
def self.parse!(url)
new(url)
end
# Parse a string into a URL, or return nil if the string is not a valid HTTP or HTTPS URL.
#
# @param url [String, Danbooru::URL]
# @return [Danbooru::URL]
def self.parse(url)
parse!(url)
rescue Error
nil
end
@@ -42,7 +55,7 @@ module Danbooru
# @return [Hash] the URL's query parameters
def params
url.query_values.with_indifferent_access
url.query_values.to_h.with_indifferent_access
end
end
end

66
app/logical/source/url.rb Normal file
View File

@@ -0,0 +1,66 @@
# frozen_string_literal: true
# A Source::URL is a URL from a source site, such as Twitter, Pixiv, etc. Each site has a
# subclass responsible for parsing and extracting information from URLs for that site.
#
# To add a new site, create a subclass of Source::URL and implement `#match?` to define
# which URLs belong to the site, and `#parse` to parse and extract information from the URL.
#
# Source::URL is a subclass of Danbooru::URL, so it inherits some common utility methods
# from there.
#
# @example
# url = Source::URL.parse("https://twitter.com/yasunavert/status/1496123903290314755")
# url.site_name # => "Twitter"
# url.status_id # => "1496123903290314755"
# url.twitter_username # => "yasunavert"
#
module Source
class URL < Danbooru::URL
SUBCLASSES = [
Source::URL::Twitter,
]
# Parse a URL into a subclass of Source::URL, or raise an exception if the URL is not a valid HTTP or HTTPS URL.
#
# @param url [String, Danbooru::URL]
# @return [Source::URL]
def self.parse!(url)
url = Danbooru::URL.new(url)
subclass = SUBCLASSES.find { |c| c.match?(url) } || Source::URL
subclass.new(url)
end
# Parse a string into a URL, or return nil if the string is not a valid HTTP or HTTPS URL.
#
# @param url [String, Danbooru::URL]
# @return [Danbooru::URL]
def self.parse(url)
parse!(url)
rescue Error
nil
end
# Subclasses should implement this to return true for URLs that should be handled by the subclass.
#
# @param url [Danbooru::URL] The source URL.
def self.match?(url)
raise NotImplementedError
end
# @return [String, nil] The name of the site this URL belongs to, or possibly nil if unknown.
def site_name
self.class.name.demodulize
end
protected def initialize(...)
super(...)
parse
end
# Subclasses should implement this to parse and extract any useful information from
# the URL. This is called when the URL is initialized.
protected def parse
end
end
end

View File

@@ -0,0 +1,118 @@
# frozen_string_literal: true
# Page URLs:
#
# * https://twitter.com/motty08111213
# * https://twitter.com/motty08111213/status/943446161586733056
# * https://twitter.com/motty08111213/status/943446161586733056?s=19
# * https://twitter.com/i/web/status/943446161586733056
#
# * https://mobile.twitter.com/motty08111213
# * https://mobile.twitter.com/motty08111213/status/943446161586733056
# * https://mobile.twitter.com/i/web/status/943446161586733056
#
# * https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1
# * https://twitter.com/sato_1_11/status/1496489742791475201/photo/2
#
# Sample image URLs:
#
# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg
# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg?name=large
# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD?format=jpg&name=large
# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD?format=jpg&name=small
#
# Full image URLs:
#
# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg:orig
# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD.jpg?name=orig
# * https://pbs.twimg.com/media/FMSZOa6aQAMIuRD?format=jpg&name=orig
#
# Video URLs:
#
# * https://video.twimg.com/tweet_video/E_8lAMJUYAIyenr.mp4
# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/pl/Srzcr2EsBK5Mwlvf.m3u8?tag=12&container=fmp4
# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/vid/360x270/SygSrUcDpCr1AnOf.mp4?tag=12
# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/vid/960x720/wiC1XIw8QehhL5JL.mp4?tag=12
# * https://video.twimg.com/ext_tw_video/1496554514312269828/pu/vid/480x360/amWjOw0MmLdnPMPB.mp4?tag=12
#
# Video thumbnail URLs:
#
# * https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg
# * https://pbs.twimg.com/ext_tw_video_thumb/1496554514312269828/pu/img/Asrdh3Ji-EqYOYHv.jpg
# * https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg
#
# Profile image URLs:
#
# * https://pbs.twimg.com/profile_banners/780804311529906176/1475001696
# * https://pbs.twimg.com/profile_images/1493345400929112064/lF1mY1i2_normal.jpg
#
# Shortened URLs:
#
# * https://t.co/Dxn7CuVErW => https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1
# * https://pic.twitter.com/Dxn7CuVErW => https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1
#
class Source::URL::Twitter < Source::URL
# Twitter provides a list of reserved usernames but it's inaccurate; some names ('intent') aren't
# included and other names in the list aren't actually reserved.
# https://developer.twitter.com/en/docs/developer-utilities/configuration/api-reference/get-help-configuration
RESERVED_USERNAMES = %w[home i intent search]
attr_reader :status_id, :twitter_username
def self.match?(url)
url.host.in?(%w[twitter.com mobile.twitter.com pic.twitter.com pbs.twimg.com video.twimg.com t.co])
end
def parse
case [domain, *path_segments]
# https://twitter.com/i/web/status/943446161586733056
in "twitter.com", "i", "web", "status", status_id
@status_id = status_id
# https://twitter.com/motty08111213/status/943446161586733056
# https://twitter.com/Kekeflipnote/status/1496555599718498319/video/1
# https://twitter.com/sato_1_11/status/1496489742791475201/photo/2
in "twitter.com", username, "status", status_id, *rest
@twitter_username = username
@status_id = status_id
# https://twitter.com/motty08111213
in "twitter.com", username, *rest
@twitter_username = username unless username.in?(RESERVED_USERNAMES)
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg:small
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb?format=jpg&name=900x900
# https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg
# https://pbs.twimg.com/ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg
# https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg
in "twimg.com", ("media" | "tweet_video_thumb" | "ext_tw_video_thumb" | "amplify_video_thumb") => media_type, *subdirs, filename
# EBGbJe_U8AA4Ekb.jpg:small
@filename, @file_size = filename.split(":")
@filename, @file_ext = @filename.split(".")
# EBGbJe_U8AA4Ekb?format=jpg&name=900x900
@file_size = params[:name] if params[:name].present?
@file_ext = params[:format] if params[:format].present?
# /media/EBGbJe_U8AA4Ekb.jpg
# /ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg
@file_path = File.join(media_type, subdirs.join("/"), "#{@filename}.#{@file_ext}")
else
end
end
def image_url?
orig_image_url.present?
end
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg:orig
# https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg:orig
# https://pbs.twimg.com/ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg:orig
# https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg:orig
def orig_image_url
return nil unless @file_path.present?
"#{site}/#{@file_path}:orig"
end
end

View File

@@ -46,8 +46,8 @@ module Sources
@referer_url = referer_url&.to_s
@urls = [@url, @referer_url].select(&:present?)
@parsed_url = Danbooru::URL.parse(url)
@parsed_referer = Danbooru::URL.parse(referer_url)
@parsed_url = Source::URL.parse(url)
@parsed_referer = Source::URL.parse(referer_url) if referer_url.present?
@parsed_urls = [parsed_url, parsed_referer].select(&:present?)
end

View File

@@ -228,8 +228,8 @@ module Sources
# http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1
if url.host == "www.pixiv.net" && url.path == "/member_illust.php" && url.query_values&.has_key?("illust_id")
return url.query_values["illust_id"].to_i
if url.host == "www.pixiv.net" && url.path == "/member_illust.php" && url.params.has_key?("illust_id")
return url.params[:illust_id].to_i
# http://www.pixiv.net/en/artworks/46324488
elsif url.host == "www.pixiv.net" && url.path =~ %r{\A/(?:en/)?artworks/(?<illust_id>\d+)}i

View File

@@ -1,28 +1,8 @@
# frozen_string_literal: true
# @see Source::URL::Twitter
module Sources::Strategies
class Twitter < Base
PAGE = %r{\Ahttps?://(?:mobile\.)?twitter\.com}i
PROFILE = %r{\Ahttps?://(?:mobile\.)?twitter.com/(?<username>[a-z0-9_]+)}i
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb?format=jpg&name=900x900
# https://pbs.twimg.com/tweet_video_thumb/ETkN_L3X0AMy1aT.jpg
# https://pbs.twimg.com/ext_tw_video_thumb/1243725361986375680/pu/img/JDA7g7lcw7wK-PIv.jpg
# https://pbs.twimg.com/amplify_video_thumb/1215590775364259840/img/lolCkEEioFZTb5dl.jpg
BASE_IMAGE_URL = %r{\Ahttps?://pbs\.twimg\.com/(?<media_type>media|tweet_video_thumb|ext_tw_video_thumb|amplify_video_thumb)}i
FILENAME1 = /(?<file_name>[a-zA-Z0-9_-]+)\.(?<file_ext>\w+)/i
FILENAME2 = /(?<file_name>[a-zA-Z0-9_-]+)\?.*format=(?<file_ext>\w+)/i
FILEPATH1 = %r{(?<file_path>\d+/[\w_-]+/img)}i
FILEPATH2 = %r{(?<file_path>\d+/img)}i
IMAGE_URL1 = %r{#{BASE_IMAGE_URL}/#{Regexp.union(FILENAME1, FILENAME2)}}i
IMAGE_URL2 = %r{#{BASE_IMAGE_URL}/#{Regexp.union(FILEPATH1, FILEPATH2)}/#{FILENAME1}}i
# Twitter provides a list but it's inaccurate; some names ('intent') aren't
# included and other names in the list aren't actually reserved.
# https://developer.twitter.com/en/docs/developer-utilities/configuration/api-reference/get-help-configuration
RESERVED_USERNAMES = %w[home i intent search]
# List of hashtag suffixes attached to tag other names
# Ex: 西住みほ生誕祭2019 should be checked as 西住みほ
# The regexes will not match if there is nothing preceding
@@ -43,24 +23,6 @@ module Sources::Strategies
Danbooru.config.twitter_api_key.present? && Danbooru.config.twitter_api_secret.present?
end
# https://twitter.com/i/web/status/943446161586733056
# https://twitter.com/motty08111213/status/943446161586733056
def self.status_id_from_url(url)
if url =~ %r{\Ahttps?://(?:(?:www|mobile)\.)?twitter\.com/(?:i/web|\w+)/status/(\d+)}i
return $1
end
nil
end
def self.tag_name_from_url(url)
if url =~ PROFILE && !$~[:username].in?(RESERVED_USERNAMES)
$~[:username]
else
nil
end
end
def domains
["twitter.com", "twimg.com"]
end
@@ -70,10 +32,9 @@ module Sources::Strategies
end
def image_urls
if url =~ IMAGE_URL1
["https://pbs.twimg.com/#{$~[:media_type]}/#{$~[:file_name]}.#{$~[:file_ext]}:orig"]
elsif url =~ IMAGE_URL2
["https://pbs.twimg.com/#{$~[:media_type]}/#{$~[:file_path]}/#{$~[:file_name]}.#{$~[:file_ext]}:orig"]
# https://pbs.twimg.com/media/EBGbJe_U8AA4Ekb.jpg:orig
if parsed_url.image_url?
[parsed_url.orig_image_url]
elsif api_response.present?
api_response.dig(:extended_entities, :media).to_a.map do |media|
if media[:type] == "photo"
@@ -217,11 +178,11 @@ module Sources::Strategies
end
def status_id
[url, referer_url].map {|x| self.class.status_id_from_url(x)}.compact.first
parsed_url.status_id || parsed_referer&.status_id
end
def tag_name_from_url
[url, referer_url].map {|x| self.class.tag_name_from_url(x)}.compact.first
parsed_url.twitter_username || parsed_referer&.twitter_username
end
memoize :api_response