tumblr: fix exception when fetching data for video urls.

Fix an exception when trying to fetch source data for URLs like
https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4.

For these URLs it's not possible to use the trick where we try to open
the URL as a HTML page and scrape the post id from the HTML. Instead we
get the raw video if we try to to this.
This commit is contained in:
evazion
2022-09-05 16:12:25 -05:00
parent f55951ab58
commit d2147eca80
3 changed files with 27 additions and 1 deletions

View File

@@ -101,13 +101,16 @@ class Source::Extractor
end
def post_url_from_image_html
return nil unless parsed_url.image_url? && parsed_url.file_ext&.in?(%w[jpg png pnj gif])
extracted = image_url_html(parsed_url)&.at("[href*='/post/']")&.[](:href)
Source::URL.parse(extracted)
end
memoize :post_url_from_image_html
def image_url_html(image_url)
resp = http.cache(1.minute).headers(accept: "text/html").get(image_url)
return nil if resp.code != 200
return nil if resp.code != 200 || resp.mime_type != "text/html"
resp.parse
end

View File

@@ -24,6 +24,7 @@ class Source::URL::Tumblr < Source::URL
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
# https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
# https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png
# https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj
in _, directory, file if image_url?
@directory = directory
parse_filename
@@ -87,6 +88,7 @@ class Source::URL::Tumblr < Source::URL
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
# https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
# https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png
# https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj
# https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
# https://media.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg
in *words, /\A\d+h?|raw\z/ => size

View File

@@ -245,6 +245,26 @@ module Sources
)
end
context "A *.media.tumblr.com/tumblr_$id_$size.pnj URL" do
strategy_should_work(
"https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj",
image_urls: ["https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj"],
page_url: nil,
artist_name: nil,
download_size: 296_595,
)
end
context "A va.media.tumblr.com/tumblr_$id.mp4 URL" do
strategy_should_work(
"https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4",
image_urls: ["https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4"],
page_url: nil,
artist_name: nil,
download_size: 7_960_082,
)
end
context "generating page urls" do
should "work" do
source1 = "https://octrain1020.tumblr.com/post/190713122589"
@@ -272,6 +292,7 @@ module Sources
assert(Source::URL.image_url?("https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif"))
assert(Source::URL.image_url?("https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif"))
assert(Source::URL.image_url?("https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png"))
assert(Source::URL.image_url?("https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj"))
assert(Source::URL.image_url?("https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png"))
assert(Source::URL.image_url?("https://media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png"))