tumblr: fix exception when fetching data for video urls.
Fix an exception when trying to fetch source data for URLs like https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4. For these URLs it's not possible to use the trick where we try to open the URL as a HTML page and scrape the post id from the HTML. Instead we get the raw video if we try to to this.
This commit is contained in:
@@ -101,13 +101,16 @@ class Source::Extractor
|
|||||||
end
|
end
|
||||||
|
|
||||||
def post_url_from_image_html
|
def post_url_from_image_html
|
||||||
|
return nil unless parsed_url.image_url? && parsed_url.file_ext&.in?(%w[jpg png pnj gif])
|
||||||
|
|
||||||
extracted = image_url_html(parsed_url)&.at("[href*='/post/']")&.[](:href)
|
extracted = image_url_html(parsed_url)&.at("[href*='/post/']")&.[](:href)
|
||||||
Source::URL.parse(extracted)
|
Source::URL.parse(extracted)
|
||||||
end
|
end
|
||||||
|
memoize :post_url_from_image_html
|
||||||
|
|
||||||
def image_url_html(image_url)
|
def image_url_html(image_url)
|
||||||
resp = http.cache(1.minute).headers(accept: "text/html").get(image_url)
|
resp = http.cache(1.minute).headers(accept: "text/html").get(image_url)
|
||||||
return nil if resp.code != 200
|
return nil if resp.code != 200 || resp.mime_type != "text/html"
|
||||||
resp.parse
|
resp.parse
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ class Source::URL::Tumblr < Source::URL
|
|||||||
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
|
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
|
||||||
# https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
|
# https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
|
||||||
# https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png
|
# https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png
|
||||||
|
# https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj
|
||||||
in _, directory, file if image_url?
|
in _, directory, file if image_url?
|
||||||
@directory = directory
|
@directory = directory
|
||||||
parse_filename
|
parse_filename
|
||||||
@@ -87,6 +88,7 @@ class Source::URL::Tumblr < Source::URL
|
|||||||
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
|
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
|
||||||
# https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
|
# https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
|
||||||
# https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png
|
# https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png
|
||||||
|
# https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj
|
||||||
# https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
|
# https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
|
||||||
# https://media.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg
|
# https://media.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg
|
||||||
in *words, /\A\d+h?|raw\z/ => size
|
in *words, /\A\d+h?|raw\z/ => size
|
||||||
|
|||||||
@@ -245,6 +245,26 @@ module Sources
|
|||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "A *.media.tumblr.com/tumblr_$id_$size.pnj URL" do
|
||||||
|
strategy_should_work(
|
||||||
|
"https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj",
|
||||||
|
image_urls: ["https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj"],
|
||||||
|
page_url: nil,
|
||||||
|
artist_name: nil,
|
||||||
|
download_size: 296_595,
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
context "A va.media.tumblr.com/tumblr_$id.mp4 URL" do
|
||||||
|
strategy_should_work(
|
||||||
|
"https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4",
|
||||||
|
image_urls: ["https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4"],
|
||||||
|
page_url: nil,
|
||||||
|
artist_name: nil,
|
||||||
|
download_size: 7_960_082,
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
context "generating page urls" do
|
context "generating page urls" do
|
||||||
should "work" do
|
should "work" do
|
||||||
source1 = "https://octrain1020.tumblr.com/post/190713122589"
|
source1 = "https://octrain1020.tumblr.com/post/190713122589"
|
||||||
@@ -272,6 +292,7 @@ module Sources
|
|||||||
assert(Source::URL.image_url?("https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif"))
|
assert(Source::URL.image_url?("https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif"))
|
||||||
assert(Source::URL.image_url?("https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif"))
|
assert(Source::URL.image_url?("https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif"))
|
||||||
assert(Source::URL.image_url?("https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png"))
|
assert(Source::URL.image_url?("https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png"))
|
||||||
|
assert(Source::URL.image_url?("https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj"))
|
||||||
|
|
||||||
assert(Source::URL.image_url?("https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png"))
|
assert(Source::URL.image_url?("https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png"))
|
||||||
assert(Source::URL.image_url?("https://media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png"))
|
assert(Source::URL.image_url?("https://media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png"))
|
||||||
|
|||||||
Reference in New Issue
Block a user