From d2147eca80c69b456a2dc3f4b5d9aa2448ded3f6 Mon Sep 17 00:00:00 2001 From: evazion Date: Mon, 5 Sep 2022 16:12:25 -0500 Subject: [PATCH] tumblr: fix exception when fetching data for video urls. Fix an exception when trying to fetch source data for URLs like https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4. For these URLs it's not possible to use the trick where we try to open the URL as a HTML page and scrape the post id from the HTML. Instead we get the raw video if we try to to this. --- app/logical/source/extractor/tumblr.rb | 5 ++++- app/logical/source/url/tumblr.rb | 2 ++ test/unit/sources/tumblr_test.rb | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/app/logical/source/extractor/tumblr.rb b/app/logical/source/extractor/tumblr.rb index fc3faee9b..8889e25e3 100644 --- a/app/logical/source/extractor/tumblr.rb +++ b/app/logical/source/extractor/tumblr.rb @@ -101,13 +101,16 @@ class Source::Extractor end def post_url_from_image_html + return nil unless parsed_url.image_url? && parsed_url.file_ext&.in?(%w[jpg png pnj gif]) + extracted = image_url_html(parsed_url)&.at("[href*='/post/']")&.[](:href) Source::URL.parse(extracted) end + memoize :post_url_from_image_html def image_url_html(image_url) resp = http.cache(1.minute).headers(accept: "text/html").get(image_url) - return nil if resp.code != 200 + return nil if resp.code != 200 || resp.mime_type != "text/html" resp.parse end diff --git a/app/logical/source/url/tumblr.rb b/app/logical/source/url/tumblr.rb index 77cd7049e..e65701e34 100644 --- a/app/logical/source/url/tumblr.rb +++ b/app/logical/source/url/tumblr.rb @@ -24,6 +24,7 @@ class Source::URL::Tumblr < Source::URL # https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif # https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif # https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png + # https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj in _, directory, file if image_url? @directory = directory parse_filename @@ -87,6 +88,7 @@ class Source::URL::Tumblr < Source::URL # https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif # https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif # https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png + # https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj # https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png # https://media.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg in *words, /\A\d+h?|raw\z/ => size diff --git a/test/unit/sources/tumblr_test.rb b/test/unit/sources/tumblr_test.rb index 7beb232f5..30b9cb097 100644 --- a/test/unit/sources/tumblr_test.rb +++ b/test/unit/sources/tumblr_test.rb @@ -245,6 +245,26 @@ module Sources ) end + context "A *.media.tumblr.com/tumblr_$id_$size.pnj URL" do + strategy_should_work( + "https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj", + image_urls: ["https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj"], + page_url: nil, + artist_name: nil, + download_size: 296_595, + ) + end + + context "A va.media.tumblr.com/tumblr_$id.mp4 URL" do + strategy_should_work( + "https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4", + image_urls: ["https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4"], + page_url: nil, + artist_name: nil, + download_size: 7_960_082, + ) + end + context "generating page urls" do should "work" do source1 = "https://octrain1020.tumblr.com/post/190713122589" @@ -272,6 +292,7 @@ module Sources assert(Source::URL.image_url?("https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif")) assert(Source::URL.image_url?("https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif")) assert(Source::URL.image_url?("https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png")) + assert(Source::URL.image_url?("https://media.tumblr.com/701a535af224f89684d2cfcc097575ef/tumblr_pjsx70RakC1y0gqjko1_1280.pnj")) assert(Source::URL.image_url?("https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png")) assert(Source::URL.image_url?("https://media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png"))