From b880b07387af9c9465ca005b36978e56903fc5eb Mon Sep 17 00:00:00 2001 From: evazion Date: Tue, 27 Jun 2017 15:08:16 -0500 Subject: [PATCH 1/2] sources: factor out html-to-dtext code to DText.from_html. --- app/logical/d_text.rb | 47 +++++++++++++++++++ app/logical/sources/strategies/deviant_art.rb | 44 ++--------------- app/logical/sources/strategies/nijie.rb | 21 +-------- app/logical/sources/strategies/pawoo.rb | 27 ++--------- test/unit/sources/nijie_test.rb | 4 +- 5 files changed, 59 insertions(+), 84 deletions(-) diff --git a/app/logical/d_text.rb b/app/logical/d_text.rb index 0ac557ee3..2e7ef8714 100644 --- a/app/logical/d_text.rb +++ b/app/logical/d_text.rb @@ -369,6 +369,53 @@ class DText s end + def self.from_html(text, &block) + html = Nokogiri::HTML.fragment(text) + + dtext = html.children.map do |element| + block.call(element) if block.present? + + case element.name + when "text" + element.content + when "br" + "\n" + when "p" + from_html(element.inner_html, &block) + "\n\n" + when "blockquote" + "[quote]#{from_html(element.inner_html, &block)}[/quote]" if element.inner_html.present? + when "small", "sub" + "[tn]#{from_html(element.inner_html, &block)}[/tn]" if element.inner_html.present? + when "b", "strong" + "[b]#{from_html(element.inner_html, &block)}[/b]" if element.inner_html.present? + when "i", "em" + "[i]#{from_html(element.inner_html, &block)}[/i]" if element.inner_html.present? + when "u" + "[u]#{from_html(element.inner_html, &block)}[/u]" if element.inner_html.present? + when "s", "strike" + "[s]#{from_html(element.inner_html, &block)}[/s]" if element.inner_html.present? + when "li" + "* #{from_html(element.inner_html, &block)}" if element.inner_html.present? + when "h1", "h2", "h3", "h4", "h5", "h6" + hN = element.name + title = from_html(element.inner_html, &block) + "#{hN}. #{title}\n" + when "a" + title = from_html(element.inner_html, &block) + url = element["href"] + %("#{title}":[#{url}]) if title.present? && url.present? + when "img" + element.attributes["title"] || element.attributes["alt"] || "" + when "comment" + # ignored + else + from_html(element.inner_html, &block) + end + end.join + + dtext + end + # extract the first paragraph `needle` occurs in. def self.excerpt(dtext, needle) dtext = dtext.gsub(/\r\n|\r|\n/, "\n") diff --git a/app/logical/sources/strategies/deviant_art.rb b/app/logical/sources/strategies/deviant_art.rb index 2e9ea50b0..27e7f10f1 100644 --- a/app/logical/sources/strategies/deviant_art.rb +++ b/app/logical/sources/strategies/deviant_art.rb @@ -36,47 +36,11 @@ module Sources end def self.to_dtext(text) - html = Nokogiri::HTML.fragment(text) - - dtext = html.children.map do |element| - case element.name - when "text" - element.content - when "br" - "\n" - when "blockquote" - "[quote]#{to_dtext(element.inner_html)}[/quote]" if element.inner_html.present? - when "small", "sub" - "[tn]#{to_dtext(element.inner_html)}[/tn]" if element.inner_html.present? - when "b" - "[b]#{to_dtext(element.inner_html)}[/b]" if element.inner_html.present? - when "i" - "[i]#{to_dtext(element.inner_html)}[/i]" if element.inner_html.present? - when "u" - "[u]#{to_dtext(element.inner_html)}[/u]" if element.inner_html.present? - when "strike" - "[s]#{to_dtext(element.inner_html)}[/s]" if element.inner_html.present? - when "li" - "* #{to_dtext(element.inner_html)}" if element.inner_html.present? - when "h1", "h2", "h3", "h4", "h5", "h6" - hN = element.name - title = to_dtext(element.inner_html) - "#{hN}. #{title}\n" - when "a" - title = to_dtext(element.inner_html) - url = element.attributes["href"].value - url = url.gsub(%r!\Ahttps?://www\.deviantart\.com/users/outgoing\?!i, "") - %("#{title}":[#{url}]) if title.present? - when "img" - element.attributes["title"] || element.attributes["alt"] || "" - when "comment" - # ignored - else - to_dtext(element.inner_html) + DText.from_html(text) do |element| + if element.name == "a" && element["href"].present? + element["href"] = element["href"].gsub(%r!\Ahttps?://www\.deviantart\.com/users/outgoing\?!i, "") end - end.join - - dtext + end end protected diff --git a/app/logical/sources/strategies/nijie.rb b/app/logical/sources/strategies/nijie.rb index 71de9869d..53e32d21e 100644 --- a/app/logical/sources/strategies/nijie.rb +++ b/app/logical/sources/strategies/nijie.rb @@ -50,26 +50,9 @@ module Sources protected - # XXX: duplicated from strategies/deviant_art.rb. def self.to_dtext(text) - html = Nokogiri::HTML.fragment(text) - - dtext = html.children.map do |element| - case element.name - when "text" - element.content - when "strong" - "[b]#{to_dtext(element.inner_html)}[/b]" if element.inner_html.present? - when "i" - "[i]#{to_dtext(element.inner_html)}[/i]" if element.inner_html.present? - when "s" - "[s]#{to_dtext(element.inner_html)}[/s]" if element.inner_html.present? - else - to_dtext(element.inner_html) - end - end.join - - dtext + text = text.gsub(/\r\n|\r/, "
") + DText.from_html(text).strip end def get_commentary_from_page(page) diff --git a/app/logical/sources/strategies/pawoo.rb b/app/logical/sources/strategies/pawoo.rb index 1605332e0..6c27eea80 100644 --- a/app/logical/sources/strategies/pawoo.rb +++ b/app/logical/sources/strategies/pawoo.rb @@ -55,32 +55,13 @@ module Sources::Strategies end def dtext_artist_commentary_desc - to_dtext(artist_commentary_desc) - end - - def to_dtext(text) - html = Nokogiri::HTML.fragment(text) - - dtext = html.children.map do |element| - case element.name - when "text" - element.content - when "p" - to_dtext(element.inner_html) + "\n\n" - when "a" + DText.from_html(artist_commentary_desc) do |element| + if element.name == "a" # don't include links to the toot itself. media_urls = api_response.json["media_attachments"].map { |attr| attr["text_url"] } - next if element.attribute("href").value.in?(media_urls) - - title = to_dtext(element.inner_html) - url = element.attributes["href"].value - %("#{title}":[#{url}]) - else - to_dtext(element.inner_html) + element["href"] = nil if element["href"].in?(media_urls) end - end.join.strip - - dtext + end.strip end end end diff --git a/test/unit/sources/nijie_test.rb b/test/unit/sources/nijie_test.rb index 39dafb344..6e1528116 100644 --- a/test/unit/sources/nijie_test.rb +++ b/test/unit/sources/nijie_test.rb @@ -92,8 +92,8 @@ module Sources should "get the dtext-ified commentary" do desc = <<-EOS.strip_heredoc.chomp - foo [b]bold[/b] [i]italics[/i] [s]strike[/s] red\r - \r + foo [b]bold[/b] [i]italics[/i] [s]strike[/s] red + http://nijie.info/view.php?id=218944 EOS From 71f84b10af8571fe28c477988f04409499cfa1c7 Mon Sep 17 00:00:00 2001 From: evazion Date: Tue, 27 Jun 2017 16:01:34 -0500 Subject: [PATCH 2/2] tumblr: convert commentary to dtext. * Convert Tumblr commentary to DText. * Strip extraneous whitespace in links and blockquotes. * Add newlines after block elements to ensure they're separated from subsequent blocks. --- app/logical/d_text.rb | 14 +++++++------- app/logical/sources/strategies/tumblr.rb | 4 ++++ test/unit/sources/deviantart_test.rb | 6 +----- test/unit/sources/tumblr_test.rb | 20 ++++++++++++++++++++ 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/app/logical/d_text.rb b/app/logical/d_text.rb index 2e7ef8714..68c0990bd 100644 --- a/app/logical/d_text.rb +++ b/app/logical/d_text.rb @@ -377,13 +377,13 @@ class DText case element.name when "text" - element.content + element.content.gsub(/(?:\r|\n)+$/, "") when "br" "\n" - when "p" - from_html(element.inner_html, &block) + "\n\n" + when "p", "ul", "ol" + from_html(element.inner_html, &block).strip + "\n\n" when "blockquote" - "[quote]#{from_html(element.inner_html, &block)}[/quote]" if element.inner_html.present? + "[quote]#{from_html(element.inner_html, &block).strip}[/quote]\n\n" if element.inner_html.present? when "small", "sub" "[tn]#{from_html(element.inner_html, &block)}[/tn]" if element.inner_html.present? when "b", "strong" @@ -395,13 +395,13 @@ class DText when "s", "strike" "[s]#{from_html(element.inner_html, &block)}[/s]" if element.inner_html.present? when "li" - "* #{from_html(element.inner_html, &block)}" if element.inner_html.present? + "* #{from_html(element.inner_html, &block)}\n" if element.inner_html.present? when "h1", "h2", "h3", "h4", "h5", "h6" hN = element.name title = from_html(element.inner_html, &block) - "#{hN}. #{title}\n" + "#{hN}. #{title}\n\n" when "a" - title = from_html(element.inner_html, &block) + title = from_html(element.inner_html, &block).strip url = element["href"] %("#{title}":[#{url}]) if title.present? && url.present? when "img" diff --git a/app/logical/sources/strategies/tumblr.rb b/app/logical/sources/strategies/tumblr.rb index d9d7062a7..d8ae683fe 100644 --- a/app/logical/sources/strategies/tumblr.rb +++ b/app/logical/sources/strategies/tumblr.rb @@ -53,6 +53,10 @@ module Sources::Strategies end end + def dtext_artist_commentary_desc + DText.from_html(artist_commentary_desc).strip + end + def image_url image_urls.first end diff --git a/test/unit/sources/deviantart_test.rb b/test/unit/sources/deviantart_test.rb index ba5aedc68..a28305057 100644 --- a/test/unit/sources/deviantart_test.rb +++ b/test/unit/sources/deviantart_test.rb @@ -35,14 +35,11 @@ module Sources should "get the dtext-ified commentary" do desc = <<-EOS.strip_heredoc.chomp blah blah - "test link":[http://www.google.com] - - h1. lol - + [b]blah[/b] [i]blah[/i] [u]blah[/u] [s]blah[/s] herp derp @@ -53,7 +50,6 @@ module Sources * two * three - * one * two * three diff --git a/test/unit/sources/tumblr_test.rb b/test/unit/sources/tumblr_test.rb index 4f89e4a93..a117ed3e8 100644 --- a/test/unit/sources/tumblr_test.rb +++ b/test/unit/sources/tumblr_test.rb @@ -42,6 +42,26 @@ module Sources assert_equal(desc, @site.artist_commentary_desc) end + should "get the dtext-ified commentary" do + desc = <<-EOS.strip_heredoc.chomp + h2. header + + plain [b]bold[/b] [i]italics[/i] [s]strike[/s] + + * one + * two + + * one + * two + + [quote]quote[/quote] + + "link":[http://www.google.com] + EOS + + assert_equal(desc, @site.dtext_artist_commentary_desc) + end + should "get the image url" do assert_equal("http://data.tumblr.com/3bbfcbf075ddf969c996641b264086fd/tumblr_os2buiIOt51wsfqepo1_raw.png", @site.image_url) end