Fix #4016: Translated tags failing to find some tags.

* Normalize spaces to underscores when saving other names. Preserve case
  since case can be significant.

* Fix WikiPage#other_names_include to search case-insensitively (note:
  this prevents using the index).

* Fix sources to return the raw tags in `#tags` and the normalized tags
  in `#normalized_tags`. The normalized tags are the tags that will be
  matched against other names.
This commit is contained in:
evazion
2018-12-16 11:27:04 -06:00
parent a1df1abf0b
commit c700ea4b5f
9 changed files with 49 additions and 21 deletions

View File

@@ -72,7 +72,7 @@ module Sources::Strategies
def tags
api_response[:tags].to_a.map do |tag|
[tag.downcase.tr(" ", "_"), "https://www.artstation.com/search?q=" + CGI.escape(tag)]
[tag, "https://www.artstation.com/search?q=" + CGI.escape(tag)]
end
end

View File

@@ -178,8 +178,16 @@ module Sources
(@tags || []).uniq
end
def normalized_tags
tags.map { |tag, url| normalize_tag(tag) }.sort.uniq
end
def normalize_tag(tag)
WikiPage.normalize_other_name(tag).downcase
end
def translated_tags
translated_tags = tags.map(&:first).flat_map(&method(:translate_tag)).uniq.sort
translated_tags = normalized_tags.flat_map(&method(:translate_tag)).uniq.sort
translated_tags.reject { |tag| tag.category == Tag.categories.artist }
end
@@ -240,6 +248,7 @@ module Sources
:canonical_url => canonical_url,
:normalized_for_artist_finder_url => normalize_for_artist_finder,
:tags => tags,
:normalized_tags => normalized_tags,
:translated_tags => translated_tags,
:unique_id => unique_id,
:artist_commentary => {

View File

@@ -77,7 +77,7 @@ module Sources
def tags
api_response[:tags].to_s.split.map do |tag|
[tag.tr("_", " "), "https://#{site_name}/post?tags=#{CGI.escape(tag)}"]
[tag, "https://#{site_name}/post?tags=#{CGI.escape(tag)}"]
end
end

View File

@@ -160,14 +160,16 @@ module Sources
rescue PixivApiClient::BadIDError
[]
end
memoize :tags
def normalize_tag(tag)
tag.gsub(/\d+users入り\z/i, "")
end
def translate_tag(tag)
normalized_tag = tag.gsub(/\d+users入り\z/i, "")
translated_tags = super(normalized_tag)
translated_tags = super(tag)
if translated_tags.empty? && normalized_tag.include?("/")
translated_tags = normalized_tag.split("/").flat_map { |tag| super(tag) }
if translated_tags.empty? && tag.include?("/")
translated_tags = tag.split("/").flat_map { |tag| super(tag) }
end
translated_tags

View File

@@ -105,12 +105,15 @@ module Sources::Strategies
def tags
post[:tags].to_a.map do |tag|
# normalize tags: space, underscore, and hyphen are equivalent in tumblr tags.
etag = tag.gsub(/[ _-]/, "_")
[etag, "https://tumblr.com/tagged/#{CGI.escape(etag)}"]
[tag, "https://tumblr.com/tagged/#{CGI.escape(tag)}"]
end.uniq
end
def normalize_tag(tag)
tag = tag.tr("-", "_")
super(tag)
end
def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc).strip
end

View File

@@ -32,7 +32,9 @@ class WikiPage < ApplicationRecord
end
def other_names_include(name)
where("wiki_pages.other_names @> ARRAY[?]", name.unicode_normalize(:nfkc))
name = normalize_other_name(name).downcase
subquery = WikiPage.from("unnest(other_names) AS other_name").where("lower(other_name) = ?", name)
where(id: subquery)
end
def other_names_match(name)
@@ -146,7 +148,11 @@ class WikiPage < ApplicationRecord
end
def normalize_other_names
self.other_names = other_names.map { |name| name.unicode_normalize(:nfkc) }.uniq
self.other_names = other_names.map { |name| WikiPage.normalize_other_name(name) }.uniq
end
def self.normalize_other_name(name)
name.unicode_normalize(:nfkc).gsub(/[[:space:]]+/, " ").strip.tr(" ", "_")
end
def skip_secondary_validations=(value)

View File

@@ -56,7 +56,8 @@ module Sources
end
should "get the tags" do
assert_equal(%w[gantz reika], @site.tags.map(&:first))
assert_equal(%w[gantz Reika], @site.tags.map(&:first))
assert_equal(%w[gantz reika], @site.normalized_tags)
end
should "get the artist commentary" do
@@ -74,6 +75,11 @@ module Sources
url = "https://cdna.artstation.com/p/assets/images/images/000/144/922/large/cassio-yoshiyaki-cody2backup2-yoshiyaki.jpg?1406314198"
assert_equal(url, @site.image_url)
end
should "get the tags" do
assert_equal(["Street Fighter", "Cody", "SF"].sort, @site.tags.map(&:first).sort)
assert_equal(["street_fighter", "cody", "sf"].sort, @site.normalized_tags.sort)
end
end
context "The source site for a http://cdna.artstation.com/p/assets/... url" do

View File

@@ -34,7 +34,7 @@ module Sources
@samp = "https://files.yande.re/sample/7ecfdead705d7b956b26b1d37b98d089/yande.re%20482880%20sample%20bayashiko%20journey_to_the_west%20sun_wukong.jpg"
@full = "https://files.yande.re/image/7ecfdead705d7b956b26b1d37b98d089/yande.re%20482880.jpg"
@page = "https://yande.re/post/show/482880"
@tags = ["bayashiko", "journey to the west", "sun wukong"]
@tags = ["bayashiko", "journey_to_the_west", "sun_wukong"]
@size = 362_554
@profile_url = "https://twitter.com/apononori"
@data = { site_name: "yande.re", preview_url: @prev, image_url: @full, page_url: @page, size: @size, tags: @tags, profile_url: @profile_url }
@@ -52,7 +52,7 @@ module Sources
@jpeg = "https://files.yande.re/sample/fb27a7ea6c48b2ef76fe915e378b9098/yande.re%20398018%20detexted%20misaki_kurehito%20saenai_heroine_no_sodatekata%20sawamura_spencer_eriri%20thighhighs.jpg"
@full = "https://files.yande.re/image/fb27a7ea6c48b2ef76fe915e378b9098/yande.re%20398018.png"
@page = "https://yande.re/post/show/398018"
@tags = ["misaki kurehito", "saenai heroine no sodatekata", "sawamura spencer eriri", "detexted", "thighhighs"]
@tags = ["misaki_kurehito", "saenai_heroine_no_sodatekata", "sawamura_spencer_eriri", "detexted", "thighhighs"]
@size = 9_118_998
@data = { site_name: "yande.re", preview_url: @prev, image_url: @full, page_url: @page, size: @size, tags: @tags, profile_url: nil }
@@ -93,7 +93,7 @@ module Sources
anthropomorphism bed blonde_hair bow brown_eyes doll
girls_frontline hara_shoutarou hoodie long_hair pantyhose scar skirt
twintails ump-45_(girls_frontline) ump-9_(girls_frontline)
].map { |tag| tag.tr("_", " ") }
]
@profile_url = "https://www.pixiv.net/member.php?id=22528152"
@data = { site_name: "konachan.com", preview_url: @prev, image_url: @full, page_url: @page, size: @size, tags: @tags, profile_url: @profile_url }

View File

@@ -20,8 +20,9 @@ module Sources
end
should "get the tags" do
tags = [["tag", "https://tumblr.com/tagged/tag"], ["red_hair", "https://tumblr.com/tagged/red_hair"]]
assert_equal(tags, @site.tags)
tags = ["tag", "red hair", "red-hair", "red_hair"]
assert_equal(tags, @site.tags.map(&:first))
assert_equal(["red_hair", "tag"], @site.normalized_tags)
end
should "get the commentary" do
@@ -100,8 +101,9 @@ module Sources
end
should "get the tags" do
tags = [["tag", "https://tumblr.com/tagged/tag"], ["red_hair", "https://tumblr.com/tagged/red_hair"]]
assert_equal(tags, @site.tags)
tags = ["tag", "red hair", "red-hair", "red_hair"]
assert_equal(tags, @site.tags.map(&:first))
assert_equal(["red_hair", "tag"], @site.normalized_tags)
end
end