Merge pull request #3429 from evazion/fix-bogus-find-artist

Fix #2696: Bogus results on non-matching URL searches for Artists
This commit is contained in:
Albert Yi
2017-12-18 10:28:32 -08:00
committed by GitHub
2 changed files with 145 additions and 9 deletions

View File

@@ -31,6 +31,118 @@ class Artist < ApplicationRecord
extend ActiveSupport::Concern
module ClassMethods
# Subdomains are automatically included. e.g., "twitter.com" matches "www.twitter.com",
# "mobile.twitter.com" and any other subdomain of "twitter.com".
SITE_BLACKLIST = [
"artstation.com/artist", # http://www.artstation.com/artist/serafleur/
"www.artstation.com", # http://www.artstation.com/serafleur/
/(?:cdn[ab]?)\.artstation\.com/i, # http://serafleur.artstation.com
"ask.fm", # http://ask.fm/mikuroko_396
"bcyimg.com",
"bcyimg.com/drawer", # https://img9.bcyimg.com/drawer/32360/post/178vu/46229ec06e8111e79558c1b725ebc9e6.jpg
"bcy.net",
"bcy.net/illust/detail", # https://bcy.net/illust/detail/32360/1374683
"bcy.net/u", # http://bcy.net/u/1390261
"behance.net", # "https://www.behance.net/webang111
"booru.org",
"booru.org/drawfriends", # http://img.booru.org/drawfriends//images/36/de65da5f588b76bc1d9de8af976b540e2dff17e2.jpg
"donmai.us",
"donmai.us/users", # http://danbooru.donmai.us/users/507162/
"derpibooru.org",
"derpibooru.org/tags", # https://derpibooru.org/tags/artist-colon-checkerboardazn
"deviantart.net",
"dlsite.com",
"doujinshi.org",
"doujinshi.org/browse/circle", # http://www.doujinshi.org/browse/circle/65368/
"doujinshi.org/browse/author", # http://www.doujinshi.org/browse/author/979/23/
"doujinshi.mugimugi.org",
"doujinshi.mugimugi.org/browse/author", # http://doujinshi.mugimugi.org/browse/author/3029/
"doujinshi.mugimugi.org/browse/circle", # http://doujinshi.mugimugi.org/browse/circle/7210/
"drawcrowd.net", # https://drawcrowd.com/agussw
"drawr.net", # http://drawr.net/matsu310
"dropbox.com",
"dropbox.com/sh", # https://www.dropbox.com/sh/gz9okupqycr2vj2/GHt_oHDKsR
"dropbox.com/u", # http://dl.dropbox.com/u/76682289/daitoHP-WP/pict/
"e-hentai.org", # https://e-hentai.org/tag/artist:spirale
"e621.net",
"e621.net/post/index/1", # https://e621.net/post/index/1/spirale
"enty.jp", # https://enty.jp/aizawachihiro888
"enty.jp/users", # https://enty.jp/users/3766
"facebook.com", # https://www.facebook.com/LuutenantsLoot
"fantia.jp", # http://fantia.jp/no100
"fantia.jp/fanclubs", # https://fantia.jp/fanclubs/1711
"fav.me", # http://fav.me/d9y1njg
/blog-imgs-\d+(?:-origin)?\.fc2\.com/i,
"furaffinity.net",
"furaffinity.net/user", # http://www.furaffinity.net/user/achthenuts
"gelbooru.com", # http://gelbooru.com/index.php?page=account&s=profile&uname=junou
"inkbunny.net", # https://inkbunny.net/achthenuts
"plus.google.com", # https://plus.google.com/111509637967078773143/posts
"hentai-foundry.com",
"hentai-foundry.com/pictures/user", # http://www.hentai-foundry.com/pictures/user/aaaninja/
"hentai-foundry.com/user", # http://www.hentai-foundry.com/user/aaaninja/profile
%r!pictures\.hentai-foundry\.com(?:/\w)?!i, # http://pictures.hentai-foundry.com/a/aaaninja/
"i.imgur.com", # http://i.imgur.com/Ic9q3.jpg
"instagram.com", # http://www.instagram.com/serafleur.art/
"iwara.tv",
"iwara.tv/users", # http://ecchi.iwara.tv/users/marumega
"kym-cdn.com",
"livedoor.blogimg.jp",
"monappy.jp",
"monappy.jp/u", # https://monappy.jp/u/abara_bone
"mstdn.jp", # https://mstdn.jp/@oneb
"nicoseiga.jp",
"nicoseiga.jp/priv", # http://lohas.nicoseiga.jp/priv/2017365fb6cfbdf47ad26c7b6039feb218c5e2d4/1498430264/6820259
"nicovideo.jp",
"nicovideo.jp/user/illust", # http://seiga.nicovideo.jp/user/illust/29075429
"nijie.info", # http://nijie.info/members.php?id=15235
"patreon.com", # http://patreon.com/serafleur
"pawoo.net", # https://pawoo.net/@148nasuka
"pawoo.net/web/accounts", # https://pawoo.net/web/accounts/228341
"picarto.tv", # https://picarto.tv/CheckerBoardAZN
"picarto.tv/live", # https://www.picarto.tv/live/channel.php?watch=aaaninja
"pictaram.com", # http://www.pictaram.com/user/5ish/3048385011/1350040096769940245_3048385011
"pinterest.com", # http://www.pinterest.com/alexandernanitc/
"pixiv.cc", # http://pixiv.cc/0123456789/
"pixiv.net", # https://www.pixiv.net/member.php?id=10442390
"pixiv.net/stacc", # https://www.pixiv.net/stacc/aaaninja2013
"i.pximg.net",
"plurk.com", # http://www.plurk.com/a1amorea1a1
"privatter.net",
"privatter.net/u", # http://privatter.net/u/saaaatonaaaa
"rule34.paheal.net",
"rule34.paheal.net/post/list", # http://rule34.paheal.net/post/list/Reach025/
"sankakucomplex.com", # https://chan.sankakucomplex.com/?tags=user%3ASubridet
"society6.com", # http://society6.com/serafleur/
"tinami.com",
"tinami.com/creator/profile", # http://www.tinami.com/creator/profile/29024
"data.tumblr.com",
/\d+\.media\.tumblr\.com/i,
"twipple.jp",
"twipple.jp/user", # http://p.twipple.jp/user/Type10TK
"twitch.tv", # https://www.twitch.tv/5ish
"twitpic.com",
"twitpic.com/photos", # http://twitpic.com/photos/Type10TK
"twitter.com", # https://twitter.com/akkij0358
"ustream.tv",
"ustream.tv/channel", # http://www.ustream.tv/channel/633b
"ustream.tv/user", # http://www.ustream.tv/user/kazaputi
"vk.com", # https://vk.com/id425850679
"weibo.com", # http://www.weibo.com/5536681649
"wp.com",
"yande.re",
"youtube.com",
"youtube.com/c", # https://www.youtube.com/c/serafleurArt
"youtube.com/channel", # https://www.youtube.com/channel/UCfrCa2Y6VulwHD3eNd3HBRA
"youtube.com/user", # https://www.youtube.com/user/148nasuka
"youtu.be", # http://youtu.be/gibeLKKRT-0
]
SITE_BLACKLIST_REGEXP = Regexp.union(SITE_BLACKLIST.map do |domain|
domain = Regexp.escape(domain) if domain.is_a?(String)
%r!\Ahttps?://(?:[a-zA-Z0-9_-]+\.)*#{domain}/\z!i
end)
def find_all_by_url(url)
url = ArtistUrl.normalize(url)
artists = []
@@ -42,14 +154,8 @@ class Artist < ApplicationRecord
u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%'
artists += Artist.joins(:urls).where(["artists.is_active = TRUE AND artist_urls.normalized_url LIKE ? ESCAPE E'\\\\'", u]).limit(10).order("artists.name").all
url = File.dirname(url) + "/"
break if url =~ /pixiv\.net\/(?:img\/)?$/i
break if url =~ /lohas\.nicoseiga\.jp\/priv\/$/i
break if url =~ /nicovideo\.jp\/user\/illust/
break if url =~ /(?:data|media)\.tumblr\.com\/[a-z0-9]+\/$/i
break if url =~ /deviantart\.net\//i
break if url =~ %r!\Ahttps?://(?:mobile\.)?twitter\.com/\Z!i
break if url =~ %r!pawoo\.net/(?:web/)?$!i
break if url =~ %r!\Ahttps?://(pic\d+\.)?nijie\.info/!i
break if url =~ SITE_BLACKLIST_REGEXP
end
artists.inject({}) {|h, x| h[x.name] = x; h}.values.slice(0, 20)

View File

@@ -15,6 +15,8 @@ class ArtistTest < ActiveSupport::TestCase
context "An artist" do
setup do
User.any_instance.stubs(:validate_sock_puppets).returns(true)
user = Timecop.travel(1.month.ago) {FactoryGirl.create(:user)}
CurrentUser.user = user
CurrentUser.ip_addr = "127.0.0.1"
@@ -203,7 +205,7 @@ class ArtistTest < ActiveSupport::TestCase
assert_artist_found("trixia", "http://trixdraws.deviantart.com/gallery/#/d722mrt")
end
should_eventually "find the correct artist for image URLs" do
should "find the correct artist for image URLs" do
assert_artist_found("artgerm", "http://th05.deviantart.net/fs71/200H/f/2014/150/d/c/peachy_princess_by_artgerm-d7k7tmu.jpg")
assert_artist_found("artgerm", "http://th05.deviantart.net/fs71/PRE/f/2014/150/d/c/peachy_princess_by_artgerm-d7k7tmu.jpg")
assert_artist_found("artgerm", "http://fc06.deviantart.net/fs71/f/2014/150/d/c/peachy_princess_by_artgerm-d7k7tmu.jpg")
@@ -212,6 +214,11 @@ class ArtistTest < ActiveSupport::TestCase
assert_artist_found("trixia", "http://th01.deviantart.net/fs71/200H/i/2014/050/d/e/my_queen_by_trixdraws-d722mrt.jpg")
assert_artist_found("trixia", "http://th09.deviantart.net/fs71/PRE/i/2014/050/d/e/my_queen_by_trixdraws-d722mrt.jpg")
end
should "return nothing for unknown deviantart artists" do
assert_artist_not_found("http://guweiz.deviantart.com/art/Battleship-551905391")
assert_artist_not_found("https://orig00.deviantart.net/7585/f/2015/219/a/5/battleship__by_guweiz-d94l8xb.png")
end
end
context "when finding pixiv artists" do
@@ -263,10 +270,17 @@ class ArtistTest < ActiveSupport::TestCase
context "when finding nico seiga artists" do
setup do
FactoryGirl.create(:artist, :name => "osamari", :url_string => "http://seiga.nicovideo.jp/user/illust/7017777")
FactoryGirl.create(:artist, :name => "hakuro109", :url_string => "http://seiga.nicovideo.jp/user/illust/16265470")
end
should "find the artist by the profile" do
assert_artist_found("osamari", "http://seiga.nicovideo.jp/seiga/im4937663")
assert_artist_found("hakuro109", "http://lohas.nicoseiga.jp/priv/b9ea863e691f3a648dee5582fd6911c30dc8acab/1510092103/6424205")
end
should "return nothing for unknown nico seiga artists" do
assert_artist_not_found("http://seiga.nicovideo.jp/seiga/im6605221")
assert_artist_not_found("http://lohas.nicoseiga.jp/priv/fd195b3405b19874c825eb4d81c9196086562c6b/1509089019/6605221")
end
end
@@ -338,6 +352,22 @@ class ArtistTest < ActiveSupport::TestCase
end
end
context "when finding tumblr artists" do
setup do
FactoryGirl.create(:artist, :name => "ilya_kuvshinov", :url_string => "http://kuvshinov-ilya.tumblr.com")
FactoryGirl.create(:artist, :name => "j.k.", :url_string => "https://jdotkdot5.tumblr.com")
end
should "find the artist" do
assert_artist_found("ilya_kuvshinov", "http://kuvshinov-ilya.tumblr.com/post/168641755845")
assert_artist_found("j.k.", "https://jdotkdot5.tumblr.com/post/168276640697")
end
should "return nothing for unknown tumblr artists" do
assert_artist_not_found("https://peptosis.tumblr.com/post/168162082005")
end
end
should "normalize its other names" do
artist = FactoryGirl.create(:artist, :name => "a1", :other_names_comma => "aaa, bbb, ccc ddd")
assert_equal("aaa, bbb, ccc_ddd", artist.other_names_comma)