Files
danbooru/app/logical/artist_finder.rb
evazion 88d9fc4e5e sources: simplify artist finder url normalization.
Get rid of `normalized_for_artist_finder?` and `normalizable_for_artist_finder?`.
This was legacy bullshit that was originally designed to avoid API calls
when saving artist entries containing old Pixiv direct image urls that
had already been normalized, or that couldn't be normalized because they
were bad id.

Nowadays we store profile urls in artist entries instead of direct image
urls, so we don't normally need to do any API calls to normalize the
profile url. Strategies should take care to avoid triggering API calls
inside `profile_url` when possible.
2020-05-29 15:35:15 -05:00

142 lines
6.8 KiB
Ruby

module ArtistFinder
module_function
# Subdomains are automatically included. e.g., "twitter.com" matches "www.twitter.com",
# "mobile.twitter.com" and any other subdomain of "twitter.com".
SITE_BLACKLIST = [
"artstation.com/artist", # http://www.artstation.com/artist/serafleur/
"www.artstation.com", # http://www.artstation.com/serafleur/
%r!cdn[ab]?\.artstation\.com/p/assets/images/images!i, # https://cdna.artstation.com/p/assets/images/images/001/658/068/large/yang-waterkuma-b402.jpg?1450269769
"ask.fm", # http://ask.fm/mikuroko_396
"bcyimg.com",
"bcyimg.com/drawer", # https://img9.bcyimg.com/drawer/32360/post/178vu/46229ec06e8111e79558c1b725ebc9e6.jpg
"bcy.net",
"bcy.net/illust/detail", # https://bcy.net/illust/detail/32360/1374683
"bcy.net/u", # http://bcy.net/u/1390261
"behance.net", # "https://www.behance.net/webang111
"booru.org",
"booru.org/drawfriends", # http://img.booru.org/drawfriends//images/36/de65da5f588b76bc1d9de8af976b540e2dff17e2.jpg
"donmai.us",
"donmai.us/users", # http://danbooru.donmai.us/users/507162/
"derpibooru.org",
"derpibooru.org/tags", # https://derpibooru.org/tags/artist-colon-checkerboardazn
"deviantart.com",
"deviantart.net",
"dlsite.com",
"doujinshi.org",
"doujinshi.org/browse/circle", # http://www.doujinshi.org/browse/circle/65368/
"doujinshi.org/browse/author", # http://www.doujinshi.org/browse/author/979/23/
"doujinshi.mugimugi.org",
"doujinshi.mugimugi.org/browse/author", # http://doujinshi.mugimugi.org/browse/author/3029/
"doujinshi.mugimugi.org/browse/circle", # http://doujinshi.mugimugi.org/browse/circle/7210/
"drawcrowd.net", # https://drawcrowd.com/agussw
"drawr.net", # http://drawr.net/matsu310
"dropbox.com",
"dropbox.com/sh", # https://www.dropbox.com/sh/gz9okupqycr2vj2/GHt_oHDKsR
"dropbox.com/u", # http://dl.dropbox.com/u/76682289/daitoHP-WP/pict/
"e-hentai.org", # https://e-hentai.org/tag/artist:spirale
"e621.net",
"e621.net/post/index/1", # https://e621.net/post/index/1/spirale
"enty.jp", # https://enty.jp/aizawachihiro888
"enty.jp/users", # https://enty.jp/users/3766
"facebook.com", # https://www.facebook.com/LuutenantsLoot
"fantia.jp", # http://fantia.jp/no100
"fantia.jp/fanclubs", # https://fantia.jp/fanclubs/1711
"fav.me", # http://fav.me/d9y1njg
/blog-imgs-\d+(?:-origin)?\.fc2\.com/i,
"furaffinity.net",
"furaffinity.net/user", # http://www.furaffinity.net/user/achthenuts
"gelbooru.com", # http://gelbooru.com/index.php?page=account&s=profile&uname=junou
"inkbunny.net", # https://inkbunny.net/achthenuts
"plus.google.com", # https://plus.google.com/111509637967078773143/posts
"hentai-foundry.com",
"hentai-foundry.com/pictures/user", # http://www.hentai-foundry.com/pictures/user/aaaninja/
"hentai-foundry.com/user", # http://www.hentai-foundry.com/user/aaaninja/profile
%r!pictures\.hentai-foundry\.com(?:/\w)?!i, # http://pictures.hentai-foundry.com/a/aaaninja/
"i.imgur.com", # http://i.imgur.com/Ic9q3.jpg
"instagram.com", # http://www.instagram.com/serafleur.art/
"iwara.tv",
"iwara.tv/users", # http://ecchi.iwara.tv/users/marumega
"kym-cdn.com",
"livedoor.blogimg.jp",
"monappy.jp",
"monappy.jp/u", # https://monappy.jp/u/abara_bone
"mstdn.jp", # https://mstdn.jp/@oneb
"nicoseiga.jp",
"nicoseiga.jp/priv", # http://lohas.nicoseiga.jp/priv/2017365fb6cfbdf47ad26c7b6039feb218c5e2d4/1498430264/6820259
"nicovideo.jp",
"nicovideo.jp/user", # http://www.nicovideo.jp/user/317609
"nicovideo.jp/user/illust", # http://seiga.nicovideo.jp/user/illust/29075429
"nijie.info", # http://nijie.info/members.php?id=15235
%r!nijie\.info/nijie_picture!i, # http://pic03.nijie.info/nijie_picture/32243_20150609224803_0.png
"patreon.com", # http://patreon.com/serafleur
"pawoo.net", # https://pawoo.net/@148nasuka
"pawoo.net/web/accounts", # https://pawoo.net/web/accounts/228341
"picarto.tv", # https://picarto.tv/CheckerBoardAZN
"picarto.tv/live", # https://www.picarto.tv/live/channel.php?watch=aaaninja
"pictaram.com", # http://www.pictaram.com/user/5ish/3048385011/1350040096769940245_3048385011
"pinterest.com", # http://www.pinterest.com/alexandernanitc/
"pixiv.cc", # http://pixiv.cc/0123456789/
"pixiv.net", # https://www.pixiv.net/member.php?id=10442390
"pixiv.net/stacc", # https://www.pixiv.net/stacc/aaaninja2013
"pixiv.net/fanbox/creator", # https://www.pixiv.net/fanbox/creator/310630
"pixiv.net/users", # https://www.pixiv.net/users/555603
"pixiv.net/en/users", # https://www.pixiv.net/en/users/555603
"i.pximg.net",
"plurk.com", # http://www.plurk.com/a1amorea1a1
"privatter.net",
"privatter.net/u", # http://privatter.net/u/saaaatonaaaa
"rule34.paheal.net",
"rule34.paheal.net/post/list", # http://rule34.paheal.net/post/list/Reach025/
"sankakucomplex.com", # https://chan.sankakucomplex.com/?tags=user%3ASubridet
"society6.com", # http://society6.com/serafleur/
"tinami.com",
"tinami.com/creator/profile", # http://www.tinami.com/creator/profile/29024
"data.tumblr.com",
/\d+\.media\.tumblr\.com/i,
"twipple.jp",
"twipple.jp/user", # http://p.twipple.jp/user/Type10TK
"twitch.tv", # https://www.twitch.tv/5ish
"twitpic.com",
"twitpic.com/photos", # http://twitpic.com/photos/Type10TK
"twitter.com", # https://twitter.com/akkij0358
"twitter.com/i/web/status", # https://twitter.com/i/web/status/943446161586733056
"twimg.com/media", # https://pbs.twimg.com/media/DUUUdD5VMAEuURz.jpg:orig
"ustream.tv",
"ustream.tv/channel", # http://www.ustream.tv/channel/633b
"ustream.tv/user", # http://www.ustream.tv/user/kazaputi
"vk.com", # https://vk.com/id425850679
"weibo.com", # http://www.weibo.com/5536681649
"weibo.com/u",
"weibo.com/p",
"wp.com",
"yande.re",
"youtube.com",
"youtube.com/c", # https://www.youtube.com/c/serafleurArt
"youtube.com/channel", # https://www.youtube.com/channel/UCfrCa2Y6VulwHD3eNd3HBRA
"youtube.com/user", # https://www.youtube.com/user/148nasuka
"youtu.be" # http://youtu.be/gibeLKKRT-0
]
SITE_BLACKLIST_REGEXP = Regexp.union(SITE_BLACKLIST.map do |domain|
domain = Regexp.escape(domain) if domain.is_a?(String)
%r!\Ahttps?://(?:[a-zA-Z0-9_-]+\.)*#{domain}/\z!i
end)
def find_artists(url)
url = ArtistUrl.normalize(url)
artists = []
while artists.empty? && url.size > 10
u = url.sub(/\/+$/, "") + "/"
u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%'
artists += Artist.joins(:urls).where(["artists.is_deleted = FALSE AND artist_urls.normalized_url LIKE ? ESCAPE E'\\\\'", u]).limit(10).order("artists.name").all
url = File.dirname(url) + "/"
break if url =~ SITE_BLACKLIST_REGEXP
end
Artist.where(id: artists.uniq(&:name).take(20))
end
end