diff --git a/app/logical/artist_finder.rb b/app/logical/artist_finder.rb new file mode 100644 index 000000000..1853ce35f --- /dev/null +++ b/app/logical/artist_finder.rb @@ -0,0 +1,141 @@ +module ArtistFinder + module_function + + # Subdomains are automatically included. e.g., "twitter.com" matches "www.twitter.com", + # "mobile.twitter.com" and any other subdomain of "twitter.com". + SITE_BLACKLIST = [ + "artstation.com/artist", # http://www.artstation.com/artist/serafleur/ + "www.artstation.com", # http://www.artstation.com/serafleur/ + %r!cdn[ab]?\.artstation\.com/p/assets/images/images!i, # https://cdna.artstation.com/p/assets/images/images/001/658/068/large/yang-waterkuma-b402.jpg?1450269769 + "ask.fm", # http://ask.fm/mikuroko_396 + "bcyimg.com", + "bcyimg.com/drawer", # https://img9.bcyimg.com/drawer/32360/post/178vu/46229ec06e8111e79558c1b725ebc9e6.jpg + "bcy.net", + "bcy.net/illust/detail", # https://bcy.net/illust/detail/32360/1374683 + "bcy.net/u", # http://bcy.net/u/1390261 + "behance.net", # "https://www.behance.net/webang111 + "booru.org", + "booru.org/drawfriends", # http://img.booru.org/drawfriends//images/36/de65da5f588b76bc1d9de8af976b540e2dff17e2.jpg + "donmai.us", + "donmai.us/users", # http://danbooru.donmai.us/users/507162/ + "derpibooru.org", + "derpibooru.org/tags", # https://derpibooru.org/tags/artist-colon-checkerboardazn + "deviantart.com", + "deviantart.net", + "dlsite.com", + "doujinshi.org", + "doujinshi.org/browse/circle", # http://www.doujinshi.org/browse/circle/65368/ + "doujinshi.org/browse/author", # http://www.doujinshi.org/browse/author/979/23/ + "doujinshi.mugimugi.org", + "doujinshi.mugimugi.org/browse/author", # http://doujinshi.mugimugi.org/browse/author/3029/ + "doujinshi.mugimugi.org/browse/circle", # http://doujinshi.mugimugi.org/browse/circle/7210/ + "drawcrowd.net", # https://drawcrowd.com/agussw + "drawr.net", # http://drawr.net/matsu310 + "dropbox.com", + "dropbox.com/sh", # https://www.dropbox.com/sh/gz9okupqycr2vj2/GHt_oHDKsR + "dropbox.com/u", # http://dl.dropbox.com/u/76682289/daitoHP-WP/pict/ + "e-hentai.org", # https://e-hentai.org/tag/artist:spirale + "e621.net", + "e621.net/post/index/1", # https://e621.net/post/index/1/spirale + "enty.jp", # https://enty.jp/aizawachihiro888 + "enty.jp/users", # https://enty.jp/users/3766 + "facebook.com", # https://www.facebook.com/LuutenantsLoot + "fantia.jp", # http://fantia.jp/no100 + "fantia.jp/fanclubs", # https://fantia.jp/fanclubs/1711 + "fav.me", # http://fav.me/d9y1njg + /blog-imgs-\d+(?:-origin)?\.fc2\.com/i, + "furaffinity.net", + "furaffinity.net/user", # http://www.furaffinity.net/user/achthenuts + "gelbooru.com", # http://gelbooru.com/index.php?page=account&s=profile&uname=junou + "inkbunny.net", # https://inkbunny.net/achthenuts + "plus.google.com", # https://plus.google.com/111509637967078773143/posts + "hentai-foundry.com", + "hentai-foundry.com/pictures/user", # http://www.hentai-foundry.com/pictures/user/aaaninja/ + "hentai-foundry.com/user", # http://www.hentai-foundry.com/user/aaaninja/profile + %r!pictures\.hentai-foundry\.com(?:/\w)?!i, # http://pictures.hentai-foundry.com/a/aaaninja/ + "i.imgur.com", # http://i.imgur.com/Ic9q3.jpg + "instagram.com", # http://www.instagram.com/serafleur.art/ + "iwara.tv", + "iwara.tv/users", # http://ecchi.iwara.tv/users/marumega + "kym-cdn.com", + "livedoor.blogimg.jp", + "monappy.jp", + "monappy.jp/u", # https://monappy.jp/u/abara_bone + "mstdn.jp", # https://mstdn.jp/@oneb + "nicoseiga.jp", + "nicoseiga.jp/priv", # http://lohas.nicoseiga.jp/priv/2017365fb6cfbdf47ad26c7b6039feb218c5e2d4/1498430264/6820259 + "nicovideo.jp", + "nicovideo.jp/user", # http://www.nicovideo.jp/user/317609 + "nicovideo.jp/user/illust", # http://seiga.nicovideo.jp/user/illust/29075429 + "nijie.info", # http://nijie.info/members.php?id=15235 + %r!nijie\.info/nijie_picture!i, # http://pic03.nijie.info/nijie_picture/32243_20150609224803_0.png + "patreon.com", # http://patreon.com/serafleur + "pawoo.net", # https://pawoo.net/@148nasuka + "pawoo.net/web/accounts", # https://pawoo.net/web/accounts/228341 + "picarto.tv", # https://picarto.tv/CheckerBoardAZN + "picarto.tv/live", # https://www.picarto.tv/live/channel.php?watch=aaaninja + "pictaram.com", # http://www.pictaram.com/user/5ish/3048385011/1350040096769940245_3048385011 + "pinterest.com", # http://www.pinterest.com/alexandernanitc/ + "pixiv.cc", # http://pixiv.cc/0123456789/ + "pixiv.net", # https://www.pixiv.net/member.php?id=10442390 + "pixiv.net/stacc", # https://www.pixiv.net/stacc/aaaninja2013 + "pixiv.net/fanbox/creator", # https://www.pixiv.net/fanbox/creator/310630 + "pixiv.net/users", # https://www.pixiv.net/users/555603 + "pixiv.net/en/users", # https://www.pixiv.net/en/users/555603 + "i.pximg.net", + "plurk.com", # http://www.plurk.com/a1amorea1a1 + "privatter.net", + "privatter.net/u", # http://privatter.net/u/saaaatonaaaa + "rule34.paheal.net", + "rule34.paheal.net/post/list", # http://rule34.paheal.net/post/list/Reach025/ + "sankakucomplex.com", # https://chan.sankakucomplex.com/?tags=user%3ASubridet + "society6.com", # http://society6.com/serafleur/ + "tinami.com", + "tinami.com/creator/profile", # http://www.tinami.com/creator/profile/29024 + "data.tumblr.com", + /\d+\.media\.tumblr\.com/i, + "twipple.jp", + "twipple.jp/user", # http://p.twipple.jp/user/Type10TK + "twitch.tv", # https://www.twitch.tv/5ish + "twitpic.com", + "twitpic.com/photos", # http://twitpic.com/photos/Type10TK + "twitter.com", # https://twitter.com/akkij0358 + "twitter.com/i/web/status", # https://twitter.com/i/web/status/943446161586733056 + "twimg.com/media", # https://pbs.twimg.com/media/DUUUdD5VMAEuURz.jpg:orig + "ustream.tv", + "ustream.tv/channel", # http://www.ustream.tv/channel/633b + "ustream.tv/user", # http://www.ustream.tv/user/kazaputi + "vk.com", # https://vk.com/id425850679 + "weibo.com", # http://www.weibo.com/5536681649 + "wp.com", + "yande.re", + "youtube.com", + "youtube.com/c", # https://www.youtube.com/c/serafleurArt + "youtube.com/channel", # https://www.youtube.com/channel/UCfrCa2Y6VulwHD3eNd3HBRA + "youtube.com/user", # https://www.youtube.com/user/148nasuka + "youtu.be" # http://youtu.be/gibeLKKRT-0 + ] + + SITE_BLACKLIST_REGEXP = Regexp.union(SITE_BLACKLIST.map do |domain| + domain = Regexp.escape(domain) if domain.is_a?(String) + %r!\Ahttps?://(?:[a-zA-Z0-9_-]+\.)*#{domain}/\z!i + end) + + def find_artists(url) + url = ArtistUrl.normalize(url) + artists = [] + + # return [] unless Sources::Strategies.find(url).normalized_for_artist_finder? + + while artists.empty? && url.size > 10 + u = url.sub(/\/+$/, "") + "/" + u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%' + artists += Artist.joins(:urls).where(["artists.is_deleted = FALSE AND artist_urls.normalized_url LIKE ? ESCAPE E'\\\\'", u]).limit(10).order("artists.name").all + url = File.dirname(url) + "/" + + break if url =~ SITE_BLACKLIST_REGEXP + end + + Artist.where(id: artists.uniq(&:name).take(20)) + end +end diff --git a/app/logical/sources/strategies/base.rb b/app/logical/sources/strategies/base.rb index 9c7d7e322..d21e2dc33 100644 --- a/app/logical/sources/strategies/base.rb +++ b/app/logical/sources/strategies/base.rb @@ -178,7 +178,7 @@ module Sources end def artists - Artist.find_artists(normalize_for_artist_finder.to_s) + ArtistFinder.find_artists(normalize_for_artist_finder.to_s) end # A new artist entry with suggested defaults for when the artist doesn't diff --git a/app/models/artist.rb b/app/models/artist.rb index 801e4c63d..fbc565c9a 100644 --- a/app/models/artist.rb +++ b/app/models/artist.rb @@ -27,146 +27,6 @@ class Artist < ApplicationRecord module UrlMethods extend ActiveSupport::Concern - module ClassMethods - # Subdomains are automatically included. e.g., "twitter.com" matches "www.twitter.com", - # "mobile.twitter.com" and any other subdomain of "twitter.com". - SITE_BLACKLIST = [ - "artstation.com/artist", # http://www.artstation.com/artist/serafleur/ - "www.artstation.com", # http://www.artstation.com/serafleur/ - %r!cdn[ab]?\.artstation\.com/p/assets/images/images!i, # https://cdna.artstation.com/p/assets/images/images/001/658/068/large/yang-waterkuma-b402.jpg?1450269769 - "ask.fm", # http://ask.fm/mikuroko_396 - "bcyimg.com", - "bcyimg.com/drawer", # https://img9.bcyimg.com/drawer/32360/post/178vu/46229ec06e8111e79558c1b725ebc9e6.jpg - "bcy.net", - "bcy.net/illust/detail", # https://bcy.net/illust/detail/32360/1374683 - "bcy.net/u", # http://bcy.net/u/1390261 - "behance.net", # "https://www.behance.net/webang111 - "booru.org", - "booru.org/drawfriends", # http://img.booru.org/drawfriends//images/36/de65da5f588b76bc1d9de8af976b540e2dff17e2.jpg - "donmai.us", - "donmai.us/users", # http://danbooru.donmai.us/users/507162/ - "derpibooru.org", - "derpibooru.org/tags", # https://derpibooru.org/tags/artist-colon-checkerboardazn - "deviantart.com", - "deviantart.net", - "dlsite.com", - "doujinshi.org", - "doujinshi.org/browse/circle", # http://www.doujinshi.org/browse/circle/65368/ - "doujinshi.org/browse/author", # http://www.doujinshi.org/browse/author/979/23/ - "doujinshi.mugimugi.org", - "doujinshi.mugimugi.org/browse/author", # http://doujinshi.mugimugi.org/browse/author/3029/ - "doujinshi.mugimugi.org/browse/circle", # http://doujinshi.mugimugi.org/browse/circle/7210/ - "drawcrowd.net", # https://drawcrowd.com/agussw - "drawr.net", # http://drawr.net/matsu310 - "dropbox.com", - "dropbox.com/sh", # https://www.dropbox.com/sh/gz9okupqycr2vj2/GHt_oHDKsR - "dropbox.com/u", # http://dl.dropbox.com/u/76682289/daitoHP-WP/pict/ - "e-hentai.org", # https://e-hentai.org/tag/artist:spirale - "e621.net", - "e621.net/post/index/1", # https://e621.net/post/index/1/spirale - "enty.jp", # https://enty.jp/aizawachihiro888 - "enty.jp/users", # https://enty.jp/users/3766 - "facebook.com", # https://www.facebook.com/LuutenantsLoot - "fantia.jp", # http://fantia.jp/no100 - "fantia.jp/fanclubs", # https://fantia.jp/fanclubs/1711 - "fav.me", # http://fav.me/d9y1njg - /blog-imgs-\d+(?:-origin)?\.fc2\.com/i, - "furaffinity.net", - "furaffinity.net/user", # http://www.furaffinity.net/user/achthenuts - "gelbooru.com", # http://gelbooru.com/index.php?page=account&s=profile&uname=junou - "inkbunny.net", # https://inkbunny.net/achthenuts - "plus.google.com", # https://plus.google.com/111509637967078773143/posts - "hentai-foundry.com", - "hentai-foundry.com/pictures/user", # http://www.hentai-foundry.com/pictures/user/aaaninja/ - "hentai-foundry.com/user", # http://www.hentai-foundry.com/user/aaaninja/profile - %r!pictures\.hentai-foundry\.com(?:/\w)?!i, # http://pictures.hentai-foundry.com/a/aaaninja/ - "i.imgur.com", # http://i.imgur.com/Ic9q3.jpg - "instagram.com", # http://www.instagram.com/serafleur.art/ - "iwara.tv", - "iwara.tv/users", # http://ecchi.iwara.tv/users/marumega - "kym-cdn.com", - "livedoor.blogimg.jp", - "monappy.jp", - "monappy.jp/u", # https://monappy.jp/u/abara_bone - "mstdn.jp", # https://mstdn.jp/@oneb - "nicoseiga.jp", - "nicoseiga.jp/priv", # http://lohas.nicoseiga.jp/priv/2017365fb6cfbdf47ad26c7b6039feb218c5e2d4/1498430264/6820259 - "nicovideo.jp", - "nicovideo.jp/user", # http://www.nicovideo.jp/user/317609 - "nicovideo.jp/user/illust", # http://seiga.nicovideo.jp/user/illust/29075429 - "nijie.info", # http://nijie.info/members.php?id=15235 - %r!nijie\.info/nijie_picture!i, # http://pic03.nijie.info/nijie_picture/32243_20150609224803_0.png - "patreon.com", # http://patreon.com/serafleur - "pawoo.net", # https://pawoo.net/@148nasuka - "pawoo.net/web/accounts", # https://pawoo.net/web/accounts/228341 - "picarto.tv", # https://picarto.tv/CheckerBoardAZN - "picarto.tv/live", # https://www.picarto.tv/live/channel.php?watch=aaaninja - "pictaram.com", # http://www.pictaram.com/user/5ish/3048385011/1350040096769940245_3048385011 - "pinterest.com", # http://www.pinterest.com/alexandernanitc/ - "pixiv.cc", # http://pixiv.cc/0123456789/ - "pixiv.net", # https://www.pixiv.net/member.php?id=10442390 - "pixiv.net/stacc", # https://www.pixiv.net/stacc/aaaninja2013 - "pixiv.net/fanbox/creator", # https://www.pixiv.net/fanbox/creator/310630 - "pixiv.net/users", # https://www.pixiv.net/users/555603 - "pixiv.net/en/users", # https://www.pixiv.net/en/users/555603 - "i.pximg.net", - "plurk.com", # http://www.plurk.com/a1amorea1a1 - "privatter.net", - "privatter.net/u", # http://privatter.net/u/saaaatonaaaa - "rule34.paheal.net", - "rule34.paheal.net/post/list", # http://rule34.paheal.net/post/list/Reach025/ - "sankakucomplex.com", # https://chan.sankakucomplex.com/?tags=user%3ASubridet - "society6.com", # http://society6.com/serafleur/ - "tinami.com", - "tinami.com/creator/profile", # http://www.tinami.com/creator/profile/29024 - "data.tumblr.com", - /\d+\.media\.tumblr\.com/i, - "twipple.jp", - "twipple.jp/user", # http://p.twipple.jp/user/Type10TK - "twitch.tv", # https://www.twitch.tv/5ish - "twitpic.com", - "twitpic.com/photos", # http://twitpic.com/photos/Type10TK - "twitter.com", # https://twitter.com/akkij0358 - "twitter.com/i/web/status", # https://twitter.com/i/web/status/943446161586733056 - "twimg.com/media", # https://pbs.twimg.com/media/DUUUdD5VMAEuURz.jpg:orig - "ustream.tv", - "ustream.tv/channel", # http://www.ustream.tv/channel/633b - "ustream.tv/user", # http://www.ustream.tv/user/kazaputi - "vk.com", # https://vk.com/id425850679 - "weibo.com", # http://www.weibo.com/5536681649 - "wp.com", - "yande.re", - "youtube.com", - "youtube.com/c", # https://www.youtube.com/c/serafleurArt - "youtube.com/channel", # https://www.youtube.com/channel/UCfrCa2Y6VulwHD3eNd3HBRA - "youtube.com/user", # https://www.youtube.com/user/148nasuka - "youtu.be" # http://youtu.be/gibeLKKRT-0 - ] - - SITE_BLACKLIST_REGEXP = Regexp.union(SITE_BLACKLIST.map do |domain| - domain = Regexp.escape(domain) if domain.is_a?(String) - %r!\Ahttps?://(?:[a-zA-Z0-9_-]+\.)*#{domain}/\z!i - end) - - def find_artists(url) - url = ArtistUrl.normalize(url) - artists = [] - - # return [] unless Sources::Strategies.find(url).normalized_for_artist_finder? - - while artists.empty? && url.size > 10 - u = url.sub(/\/+$/, "") + "/" - u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%' - artists += Artist.joins(:urls).where(["artists.is_deleted = FALSE AND artist_urls.normalized_url LIKE ? ESCAPE E'\\\\'", u]).limit(10).order("artists.name").all - url = File.dirname(url) + "/" - - break if url =~ SITE_BLACKLIST_REGEXP - end - - where(id: artists.uniq(&:name).take(20)) - end - end - def sorted_urls urls.sort {|a, b| a.priority <=> b.priority} end @@ -367,7 +227,7 @@ class Artist < ApplicationRecord elsif query.include?("*") where(id: ArtistUrl.where_like(:url, query).select(:artist_id)) elsif query =~ %r!\Ahttps?://!i - find_artists(query) + ArtistFinder.find_artists(query) else where(id: ArtistUrl.where_like(:url, "*#{query}*").select(:artist_id)) end diff --git a/test/unit/artist_test.rb b/test/unit/artist_test.rb index 16aa3eafe..1f9e447d7 100644 --- a/test/unit/artist_test.rb +++ b/test/unit/artist_test.rb @@ -2,7 +2,7 @@ require 'test_helper' class ArtistTest < ActiveSupport::TestCase def assert_artist_found(expected_name, source_url) - artists = Artist.find_artists(source_url).to_a + artists = ArtistFinder.find_artists(source_url).to_a assert_equal(1, artists.size) assert_equal(expected_name, artists.first.name, "Testing URL: #{source_url}") @@ -11,7 +11,7 @@ class ArtistTest < ActiveSupport::TestCase end def assert_artist_not_found(source_url) - artists = Artist.find_artists(source_url).to_a + artists = ArtistFinder.find_artists(source_url).to_a assert_equal(0, artists.size, "Testing URL: #{source_url}") rescue Net::OpenTimeout skip "Remote connection failed for #{source_url}"