From 000653d840f9e855f38617de645336d1feaf4c58 Mon Sep 17 00:00:00 2001 From: evazion Date: Mon, 28 Jun 2021 05:01:10 -0500 Subject: [PATCH] Parse and log user agent type to NewRelic. Parse the user agent and log whether it seems like a known bot or a human to NewRelic under the `user.bot` request attribute. This is so that known bots can be filtered out of search traffic analytics. Bots and search crawlers make up a significant portion of search traffic. --- app/logical/danbooru_logger.rb | 3 +- app/logical/user_agent.rb | 223 +++++++++++++++++++++++++++++++++ test/unit/user_agent_test.rb | 30 +++++ 3 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 app/logical/user_agent.rb create mode 100644 test/unit/user_agent_test.rb diff --git a/app/logical/danbooru_logger.rb b/app/logical/danbooru_logger.rb index e20b6f58c..4728a3b71 100644 --- a/app/logical/danbooru_logger.rb +++ b/app/logical/danbooru_logger.rb @@ -74,7 +74,8 @@ class DanbooruLogger level: user&.level_string, ip: request.remote_ip, country: CurrentUser.country, - safe_mode: CurrentUser.safe_mode? + safe_mode: CurrentUser.safe_mode?, + bot: UserAgent.new(request.headers["HTTP_USER_AGENT"]).is_bot?, } end diff --git a/app/logical/user_agent.rb b/app/logical/user_agent.rb new file mode 100644 index 000000000..4548d3467 --- /dev/null +++ b/app/logical/user_agent.rb @@ -0,0 +1,223 @@ +# Parses a user agent string and tries to determine whether it's a known bot or a known web browser. +# +# @see https://developers.whatismybrowser.com/ +# @see https://developers.whatismybrowser.com/api/features/user-agent-checks/weird/ +class UserAgent + attr_reader :user_agent + + # Initialize a user agent + # + # @param user_agent [String] the user agent string + def initialize(user_agent) + @user_agent = user_agent.to_s + end + + # @return [String] the name of the user agent + def name + if user_agent.blank? + "blank" + + # Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) + # Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.90 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html + # Googlebot-Image/1.0 + elsif user_agent =~ %r{http://www\.google\.com/bot\.html} || user_agent =~ %r{Googlebot} + "googlebot" + + # Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) + # Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) + elsif user_agent =~ %r{http://yandex\.com/bots} + "yandexbot" + + # Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) + elsif user_agent =~ %r{http://www\.bing\.com/bingbot\.htm} + "bingbot" + + # Mozilla/5.0 (Linux; Android 7.0;) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; PetalBot;+https://webmaster.petalsearch.com/site/petalbot) + elsif user_agent =~ %r{PetalBot} + "petalbot" + + # Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Grabber/7.5.1 + # Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0 Grabber/7.3.2 + # (see https://github.com/Bionus/imgbrd-grabber) + elsif user_agent =~ %r{Firefox/52\.0 Grabber/[^ ]+\z} + "imgbrd-grabber" + + # Mozilla/5.0 (Linux; U; Android 11; en_GB; M2101K7AG; Build/RKQ1.201022.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 onlymash.flexbooru.play/2.7.5.c1180 Mobile Safari/537.36 + # (see https://github.com/flexbooru/flexbooru) + elsif user_agent =~ %r{flexbooru} + "flexbooru" + + # Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.4 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.4 facebookexternalhit/1.1 Facebot Twitterbot/1.0 + # (spoofed by iMessage; see https://stackoverflow.com/questions/41499402 and https://www.reddit.com/r/iOSProgramming/comments/4wcake) + elsif user_agent =~ %r{\AMozilla/5.0.*facebookexternalhit.*Twitterbot} + "imessage" + + # http.rb/3.2.0 (Mastodon/2.4.4; +https://queer.af/) + # http.rb/4.4.1 (Mastodon/3.4.0; +https://mastodon.social/) Bot + # http.rb/4.4.1 (Mastodon/3.4.1; +https://raru.re/) Bot + elsif user_agent =~ %r{Mastodon} + "mastodon" + + # facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php) + # facebookexternalhit/1.1;line-poker/1.0 + elsif user_agent =~ %r{\Afacebookexternallhit} + "facebook" + + # Slackbot 1.0 (+https://api.slack.com/robots) + elsif user_agent =~ %r{Slackbot} + "slackbot" + + # TelegramBot (like TwitterBot) + elsif user_agent =~ %r{\ATelegramBot} + "telegrambot" + + # Python/3.9 aiohttp/3.6.3 + # Python/3.6 aiohttp/3.7.4.post0 + elsif user_agent =~ %r{aiohttp} + "aiohttp" + + # Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) + # Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 10.0; WOW64; Trident/7.0; Sleipnir6/6.4.12; SleipnirSiteUpdates/6.4.12) + # Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/3.0),gzip(gfe) + # Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/3.0),gzip(gfe) + elsif user_agent =~ %r{\AMozilla/[^ ]+ \(compatible; MSIE} + "msie" + + # Mozilla/5.0 (compatible; Hydrus Client) + # Mozilla/5.0 (compatible; SauceNAO Booru Aggregator 1.0) + # Mozilla/5.0 (compatible; inoreader.com; 1 subscribers) + # Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/) + # Mozilla/5.0 (compatible; coccocbot-web/1.0; +http://help.coccoc.com/searchengine) + # Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html) + # Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com) + # Mozilla/5.0 (compatible; DotBot/1.2; +https://opensiteexplorer.org/dotbot; help@moz.com) + # Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html) + # Mozilla/5.0 (compatible; MegaIndex.ru/2.0; +http://megaindex.com/crawler) + # Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/) + elsif user_agent =~ %r{\AMozilla/5\.0 \(compatible; ([^/;)]+)} + $1.downcase + + # Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.17 + # Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14 + # Opera/9.80 (Android; Opera Mini/57.0.2254/191.236; U; en) Presto/2.12.423 Version/12.16 + # Opera/9.80 (X11; Fedora; Linux x64) Presto/2.12.388 Version/12.18 + elsif user_agent =~ %r{\AOpera/9\.80} + "opera" + + # Blogtrottr/2.0 + # BooruNav.Android/1.0.0.296b + # com.apple.Safari.SearchHelper/16611.2.7.1.4 CFNetwork/1240.0.4 Darwin/20.5.0 + # CUMZONATORBOT/1.0 kotb + # curl/7.61.1 + # feedparser/6.0.2 +https://github.com/kurtmckee/feedparser/ + # FeedDemon/4.5 (http://www.feeddemon.com/; Microsoft Windows) + # FreshRSS/1.18.0 (Linux; https://freshrss.org) + # Go-http-client/2.0 + # Illustail/38 CFNetwork/1240.0.4 Darwin/20.5.0 + # ImageBoardAPI/https://github.com/Kodehawa/imageboard-api + # Irvine/1.3.0 + # iqdb/0.1 (+http://iqdb.org/) + # LoliSnatcher_Droid/1.8.1 + # Mantaro/6.2.7/JDA-DiscordBot (https://github.com/Mantaro/MantaroBot) + # MetalBrowser/5.0.0.0 + # multibooru/0.1beta (Linux) + # node-fetch/1.0 (+https://github.com/bitinn/node-fetch) + # Node/RssFeedEmitter (https://github.com/filipedeschamps/rss-feed-emitter) + # nori/3.5.0 + # okhttp/4.9.1 + # PassiveCrawler.green-hill/1.0.0.1 + # Pybooru/4.2.2 + # python-requests/2.22.0 + # Python-urllib/3.8 + # RSSOwlnix/2.8.0.202006031646 (Windows; U; en) + # Twitterbot/1.0 + # Valve/Steam HTTP Client 1.0 + # Wget/1.21.1 + elsif user_agent !~ %r{\AMozilla/5\.0} && user_agent =~ %r{\A([^ /]*)/([^ ]+)}i + $1.downcase + + # booru (https://github.com/AtlasTheBot/booru) + # got (https://github.com/sindresorhus/got) + # Kaori, a npm module for boorus. v2 (https://github.com/iCrawl/kaori/) + # Tiny Tiny RSS/21.06-7bd9572aa (http://tt-rss.org/) + elsif user_agent =~ %r{(https?://[^ )]+)} + $1 + + # Booru v1.2.0, a node package for booru searching (by AtlasTheBot) + # Danbooru-Aggregator + # EcchiBot / PrivateGER Discord Bot / Contact: + # Ktor client + # MobileSafari/604.1 CFNetwork/1240.0.4 Darwin/20.5.0 + # Universal Booru Wrapper (Alejandro Akbal) + # Mozilla/5.0 + # Mozilla/5.0 BooruSharp + elsif user_agent !~ %r{\AMozilla/5\.0 \([^)]+\)} + "unknown-bot" + + # Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0 + # Mozilla/5.0 (Windows NT 6.1; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0 + # Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0 + # Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0 + # Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0 + # Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0 + # Mozilla/5.0 (Android 11; Mobile; rv:89.0) Gecko/89.0 Firefox/89.0 + # Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:82.0) Gecko/20100101 Firefox/82.0 + # Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0 + elsif user_agent =~ %r{Gecko/[^ ]+ Firefox/[^ ]+} + "firefox" + + # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 + # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36 + # Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 + elsif user_agent =~ %r{AppleWebKit/[^ ]+ \(KHTML, like Gecko\) Chrome/[^ ]+ Safari/[^ ]+\z} + "chrome" + + # Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1 + # Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/91.0.4472.80 Mobile/15E148 Safari/604.1 + # Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1 + # Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15 + # Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/91.0.4472.80 Mobile/15E148 Safari/604.1 + # Mozilla/5.0 (PlayStation; PlayStation 4/8.52) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15 + elsif user_agent =~ %r{AppleWebKit/[^ ]+ \(KHTML, like Gecko\).*Safari/[^ ]+\z} && user_agent !~ %r{Chrome} + "safari" + + # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.54 + # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59 + # Mozilla/5.0 (Windows NT 10.0; Win64; x64; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19041 + # Mozilla/5.0 (Linux; Android 10; SM-A102U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.116 Mobile Safari/537.36 EdgA/46.04.2.5157 + elsif user_agent =~ %r{AppleWebKit/[^ ]+ \(KHTML, like Gecko\).*Edg[eA]?/[^ ]+\z} + "edge" + + # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 OPR/75.0.3969.285 + # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 OPR/77.0.4054.90 + # Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 OPR/77.0.4054.90 + # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 OPR/77.0.4054.90 (Edition Yx 05) + elsif user_agent =~ %r{AppleWebKit/[^ ]+ \(KHTML, like Gecko\) Chrome/[^ ]+ Safari/[^ ]+ OPR/[^ ]+} + "opera" + + # Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko + # Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko + # Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko + # Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko + # Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Geck; GreenBrowser) + # Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko + elsif user_agent =~ %r{Trident/[^ ;)]+} + "msie" + + # Mozilla/5.0 (Nintendo WiiU) AppleWebKit/536.30 (KHTML, like Gecko) NX/3.0.4.2.13 NintendoBrowser/4.3.2.11274.US + # Mozilla/5.0 (PLAYSTATION 3 4.87) AppleWebKit/531.22.8 (KHTML, like Gecko) + # Mozilla/5.0 (PlayStation Vita 3.73) AppleWebKit/537.73 (KHTML, like Gecko) Silk/3.2 + # Mozilla/5.0 (Linux; NetCast; U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36 SmartTV/10.0 Colt/2.0 + # Mozilla/5.0 (Nintendo 3DS; U; ; en) Version/1.7639.EU + # Mozilla/5.0 (Nintendo Switch; WifiWebAuthApplet) AppleWebKit/606.4 (KHTML, like Gecko) NF/6.0.1.16.11 NintendoBrowser/5.1.0.20935 + else + "unknown-browser" + end + end + + # Returns true if the agent is a known bot (or a human pretending to be a bot), or false if the agent + # is a known web browser (or a bot pretending to be a known browser). + def is_bot? + !name.in?(%w[chrome firefox safari opera edge msie unknown-browser]) + end +end \ No newline at end of file diff --git a/test/unit/user_agent_test.rb b/test/unit/user_agent_test.rb new file mode 100644 index 000000000..2acfed2f0 --- /dev/null +++ b/test/unit/user_agent_test.rb @@ -0,0 +1,30 @@ +require 'test_helper' + +class UserAgentTest < ActiveSupport::TestCase + def assert_name_equals(name, user_agent) + assert_equal(name, UserAgent.new(user_agent).name) + end + + context "UserAgent#name" do + should "parse the user agent name correctly" do + assert_name_equals("googlebot", "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.90 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html") + assert_name_equals("yandexbot", "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)") + assert_name_equals("bingbot", "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)") + assert_name_equals("discordbot", "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)") + assert_name_equals("twitterbot", "Twitterbot/1.0") + assert_name_equals("curl", "curl/7.61.1") + assert_name_equals("https://github.com/AtlasTheBot/booru", "booru (https://github.com/AtlasTheBot/booru)") + assert_name_equals("http://tt-rss.org/", "Tiny Tiny RSS/21.06-7bd9572aa (http://tt-rss.org/)") + assert_name_equals("unknown-bot", "Booru v1.2.0, a node package for booru searching (by AtlasTheBot)") + + assert_name_equals("chrome", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36") + assert_name_equals("firefox", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0") + assert_name_equals("safari", "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1") + assert_name_equals("edge", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.54") + assert_name_equals("opera", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 OPR/75.0.3969.285") + assert_name_equals("opera", "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.17") + assert_name_equals("msie", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko") + assert_name_equals("unknown-browser", "Mozilla/5.0 (Nintendo WiiU) AppleWebKit/536.30 (KHTML, like Gecko) NX/3.0.4.2.13 NintendoBrowser/4.3.2.11274.US") + end + end +end \ No newline at end of file