<?PHP
#
#   FILE:  BotDetector.php
#
#   Part of the Collection Workflow Integration System (CWIS)
#   Copyright 2002-2013 Edward Almasy and Internet Scout Research Group
#   http://scout.wisc.edu/cwis/
#

/**
* Provides support for detecting whether a page was loaded by a person or by an
* automated program, e.g., a web crawler or spider.
*/
class BotDetector extends Plugin
{

    /**
    * Register information about this plugin.
    */
    public function Register()
    {
        $this->Name = "Bot Detector";
        $this->Version = "1.2.2";
        $this->Description = "Provides support for detecting whether the"
                ." current page load is by an actual person or by an automated"
                ." <a href=\"http://en.wikipedia.org/wiki/Web_crawler\""
                ." target=\"_blank\">web crawler or spider</a>.";
        $this->Author = "Internet Scout";
        $this->Url = "http://scout.wisc.edu/cwis/";
        $this->Email = "scout@scout.wisc.edu";
        $this->Requires = array("CWISCore" => "2.1.0");
        $this->EnabledByDefault = TRUE;

        $this->CfgSetup["HttpBLAccessKey"] = array(
                "Type" => "Text",
                "Label" => "http:BL Access Key",
                "Help" => "(Optional) Your http:BL Access Key "
                ." from <a href=\"http://www.projecthoneypot.org/\">Project Honeypot</a>"
                .", used to identify web robots by IP address. "
                ."Keys are 12 lowercase letters (e.g., <i>abcdefghjkmn</i>).",
                "Size" => 16);

        $this->CfgSetup["BotPruning"] = array(
            "Type" => "Flag",
            "Label"=> "Bot Pruning",
            "Help" => "When a bot is detected, should all data for that bot's IP "
            ."be pruned from data collected by MetricsRecorder?",
            "OnLabel" => "Yes",
            "OffLabel" => "No",
            "Default" => TRUE );
    }

    /**
    * Perform table creation necessary when the plugin is first installed.
    * @return NULL on success, string containing an error message otherwise.
    */
    public function Install()
    {
        return $this->CreateTables($this->SqlTables);
    }

    /**
    * Perform work necessary on upgrades.
    * @param string $PreviousVersion The version number of the plugin
    * that was previously installed.
    * @return NULL if upgrade succeeded, string with an error message
    * otherwise.
    */
    public function Upgrade($PreviousVersion)
    {
        if (version_compare($PreviousVersion, "1.1.0", "<"))
        {
            $Result = $this->CreateTables($this->SqlTables);
            if ($Result !== NULL)
            {
                return $Result;
            }
        }

        if (version_compare($PreviousVersion, "1.2.0", "<"))
        {
            $DB = new Database();

            if ($DB->FieldExists("BotDetector_DNSCache", "IP"))
            {
                $Result = $DB->Query(
                    "ALTER TABLE BotDetector_DNSCache "
                    ."CHANGE IP IPAddress INT UNSIGNED ");
                if ($Result === FALSE)
                {
                    return "Could not update the IP Column";
                }
            }

            $Result = $this->CreateMissingTables($this->SqlTables);
            if ($Result !== NULL)
            {
                return $Result;
            }
        }

        if (version_compare($PreviousVersion, "1.2.2", "<"))
        {
            $DB = new Database();
            if (!$DB->FieldExists("BotDetector_DNSCache", "LastUsed"))
            {
                $Result = $DB->Query(
                    "ALTER TABLE BotDetector_DNSCache "
                    ."ADD COLUMN LastUsed TIMESTAMP, ADD INDEX (LastUsed)");
                if ($Result === FALSE)
                {
                    return "Could not add LastUsed column to DNSCache";
                }
            }
        }

        return NULL;
    }

    /**
    * Perform table deletion necessary when the plugin is uninstalled.
    * @return NULL on success, string containing error message on failure.
    */
    public function Uninstall()
    {
        return $this->DropTables($this->SqlTables);
    }

    /**
    * Initialize the plugin.  This method is called after all plugins
    * are loaded but before any other plugin methods (except Register)
    * are called.
    * @return NULL on success, error string otherwise.
    */
    public function Initialize()
    {
        # if an access key was provided but that key is not valid, complain
        if (strlen($this->ConfigSetting("HttpBLAccessKey")) != 0 &&
            !self::BlacklistAccessKeyLooksValid($this->ConfigSetting("HttpBLAccessKey")))
        {
            return "Incorrect Http:BL key format.  Keys are 12 lowercase letters.";
        }

        $GLOBALS["AF"]->AddCleanUrl(
            "%^canary/[0-9]+/canary.js%",
            "P_BotDetector_Canary",
            array("JS" => 1 ) );

        $GLOBALS["AF"]->AddCleanUrl(
            "%^canary/[0-9]+/canary.css%",
            "P_BotDetector_Canary" );

        return NULL;
    }

    /**
    * Declare the events this plugin provides to the application framework.
    * @return Returns an array of events this plugin provides.
    */
    public function DeclareEvents()
    {
        return array(
                "BotDetector_EVENT_CHECK_FOR_BOT"
                        => ApplicationFramework::EVENTTYPE_FIRST,
                );
    }

    /**
    * Hook the events into the application framework.
    * @return Returns an array of events to be hooked into the application
    *      framework.
    */
    public function HookEvents()
    {
        return array(
                "BotDetector_EVENT_CHECK_FOR_BOT" => "CheckForBot",
                "EVENT_IN_HTML_HEADER" => "GenerateHTMLForCanary",
                "EVENT_HOURLY" => "CleanCacheData",
                );
    }

    /**
    * Generate HTML elements to display a CSS and JS Canary used to test for bots.
    */
    public function GenerateHTMLForCanary()
    {
        # if the user's IP address looks reasonably valid
        if (preg_match(
                "/[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}/",
                $_SERVER["REMOTE_ADDR"]))
        {
            if ($GLOBALS["G_PluginManager"]->PluginEnabled("CleanURLs"))
            {
                print "\n<link rel=\"stylesheet\" type=\"text/css\" "
                    ."href=\"canary/".rand()."/canary.css\" />\n"
                    ."<script type=\"text/javascript\" "
                    ."src=\"canary/".rand()."/canary.js\"></script>\n";
            }
            else
            {
                print "\n<link rel=\"stylesheet\" type=\"text/css\" "
                    ."href=\"index.php?P=P_BotDetector_Canary&amp;RN="
                            .rand()."\" />\n"
                    ."<script type=\"text/javascript\" "
                    ."src=\"index.php?P=P_BotDetector_Canary&amp;JS=1"
                    ."&amp;RN=".rand()."\"></script>\n";
            }

            # record in the database that the canary was shown
            $DB = new Database();
            $DB->Query(
                "INSERT INTO BotDetector_CanaryData (IPAddress, CanaryLastShown) "
                ."VALUES (INET_ATON('".addslashes($_SERVER["REMOTE_ADDR"])
                        ."'), NOW()) "
                ." ON DUPLICATE KEY UPDATE CanaryLastShown=NOW()");
        }
    }

    /**
    * Determine whether the page was loaded by a person or an automated program.
    * @return Returns TRUE if the page was loaded by an automated program.
    */
    public function CheckForBot()
    {
        static $BotCheckValue;

        if (!isset($BotCheckValue))
        {
            # checks based on Useragent
            if (isset($_SERVER['HTTP_USER_AGENT']))
            {
                # check against a blacklist of known crawlers
                foreach ($this->Robots as $Robot)
                {
                    if (preg_match('/'.$Robot.'/i', $_SERVER['HTTP_USER_AGENT']))
                    {
                        $BotCheckValue = TRUE;
                        break;
                    }
                }
            }

            # when we can't figure it out by Useragent
            if (!isset($BotCheckValue))
            {
                # try looking up this host in httpBL
                $BLValue = $this->CheckHttpBL();

                # if httpBL is configured and working and if it
                # returns a blacklist entry for this IP
                if ($BLValue !== NULL && $BLValue !== FALSE)
                {
                    # if this IP is marked as "Suspicious" with no
                    # other annotations, don't count it as a bot
                    if ($BLValue["BotType"] == self::BT_SUSPICIOUS)
                    {
                        $BotCheckValue = FALSE;
                    }
                    else
                    {
                        # otherwise, call it a bot
                        $BotCheckValue = TRUE;
                    }
                }
            }

            # lastly, look for the canary values
            if (!isset($BotCheckValue))
            {
                $DB = new Database();
                $DB->Query("SELECT CanaryLastShown, CanaryLastLoaded"
                  ." FROM BotDetector_CanaryData WHERE"
                  ." IPAddress=INET_ATON('".addslashes($_SERVER["REMOTE_ADDR"])."')");
                $Data = $DB->FetchRow();

                if ($Data === FALSE
                    || $Data["CanaryLastLoaded"] !== NULL
                    || (time() - strtotime( $Data["CanaryLastShown"] )  < 60 ) )
                {
                    # presume not a bot when
                    #  - We've never shown them the canary
                    #  - When they've loaded the canary
                    #  - Or when it's been less than 60s since they
                    #    were last shown the canary
                    $BotCheckValue = FALSE;
                }
                else
                {
                    # but if we *have* shown them the canary
                    # and it's been more than 60s, presume a bot
                    $BotCheckValue = TRUE;
                }
            }
        }

        if ( $BotCheckValue &&
                $this->ConfigSetting("BotPruning") &&
                $GLOBALS["G_PluginManager"]->PluginEnabled("MetricsRecorder") )
        {
            $Recorder = $GLOBALS["G_PluginManager"]->GetPlugin("MetricsRecorder");
            $Recorder->RemoveEventsForIPAddress( $_SERVER["REMOTE_ADDR"] );
        }

        return $BotCheckValue;
    }

    /**
    * Use Project Honeypot's Http:BL service to determine if the current
    * client is likely to be some flavor of spam robot.
    * @return TRUE for spambots, FALSE otherwise.
    */
    public function CheckForSpamBot()
    {
        $BLValue = $this->CheckHttpBL();

        # if httpBL is either not correctly configured or does not
        # list the client IP, the this is not a spam bot
        if (!is_array($BLValue))
        {
            return FALSE;
        }

        # it httpBL has a listing for this IP, but it does not have
        # the spammer flag set, then this is not a spam bot
        if ( ($BLValue["BotType"] & self::BT_COMMENTSPAMMER)==0 )
        {
            return FALSE;
        }

        # otherwise, we are a spam bot

        # prune metrics entries if configured to do so
        if ($this->ConfigSetting("BotPruning") &&
            $GLOBALS["G_PluginManager"]->PluginEnabled("MetricsRecorder") )
        {
            $Recorder = $GLOBALS["G_PluginManager"]->GetPlugin("MetricsRecorder");
            $Recorder->RemoveEventsForIPAddress( $_SERVER["REMOTE_ADDR"] );
        }

        # and report spamminess back to the caller
        return TRUE;
    }

    /**
    * Remove stale cached DNSCache entries.
    */
    public function CleanCacheData()
    {
        $DB = new Database();

        # clean out DNS cache data that was last used > 2 hours ago
        $DB->Query("DELETE FROM BotDetector_DNSCache "
                   ."WHERE LastUsed < (NOW() - INTERVAL 2 HOUR)");

        # queue background tasks to refresh DNS cache data for IPs
        # that are still being used
        $DB->Query(
            "SELECT INET_NTOA(IPAddress) AS IP FROM BotDetector_DNSCache "
            ."WHERE Retrieved < (NOW() - INTERVAL 2 HOUR)");
        $IPs = $DB->FetchColumn("IP");
        foreach ($IPs as $IP)
        {
            $GLOBALS["AF"]->QueueUniqueTask(
                ["BotDetector", "UpdateDnsCacheForIP"], [$IP],
                ApplicationFramework::PRIORITY_LOW,
                "Update HttpBL DNS cache data for ".$IP);
        }

        # if we're recording metrics, we'll want to clean out metrics data
        #  recorded in that 60s window between showing the canary and deciding
        #  that a particular IP is likely a bot because they didn't load it
        if ($GLOBALS["G_PluginManager"]->PluginEnabled("MetricsRecorder"))
        {
            $Recorder = $GLOBALS["G_PluginManager"]->GetPlugin("MetricsRecorder");

            $DB->Query(
                "SELECT CanaryLastShown, INET_NTOA(IPAddress) AS IP "
                ." FROM BotDetector_CanaryData"
                ." WHERE CanaryLastShown < (NOW() - INTERVAL 120 SECOND) "
                ." AND CanaryLastLoaded IS NULL");

            $BadIps = $DB->FetchRows();
            foreach ($BadIps as $Row)
            {
                $GLOBALS["AF"]->QueueUniqueTask(
                    ["BotDetector", "CleanBotFromMetrics"],
                    [ $Row["IP"], $Row["CanaryLastShown"] ],
                    ApplicationFramework::PRIORITY_LOW,
                    "Clean out metrics data for a bot at ".$Row["IP"]);
            }
        }

        # clean out the canary data for IPs where we've not showed the
        #  canary in a long time AND they've never loaded it or haven't
        #    loaded it for a long time
        $DB->Query(
            "DELETE FROM BotDetector_CanaryData "
            ."WHERE CanaryLastShown < (NOW() - INTERVAL 2 HOUR) "
            ."  AND (CanaryLastLoaded IS NULL OR "
            ."       CanaryLastLoaded < (NOW() - INTERVAL 2 HOUR)) ");
    }

    /**
    * Clean out MetricsRecorder logs for a Bot.
    * @param string $TargetIP IP address to clean up.
    * @param string $StartTime Oldest date/time to remove.
    */
    public static function CleanBotFromMetrics($TargetIP, $StartTime)
    {
        if ($GLOBALS["G_PluginManager"]->PluginEnabled("MetricsRecorder"))
        {
            $Recorder = $GLOBALS["G_PluginManager"]->GetPlugin("MetricsRecorder");

            $Recorder->RemoveEventsForIPAddress(
                $TargetIP, $StartTime);
        }
    }

    /**
    * Perform background update of cached HttpBL result for a given IP address.
    * @param string $RemoteIP Remote address to update.
    */
    public static function UpdateDnsCacheForIP($RemoteIP)
    {
        $Plugin = $GLOBALS["G_PluginManager"]->GetPlugin("BotDetector");
        $AccessKey = $Plugin->ConfigSetting("HttpBLAccessKey");

        # if access key setting is valid
        if (self::BlacklistAccessKeyLooksValid($AccessKey))
        {
            # do dns query
            $Result = self::DoHttpBLDNSLookup($AccessKey, $RemoteIP);

            # and update database cache
            $DB = new Database();
            $DB->Query(
                "UPDATE BotDetector_DNSCache "
                ."SET Result=INET_ATON('".addslashes($Result)."'), Retrieved=NOW() "
                ."WHERE IPAddress=INET_ATON('".addslashes($RemoteIP)."')");
        }
    }

    # ---- PRIVATE INTERFACE ---------------------------------------------------

    /**
    * Check the client IP against Project Honeypot's Http:BL service.
    * @return NULL when no Access Key is configured or on errors,
    * FALSE for non-bots, and an array of host information for bots.
    * For all bots, this array contains a BotType.  For search bots,
    * it will also have a SearchEngine.  For other kinds of bot, it
    * will have LastActivity and ThreatScore.  Meaning for these
    * elements is described in the Http:BL API documentation:
    * http://www.projecthoneypot.org/httpbl_api.php
    */
    private function CheckHttpBL()
    {
        static $HttpBLValue;

        if (!isset($HttpBLValue))
        {
            $RemoteIP = $_SERVER["REMOTE_ADDR"];
            $AccessKey = $this->ConfigSetting("HttpBLAccessKey");

            # if not from localhost and a key is set and of the right length
            if (($RemoteIP !== "::1")
                    && ($RemoteIP !== "127.0.0.1")
                    && self::BlacklistAccessKeyLooksValid($AccessKey))
            {
                # grab an AF Lock named for this IP
                $LockName = "BotDetector_IP_".$RemoteIP;
                $GLOBALS["AF"]->GetLock($LockName);

                # check to see if we have a cached status for this IP
                # so that we're not doing a dnslookup on every pageload
                $DB = new Database();
                $DB->Query("SELECT INET_NTOA(Result) as Rx FROM BotDetector_DNSCache "
                           ."WHERE IPAddress=INET_ATON('".addslashes($RemoteIP)."')");

                # if a cached HttpBL result was found
                if ($DB->NumRowsSelected()>0)
                {
                    # use it and update the LastUsed time for this cache row
                    $Row = $DB->FetchRow();
                    $Result = $Row["Rx"];
                    $DB->Query(
                        "UPDATE BotDetector_DNSCache SET LastUsed=NOW()"
                        ." WHERE IPAddress=INET_ATON('".addslashes($RemoteIP)."')");
                }
                else
                {
                    # if nothing was in the cache, do the DNS lookup in the foreground
                    $Result = self::DoHttpBLDNSLookup($AccessKey, $RemoteIP);

                    # and store the result in the cache
                    $DB->Query("INSERT INTO BotDetector_DNSCache"
                            ." (IPAddress, Result, LastUsed) VALUES "
                            ."(INET_ATON('".addslashes($RemoteIP)."'), "
                            ." INET_ATON('".addslashes($Result)."'), NOW())");
                }
                $GLOBALS["AF"]->ReleaseLock($LockName);

                if ($Result === NULL)
                {
                    # no blacklist entry found = not a bot
                    $HttpBLValue = FALSE;
                }
                else
                {
                    # found blacklist entry; parse the reply to figure out what it said
                    $Data = explode('.', $Result);

                    # first octet should be 127 for correctly formed queries
                    if ($Data[0] == 127)
                    {
                        # pull the Bot Type information out of the fourth octet
                        $HttpBLValue = array( "BotType" => $Data[3] );

                        if ($Data[3] == 0)
                        {
                            # if the bot was a search engine, then the engine type can be
                            # extracted from the third octet
                            $HttpBLValue["SearchEngine"] = $Data[2];
                        }
                        else
                        {
                            # for other bot types, the number of days since last activity
                            # is in the second octet, and a Threat Score is in the third
                            $HttpBLValue["LastActivity"] = $Data[1];
                            $HttpBLValue["ThreatScore"]  = $Data[2];
                        }
                    }
                    else
                    {
                        # return NULL when the query indicates an error
                        # the API documentation suggests that the most common problem
                        #  is an incorrect access key
                        $HttpBLValue = NULL;
                    }
                }
            }
            else
            {
                # return NULL when no keys are configured
                $HttpBLValue = NULL;
            }
        }

        return $HttpBLValue;
    }

    /**
    * Query HttpBL with a DNS lookup of their API-specified synthetic hostname.
    * @param string $AccessKey HttpBL access key.
    * @param string $IpAddress Address to query in dotted-quad notation.
    * @return string Synthetic IP address returned from dnsbl or NULL
    *     when nothing is returned.
    */
    private static function DoHttpBLDNSLookup($AccessKey, $IpAddress)
    {

        $ReversedIp = implode('.', array_reverse(explode('.', $IpAddress)));
        $DnsQuery =  $AccessKey.".".$ReversedIp.".dnsbl.httpbl.org.";

        $Result = gethostbyname($DnsQuery);

        # (gethostbyname() returns the argument on failure)
        if ($Result == $DnsQuery)
        {
            $Result = NULL;
        }

        return $Result;
    }

    /**
    * Determine if the configured HttpBL access key is in the correct format.
    * @param string $AccessKey Access key value to test.
    * @return bool TRUE for valid-looking keys
    */
    private static function BlacklistAccessKeyLooksValid($AccessKey)
    {
        return preg_match('/[a-z]{12}/', $AccessKey);
    }

    # constants describing BotType bitset returned by Http:BL
    const BT_SEARCHENGINE   = 0;
    const BT_SUSPICIOUS     = 1;
    const BT_HARVESTER      = 2;
    const BT_COMMENTSPAMMER = 4;

    # constants describing the Search Engines returned by Http:BL
    const SE_UNDOCUMENTED =  0;
    const SE_ALTAVIST     =  1;
    const SE_ASK          =  2;
    const SE_BAIDU        =  3;
    const SE_EXCITE       =  4;
    const SE_GOOGLE       =  5;
    const SE_LOOKSMART    =  6;
    const SE_LYCOS        =  7;
    const SE_MSN          =  8;
    const SE_YAHOO        =  9;
    const SE_CUIL         = 10;
    const SE_INFOSEEK     = 11;
    const SE_MISC         = 12;

    ## Borrow patterns for known bots from awstats-7.3, lib/robots.pm
    ## Here, we're talking all three of their bots lists (common, uncommon, and generic)
    // @codingStandardsIgnoreStart
    private $Robots = array(
        ## From RobotsSearchIdOrder_list1
        'appie', 'architext', 'bingpreview', 'bjaaland', 'contentmatch',
        'ferret', 'googlebot\-image', 'googlebot', 'google\-sitemaps',
        'google[_+ ]web[_+ ]preview', 'grabber', 'gulliver',
        'virus[_+ ]detector', 'harvest', 'htdig', 'jeeves', 'linkwalker',
        'lilina', 'lycos[_+ ]', 'moget', 'muscatferret', 'myweb', 'nomad',
        'scooter', 'slurp', '^voyager\/', 'weblayers',
        'antibot', 'bruinbot', 'digout4u', 'echo!', 'fast\-webcrawler',
        'ia_archiver\-web\.archive\.org', 'ia_archiver', 'jennybot', 'mercator',
        'netcraft', 'msnbot\-media', 'msnbot', 'petersnews', 'relevantnoise\.com',
        'unlost_web_crawler', 'voila', 'webbase', 'webcollage', 'cfetch', 'zyborg',
        'wisenutbot',
        ## From RobotsSearchIdOrder_list2
        '[^a]fish', 'abcdatos', 'abonti\.com', 'acme\.spider', 'ahoythehomepagefinder',
        'ahrefsbot', 'alkaline', 'anthill', 'arachnophilia', 'arale', 'araneo',
        'aretha', 'ariadne', 'powermarks', 'arks', 'aspider', 'atn\.txt',
        'atomz', 'auresys', 'backrub', 'bbot', 'bigbrother', 'blackwidow',
        'blindekuh', 'bloodhound', 'borg\-bot', 'brightnet', 'bspider', 'cactvschemistryspider',
        'calif[^r]', 'cassandra', 'cgireader', 'checkbot', 'christcrawler',
        'churl', 'cienciaficcion', 'collective', 'combine', 'conceptbot',
        'coolbot', 'core', 'cosmos', 'cruiser', 'cusco',
        'cyberspyder', 'desertrealm', 'deweb', 'dienstspider', 'digger',
        'diibot', 'direct_hit', 'dnabot', 'download_express', 'dragonbot',
        'dwcp', 'e\-collector', 'ebiness', 'elfinbot', 'emacs',
        'emcspider', 'esther', 'evliyacelebi', 'fastcrawler', 'feedcrawl',
        'fdse', 'felix', 'fetchrover', 'fido', 'finnish',
        'fireball', 'fouineur', 'francoroute', 'freecrawl', 'funnelweb',
        'gama', 'gazz', 'gcreep', 'getbot', 'geturl',
        'golem', 'gougou', 'grapnel', 'griffon', 'gromit',
        'gulperbot', 'hambot', 'havindex', 'hometown', 'htmlgobble',
        'hyperdecontextualizer', 'iajabot', 'iaskspider', 'hl_ftien_spider', 'sogou',
        'icjobs\.de', 'iconoclast', 'ilse', 'imagelock', 'incywincy',
        'informant', 'infoseek', 'infoseeksidewinder', 'infospider', 'inspectorwww',
        'intelliagent', 'irobot', 'iron33', 'israelisearch', 'javabee',
        'jbot', 'jcrawler', 'jobo', 'jobot', 'joebot',
        'jubii', 'jumpstation', 'kapsi', 'katipo', 'kilroy',
        'ko[_+ ]yappo[_+ ]robot', 'kummhttp', 'labelgrabber\.txt', 'larbin', 'legs',
        'linkidator', 'linkscan', 'lockon', 'logo_gif', 'macworm',
        'magpie', 'marvin', 'mattie', 'mediafox', 'merzscope',
        'meshexplorer', 'mindcrawler', 'mnogosearch', 'momspider', 'monster',
        'motor', 'muncher', 'mwdsearch', 'ndspider', 'nederland\.zoek',
        'netcarta', 'netmechanic', 'netscoop', 'newscan\-online', 'nhse',
        'northstar', 'nzexplorer', 'objectssearch', 'occam', 'octopus',
        'openfind', 'orb_search', 'packrat', 'pageboy', 'parasite',
        'patric', 'pegasus', 'perignator', 'perlcrawler', 'phantom',
        'phpdig', 'piltdownman', 'pimptrain', 'pioneer', 'pitkow',
        'pjspider', 'plumtreewebaccessor', 'poppi', 'portalb', 'psbot',
        'python', 'raven', 'rbse', 'resumerobot', 'rhcs',
        'road_runner', 'robbie', 'robi', 'robocrawl', 'robofox',
        'robozilla', 'roverbot', 'rules', 'safetynetrobot', 'search\-info',
        'search_au', 'searchprocess', 'senrigan', 'sgscout', 'shaggy',
        'shaihulud', 'sift', 'simbot', 'site\-valet', 'sitetech',
        'skymob', 'slcrawler', 'smartspider', 'snooper', 'solbot',
        'speedy', 'spider[_+ ]monkey', 'spiderbot', 'spiderline', 'spiderman',
        'spiderview', 'spry', 'sqworm', 'ssearcher', 'suke',
        'sunrise', 'suntek', 'sven', 'tach_bw', 'tagyu_agent',
        'tailrank', 'tarantula', 'tarspider', 'techbot', 'templeton',
        'titan', 'titin', 'tkwww', 'tlspider', 'ucsd',
        'udmsearch', 'universalfeedparser', 'urlck', 'valkyrie', 'verticrawl',
        'victoria', 'visionsearch', 'voidbot', 'vwbot', 'w3index',
        'w3m2', 'wallpaper', 'wanderer', 'wapspIRLider', 'webbandit',
        'webcatcher', 'webcopy', 'webfetcher', 'webfoot', 'webinator',
        'weblinker', 'webmirror', 'webmoose', 'webquest', 'webreader',
        'webreaper', 'websnarf', 'webspider', 'webvac', 'webwalk',
        'webwalker', 'webwatch', 'whatuseek', 'whowhere', 'wired\-digital',
        'wmir', 'wolp', 'wombat', 'wordpress', 'worm',
        'woozweb', 'wwwc', 'wz101', 'xget',
        '1\-more_scanner', '360spider', 'a6-indexer', 'accoona\-ai\-agent', 'activebookmark',
        'adamm_bot', 'adsbot-google', 'almaden', 'aipbot', 'aleadsoftbot',
        'alpha_search_agent', 'allrati', 'aport', 'archive\.org_bot', 'argus',
        'arianna\.libero\.it', 'aspseek', 'asterias', 'awbot', 'backlinktest\.com',
        'baiduspider', 'becomebot', 'bender', 'betabot', 'biglotron',
        'bittorrent_bot', 'biz360[_+ ]spider', 'blogbridge[_+ ]service', 'bloglines', 'blogpulse',
        'blogsearch', 'blogshares', 'blogslive', 'blogssay', 'bncf\.firenze\.sbn\.it\/raccolta\.txt',
        'bobby', 'boitho\.com\-dc', 'bookmark\-manager', 'boris', 'bubing',
        'bumblebee', 'candlelight[_+ ]favorites[_+ ]inspector', 'careerbot', 'cbn00glebot', 'cerberian_drtrs',
        'cfnetwork', 'cipinetbot', 'checkweb_link_validator', 'commons\-httpclient',
        'computer_and_automation_research_institute_crawler',
        'converamultimediacrawler', 'converacrawler', 'copubbot', 'cscrawler',
        'cse_html_validator_lite_online', 'cuasarbot', 'cursor', 'custo', 'datafountains\/dmoz_downloader',
        'dataprovider\.com', 'daumoa', 'daviesbot', 'daypopbot', 'deepindex',
        'dipsie\.bot', 'dnsgroup', 'domainchecker', 'domainsdb\.net', 'dulance',
        'dumbot', 'dumm\.de\-bot', 'earthcom\.info', 'easydl', 'eccp',
        'edgeio\-retriever', 'ets_v', 'exactseek', 'extreme[_+ ]picture[_+ ]finder', 'eventax',
        'everbeecrawler', 'everest\-vulcan', 'ezresult', 'enteprise', 'facebook',
        'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de',
        'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de',
        'matrix_s\.p\.a\._\-_fast_enterprise_crawler',
        'fast_enterprise_crawler', 'fast\-search\-engine', 'favicon', 'favorg', 'favorites_sweeper',
        'feedburner', 'feedfetcher\-google', 'feedflow', 'feedster', 'feedsky',
        'feedvalidator', 'filmkamerabot', 'filterdb\.iss\.net', 'findlinks', 'findexa_crawler',
        'firmilybot', 'foaf-search\.net', 'fooky\.com\/ScorpionBot', 'g2crawler', 'gaisbot',
        'geniebot', 'gigabot', 'girafabot', 'global_fetch', 'gnodspider',
        'goforit\.com', 'goforitbot', 'gonzo', 'grapeshot', 'grub',
        'gpu_p2p_crawler', 'henrythemiragorobot', 'heritrix', 'holmes', 'hoowwwer',
        'hpprint', 'htmlparser', 'html[_+ ]link[_+ ]validator', 'httrack', 'hundesuche\.com\-bot',
        'i-bot', 'ichiro', 'iltrovatore\-setaccio', 'infobot', 'infociousbot',
        'infohelfer', 'infomine', 'insurancobot', 'integromedb\.org', 'internet[_+ ]ninja',
        'internetarchive', 'internetseer', 'internetsupervision', 'ips\-agent', 'irlbot',
        'isearch2006', 'istellabot', 'iupui_research_bot',
        'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility', 'justview',
        'kalambot', 'kamano\.de_newsfeedverzeichnis', 'kazoombot', 'kevin', 'keyoshid',
        'kinjabot', 'kinja\-imagebot', 'knowitall', 'knowledge\.com', 'kouaa_krawler',
        'krugle', 'ksibot', 'kurzor', 'lanshanbot', 'letscrawl\.com',
        'libcrawl', 'linkbot', 'linkdex\.com', 'link_valet_online', 'metager\-linkchecker',
        'linkchecker', 'livejournal\.com', 'lmspider', 'ltbot', 'lwp\-request',
        'lwp\-trivial', 'magpierss', 'mail\.ru', 'mapoftheinternet\.com', 'mediapartners\-google',
        'megite', 'metaspinner', 'miadev', 'microsoft bits', 'microsoft.*discovery',
        'microsoft[_+ ]url[_+ ]control', 'mini\-reptile', 'minirank', 'missigua_locator', 'misterbot',
        'miva', 'mizzu_labs', 'mj12bot', 'mojeekbot', 'msiecrawler',
        'ms_search_4\.0_robot', 'msrabot', 'msrbot', 'mt::telegraph::agent', 'mydoyouhike',
        'nagios', 'nasa_search', 'netestate ne crawler', 'netluchs', 'netsprint',
        'newsgatoronline', 'nicebot', 'nimblecrawler', 'noxtrumbot', 'npbot',
        'nutchcvs', 'nutchosu\-vlib', 'nutch', 'ocelli', 'octora_beta_bot',
        'omniexplorer[_+ ]bot', 'onet\.pl[_+ ]sa', 'onfolio', 'opentaggerbot', 'openwebspider',
        'oracle_ultra_search', 'orbiter', 'yodaobot', 'qihoobot', 'passwordmaker\.org',
        'pear_http_request_class', 'peerbot', 'perman', 'php[_+ ]version[_+ ]tracker', 'pictureofinternet',
        'ping\.blo\.gs', 'plinki', 'pluckfeedcrawler', 'pogodak', 'pompos',
        'popdexter', 'port_huron_labs', 'postfavorites', 'projectwf\-java\-test\-crawler', 'proodlebot',
        'pyquery', 'rambler', 'redalert', 'rojo', 'rssimagesbot',
        'ruffle', 'rufusbot', 'sandcrawler', 'sbider', 'schizozilla',
        'scumbot', 'searchguild[_+ ]dmoz[_+ ]experiment', 'searchmetricsbot', 'seekbot', 'semrushbot',
        'sensis_web_crawler', 'seokicks\.de', 'seznambot', 'shim\-crawler', 'shoutcast',
        'siteexplorer\.info', 'slysearch', 'snap\.com_beta_crawler', 'sohu\-search', 'sohu',
        'snappy', 'spbot', 'sphere_scout', 'spiderlytics', 'spip',
        'sproose_crawler', 'ssearch_bot', 'steeler', 'steroid__download', 'suchfin\-bot',
        'superbot', 'surveybot', 'susie', 'syndic8', 'syndicapi',
        'synoobot', 'tcl_http_client_package', 'technoratibot', 'teragramcrawlersurf', 'test_crawler',
        'testbot', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', 'topicblogs', 'turnitinbot', 'turtlescanner',
        'turtle', 'tutorgigbot', 'twiceler', 'ubicrawler', 'ultraseek',
        'unchaos_bot_hybrid_web_search_engine', 'unido\-bot', 'unisterbot', 'updated', 'ustc\-semantic\-group',
        'vagabondo\-wap', 'vagabondo', 'vermut', 'versus_crawler_from_eda\.baykan@epfl\.ch', 'vespa_crawler',
        'vortex', 'vse\/', 'w3c\-checklink', 'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', 'w3c_validator',
        'watchmouse', 'wavefire', 'waybackarchive\.org', 'webclipping\.com', 'webcompass',
        'webcrawl\.net', 'web_downloader', 'webdup', 'webfilter', 'webindexer',
        'webminer', 'website[_+ ]monitoring[_+ ]bot', 'webvulncrawl', 'wells_search', 'wesee:search',
        'wonderer', 'wume_crawler', 'wwweasel', 'xenu\'s_link_sleuth', 'xenu_link_sleuth',
        'xirq', 'y!j', 'yacy', 'yahoo\-blogs', 'yahoo\-verticalcrawler',
        'yahoofeedseeker', 'yahooseeker\-testing', 'yahooseeker', 'yahoo\-mmcrawler', 'yahoo!_mindset',
        'yandex', 'flexum', 'yanga', 'yet-another-spider', 'yooglifetchagent',
        'z\-add_link_checker', 'zealbot', 'zhuaxia', 'zspider', 'zeus',
        'ng\/1\.', 'ng\/2\.', 'exabot',
        'alltop', 'applesyndication', 'asynchttpclient', 'bingbot', 'blogged_crawl',
        'bloglovin', 'butterfly', 'buzztracker', 'carpathia', 'catbot',
        'chattertrap', 'check_http', 'coldfusion', 'covario', 'daylifefeedfetcher',
        'discobot', 'dlvr\.it', 'dreamwidth', 'drupal', 'ezoom',
        'feedmyinbox', 'feedroll\.com', 'feedzira', 'fever\/', 'freenews',
        'geohasher', 'hanrss', 'inagist', 'jacobin club', 'jakarta',
        'js\-kit', 'largesmall crawler', 'linkedinbot', 'longurl', 'metauri',
        'microsoft\-webdav\-miniredir', '^motorola$', 'movabletype', 'netnewswire', ' netseer ',
        'netvibes', 'newrelicpinger', 'newsfox', 'nextgensearchbot', 'ning',
        'pingdom', 'pita', 'postpost', 'postrank', 'printfulbot',
        'protopage', 'proximic', 'quipply', 'r6\_', 'ratingburner',
        'regator', 'rome client', 'rpt\-httpclient', 'rssgraffiti', 'sage\+\+',
        'scoutjet', 'simplepie', 'sitebot', 'summify\.com', 'superfeedr',
        'synthesio', 'teoma', 'topblogsinfo', 'topix\.net', 'trapit',
        'trileet', 'tweetedtimes', 'twisted pagegetter', 'twitterbot', 'twitterfeed',
        'unwindfetchor', 'wazzup', 'windows\-rss\-platform', 'wiumi', 'xydo',
        'yahoo! slurp', 'yahoo pipes', 'yahoo\-newscrawler', 'yahoocachesystem', 'yahooexternalcache',
        'yahoo! searchmonkey', 'yahooysmcm', 'yammer', 'yeti', 'yie8',
        'youdao', 'yourls', 'zemanta', 'zend_http_client', 'zumbot',
        'wget', 'libwww', '^java\/[0-9]',
        ## From RobotsSearchIdOrder_listgen
        'robot', 'checker', 'crawl', 'discovery', 'hunter',
        'scanner', 'spider', 'sucker', 'bot[\s_+:,\.\;\/\\\-]', '[\s_+:,\.\;\/\\\-]bot',
        'curl', 'php', 'ruby\/', 'no_user_agent'
        );
    // @codingStandardsIgnoreEnd

    private $SqlTables = [
        "DNSCache" => "CREATE TABLE BotDetector_DNSCache (
               IPAddress INT UNSIGNED,
               Result INT UNSIGNED,
               Retrieved TIMESTAMP DEFAULT NOW(),
               LastUsed TIMESTAMP,
               INDEX (IPAddress),
               INDEX (LastUsed),
               INDEX (Retrieved) )",
        "CanaryData" => "CREATE TABLE BotDetector_CanaryData (
               IPAddress INT UNSIGNED,
               CanaryLastShown TIMESTAMP NULL DEFAULT NULL,
               CanaryLastLoaded TIMESTAMP NULL DEFAULT NULL,
               PRIMARY KEY (IPAddress),
               INDEX (CanaryLastShown),
               INDEX (CanaryLastLoaded) )",
    ];
}
