/**** user variables ****/
$log_file = "/home/joe/log/spiderlog"; /* create this file and chmod 777 */
$date = date("d.M.Y H:i:s", time());
/* Major search engines match either $spider_footprint or $spider_ip. */
/* If no match can be made, popular browsers are sorted out and the */
/* request is supposed to be from a smaller search engine. */
$spider_footprint = array("cooter", "lurp", "rawler", "pider", "obot", "eek",
"oogle", "canner", "rachnoidea", "ulliver", "arvest", "ireball", "idewinder");
$spider_ip = array("204.123.", "204.74.103.", "203.108.10.", "195.4.183.",
"195.242.46.", "198.3.97.", "204.62.245.", "193.189.227.", "209.1.12.",
"204.162.96.", "204.162.98.", "194.121.108.", "128.182.72.", "207.77.91.",
"206.79.171.", "207.77.90.", "208.213.76.", "194.124.202.", "193.114.89.",
"193.131.74.", "131.84.1.", "208.219.77.", "206.64.113.", "195.186.1.",
"195.3.97.", "194.191.121.", "139.175.250.", "209.73.233.", "194.191.121.",
"198.49.220.", "204.62.245.", "198.3.99.", "198.2.101.", "204.192.112.",
"206.181.238", "208.215.47.", "171.64.75.", "204.162.98.", "204.162.96.",
"204.123.9.52", "204.123.2.44", "204.74.103.39", "204.123.9.53", "204.62.245.",
"206.64.113.", "194.100.28.20", "204.138.115.", "94.22.130.", "164.195.64.1",
"205.181.75.169", "129.170.24.57", "204.162.96.", "204.162.96.", "204.162.98.",
"204.162.96.", "207.77.90.", "207.77.91.", "208.200.146.", "204.123.9.20",
"204.138.115.", "209.1.32.", "209.1.12.", "192.216.46.49", "192.216.46.31",
"192.216.46.30", "203.9.252.2");
$browser_footprint = array("95", "98", "MSIE", "NT", "Opera", "16", "32",
"MAC", "Mac", "X11", "WebTV", "OS", "Lynx", "IBrowse", "IWENG", "PRODIGY",
"Mosaic", "InterGO", "Gold", "zzZ", "Mozzarella", "Vampire", "Cache", "libwww",
"TURBO", "WebCompass", "Konqueror", "Wget", "Amiga");
/* 'Mozilla' is not included, for many spiders pretend to be a Mozilla */
/* browser. Regular Mozilla browsers include almost always one of the */
/* above footprints additionally. */
/**** spider or not? ********/
$agent = getenv('HTTP_USER_AGENT');
$host_ip = getenv('REMOTE_ADDR');
$is_spider = 0;
$i = 0;
while ($i < (count($spider_footprint))) {
if (strstr($agent, $spider_footprint[$i])) {
$is_spider = 1;
break;
}
$i++;
}
if ($is_spider != 1) {
$i = 0;
while ($i < (count($spider_ip))) {
if (strstr($host_ip, $spider_ip[$i])) {
$is_spider = 1;
break;
}
$i++;
}
}
if ($is_spider != 1) {
$is_spider = 1; /* when in doubt, it's logged */
$i = 0;
while ($i < (count($browser_footprint))) {
if (strstr($agent, $browser_footprint[$i])) {
$is_spider = 0;
break;
}
$i++;
}
}
/*** log the spider access ***/
if ($is_spider = 1) {
$host_name = @GetHostByAddr($host_ip);
$readit = file($log_file);
$i = 0;
while ($i < (count($readit))) {
$alreadythere = $alreadythere.$readit[$i];
$i++;
}
$newentry = "URL: $PHP_SELF, Date: $date, RemoteHost: $host_name
($host_ip), UserAgent: $agent\n";
$fp = fopen($log_file, "w" );
fputs($fp, $alreadythere.$newentry);
fclose($fp);
}
?>