Primera parte de Spider
Código PHP:
<?php
require('../includes/config.inc.php');
ini_set(user_agent, "$spiderhost");
$spiderday = $spiderday * 86400;
set_time_limit(0);
$starttime = date("H:i m/d/y");
echo "##### The Spider has started at $starttime, Do Not Close This Console #####\n\n";
// Start the big loop
do {
// Open the database and start looking at URLs
$sql = mysql_query("SELECT * FROM search WHERE flag='0' ORDER BY date");
while($rslt = mysql_fetch_array($sql)){
$url_id = $rslt["url_id"];
$url = $rslt["url"];
$crc = $rslt["checksum"];
$date = $rslt["date"];
if($url === $ourl){
echo "<br>WARNING: Repeditive URL got through: $ourl\n";
continue;
}
// Make an announcement
echo "<br>\nNow Processing: $url\n";
// Don't go there if you don't have to
if($flag == 1){
echo "<br>This url is already up to date.";
continue;
}
// Open URL for parsing
$read = @file_get_contents($url);
echo $read;
if(!$read || $read == ''){
echo "<br>Killing off dead URL: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
// Check for binaries
$ckbin = 14;
while($ckbin <= 26){
$ck = chr($ckbin);
$cbin = substr_count($read, $ck);
if($cbin > 0){
echo "<br>Killing off binary file URL: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue 2;
}
++$ckbin;
}
// Set date and checksum info
$checksum = crc32($read);
$daycheck = date(U) - $spiderday;
$date = strtotime($date);
if($date > $daycheck && $crc == $checksum){
echo "<br>This url is already up to date\n";
continue;
}
// Get rid of pages from idiot webmasters who's first line of code starts with "<script".
$firstcheck = trim(strtolower(substr($read, 0, 8)));
if($firstcheck == "<script" || $firstcheck == "< script"){
echo "<br>Killing off incompatible file at: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
// Get meta tags
$meta = @get_meta_tags($url);
$robots = $meta["robots"];
$keywords = $meta["keywords"];
$keywords = str_replace("'","`",$keywords);
$description = $meta["description"];
$description = str_replace("'","`",$description);
if(strlen($description) > 255){
$description = substr($description, 0, 255);
}
// Check robots meta tags
$metarobots = "noindex";
if(checkmetarobots($metarobots)){
echo "<br>Indexing disallowed by robots meta tag: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
$metarobots = "none";
if(checkmetarobots($metarobots)){
echo "<br>Indexing disallowed by robots meta tag: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
// Get the page title
$temp = spliti("title>",$read,3);
$title = substr($temp[1],0,-2);
$title = str_replace("'","`",$title);
if(strlen($title) > 128){$title = substr($title, 0, 128);}
if($title == ""){$title = "No Title";}
// Run the cleanup function to parse all the garbage and whitespace out of the code
if(!hardcleanup()){
echo "<br>Clean up error on $link\n";
continue;
}
// Encode stuff and put ir in the search database\n";
echo "<br>Updating: $title\n$url\n";
$title = html_entity_decode($title);
$description = html_entity_decode($description);
$body = html_entity_decode($body);
$renew = @mysql_unbuffered_query("UPDATE search SET url='$url', title='$title', metak='$keywords', metad='$description', checksum='$checksum', date=CURDATE(), flag=1, body='$body' WHERE url_id='$url_id'");
if(!$renew || $renew == ""){
echo "<br>NOT UPDATED: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
continue;
}
else{
$renew = @mysql_unbuffered_query("UPDATE search SET flag=1 WHERE url_id='$url_id'");
if(!$renew || $renew == ""){
echo "<br>NOT UPDATED: $url\n";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url_id='$url_id'");
}
}
// Check robots meta tags
$metarobots = "nofollow";
if(checkmetarobots($metarobots)){
echo "<br>Following disallowed by robots meta tag: $url\n";
continue;
}
$metarobots = "none";
if(checkmetarobots($metarobots)){
echo "<br>Following disallowed by robots meta tag: $url\n";
continue;
}
// "Parse the main URL\n";
$top = parse_url($url);
$tschm = $top["scheme"];
$thost = $top["host"];
$tpath = $top["path"];
$tqury = $top["query"];
$tfrag = $top["fragment"];
$currentdomain = $tschm . "://" . $thost;
$getbot = $currentdomain . "/robots.txt";
$robotay = @file($getbot);
// Parse all the links on the page
$rtemp = stristr($read,"<");
$temp = stristr($rtemp,"a");
while($rtemp){
//"Parse the href out of the string\n";
$rtemp = stristr($temp,"href");
$rtemp = stristr($rtemp, '"');
$rtemp = substr($rtemp, 1);
$lpos = strpos($rtemp, '"');
$link = substr($rtemp, 0, $lpos);
$temp = stristr($rtemp,"<");
$link = trim($link);
// Kill any trailing slashes
if(substr($link,(strlen($link)-1)) == "/"){
$link = substr($link,0,(strlen($link)-1));
}
// If it just won't fit.
if(strlen($link) > 255){
continue;
}
if(checkforgarbage()){
continue;
}
// Parse the current link
$bot = @parse_url($link);
if(!$bot || $bot == ""){
continue;
}
$bschm = $bot["scheme"];
$bhost = trim(urldecode($bot["host"]));
$bpath = trim(urldecode($bot["path"]));
$bqury = $bot["query"];
$bfrag = $bot["fragment"];
// Get rid of outside links
if($bhost != "" && $bhost != $thost){
continue;
}
// Kill off any fragment based URLs
if(strlen($bfrag) > 0){
continue;
}
// Kill off any dot dots ../../ and dots ././
$ddotcheck = substr_count($bpath,"../");
if($ddotcheck != ""){
$bpath = str_replace("/../", "/", $bpath);
$bpath = str_replace("../", "/", $bpath);
}
$dotcheck = substr_count($bpath, "./");
if($dotcheck != ""){
$bpath = str_replace("/./", "/", $bpath);
$bpath = str_replace("./", "/", $bpath);
}
// Comparitive analisys
if($bpath != "" && substr($bpath,0,1) != "/"){
if(strrpos($tpath,".") === false){
$bpath = $tpath . "/" . $bpath;
}
if(strrpos($tpath,".")){
$ttmp = substr($tpath,0,(strrpos($tpath,"/")+1));
$bpath = $ttmp . $bpath;
if(substr($bpath,0,1) != "/"){
$bpath = "/" . $bpath;
}
}
}
// Kill any trailing slashes
$link = trim($link);
if(substr($link,(strlen($link)-1)) == "/"){
$link = substr($link,0,(strlen($link)-1));
}
// Check to see if the scheme and domain are in the url
if($bhost == ""){
$link = $thost . $bpath;
$link = str_replace(" ", "", $link);
$link = str_replace("//", "/", $link);
$link = $tschm . "://" . $link;
}
$link = urldecode($link);
// Kill off any remaining query strings
$kilqu = strpos($link, "?");
if($kilqu > 0 || $kilqu != ""){
$link = substr($link, 0, $kilqu);
$link = trim(str_replace("?", "", $link));
}
// Format the link for inclusion and to avoid stupid looping
$link = trim(strtolower($link));
// Kill any trailing slashes
if(substr($link,(strlen($link)-1)) == "/"){
$link = substr($link,0,(strlen($link)-1));
}
// Don't be overly recursive
if($link == $currentdomain){
continue;
}
// If it's a usless link, kill it
if($link == ""){
continue;
}
// Execute robots exclusion standard via robots.txt
if(checkrobotstxt()){
echo "\nDisallowed by robots.txt: $link\n\n";
continue;
}
// Finish it off and prep for the next loop
if(!checkandupdatetoindexer()){
continue;
}
}
// Take the new URLs and put them in the search database, or finish if there are no more
$movem = mysql_query("SELECT url FROM indexer");
while($mvrslt = mysql_fetch_array($movem)){
$murl = $mvrslt["url"];
$putem = mysql_unbuffered_query("INSERT INTO search SET url='$murl'");
$kill = mysql_unbuffered_query("DELETE FROM indexer");
}
$ourl = $url;
}
$preloop = mysql_fetch_row(mysql_unbuffered_query("SELECT COUNT(checksum) AS count FROM search WHERE checksum='0'"));
$loopcount = $preloop[0];
} while($loopcount > 0);
$done = mysql_unbuffered_query("UPDATE search SET flag='0' WHERE flag='1'");
echo "\n\nOptimizing Database...";
$cleans = mysql_query("OPTIMIZE TABLE search");
$cleani = mysql_query("OPTIMIZE TABLE indexer");
echo " Done.\n\n";
$endtime = date("H:i m/d/y");
echo "\n\n##### Spider started at $starttime, finished at $endtime. #####\n##### You Can Now Close This Console #####\n";