Código PHP:
////// Spider Functions //////
function checkandupdatetoindexer(){
global $link;
// "Put the new URL in the search database\n";
$link = trim($link);
if(substr($link,(strlen($link)-1)) == "/"){
$link = substr($link,0,(strlen($link)-1));
}
$chk = @mysql_unbuffered_query("SELECT * FROM search");
while($curec = mysql_fetch_array($chk)){
$curid = $curec["url_id"];
$curchk = urldecode($curec["url"]);
if(trim($curchk) == trim($link)){
return FALSE;
}
}
$chk = @mysql_unbuffered_query("SELECT * FROM indexer");
while($curec = mysql_fetch_array($chk)){
$curid = $curec["url_id"];
$curchk = $curec["url"];
if(trim($curchk) == trim($link)){
return FALSE;
}
}
$chk = @mysql_unbuffered_query("SELECT * FROM directory");
while($curec = mysql_fetch_array($chk)){
$curid = $curec["link_id"];
$curchk = $curec["link"];
$tlink = strlen($curchk);
$glink = substr($link, 0, $tlink);
if($glink == $curchk){
echo "<br><b>Adding: $link \n</b>";
$putup = mysql_unbuffered_query("INSERT INTO indexer SET url='$link'");
return TRUE;
}
}
return FALSE;
}
function checkforgarbage(){
global $link;
// "Get rid of any garbage and most binary files in the link\n";
if(substr_count(strtolower($link),"&?") != 0){
return TRUE;
}
if(substr_count(strtolower($link),"@") != 0){
return TRUE;
}
if(substr_count(strtolower($link),"javascript") != 0){
return TRUE;
}
if(substr_count(strtolower($link),"mailto") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".jpg") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".gif") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".pdf") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".pnf") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".png") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".mpg") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".mov") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".mpeg") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".avi") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".mp3") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".wav") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".zip") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".tar") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".gz") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".tgz") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".exe") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".css") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".rm") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".bin") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".iso") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".xls") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".doc") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".dbf") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".iso") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".dll") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".sys") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".au") != 0){
return TRUE;
}
if(substr_count(strtolower($link),".c") != 0){
return TRUE;
}
return FALSE;
}
function checkmetarobots(){
global $robots, $metarobots;
if(substr_count($robots,$metarobots) > 0){
return TRUE;
}
return FALSE;
}
function checkrobotstxt(){
global $currentdomain, $link, $robotay, $spiderhost;
if(!$robotay || $robotay == ""){
return FALSE;
}
$robotaycount = count($robotay);
$roop = 0;
while($roop <= $robotaycount){
$curele = $robotay[$roop];
if(!$curele || $curele == ""){
Return FALSE;
}
$thecolon = strpos($curele,":");
if(substr($curele,0,($thecolon+1)) == "User-agent:"){
$robgent = trim(substr($curele,$thecolon+1));
if($robgent == "*" || $robgent == $spiderhost){
$dospider = 1;
}
else{
return FALSE;
}
}
if(substr($curele,0,($thecolon+1)) == "Disallow:"){
$st = strpos($curele, "/");
$robdis = substr($curele, $st);
$ed = strrpos(trim($robdis), "/");
$robdis = trim(substr($robdis, 0));
if(!$robdis || $robdis == ""){
return FALSE;
}
$fnd = substr_count($link, $robdis);
if($fnd){
return TRUE;
}
}
++$roop;
}
return FALSE;
}
function hardcleanup(){
global $body, $read, $title, $url;
// Set the script and style tags to lowercase so these routines can work
$tolower = "SCRIPT>";
while(substr_count($read, $tolower)){
$islower = strtolower($tolower);
$read = str_replace($tolower, $islower, $read);
$tolower = "Script>";
}
$tolower = "STYLE>";
while(substr_count($read, $tolower)){
$islower = strtolower($tolower);
$read = str_replace($tolower, $islower, $read);
$tolower = "Style>";
}
// Kill any style tags
$kstyle = substr_count($read, "/style>");
if($kstyle > 0){
$a = 1;
while($a <= $kstyle){
$bstyl = strpos($read, "/style>");
$read = substr($read, ($bstyl+7));
++$a;
}
}
// Kill any scripts
$kscript = substr_count($read, "/script>");
if($kscript > 0){
$a = 1;
while($a <= $kscript){
$bscrpt = strpos($read, "/script>");
$read= substr($read, ($bscrpt+8));
++$a;
}
}
// Get the page body
$body = trim(strip_tags($read));
$body = str_replace("'","`",$body);
// Make sure there's something left to work with
if(trim($body) == ""){
echo "Killing off empty file.";
$kill = mysql_unbuffered_query("DELETE FROM search WHERE url='$url'");
return FALSE;
}
return TRUE;
}
?>