Que deveria cambiarle??
Código PHP:
<?
set_time_limit( 0 );
class spider_man{
var $limit;
var $cache;
var $crawled;
var $banned_ext;
function spider_man( $url, $banned_ext, $limit )
{
$this->start = $url ;
$this->banned_ext = $banned_ext ;
$this->limit = $limit ;
if( !fopen( $url, "r") ) return false;
else $this->_spider( $url );
}
function _spider( $url ){
$cache = @file_get_contents( urldecode( $url ) );
if( !$cache ) return false;
$this->crawled[] = urldecode( $url ) ;
preg_match_all( [COLOR="Red"]"aqui para sacar los H re f[/COLOR]", $cache, $links );
if ( $links ) :
foreach ( $links[1] as $hyperlink )
{
$this->limit--;
if( ! $this->limit ) return;
if( $this->is_valid_ext( trim( $hyperlink ) ) and !$this->is_crawled( $hyperlink ) ) :
$this->crawled[] = $hyperlink;
echo "Crawling $hyperlink<br />\n";
unset( $cache );
$this->_spider( $hyperlink );
endif;
}
endif;
}
function is_valid_ext( $url ) {
foreach( $this->banned_ext as $ext ) {
if( $ext == substr( $url, strlen($url) - strlen( $ext ) ) ) return false;
}
return true; }
function is_crawled( $url ){
return in_array( $url, $this->crawled );
}
}
$banned_ext = array(
".dtd",
".css",
".xml",
".js",
".gif",
".jpg",
".jpeg",
".bmp",
".ico",
".rdf",
".rss");
$spider = new spider_man([COLOR="Red"]'pagina a extraer link'[/COLOR], $banned_ext, 100 );
print_r( $spider->crawled );
?>