Foros del Web - Ver Mensaje Individual

xarmagedonx · #1 (**permalink**) 27/10/2012, 17:01

Hola compañeros de FDW!, estuve desarrollando un webcrawler y cuando pensé que lo había terminado surgió un error

Utilizo un loop recursivo para visitar una web, y si se cumplen ciertas condiciones comenzar la extracción, luego se vuelve a llamar una y otra ves a la función para que se repita hasta que ya no se cumpla alguna condición.

Pero eso no es lo peor, tenía una fecha para presentar este webcrawler en mi escuela, y yo creía que lo había terminado

Cuando pongo en funcionamiento el código solo extrae la pág principal y después no puede continuar debido al máximo de conecciones disponibles. ¿Qué puedo hacer para evitar repetir todo el tiempo y lograr consumir menos conecciones?

Les muestro el código completo:

Código PHP:

  <?php

error_reporting(E_ALL);

?>

<?php

$url = "http://www.web.com";

$patron = "http://www.web.com";

$prof = 1;

 
function storeLink($titulo,$descripcion,$url,$keywords) {

    $query = "INSERT INTO webs (webTitulo, webDescripcion, weburl, webkeywords) VALUES ('$titulo', '$descripcion', '$url', '$keywords')";

    mysql_query($query) or die('Error, falló la inserción de datos');

}

 
function extraer($url, $prof){

$server_link = mysql_connect("Serv", "User", "Pass"); 

 
if(!$server_link){ die("Fall&oacute; la Conexi&oacute;n ". mysql_error()); 

} 

$db_selected = mysql_select_db("DB", $server_link);

if(!$db_selected){ die("No se pudo seleccionar la Base de Datos ". mysql_error()); 

} 

 
$url = "http://www.web.com";

$patron = "http://www.web.com";

$prof = 1;

$userAgent = 'Interredu';

 
$ch = curl_init();

curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);

curl_setopt($ch, CURLOPT_URL,$url);

curl_setopt($ch, CURLOPT_HTTPHEADER, array(("Accept-Language: es-es,en")));

curl_setopt($ch, CURLOPT_FAILONERROR, true);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);

curl_setopt($ch, CURLOPT_MAXREDIRS, 2);

curl_setopt($ch, CURLOPT_AUTOREFERER, true);

curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);

$html= curl_exec($ch);

if (!$html) {

    echo "<br />cURL error number:" .curl_errno($ch);

    echo "<br />cURL error:" . curl_error($ch);

    exit;

}

 
$dom = new DOMDocument();

@$dom->loadHTML($html);

 
$xpath = new DOMXPath($dom);

$hrefs = $xpath->evaluate("/html/body//a");

$busqueda = mysql_query("SELECT weburl FROM webs WHERE weburl='$url'");

 
if($prof < 1 and mysql_num_rows($busqueda)==0 and strpos($url, $patron)!==FALSE){

    preg_match_all ("(<title>(.*)<\/title>)siU", $html, $title);

    preg_match_all ("(<meta name=\"description\" content=\"(.*)\"\/>)siU", $html, $description);

    preg_match_all ("(<meta name=\"keywords\" content=\"(.*)\"\/>)siU", $html, $keys);

    $titulo = $title[1][0];

    $descripcion = $description[1][0];

    $keywords = $keys[1][0];

    storeLink($titulo,$descripcion,$url,$keywords);

    }

for ($i = 0; $i < $hrefs->length; $i++) {

    $href = $hrefs->item($i);

    $url2 = $href->getAttribute('href');

    extraer($url2, $prof ++);

}

}

extraer($url, $prof);

?>

Saludos y gracias por su ayuda!!!!