Hola, hice la siguiente clase para poder obtener los metas, el titulo y el texto sin tags html de una web.
Espero que les sirva!
Mas explicaciones en mi blog;
http://eugeniofage.wordpress.com/2008/08/28/clase-spider-para-obtener-metas-y-contenido-sin-html-tags-de-una-web/
Código PHP:
//@author Eugenio Fage
abstract class Spider {
public function getWebFull($url){
$htmlCode=self::getWebCode($url);
if($htmlCode=='') return array();
$return['title']=self::getTitle($htmlCode);
$return['metas']=self::getMetas($htmlCode);
$return['text']=self::justText($htmlCode);
return $return;
}
public function getWebCode($url){
//@todo si no existen las curl functions usar fsockopen
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
$data = curl_exec ($ch);
curl_close ($ch);
return $data;
}
public function getTitle($html,$charset=null){
//@todo corregir acentos sin usar multi byte functions
$arr=array();
preg_match_all('@(<title>(.*)</title>)@i',$html,$arr);
$arr=$arr[2];
//el titulo no va a ser mas largoque 100 caracteres
return(substr(strip_tags($arr[0]),0,110));
}
public function getMetas($html,$charset=null){
//@todo corregir acentos sin usar multi byte functions
$arr=array();
preg_match_all('@(meta\sname=\"(.*)\"\scontent=\"(.*)\"[ /]*>)@i',$html,$arr);
$meta=$arr[2];
$content=$arr[3];
unset($arr);
while(($unMeta=array_pop($meta))){
$metas[strtolower($unMeta)]=array_pop($content);
}
while(($unMeta=array_pop($meta))){
$metas[strtolower($unMeta)]=array_pop($content);
}
preg_match_all('@(meta\scontent=\"(.*)\"\sname=\"(.*)\"[ /]*>)@i',$html,$arr);
$meta=$arr[3];
$content=$arr[2];
unset($arr);
while(($unMeta=array_pop($meta))){
$metas[strtolower($unMeta)]=array_pop($content);
}
return $metas;
}
public function justText($html,$charset=null){
//@todo corregir acentos sin usar multi byte functions
$html=str_replace('>','> ', $html);
$buscar=array('@<!--.*?-->@si','@<script[^>]*?>.*?</script>@si','@<style[^>]*?>.*?</style>@si');
$html = preg_replace($buscar, ' ', $html);
$html = preg_replace('@<.*?>@si', ' ', $html);
$html=str_replace('<',' ',$html);
$html=str_replace('>',' ',$html);
$html=html_entity_decode(strip_tags($html));
$html=str_replace(array('<','>','>','<',"\t",chr(13),chr(10),chr(160)),' ',$html);
while(strpos($html,' ')!==false){
$html=str_replace(' ',' ',$html);
}
return substr($html,0,1500);
}
}