Hola, encontre una clase que se supone que te calcula el tamaño de las paginas. Aqui teneis el código:
(Fichero class.webpagesize.php)
<?
class WebpageSize {
var $url = '';
var $baseurl = '';
var $tailfile = '';
var $proxy = '';
var $proxyport = 3128;
var $pages = array();
var $freqpages = array(); //frequency of page element to be loaded
function setURL($url) {
$this->url = $this->parseURL($url);
}
function parseURL($url) {
$this->tailfile = substr($url, strrpos($url, '/')+1);
$parsed = parse_url($url);
if($this->tailfile == $parsed['host']) $this->tailfile = '';
if(substr($url, -1)=='/' or $this->tailfile)
return $url;
else
$url = $url.'/';
return $url;
}
function setBaseURL($str) {
preg_match("/base.*[\s]*href[\040]*=[\040]*\"?([^\"' >]+)/ie", $str, $match);
if($match[1]) {
$url = $this->parseURL($match[1]);
if(substr($url, -1)!='/') $url .= '/';
$this->baseurl = $url;
} else {
$this->baseurl = $this->url;
}
}
function setURLviaProxy($url,$proxy,$port) {
$this->setURL($url);
$this->proxy = $proxy;
$this->proxyport = $port;
}
function getResult() {
$paths = $this->grabPageSources();
array_unshift ($paths, $this->url);
$pages = array();
if(function_exists('curl_init')) {
$ch = curl_init();
if($this->proxy) {
curl_setopt($ch, CURLOPT_PROXY, $this->proxy.':'.$this->proxyport);
}
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
for($i=0; $i<count($paths); $i++){
if(!array_key_exists($paths[$i],$pages) ){
curl_setopt($ch, CURLOPT_URL, $paths[$i]);
$headers = curl_exec ($ch);
$filesize = curl_getinfo($ch,CURLINFO_CONTENT_LENGTH_DOWNLOAD) ;
if(!$filesize) //try using fopen
$filesize = strlen($this->getContent($paths[$i]));
if(!$filesize)
continue;
$this->freqpages[$paths[$i]] = 1;
$pages[$paths[$i]] = $filesize;
$this->totalsize += $filesize;
} else
$this->freqpages[$paths[$i]] += 1;
}
curl_close ($ch);
} else {
for($i=0; $i<count($paths); $i++){
if(!array_key_exists($paths[$i],$pages) ){
$filesize = strlen($this->getContent($paths[$i]));
$this->freqpages[$paths[$i]] = 1;
$pages[$paths[$i]] = $filesize;
$this->totalsize += $filesize;
} else
$this->freqpages[$paths[$i]] += 1;
}
}
natsort($pages);
return $pages;
}
function totalPageSize() {
return $this->totalsize;
}
/*
* this one is usefull
*/
function readableSize($size) {
return number_format($size/1024,2)." KB";
}
/*
* pre-formated output
*/
function printResult() {
$pages = $this->getResult();
$strtable = '<table width=\"700\" border=\"1\">' .
'<tr bgcolor=#F3F3F3><td width=\"360\" colspan=2>' .
'<div align=\"center\">Webpage`s URL : '.$this->url.'</div></td>' .
'<td width=\"140\" colspan=2>Size : ' . $this->readableSize($this->totalPageSize()) . '</td></tr>' .
'<tr bgcolor=#F3F3F3><td width=\"24\"><div align=\"center\">#</div></td>' .
'<td width=\"210\">URL of Elements of Webpage</td>' .
'<td width=\"86\">Filesize</td>' .
'<td width=\"32\">Freq</td>' .
'</tr>';
$n=0;
while(list($url,$size) = each($pages)){
$strtable .= '<tr><td width=20>'.++$n.'</td><td width=440>'. $url. '</td>' .
'<td width=100>'. $this->readableSize($size) . '</td>' .
'<td width=40>' . $this->freqpages[$url] . '</td></tr>';
}
$strtable .= '<tr bgcolor=#F3F3F3><td> </td><td> Total Webpage Size</td><td colspan=2>'.
$this->readableSize($this->totalPageSize()) . '</td></tr>';
$strtable .='</table>';
echo $strtable;
}
function getContent($url){
if($this->proxy) {
return $this->getContentProxy($url);
} else {
$file = @fopen($url, 'rb');
$buffer = '';
if(!$file) return '';
while(!feof($file)) {
$buffer .= fread($file,1024);
}
fclose($file);
return $buffer;
}
}
function getContentProxy($url)
{
$buffer = '';
$file = fsockopen($this->proxy, $this->proxyport);
if (!$file) {return '';}
fputs($file, "GET $url HTTP/1.0\r\nHost: $proxy_name\r\n\r\n");
while(!feof($file)) {$buffer .= fread($file,4096);}
fclose($file);
$buffer = substr($buffer, strpos($buffer,"\r\n\r\n")+4);
return $buffer;
}
function grabPageSources() {
$content = $this->getContent($this->url);
$this->setBaseURL($content);
$arr_src1 = array();
$arr_src2 = array();
$arr_src3 = array();
$arr_src4 = array();
$arr_src5 = array();
$arr_src6 = array();
$arr_src1 = $this->searchSources($content);
//search CSS classes that applied on page
$this->CSSclasses = $this->searchCSSClasses($content);
// print_r( $arr_clss );
$arr_src2 = $this->searchSourcesOnCSS($content);
$arr_src3 = $this->searchCSSLinks($content);
if(!empty($arr_src3))
$arr_src4 = $this->searchSourcesOnCSSFiles($arr_src3);
//search on frames if exists
$arr_src5 = $this->searchFrames($content);
if(!empty($arr_src5))
$arr_src6 = $this->searchSourcesOnFrames($arr_src5);
$arr_sources = array_merge ($arr_src1, $arr_src2, $arr_src3,
$arr_src4, $arr_src5, $arr_src6);
return $this->resolvePathSources($arr_sources);
}
function searchSources($str) {
preg_match_all("/[img|input|embed|script]+.*[\s]*(src|background)[\040]*=[\040]*\"?([^\"' >]+)/ie", $str, $arr_source);
return $arr_source[2];
}
function searchCSSClasses($str) {
preg_match_all("/class[\040]*=[\040]*\"?([^\"' >]+)/ie", $str, $arr_source);
return $arr_source[1];
}
function searchFrames($str) {
preg_match_all("/frame.*[\s]*src[\040]*=[\040]*\"?([^\"' >]+)/ie", $str, $arr_source);
return $arr_source[1];
}
function xsearchSourcesOnCSS($str) {
preg_match_all("/(url\(\"?([^\")]+))/ie", $str, $arr_source);
return $arr_source[2];
}
function searchSourcesOnCSS($str) {
preg_match_all("/(\.(.*)\s+\{[\s]+)*.*url\(\"?([^\")]+)/ie", $str, $arr_source);
for($i=0; $i<count($arr_source);$i++) {
if( in_array( $arr_source[2][$i], $this->CSSclasses )) {
$arr_sources[] = $arr_source[3][$i];
}
}
return $arr_sources;
}
/*
* searching webpage elements on frames
*/
function searchSourcesOnFrames($framefiles) {
$arr_source = array();
$arr_sources = array();
while(list(,$src) = each($framefiles)) {
$framepage = $this->makeAbsolutePath($src,$this->baseurl);
$page = new WebpageSize;
if($this->proxy)
$page->setURLviaProxy($framepage, $this->proxy, $this->proxyport);
else
$page->setURL($framepage);
$arr_source = $page->grabPageSources();
$arr_sources = array_merge($arr_sources, $arr_source);
}
return $arr_sources;
}
function searchSourcesOnCSSFiles($cssfiles) {
//search sources on CSS file
$arr_CSSlinks = array();
while(list(,$src) = each($cssfiles)) {
$numstepback = substr_count($src, "../");
$CSSpage = $this->makeAbsolutePath($src,$this->baseurl);
$CSScontent = $this->getContent($CSSpage);
$arr_sourcelink = $this->searchSourcesOnCSS($CSScontent);
if(empty( $arr_sourcelink )) continue;
while(list(,$srclink) = each($arr_sourcelink)) {
$arr_CSSlink[] = str_repeat("../",$numstepback) . $srclink;
}
$arr_CSSlinks = array_merge($arr_CSSlinks, $arr_CSSlink);
}
return $arr_CSSlinks;
}
function searchCSSLinks($str) {
preg_match_all("/<link[^>]+href[\040]*=[\040]*[\"|\'|\\\\]*([^\'|\"|>|\040]*(.*)\.css)[\"|\'|>|\040|\\\\]*/ie",$str, $arr_CSSlink);
return $arr_CSSlink[1];
}
function makeAbsolutePath ($src,$url) {
$addone = 1;
if ($this->tailfile) {
$url = substr($url, 0, -(strlen($this->tailfile)+1));
$addone = 0;
}
if (strtolower(substr($src,0,4)) != 'http') {
$numrel = substr_count($src, "../");
$src = str_replace("../","",$src);
for($i=0; $i < $numrel+$addone; $i++) {
$lastslash = strrpos($url,"/");
$url = substr($url, 0, $lastslash);
}
return $url.'/'.$src;
}
else return $src;
}
}
?>