Problema

alanfcarta · #1 (**permalink**) 02/11/2010, 12:59

EL PROBLEMA QUE TENGO, ES QUE CUANDO QUIERO ACCEDER A LA SIGUIENTE PAGINA:
http://vimeo.com/ajax/user/home_videos?&jdata={"page":'+str(k)+'}'

me tira una serie de errores, que con otra pagina no me deja, la idea del programa es hacer un crawler, y que los resultados los escriba en un xml.

directamente no me deja bajar el html de la pagina.

los errores que me tira los dejo debajo del porgrama.

import re
import urllib.request
import datetime

class Links():
def __init__(self,url):
self.__direccion=url
def setter(self,url):
self.__direccion=url

def adhtml(self):
f= urllib.request.urlopen(self.__direccion)
a=f.read()
return a

def buscaVideo(self, html):
try:

Exprecion = re.compile('<a\s*href=[\'|"](.*?)[\'|"]')
finalVideo=[]

ec = Exprecion.findall(str(html))

for i in range(0,len(ec)):
if (re.match("[0-9]*",ec[i])):
finalVideo.append(ec[i])

except:
print("ocurrio un error")

return (finalVideo)

def buscaDescripcion(self, html):

Exprecion = re.compile('<div id="description">(.*?)<br><br>(.*?)<br><br>(.*?)<b r><br>(.*?)<br><br>(.*?)<br><br>(.*?) </div>')

ec = Exprecion.findall(str(html))

return (ec)

def buscaRate(self,html):

Exprecion = re.compile('<td>(.*?)</td>')

ec = Exprecion.findall(str(html))

return (ec)

def buscaTitulo(self,html):

Exprecion = re.compile('<div class="title">(.*?)</div>')

ec = Exprecion.findall(str(html))

return (ec)
def buscaCategoria(self,html):

Exprecion = re.compile('<p id="eow-category"><a href="/.*">(.*?)</a></p>')

ec = Exprecion.findall(str(html))

return (ec)

if __name__ == '__main__':
url='vimeo.com/ajax/user/home_videos?&jdata={"page":1}'
link=Links(url)
fecha=datetime.date.today()
dia=(str(fecha))
print(fecha)

loco=open("videosVimeo.txt","w")

loco.write("<?xml>\n")
loco.write('<list origin="vimeo" date="'+ dia +'">\n')
for k in range(1,35):
url='http://vimeo.com/ajax/user/home_videos?&jdata={"page":'+k+'}'

print(url)
print (url.__repr__())

link.setter(url)
html=link.adhtml()

arrayVideo=link.buscaVideo(html)

s=set(arrayVideo)
for h in s:

link.setter("http://www.vimeo.com"+h)
html=link.adhtml()
a=link.buscaDescripcion(html)
rate=link.buscaRate(html)
titulo=link.buscaTitulo(html)

loco.write('\t<item>\n')
loco.write('\t\t <title>'+ titulo[0] + '</title>\n')
loco.write('\t\t<refer>http://www.vimeo.com"'+h+'"</refer>\n')
loco.write('\t\t<rate>'+ rate[1] +'</rate>\n')
loco.write('\t\t<description>' +a[2]+ '</description>\n')
#loco.write('\t\t<category>'+categoria[0]+'</category>\n')
loco.write('\t</item>\n')

loco.close()

print("Aca salen los videos")
for j in range(1,len(arrayVideo)):
print(arrayVideo[j])

--------------------------------------------------------------------------------------------------------Traceback (most recent call last):
File "C:\Users\carta\Desktop\kavesa\PeN DRIVE\paquete\Spider vimeo\linksvimeo.py", line 122, in <module>
html=link.adhtml()
File "C:\Users\carta\Desktop\kavesa\PeN DRIVE\paquete\Spider vimeo\linksvimeo.py", line 13, in adhtml
f= urllib.request.urlopen(self.__direccion)
File "C:\Python31\lib\urllib\request.py", line 121, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python31\lib\urllib\request.py", line 355, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 467, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 393, in error
return self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 327, in _call_chain
result = func(*args)
File "C:\Python31\lib\urllib\request.py", line 475, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

razpeitia · #2 (**permalink**) 02/11/2010, 13:39

No uses expresiones regulares para parsear html o xml en vez de eso usa un parser.
Como lxml o BeautifulSoup

Al parecer no es tan fácil de hacer un spider. Lo probe con python 2.6 y me marco esto.

Código HTML:

Ver original<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <title>Vimeo / 403 Forbidden</title>
        
    <style type="text/css">
        body {
            background:red;
            font-family:arial,san-serif;
            font-size:18px;
            font-weight:normal;
            color:white;
            margin:75px;
        }
    </style>
</head>
<body>
    <p><h1>You are blocked from Vimeo</h1></p>
    <p>The connection you are using has been blocked from communicating with Vimeo's servers. This ban will never be lifted.</p>
    <div style="display:none">1288726562</div>
    <p>If you are human and think this is an error, please <a href="mailto:[email protected]?body=I have been banned. My IP is x.x.x.x and my browser is Python-urllib/1.17">click here</a>.</p>
    <br />
    <p><em>"It's too bad she won't live. But then again, who does?"</em></p>
</body>
</html>

Al parecer también tienes que añadir algunos headers al request que estas haciendo.

alanfcarta · #3 (**permalink**) 02/11/2010, 13:42

es python 3... no se si tiene que ver eso me falto aclararlo

razpeitia · #4 (**permalink**) 02/11/2010, 14:46

Código Python:

Ver original#Codigo en python 2.6
import re
import urllib
import datetime
 
class AppURLopener(urllib.FancyURLopener):
    version = "App/1.7"
 
urllib._urlopener = AppURLopener()
 
class Links():
    def __init__(self,url):
        self.__direccion=url
 
    def setter(self,url):
        self.__direccion=url
 
    def adhtml(self):
        try:
            f= urllib.urlopen(self.__direccion)
        except:
            print("Fallo en", self.__direccion)
            raise SystemExit
        a=f.read()
        f.close()
        return a
 
 
    def buscaVideo(self, html):
        try: 
            Exprecion = re.compile('<a\s*href=[\'|"](.*?)[\'|"]')
            finalVideo=[]
            ec = Exprecion.findall(str(html))
            for i in range(0,len(ec)):
                if (re.match("[0-9]*",ec[i])):
                    finalVideo.append(ec[i])
 
        except:
            print("ocurrio un error")
        return (finalVideo)
 
 
    def buscaDescripcion(self, html): 
        Exprecion = re.compile('<div id="description">(.*?)<br><br>(.*?)<br><br>(.*?)<b r><br>(.*?)<br><br>(.*?)<br><br>(.*?) </div>')
        ec = Exprecion.findall(str(html))
        return (ec)
 
    def buscaRate(self,html):
        Exprecion = re.compile('<td>(.*?)</td>')
        ec = Exprecion.findall(str(html))
        return (ec)
 
    def buscaTitulo(self,html):
        Exprecion = re.compile('<div class="title">(.*?)</div>') 
        ec = Exprecion.findall(str(html))
        return (ec)
    
    def buscaCategoria(self,html):
        Exprecion = re.compile('<p id="eow-category"><a href="/.*">(.*?)</a></p>') 
        ec = Exprecion.findall(str(html))
        return (ec)
 
url='vimeo.com/ajax/user/home_videos?&jdata={"page":1}'
link=Links(url)
fecha=datetime.date.today()
dia=(str(fecha))
print(fecha)
loco=open("videosVimeo.txt","w")
 
 
loco.write("<?xml>\n")
loco.write('<list origin="vimeo" date="'+ dia +'">\n') 
for k in range(1,35):
    url='http://vimeo.com/ajax/user/home_videos?&jdata={"page":'+str(k)+'}'
    print(url)
    print (url.__repr__())
    link.setter(url)
    html=link.adhtml()
    print html
    arrayVideo=link.buscaVideo(html)
    s=set(arrayVideo)
    for h in s:
        link.setter("http://www.vimeo.com"+h)
        html=link.adhtml()
        a=link.buscaDescripcion(html)
        rate=link.buscaRate(html)
        titulo=link.buscaTitulo(html)
        loco.write('\t<item>\n')
        loco.write('\t\t <title>'+ titulo[0] + '</title>\n')
        loco.write('\t\t<refer>http://www.vimeo.com"'+h+'"</refer>\n')
        loco.write('\t\t<rate>'+ rate[1] +'</rate>\n')
        loco.write('\t\t<description>' +a[2]+ '</description>\n')
        #loco.write('\t\t<category>'+categoria[0]+'</category>\n')
        loco.write('\t</item>\n')
 
loco.close()
 
print("Aca salen los videos")
for j in range(1,len(arrayVideo)):
    print(arrayVideo[j])

Al parecer urllib de python 3.1 no hace muchas cosas por defecto que el de python 2.6 Si
Por cierto tienes problemas con los indices de tus listas. Por eso entre otras cosas es mejor usar un parser.

Codigo en python 3.1

Código Python:

Ver originalimport re
import urllib.request
import datetime
 
class Links():
    def __init__(self,url):
        self.__direccion=url
 
    def setter(self,url):
        self.__direccion=url
 
    def adhtml(self):
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        values = {}
        headers = {'User-Agent':user_agent}
        data = urllib.parse.urlencode(values)
        req = urllib.request.Request(url, data, headers)
        f= urllib.request.urlopen(req)
        a=f.read()
        f.close()
        return a
 
 
    def buscaVideo(self, html):
        try: 
            Exprecion = re.compile('<a\s*href=[\'|"](.*?)[\'|"]')
            finalVideo=[]
            ec = Exprecion.findall(str(html))
            for i in range(0,len(ec)):
                if (re.match("[0-9]*",ec[i])):
                    finalVideo.append(ec[i])
 
        except:
            print("ocurrio un error")
        return (finalVideo)
 
 
    def buscaDescripcion(self, html): 
        Exprecion = re.compile('<div id="description">(.*?)<br><br>(.*?)<br><br>(.*?)<b r><br>(.*?)<br><br>(.*?)<br><br>(.*?) </div>')
        ec = Exprecion.findall(str(html))
        return (ec)
 
    def buscaRate(self,html):
        Exprecion = re.compile('<td>(.*?)</td>')
        ec = Exprecion.findall(str(html))
        return (ec)
 
    def buscaTitulo(self,html):
        Exprecion = re.compile('<div class="title">(.*?)</div>') 
        ec = Exprecion.findall(str(html))
        return (ec)
    
    def buscaCategoria(self,html):
        Exprecion = re.compile('<p id="eow-category"><a href="/.*">(.*?)</a></p>') 
        ec = Exprecion.findall(str(html))
        return (ec)
 
url='vimeo.com/ajax/user/home_videos?&jdata={"page":1}'
link=Links(url)
fecha=datetime.date.today()
dia=(str(fecha))
print(fecha)
loco=open("videosVimeo.txt","w")
 
 
loco.write("<?xml>\n")
loco.write('<list origin="vimeo" date="'+ dia +'">\n') 
for k in range(1,35):
    url='http://vimeo.com/ajax/user/home_videos?&jdata={"page":'+str(k)+'}'
    print(url)
    print (url.__repr__())
    link.setter(url)
    html=link.adhtml()
    arrayVideo=link.buscaVideo(html)
    s=set(arrayVideo)
    for h in s:
        link.setter("http://www.vimeo.com"+h)
        html=link.adhtml()
        a=link.buscaDescripcion(html)
        rate=link.buscaRate(html)
        titulo=link.buscaTitulo(html)
        loco.write('\t<item>\n')
        if titulo:
            loco.write('\t\t <title>'+ titulo[0] + '</title>\n')
        loco.write('\t\t<refer>http://www.vimeo.com"'+h+'"</refer>\n')
        if len(titulo) > 2:
            loco.write('\t\t<rate>'+ rate[1] +'</rate>\n')
        if len(titulo) > 3:
            loco.write('\t\t<description>' +a[2]+ '</description>\n')
        #loco.write('\t\t<category>'+categoria[0]+'</category>\n')
        loco.write('\t</item>\n')
 
loco.close()
 
print("Aca salen los videos")
for j in range(1,len(arrayVideo)):
    print(arrayVideo[j])

No se si esta bien la identación

alanfcarta · #5 (**permalink**) 02/11/2010, 15:31

muchas gracias, ahora por lo menos entra y baja el html

soy nuevo en esto... y no tengo ni idea que hiciste aca:

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {}
headers = {'User-Agent':user_agent}
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)

razpeitia · #6 (**permalink**) 02/11/2010, 15:33

Bueno solo le añadi el header de User-Agent para que pudiera entrar.

alanfcarta · #7 (**permalink**) 02/11/2010, 18:40

realmente te agradezco muchÍsimo tu ayuda!!!!.

gracias gracias gracias y mil gracias mas