EL PROBLEMA QUE TENGO, ES QUE CUANDO QUIERO ACCEDER A LA SIGUIENTE PAGINA:
http://vimeo.com/ajax/user/home_videos?&jdata={"page":'+str(k)+'}'
me tira una serie de errores, que con otra pagina no me deja, la idea del programa es hacer un crawler, y que los resultados los escriba en un xml.
directamente no me deja bajar el html de la pagina.
los errores que me tira los dejo debajo del porgrama.
import re
import urllib.request
import datetime
class Links():
def __init__(self,url):
self.__direccion=url
def setter(self,url):
self.__direccion=url
def adhtml(self):
f= urllib.request.urlopen(self.__direccion)
a=f.read()
return a
def buscaVideo(self, html):
try:
Exprecion = re.compile('<a\s*href=[\'|"](.*?)[\'|"]')
finalVideo=[]
ec = Exprecion.findall(str(html))
for i in range(0,len(ec)):
if (re.match("[0-9]*",ec[i])):
finalVideo.append(ec[i])
except:
print("ocurrio un error")
return (finalVideo)
def buscaDescripcion(self, html):
Exprecion = re.compile('<div id="description">(.*?)<br><br>(.*?)<br><br>(.*?)<b r><br>(.*?)<br><br>(.*?)<br><br>(.*?) </div>')
ec = Exprecion.findall(str(html))
return (ec)
def buscaRate(self,html):
Exprecion = re.compile('<td>(.*?)</td>')
ec = Exprecion.findall(str(html))
return (ec)
def buscaTitulo(self,html):
Exprecion = re.compile('<div class="title">(.*?)</div>')
ec = Exprecion.findall(str(html))
return (ec)
def buscaCategoria(self,html):
Exprecion = re.compile('<p id="eow-category"><a href="/.*">(.*?)</a></p>')
ec = Exprecion.findall(str(html))
return (ec)
if __name__ == '__main__':
url='vimeo.com/ajax/user/home_videos?&jdata={"page":1}'
link=Links(url)
fecha=datetime.date.today()
dia=(str(fecha))
print(fecha)
loco=open("videosVimeo.txt","w")
loco.write("<?xml>\n")
loco.write('<list origin="vimeo" date="'+ dia +'">\n')
for k in range(1,35):
url='http://vimeo.com/ajax/user/home_videos?&jdata={"page":'+k+'}'
print(url)
print (url.__repr__())
link.setter(url)
html=link.adhtml()
arrayVideo=link.buscaVideo(html)
s=set(arrayVideo)
for h in s:
link.setter("http://www.vimeo.com"+h)
html=link.adhtml()
a=link.buscaDescripcion(html)
rate=link.buscaRate(html)
titulo=link.buscaTitulo(html)
loco.write('\t<item>\n')
loco.write('\t\t <title>'+ titulo[0] + '</title>\n')
loco.write('\t\t<refer>http://www.vimeo.com"'+h+'"</refer>\n')
loco.write('\t\t<rate>'+ rate[1] +'</rate>\n')
loco.write('\t\t<description>' +a[2]+ '</description>\n')
#loco.write('\t\t<category>'+categoria[0]+'</category>\n')
loco.write('\t</item>\n')
loco.close()
print("Aca salen los videos")
for j in range(1,len(arrayVideo)):
print(arrayVideo[j])
--------------------------------------------------------------------------------------------------------Traceback (most recent call last):
File "C:\Users\carta\Desktop\kavesa\PeN DRIVE\paquete\Spider vimeo\linksvimeo.py", line 122, in <module>
html=link.adhtml()
File "C:\Users\carta\Desktop\kavesa\PeN DRIVE\paquete\Spider vimeo\linksvimeo.py", line 13, in adhtml
f= urllib.request.urlopen(self.__direccion)
File "C:\Python31\lib\urllib\request.py", line 121, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python31\lib\urllib\request.py", line 355, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 467, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 393, in error
return self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 327, in _call_chain
result = func(*args)
File "C:\Python31\lib\urllib\request.py", line 475, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden