Estoy intentando crear un scrapper web y que una vez extraidos los datos los pase a un excel, el problema viene en que los datos me llegan en formato xml dentro de un array y no se como reemplazar los caracteres para que sean validos para introducirlos en excel. Se que el script se puede reducir en muchas menos linias, pero aun no tengo la experiencia.
He intentando aplicando month.replace("[","").replace("(","").replace(")","").replace(")", "")
Tambien he probado algo similar con re.sub y con lstrip, rstrip pero nada parece funcionar
Código Python:
Ver original
from lxml import html import requests from openpyxl import Workbook wb = Workbook() ws = wb.active def Volume( url ): tree = html.fromstring(requests.get(url).content) month = tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[1]/th/span/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[2]/th/span/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[3]/th/span/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[4]/th/span/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[5]/th/span/text()') tvol = tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[1]/td[4]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[2]/td[4]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[3]/td[4]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[4]/td[4]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[5]/td[4]/text()') atclose = tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[1]/td[11]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[2]/td[11]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[3]/td[11]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[4]/td[11]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[5]/td[11]/text()') change = tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[1]/td[12]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[2]/td[12]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[3]/td[12]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[4]/td[12]/text()'), tree.xpath('//*[@id="volumeDetailProductTable"]/tbody/tr[5]/td[12]/text()') if url == 'http://www.cmegroup.com/trading/agricultural/grain-and-oilseed/corn_quotes_volume_voi.html': ws['A2']=month[0] ws['A3']=month[1] ws['A4']=month[2] ws['A5']=month[3] ws['A6']=month[4] ws['B2']=tvol[0] ws['B3']=tvol[1] ws['B4']=tvol[2] ws['B5']=tvol[3] ws['B6']=tvol[4] ws['C2']=atclose[0] ws['C3']=atclose[1] ws['C4']=atclose[2] ws['C5']=atclose[3] ws['C6']=atclose[4] ws['D2']=change[0] ws['D3']=change[1] ws['D4']=change[2] ws['D5']=change[3] ws['D6']=change[4] elif url == 'http://www.cmegroup.com/trading/agricultural/grain-and-oilseed/wheat_quotes_volume_voi.html': ws['A8']=month[0] ws['A9']=month[1] ws['A10']=month[2] ws['A11']=month[3] ws['A12']=month[4] ws['B8']=tvol[0] ws['B9']=tvol[1] ws['B10']=tvol[2] ws['B11']=tvol[3] ws['B12']=tvol[4] ws['C8']=atclose[0] ws['C9']=atclose[1] ws['C10']=atclose[2] ws['C11']=atclose[3] ws['C12']=atclose[4] ws['D8']=change[0] ws['D9']=change[1] ws['D10']=change[2] ws['D11']=change[3] ws['D12']=change[4] return month[1],tvol[2],atclose[4],change[4] ws['A1']="Vencimiento" ws['B1']="Total Volumen" ws['C1']="Op. Int. At Close" ws['D1']="Op. Int. Change" print Volume('http://www.cmegroup.com/trading/agricultural/grain-and-oilseed/corn_quotes_volume_voi.html') print Volume('http://www.cmegroup.com/trading/agricultural/grain-and-oilseed/wheat_quotes_volume_voi.html') wb.save('balances.xlsx')
El output de ejecutar una vez la funcion Volume mostrando: month, tvol, atclose, change es el siguiente:
Código CONSOLA:
Ver original
((['MAR 16'], ['MAY 16'], ['JUL 16'], ['SEP 16'], ['DEC 16']), (['23,036'], ['144,696'], ['48,048'], ['18,267'], ['21,333']), (['16,103'], ['656,971'], ['287,740'], ['126,250'], ['175,102']), (['-7,746'], ['4,999'], ['3,182'], ['2,095'], ['2,315']))
El error en consola es el siguiente:
Código CONSOLA:
Ver original
zipus@zipus-linux ~/Escritorio $ python scrapper.py Traceback (most recent call last): File "scrapper.py", line 71, in <module> print Volume('http://www.cmegroup.com/trading/agricultural/grain-and-oilseed/corn_quotes_volume_voi.html') File "scrapper.py", line 16, in Volume ws['A2']=month[0] File "/usr/local/lib/python2.7/dist-packages/openpyxl/worksheet/worksheet.py", line 342, in __setitem__ self[key].value = value File "/usr/local/lib/python2.7/dist-packages/openpyxl/cell/cell.py", line 305, in value self._bind_value(value) File "/usr/local/lib/python2.7/dist-packages/openpyxl/cell/cell.py", line 209, in _bind_value raise ValueError("Cannot convert {0} to Excel".format(value)) ValueError: Cannot convert ['MAR 16'] to Excel
Saludos!