#!/home/artddss/python3.4/bin/python3.4
from urllib.request import urlopen # for Python 3
from lxml.etree import XMLSyntaxError
from lxml.html import fromstring
from pandas import DataFrame, ExcelWriter
URL = 'http://www.vsemagazy.ru/'
KATEGORY_PATH = '.menu li '
MAGAZ_PATH = '.tbl-shops tr'
ADRES_PATH = '.tbl-shops'
ADR_PATH = '.post p b'
SUBJ_PATH = '.post p'
def vsemagazy_parse():
f = urlopen(URL)
list_html = f.read().decode('utf-8')
list_doc = fromstring(list_html)
df = DataFrame(columns=('магазин', 'область', 'субъект', 'график работы', 'телефон', 'адрес'))
for kategory in list_doc.cssselect(KATEGORY_PATH):
a = kategory.cssselect('a')[0]
href = a.get('href')
name = a.text
details_html = urlopen(href).read().decode('utf-8')
try:
details_doc = fromstring(details_html)
except XMLSyntaxError:
continue
for magaz in details_doc.cssselect(MAGAZ_PATH):
b = magaz.cssselect('a')[0]
zref = b.get('href')
namb = b.text
drog_html = urlopen(zref).read().decode('utf-8')
try:
dret_doc = fromstring(drog_html)
except XMLSyntaxError:
continue
for adres in dret_doc.cssselect(ADRES_PATH):
c = adres.cssselect('a')[0]
nref = c.get('href')
band = c.text
drt_html = urlopen(nref).read().decode('utf-8')
try:
dry_doc = fromstring(drt_html)
except XMLSyntaxError:
continue
subjekt = dry_doc.cssselect(SUBJ_PATH)[0].text_content()
grafik = dry_doc.cssselect(SUBJ_PATH)[1].text_content()
telephone = dry_doc.cssselect(SUBJ_PATH)[2].text_content()
adres = dry_doc.cssselect(SUBJ_PATH)[3].text_content()
haret_elems_list = [('магазин', name), ('область', namb), ('субъект', subjekt), ('график работы',grafik), ('телефон', telephone), ('адрес', adres)]
print(haret_elems_list)
df = df.append(dict(haret_elems_list), ignore_index=True)
writer = ExcelWriter('magaz.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='1', header=True, index=False)
writer.save()
def main():
vsemagazy_parse()
if __name__ == '__main__':
main()
выходит такая вот ошибка
[('магазин', 'Дикси'), ('область', 'Дикси в Ленинградской области'), ('субъект', 'Субъект РФ: Ленинградская область'), ('график работы', 'Время работы: с 10:00 до 23:00'), ('телефон', 'Телефон горячей линии: 8 (800) 333-02-01'), ('адрес', 'Адрес: Ленинградская область, Шлиссельбург, Шлиссельбург, ул.Луговая, д.4')] Traceback (most recent call last):
File "vsemagazy.py", line 65, in
main()
File "vsemagazy.py", line 62, in main
vsemagazy_parse()
File "vsemagazy.py", line 45, in vsemagazy_parse
drt_html = urlopen(nref).read()
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 463, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 481, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1210, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1088, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1116, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib/python3.4/http/client.py", line 973, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\u2013' in position 49: ordinal not in range(128)
в чем причина? что не так с кодировкой?