from urllib.request import urlopen # for Python 3
# from urllib2 import urlopen # for Python 2
from lxml.etree import XMLSyntaxError
from lxml.html import fromstring
from pandas import DataFrame, ExcelWriter
URL = 'http://www.tinko.ru/c-3.html?limit=100&no_cache=true&p=l'
ITEM_PATH = ' .info-block .product-name'
DESCR_PATH = '.breadcrumb .active'
HARET_PATH = '#techdata li'
def parse_courses():
f = urlopen(URL)
list_html = f.read().decode('utf-8')
list_doc = fromstring(list_html)
df = DataFrame(columns=('name', 'description', 'href'))
for elem in list_doc.cssselect(ITEM_PATH):
a = elem.cssselect('a')[0]
href = a.get('href')
name = a.text
details_html = urlopen(href).read().decode('utf-8')
try:
details_doc = fromstring(details_html)
except XMLSyntaxError:
continue
description = details_doc.cssselect(DESCR_PATH)[0].text_content()
haret_elems_list = [('name', name), ('description', description), ('href', href)]
for haret_elems in details_doc.cssselect(HARET_PATH):
spans = haret_elems.cssselect('span')
title = spans[0].text_content()
bower = spans[1].text_content()
haret_elems_list.append((title, bower))
df = df.append(dict(haret_elems_list), ignore_index=True)
writer = ExcelWriter('tinko_ru_price_list.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='tinko.ru price list', header=True, index=False)
writer.save()
def main():
parse_courses()
if __name__ == '__main__':
main()
Вот вам готовый парсер tinko.ru с выгрузкой в Excel (проверял только в offline на странице с диска).
Как теперь гонорар делить будем? ;)
UPD: Подправлено.