Есть скрипт парсинга товаров на python 2.7 , работает, но первый спарсенный товар дублируется много раз в таблице, как исправить?
import requests
from bs4 import BeautifulSoup
import csv
import re
def get_html(url):
r = requests.get(url)
return r.text
urls=['http://www.autobody.ru/catalog/9468/']
urll=[]
for url in urls:
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
mydivs = soup.findAll('a',class_="banners_images")
urls = []
for i in mydivs:
ur = (i.get('href'))
ur = 'http://www.autobody.ru' + str(ur)
urls.append(ur.encode('utf8'))
if len(urls)==0:
mydivs = soup.findAll('div',class_="forward_catalog_new_link_container")
for i in mydivs:
ur= i.find('a')['href']
ur = 'http://www.autobody.ru' + str(ur)
urls.append(ur.encode('utf8'))
images = []
heads = []
artic = []
atrib = []
price = []
a1=[]
a2=[]
a3=[]
with open('e:\\projects\\1.csv', 'a') as f: # Open the file in binary mode for Python 2.x
f.write(u'\ufeff'.encode('utf8')) # writes "byte order mark" UTF-8 signature
writer = csv.writer(f)
for i in urls:
html = get_html(i)
soup = BeautifulSoup(html, 'html.parser')
head = soup.find('h1').get_text()
heads.append(head.encode('utf8'))
image = [x['src'] for x in soup.findAll('img', {'class': 'detimg'})]
image1 = 'http://www.autobody.ru'+image[0]
images.append(image1.encode('utf8'))
price1 = soup.find('div', class_='price').get_text()
price1 = re.sub(r"c",r"p", price1)
price.append(price1.encode('utf8'))
zo=soup.find('table', class_='tech').find_all('tr')
artic.append(zo[0].get_text().strip().encode('utf8'))
atrib.append(zo[1].get_text().strip().encode('utf8'))
a1.append(zo[2].get_text().strip().encode('utf8'))
try:
a2.append(zo[3].get_text().strip().encode('utf8'))
except:
a2.append(1)
writer.writerows(zip(*[heads, price, artic, images,atrib,a1,a2,urls]))