def get_html(url):
r = requests.get(url)
return r.text
def get_total_pages(html):
pages = []
soup = bs(html, 'lxml')
pages_block = soup.find('span', style='float:right').find_all('span')[-3].a.text.strip()
for i in range(1, int(pages_block)+1):
pages.append(f'https://www.chinavasion.com/china/wholesale/electronics/Cameras-Accessories/Digital-Cameras/?page={i}')
return pages
def get_data(pages):
for i in pages:
r = requests.get(i)
soup = bs(r.text, 'lxml')
goods = soup.find_all('div', class_='product_tile')
urls = []
for data in goods:
a = data.find('a')['href']
links = {
'href': f'https://www.chinavasion.com{a}'
}
urls.append(links)
user_database = []
for user in urls:
r = requests.get(user['href'], headers=HEADERS)
soup = bs(r.text, 'lxml')
title = soup.find('h1', {'class': 'fn'}).text.strip()
product_code = soup.find('span', {'class': 'code'}).text.strip()
price_usd = soup.find('span', {'class': 'ccy'}).text.strip()
description = soup.find('div', {'class': 'pro_info_parameters'}).text.replace('.','').strip()
link = 'https:' + soup.find('div', {'class': 'wj-content-backtop'}).find('a').get('href')
image_main = 'https:' + soup.find('img', {'class': 'img400'})['src'].strip()
#print(title, product_code, price_usd, description, link, image_main)
user_data = {
'title': title,
'product_code': product_code,
'price_usd': price_usd,
'description': description,
'link': link,
'image_main': image_main
}
user_database.append(user_data)
return user_database
def save_csv(user_data):
with open('oops.csv', 'w', encoding="utf-8") as f:
writer = csv.writer(f, delimiter=';')
writer.writerow(['Title', 'Product_code', 'Price_usd', 'Description', 'Link', 'Image_main'])
for i in user_data:
writer.writerow((i['title'], i['product_code'], i['price_usd'], i['description'], i['link'], i['image_main']))
def main():
url = 'https://www.chinavasion.com/china/wholesale/electronics/Cameras-Accessories/Digital-Cameras/'
html = get_html(url)
pages = get_total_pages(html)
user_data = get_data(pages)
save_csv(user_data)
if __name__ == '__main__':
main()