import requests
from bs4 import BeautifulSoup
import re
import csv
from datetime import datetime
from multiprocessing import Pool
def get_html(url):
r = requests.get(url)
if r.ok: # 200
return r.text # возвращает HTML-код страницы (url)
print(r.status_code)
# получаем все ссылки на товары на одной странице
def get_all_links(html):
soup = BeautifulSoup(html, 'lxml')
links_product_detail = soup.findAll('a', class_='product__list--code')
links = []
for hrefs in links_product_detail:
a = hrefs.get('href') # string
link = 'http://www.futureelectronics.com' + a
links.append(link)
return links
def get_page_data(html):
soup = BeautifulSoup(html, 'lxml')
links_product_detail = soup.findAll('a', class_='product__list--code')
for hrefs in links_product_detail:
a = hrefs.get('href') # string
link = 'http://www.futureelectronics.com' + a
print(link)
# # 3. название товара
try:
name = soup.find('h2', class_='product-title').text.strip()
except:
name = ''
data = {'name': name}
return data
def write_csv(data):
with open('semiconductors_analog.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow( (data['name']) )
# print(data['name'], 'parsed')
def make_all(url):
html = get_html(url)
data = get_page_data(html)
write_csv(data)
def main():
start = datetime.now()
# Current Category: Semiconductors
# https://www.futureelectronics.com/c/semiconductors/analog/products?q=%3Arelevance&text=&pageSize=25&page=1
# https://www.futureelectronics.com/c/semiconductors/analog/products?q=%3Arelevance&text=&pageSize=25&page=2
pattern = 'https://www.futureelectronics.com/c/semiconductors/analog/products?q=%3Arelevance&text=&pageSize=25&page={}' # передаем url в функцию def get_html(url)
for i in range(0, 1):
url = pattern.format(str(i))
# print(url)
all_links = get_all_links( get_html(url) )
# до мультипарсинга 0:00:10.555286
# for index, url in enumerate(all_links):
# html = get_html(url)
# data = get_page_data(html)
# write_csv(data)
# print(index)
with Pool(20) as p:
p.map(make_all, all_links)
end = datetime.now()
total = end - start
print(str(total))
if __name__ == '__main__':
main()
# получаем все ссылки на товары на одной странице
def get_all_links(html):
soup = BeautifulSoup(html, 'lxml')
links_product_detail = soup.findAll('a', class_='product__list--code')
links = []
for hrefs in links_product_detail:
a = hrefs.get('href') # string
link = 'http://www.futureelectronics.com' + a
links.append(link)
return links
def get_page_data(html):
soup = BeautifulSoup(html, 'lxml')
# # 3. название товара
try:
name = soup.find('h2', class_='product-title').text.strip()
except:
name = ''
дубль
дубль-2
дубль-3
дубль-4