Идея – парсер, который собирает артикулы только без картинок, т.е. с содержанием src в виде файла с расширением svg.
Но страница прокручивается полностью, а парсятся максимум первые 30 артикулов.
import bs4
import requests
import collections
import logging
import csv
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup as bs
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('wb')
ParseResult = collections.namedtuple(
'ParseResult',
(
'brand_name',
'url_image',
),
)
HEADERS = (
'Brand',
'Link',
)
driver = webdriver.Chrome(
'C://Users/roman/AppData/Local/Programs/Python/Python37-32/Lib/site-packages/selenium/common/chromedriver_win32 (1)/chromedriver.exe')
driver.get('
https://upakovka-spb.ru/category/2-odnorazovaya-po...')
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
import time
counter = 0
for _ in range(8):
driver.execute_script("window.scrollBy(0, arguments[0]);", counter)
counter += 1000
time.sleep(2)
source_data = driver.page_source
soup = bs(source_data)
class Client:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 84.0.4147.89 Safari / 537.36'
}
self.result = []
def load_page(self, page: int = None):
url = ('
https://upakovka-spb.ru/category/2-odnorazovaya-po...')
res = self.session.get(url=url)
res.raise_for_status()
return res.text
def parse_page(self, text: str):
soup = bs4.BeautifulSoup(text, 'lxml')
container = soup.select('div.s-product-block')
for block in container:
self.parse_block(block=block)
def parse_block(self, block):
# logger.info(block)
# logger.info(' ' * 100)
url_image = block.select_one('img[src$="svg"]')
if not url_image:
logger.error('image')
return
image = url_image.get('src')
if not image:
logger.error('yesimage')
brand_name = block.select_one('h5.s-product-header')
if not brand_name:
logger.error(f'no brand_name on {url}')
return
brand_name = brand_name.text
brand_name = brand_name.replace('/', '').strip()
logger.info('%s, %s', url_image, brand_name)
def save_result(self):
path = 'C:/Users/roman/PycharmProjects/new/product_scraper/result.csv'
with open(path, 'w') as f:
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
writer.writerow(HEADERS)
for item in self.result:
writer.writerows(item)
def run(self):
text = self.load_page()
self.parse_page(text=text)
self.save_result()
if __name__ == '__main__':
parser = Client()
parser.run()