Все было хорошо, скрипт нормально парсил сайт, а в очередной раз отказался работать со словами
line = pagination.text
AttributeError: 'NoneType' object has no attribute 'text'
Перезапуск не помог...
import requests
from bs4 import BeautifulSoup
import os
import csv
from datetime import datetime
start_time = datetime.now()
URL = 'https://www.avito.ru/murmanskaya_oblast/avtomobili/mitsubishi-ASgBAgICAUTgtg3ymCg?cd=1'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.138 Safari/537.36', 'accept': '*/*'}
HOST = 'https://www.avito.ru'
FILE = 'Cars.svc'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_pages_count(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find('div', class_='pagination-root-2oCjZ')
line = pagination.text
p_count = int(line[-8])
if p_count > 1:
return p_count
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_='snippet-horizontal item item_table clearfix js-catalog-item-enum'
' item-with-contact js-item-extended')
cars = []
for item in items:
cars.append({
'Cars': item.find('a', class_='snippet-link').get_text(),
'Settings': item.find('div', class_='specific-params specific-params_block').get_text().replace('\n ', ''),
'Price': item.find('span', class_='snippet-price').get_text(strip=True).replace('\n ', ''),
'City': item.find('span', class_='item-address-georeferences-item__content').get_text(),
'Link': HOST + item.find('a', class_='snippet-link').get('href'),
})
return cars
def save_file(items, path):
with open(path, 'w', newline='', encoding='UTF-8') as file:
writer = csv.writer(file, delimiter=';')
writer.writerow(['Авто', 'Параметры', 'Цена', 'город', 'ссылка'])
for item in items:
writer.writerow([item['Cars'], item['Settings'], item['Price'], item['City'], item['Link']])
def parse():
html = get_html(URL)
if html.status_code == 200:
cars = []
pages_count = get_pages_count(html.text)
for page in range(1, pages_count):
print(f'Парсинг старницы {page} из {pages_count}...')
html = get_html(URL, params={"p": page})
cars.extend(get_content(html.text))
save_file(cars, FILE)
print(cars)
os.startfile(FILE)
else:
print('Error')
parse()
print(datetime.now() - start_time)