@limon_stepan

Не работает парсер на pyhton bs4, что делать?

Всем привет, разбираюсь в парсерах на python. У меня перестал работать парсер для сайта realt.by.
Пример ссылки: https://realt.by/sale-flats/object/2562548/
import pandas
import pandas as pd
import requests
import PySimpleGUI as sg
from bs4 import BeautifulSoup
from time import sleep
import urllib.request


def is_valid(url):
    try:
        urllib.request.urlopen(url)
        return True
    except Exception:
        return False


layout = [
    [sg.Text('Ссылка на объявление'), sg.InputText(key='link')],
    [sg.Button('Применить'), sg.Button('Отмена')],
    [sg.ProgressBar(max_value=10, orientation='h', size=(30,15), key='-PROG-')]
]
window = sg.Window('Добавить объявление', layout)

while True:  # The Event Loop
    event, values = window.read()
    # print(event, values) #debug
    if event in (None, 'Exit', 'Отмена'):
        break

    if event == 'Применить':
        curr = 0
        window["-PROG-"].update_bar(curr + 1)
        curr += 1

        sql_img = []
        sql_link = []
        sql_title = []
        sql_address = []
        sql_rajon = []
        sql_metro = []
        sql_rooms = []
        sql_area = []
        sql_floor = []
        sql_price = []
        sql_comment = []
        sql_status = []

        allAparts = []


        def intTryParse(value):
            try:
                int(value)
                return True
            except ValueError:
                return False


        window["-PROG-"].update_bar(curr + 1)
        curr += 1

        url = values['link']
        if not is_valid(url):
            sg.popup("Некорректная ссылка")
        else:
            data = requests.get(url)
            soup = BeautifulSoup(data.text, features="html.parser")
            img = soup.find('div', attrs={"class": "swiper-wrapper"}).findAll('img', class_="blur-sm scale-105")[1].get('src')

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            region = soup.find('ul', class_="w-full mb-0.5 -my-1").findAll('li')[4].find('a',
                                                                             class_="focus:outline-none sm:focus:shadow-10bottom transition-colors cursor-pointer text-info-500 hover:text-info-600 active:text-info").text

            if str(region).split(' ')[1] != "район":
                region = soup.find('ul', class_="w-full mb-0.5 -my-1").findAll('li')[3].find('a',
                                                                                 class_="focus:outline-none sm:focus:shadow-10bottom transition-colors cursor-pointer text-info-500 hover:text-info-600 active:text-info").text

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            title = soup.find('h1',
                        class_='order-1 mb-0.5 md:-order-2 md:mb-4 block w-full !inline-block lg:text-h1Lg text-h1 font-raleway font-bold flex items-center').text

            address = soup.findAll('a',
                       class_="focus:outline-none sm:focus:shadow-10bottom transition-colors cursor-pointer inline md:inline-block mr-4 text-basic hover:text-info-500 active:text-info")

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            address = address[0].text + " " + address[1].text
            try:
                metro = soup.find('li', class_="align-top inline-flex mr-4 last:mr-0").find('a',
                                                                            class_='focus:outline-none sm:focus:shadow-10bottom transition-colors cursor-pointer inline md:inline-block mr-4 text-basic hover:text-info-500 active:text-info').text.strip()
            except AttributeError:
                metro = "Нету"

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            info = soup.find('ul', class_='w-full -my-1').findAll('p')
            rooms = info[0].text.strip()

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            area = info[1].text.strip()
            floor = info[5].text.strip()

            if intTryParse(rooms):
                area = info[2].text.strip()

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            if intTryParse(floor):
                floor = info[6].text.strip()

            comment = soup.find('section', class_="bg-white flex flex-wrap md:p-6 my-4 rounded-md").text
            price = soup.find('h2',
                  class_='w-full sm:w-auto sm:inline-block sm:mr-1.5 lg:text-h2Lg text-h2 font-raleway font-bold flex items-center').text.strip()

            status = soup.find('div', class_="md:w-full text-info-500 md:items-center relative flex flex-wrap w-1/2 mb-6").find('span', class_="text-subhead md:text-body text-basic w-full").text

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            sql_img.append(img)
            sql_link.append(url)
            sql_title.append(title)
            sql_address.append(address)
            sql_rajon.append(region)
            sql_metro.append(metro)
            sql_rooms.append(rooms)
            sql_area.append(area)
            sql_floor.append(floor)
            sql_price.append(price)
            sql_comment.append(comment)
            sql_status.append(status)
            allAparts.append([img, url, title, address, region, metro, rooms, area, floor, price, comment, status])

            df_aparts = pandas.DataFrame(
                {'img': sql_img, 'link': sql_link, 'title': sql_title, 'address': sql_address, 'rajon': sql_rajon,
                'metro': sql_metro, 'rooms': sql_rooms, 'area': sql_area, 'floor': sql_floor, 'price': sql_price,
                'comment': sql_comment, 'status': sql_status})

            window["-PROG-"].update_bar(curr + 1)
            curr += 1

            # To excel
            writer = pd.ExcelWriter(r"C:\Users\Kisliy\source\repos\RealEstateAgency\RealEstateAgency\bin\Debug\output.xlsx")
            df_aparts.to_excel(writer)
            writer.close()
            break

Получаю такую ошибку:
Traceback (most recent call last):
File "C:\Users\karat\PycharmProjects\testproj\parser.py", line 70, in
img = soup.find('div', attrs={"class": "swiper-wrapper"}).findAll('img', class_="blur-sm scale-105")[1].get('src')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'findAll'
  • Вопрос задан
  • 113 просмотров
Решения вопроса 1
datka
@datka
Скорее всего проблема в header-ax

добавь в код
headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36"
}


и поменяй
data = requests.get(url, headers=headers)

import requests
from bs4 import BeautifulSoup

url = "https://realt.by/sale-flats/object/2562548/"


headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36"
}

data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, features="html.parser")
img = (
    soup.find("div", attrs={"class": "swiper-wrapper"})
    .findAll("img", class_="blur-sm scale-105")[1]
    .get("src")
)
print(img)


https://static.realt.by/thumb/c/600x400/6f57b1d409f96f2b1ede7f082f120b50/ja/e/site15nf8eja/7c30f38145.jpg
Ответ написан
Пригласить эксперта
Ваш ответ на вопрос

Войдите, чтобы написать ответ

Войти через центр авторизации
Похожие вопросы
29 февр. 2024, в 00:43
5000 руб./за проект
28 февр. 2024, в 23:53
1000 руб./за проект
28 февр. 2024, в 23:20
10000 руб./за проект