Первая страница парсится нормально, а при переходе на следующую невалидный html при статус коде 200
import time
# import asyncio
import requests
from bs4 import BeautifulSoup
# from fake_useragent import UserAgent
# from datetime import datetime
class Client:
def __init__(self):
self.counter = 0
self.headers = {
'authority': 'www.kijiji.ca',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'referer': 'https: // www.kijiji.ca / b - apartments - condos / city - of - toronto / c37l1700273',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
self.session = requests.Session()
def load_page(self, page):
url = f'https://www.kijiji.ca/b-apartments-condos/city-of-toronto/page-{page}/c37l1700273'
if page == 1:
response = self.session.get(url=url)
else:
response = self.session.get(url=url)
response.raise_for_status()
print(response.status_code)
if response.status_code != 302:
return response.text
else:
return ''
def parse_page(self, text: str):
soup = BeautifulSoup(text, "lxml")
all_adds = soup.findAll(class_="search-item")
parsedData = {}
count = 0
for link in all_adds:
# self.parse_link(link=link, count=count)
parsedData[count] = self.parse_link(link=link, count=count)
print(parsedData)
print('------'*7)
count += 1
def parse_link(self, link, count):
url2 = link.get("data-vip-url")
response2 = self.session.get("https://www.kijiji.ca" + url2)
response2.raise_for_status()
if response2.status_code == 200:
soup_item = BeautifulSoup(response2.text, 'lxml')
return self.parse_item(soup_item)
else:
return ''
def parse_item(self, item):
try:
ad_id = item.select_one("li.currentCrumb-3831268168").find("a").text
title = item.select_one("h1.title-2323565163").text.strip()
location = item.select_one("span.address-3617944557").text.strip()
item_posted = item.select_one("div.datePosted-383942873").find("time").get("datetime")[:10]
# price = item.select_one("div.priceWrapper-1165431705").find("span").get("content")[:-1]
utilities = item.select_one("span.utilities-3542420827").text.strip()[:-18]
author_id = item.select_one("a.link-2686609741").get("href")[:-28][-10:]
except AttributeError:
ad_id = 1
title = 1
location = 1
item_posted = 1
utilities = 1
author_id = 1
# overview = #id foreign
# the_unit = #id foreign
# the_building = #id foreign
# accessibility = # id foreign
data = {
'ad_id': ad_id,
'title': title,
'location': location,
'item_posted': item_posted,
'utilities': utilities,
'author_id': author_id
}
return data
def run(self):
start = time.time()
for page in range(1, 4):
time.sleep(0.5)
text = self.load_page(page)
if text:
print(f"PAGE NUMBER {page} ---------------------")
self.parse_page(text)
print(f"END OF PAGE {page}")
else:
break
end = time.time()
print(end-start)
if __name__ == '__main__':
parser = Client()
parser.run()