Здравствуйте.
Не могу спарсить номер телефона olx.
Код:
import asyncio
import aiohttp
import csv
from bs4 import BeautifulSoup
from tqdm import tqdm
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
# Читаем список ссылок
with open('product_links.txt', 'r') as f:
links = [line.strip() for line in f]
sem = asyncio.Semaphore(30) # Лимит одновременных запросов
async def fetch(session, url):
async with sem:
try:
async with session.get(url, headers=HEADERS, timeout=10) as response:
return await response.text()
except Exception as e:
print(f"[Ошибка запроса] {url}: {e}")
return None
async def get_phone_number(session, ad_id):
api_url = f'https://www.olx.ua/api/v1/offers/{ad_id}/phones/'
async with sem:
try:
async with session.get(api_url, headers=HEADERS, timeout=10) as response:
if response.status == 200:
data = await response.json()
phones = data.get("data", {}).get("phones", [])
return phones[0] if phones else "Нет номера"
except Exception as e:
print(f"[Ошибка запроса] {api_url}: {e}")
return "Ошибка запроса"
return "Нет номера"
def get_subcategory(soup):
category_tag = soup.select_one(
"div#hydrate-root div.css-1ek5um8 div.css-118kolg div.css-16gd35i "
"div.css-6rrh1l nav[role='navigation'] ol.css-xv75xi li.css-7dfllt:nth-of-type(4) a.css-tyi2d1"
)
return category_tag.get_text(strip=True) if category_tag else "Неизвестно"
def get_ad_id(soup):
id_tag = soup.select_one(
"div#hydrate-root div.css-1ek5um8 div.css-118kolg div.css-1d90tha "
"div.css-n9feq4 div.css-1wws9er div.css-cgp8kk div.css-ayk4fp span.css-1i121pa"
)
return id_tag.get_text(strip=True).split(":")[1] if id_tag else None
async def process_link(session, link):
html = await fetch(session, link)
if not html:
return None
soup = BeautifulSoup(html, 'html.parser')
ad_id = get_ad_id(soup)
if not ad_id:
return None
phone = await get_phone_number(session, ad_id)
subcategory = get_subcategory(soup)
return [phone, subcategory, link]
async def main():
connector = aiohttp.TCPConnector(limit=100)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = [process_link(session, link) for link in links]
results = []
for future in tqdm(asyncio.as_completed(tasks), total=len(links), desc="Обработка"):
result = await future
if result:
results.append(result)
with open('ads_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Номер телефона', 'Подкатегория', 'Ссылка'])
writer.writerows(results)
if __name__ == '__main__':
asyncio.run(main())