import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.cookies": 2}
options.add_experimental_option("prefs", prefs)
options.add_argument("log-level=3")
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.set_page_load_timeout(400)
driver.implicitly_wait(20)
links = []
def get_page_links():
driver.get('https://www.olx.ua/uslugi/prodazha-biznesa/')
page_links = []
pagination_list = driver.find_elements(By.CSS_SELECTOR, 'ul.css-5tn4wy')
for pagination in pagination_list:
for li in pagination.find_elements(By.TAG_NAME, 'li'):
try:
href = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
if href:
page_links.append(href)
except:
pass
links.extend(page_links)
return page_links
page_links = get_page_links()
def get_links(page_links):
for page_url in page_links:
driver.get(page_url)
pagination_items = driver.find_elements(By.CSS_SELECTOR, 'ul.pagination-list.css-1vdlgt7 li')
page_numbers = [int(item.text) for item in pagination_items if item.text.isdigit()]
max_page = max(page_numbers) if page_numbers else 1
for page_num in range(2, max_page + 1):
page_url_with_num = f"{page_url}?page={page_num}"
links.append(page_url_with_num)
if len(links) >= 30:
return
get_links(page_links)
def get_products(pagination_links):
product_links = []
for page_url in tqdm(pagination_links, desc="Обработка страниц"):
driver.get(page_url)
try:
product_cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-cy="l-card"]')
for card in product_cards:
try:
ad_card_title = card.find_element(By.CSS_SELECTOR, 'div[data-cy="ad-card-title"]')
link_elements = ad_card_title.find_elements(By.CSS_SELECTOR, 'a')
for link in link_elements:
href = link.get_attribute('href')
if href and href not in product_links:
product_links.append(href)
if len(product_links) >= 100:
return product_links
except:
continue
except:
continue
return product_links
product_links = get_products(links)
data = []
for product in tqdm(product_links[:30], desc="Обработка объявлений"):
try:
driver.get(product)
subcategory = driver.find_elements(By.XPATH, '/html/body/div/div[2]/div/div[2]/div[2]/div[2]/nav/ol/li[4]/a')
price = driver.find_elements(By.XPATH, '/html/body/div/div[2]/div/div[2]/div[3]/div[2]/div[1]/div/div[3]/div/div/h3')
title = driver.find_elements(By.XPATH, '/html/body/div/div[2]/div/div[2]/div[3]/div[2]/div[1]/div/div[2]/h4')
data.append({
"Подкатегория": subcategory[0].text if subcategory else "Нет данных",
"Заголовок": title[0].text if title else "Нет данных",
"Цена": price[0].text if price else "Нет данных",
"Ссылка": f'=HYPERLINK("{product}", "Ссылка")'
})
except Exception as e:
print(f"Ошибка при обработке {product}: {e}")
if data:
df = pd.DataFrame(data)
df.to_excel("products.xlsx", index=False)
wb = load_workbook("products.xlsx")
ws = wb.active
for col in ws.columns:
max_length = 0
col_letter = get_column_letter(col[0].column)
if col_letter != get_column_letter(ws.max_column):
for cell in col:
try:
if cell.value:
max_length = max(max_length, len(str(cell.value)))
except:
pass
ws.column_dimensions[col_letter].width = max_length + 2
while ws.max_row > 1 and all(cell[0].value is None for cell in ws.iter_cols(min_col=ws.max_column, max_col=ws.max_column)):
ws.delete_rows(ws.max_row)
while ws.max_column > 1 and all(cell[0].value is None for cell in ws.iter_cols(min_col=ws.max_column, max_col=ws.max_column)):
ws.delete_cols(ws.max_column)
wb.save("products.xlsx")
print("Данные успешно сохранены в products.xlsx")
else:
print("Не удалось собрать достаточно данных.")
driver.quit()