Задать вопрос

Как удалить пустые поля в Excel?

мне надо чтобы все пустые колонки исчезли
67d470a1b9dd0894881730.png
должно быть так:
67d470758d2c7576405283.png
код:
код
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.cookies": 2}
options.add_experimental_option("prefs", prefs)
options.add_argument("log-level=3")
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.set_page_load_timeout(400)
driver.implicitly_wait(20)

links = []

def get_page_links():
    driver.get('https://www.olx.ua/uslugi/prodazha-biznesa/')
    page_links = []
    pagination_list = driver.find_elements(By.CSS_SELECTOR, 'ul.css-5tn4wy')
    for pagination in pagination_list:
        for li in pagination.find_elements(By.TAG_NAME, 'li'):
            try:
                href = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
                if href:
                    page_links.append(href)
            except:
                pass
    links.extend(page_links)
    return page_links

page_links = get_page_links()

def get_links(page_links):
    for page_url in page_links:
        driver.get(page_url)
        pagination_items = driver.find_elements(By.CSS_SELECTOR, 'ul.pagination-list.css-1vdlgt7 li')
        page_numbers = [int(item.text) for item in pagination_items if item.text.isdigit()]
        max_page = max(page_numbers) if page_numbers else 1
        
        for page_num in range(2, max_page + 1):
            page_url_with_num = f"{page_url}?page={page_num}"
            links.append(page_url_with_num)
            if len(links) >= 30:
                return

get_links(page_links)

def get_products(pagination_links):
    product_links = []
    for page_url in tqdm(pagination_links, desc="Обработка страниц"):
        driver.get(page_url)
        try:
            product_cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-cy="l-card"]')
            for card in product_cards:
                try:
                    ad_card_title = card.find_element(By.CSS_SELECTOR, 'div[data-cy="ad-card-title"]')
                    link_elements = ad_card_title.find_elements(By.CSS_SELECTOR, 'a')
                    for link in link_elements:
                        href = link.get_attribute('href')
                        if href and href not in product_links:
                            product_links.append(href)
                            if len(product_links) >= 100:
                                return product_links
                except:
                    continue
        except:
            continue
    return product_links

product_links = get_products(links)

data = []
for product in tqdm(product_links[:30], desc="Обработка объявлений"):
    try:
        driver.get(product)
        
        subcategory = driver.find_elements(By.XPATH, '/html/body/div/div[2]/div/div[2]/div[2]/div[2]/nav/ol/li[4]/a')
        price = driver.find_elements(By.XPATH, '/html/body/div/div[2]/div/div[2]/div[3]/div[2]/div[1]/div/div[3]/div/div/h3')
        title = driver.find_elements(By.XPATH, '/html/body/div/div[2]/div/div[2]/div[3]/div[2]/div[1]/div/div[2]/h4')
        
        data.append({
            "Подкатегория": subcategory[0].text if subcategory else "Нет данных",
            "Заголовок": title[0].text if title else "Нет данных",
            "Цена": price[0].text if price else "Нет данных",
            "Ссылка": f'=HYPERLINK("{product}", "Ссылка")'
        })
    except Exception as e:
        print(f"Ошибка при обработке {product}: {e}")

if data:
    df = pd.DataFrame(data)
    df.to_excel("products.xlsx", index=False)
    
    wb = load_workbook("products.xlsx")
    ws = wb.active
    
    for col in ws.columns:
        max_length = 0
        col_letter = get_column_letter(col[0].column)
        
        if col_letter != get_column_letter(ws.max_column):
            for cell in col:
                try:
                    if cell.value:
                        max_length = max(max_length, len(str(cell.value)))
                except:
                    pass
            ws.column_dimensions[col_letter].width = max_length + 2
    
    while ws.max_row > 1 and all(cell[0].value is None for cell in ws.iter_cols(min_col=ws.max_column, max_col=ws.max_column)):
        ws.delete_rows(ws.max_row)

    while ws.max_column > 1 and all(cell[0].value is None for cell in ws.iter_cols(min_col=ws.max_column, max_col=ws.max_column)):
        ws.delete_cols(ws.max_column)
    
    wb.save("products.xlsx")
    print("Данные успешно сохранены в products.xlsx")
else:
    print("Не удалось собрать достаточно данных.")

driver.quit()
  • Вопрос задан
  • 80 просмотров
Подписаться 1 Простой 5 комментариев
Пригласить эксперта
Ваш ответ на вопрос

Войдите, чтобы написать ответ

Похожие вопросы