import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
# Создаем сессию
session = requests.Session()
# Функция для получения данных с веб-страницы с использованием сессии
def get_data(url):
try:
response = session.get(url, allow_redirects=True) # Включаем обработку редиректа
response.raise_for_status() # Check if the request was successful
except requests.exceptions.HTTPError as errh:
print("HTTP Error:", errh)
return None
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
return None
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
return None
except requests.exceptions.RequestException as err:
print("Something went wrong:", err)
return None
soup = BeautifulSoup(response.text, 'html.parser')
# Извлечение данных
title = soup.find('h1', {'itemprop': 'name'}) and soup.find('h1', {'itemprop': 'name'}).find('span', {'class': 't12nw7s2_pdp'}).text or ''
price = soup.find('span', {'class': 'n12fsaew_pdp'}) and soup.find('span', {'class': 'n12fsaew_pdp'}).text or ''
stock_info = soup.find_all('li', {'data-qa': 'stock-in-store-item'})
stocks = {stock.find('span', {'class': 's1c1zkco_pdp'}).text: stock.find('span', {'class': 's1nyoy8i_pdp'}).text for stock in stock_info}
return {'title': title, 'price': price, 'stocks': stocks}
# Открываем файл Excel с URL
excel_file_path = 'Ссылка на товар.xlsx'
df_urls = pd.read_excel(excel_file_path, header=None, names=['urls'])
# Собираем данные для каждого URL
data_list = []
for url in tqdm(df_urls['urls'], desc='Обработка URL', unit='URL'):
try:
# Adding a check for the URL scheme
if not url.startswith(('http://', 'https://')):
url = 'https://' + url # Assuming it's a website, adding https:// as a default scheme
data = get_data(url)
if data:
data_list.append(data)
print(f"URL успешно обработан: {url}")
except requests.exceptions.RequestException as e:
print(f"Ошибка при обработке URL {url}: {str(e)}")
# Создаем DataFrame для удобства работы с данными
df = pd.DataFrame(data_list)
# Сохраняем результат в файл Excel
output_excel_path = 'Результаты.xlsx'
df.to_excel(output_excel_path, index=False)
# Выводим результат
print("Результаты сохранены в файл:", output_excel_path)
Возникает ошибка
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at
https://github.com/pandas-dev/pandas/issues/54466
import pandas as pd
Обработка URL: 0%| | 0/55012 [00:00<?, ?URL/s]Error Connecting: HTTPSConnectionPool(host='url', port=443): Max retries exceeded with url: / (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Обработка URL: 0%| | 1/55012 [00:02<35:04:17, 2.30s/URL]HTTP Error: 401 Client Error: Unauthorized for url: ...
Обработка URL: 0%|
и так далее