import imghdr
import os
import mimetypes
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Находим первую картинку статьи
img = article_soup.find("figure", class_="wp-block-image").find("img")
# Если есть картинка, сохраняем ее в папку images под именем заголовка статьи
if img:
img_url = img["src"]
img_data = requests.get(img_url).content
extension = imghdr.what(None, h=img_data)
with open(f"images/{heading}.{extension}", "wb") as handler:
handler.write(img_data)
img_tag = article_soup.find("figure", class_="wp-block-image").find("img")
if img_tag:
img_url = img_tag.get("src")
if img_url.startswith("http"):
response = requests.get(img_url)
if response.status_code == 200:
save_folder = "images"
if not os.path.exists(save_folder):
os.makedirs(save_folder)
# Remove _resize parameter if present
if "_resize" in img_url:
img_url = img_url.split("_resize")[0]
# Remove any query parameters (e.g. "?ssl=1") from the filename
filename = os.path.basename(img_url).split("?")[0].replace("?", "_")
with open(os.path.join(save_folder, filename), "wb") as f:
f.write(response.content)
print(f"Saved {img_url} to {save_folder}")
else:
print(f"Error: {response.status_code} - {img_url}")
else:
print(f"Skipped {img_url}")
else:
print("No image found in the article.")