Недавно писал скрипт, можете взять за основу:
#!/usr/bin/env python
# coding: utf-8
import asyncio
import aiohttp
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from collections import Counter
from pathlib import Path
from itertools import islice
TARGET = 'plate_number_image_url'
# TARGET = 'photo_url'
def url_to_filename(url, base_dir=TARGET, last_n_parts=3) -> Path:
path = Path(base_dir).joinpath(*url.strip().split('/')[-last_n_parts:])
path.parent.mkdir(parents=True, exist_ok=True)
return path
async def download_content_as_bytes(url: str) -> bytes:
content = None
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
content = await response.read()
except aiohttp.client_exceptions.ClientConnectorError:
# await asyncio.sleep(60)
pass
finally:
return content
async def write_bytes_to_file(content: bytes, filename: str) -> None:
if content:
with open(filename, 'wb') as file:
file.write(content)
print(f'SAVED {filename}')
async def file_download_task(url: str) -> None:
filename = url_to_filename(url)
if not filename.exists():
content = await download_content_as_bytes(url)
await write_bytes_to_file(content, filename)
else:
pass
# print(f'SKIPPED {filename}')
async def main(batch_size=1000) -> None:
with open(f'{TARGET}.txt') as f:
urls = [line.strip() for line in f]
print(f'TOTAL: {len(urls)}')
while urls:
tasks = map(file_download_task, urls[:batch_size])
await asyncio.wait(tasks)
del urls[:batch_size]
if __name__ == '__main__':
while True:
asyncio.run(main())
Ну и потом прочекать/докачать скачанные файлы:
#!/usr/bin/env python
# coding: utf-8
import cv2
import requests
from PIL import Image
from pathlib import Path
for root in ['photo_url', 'plate_number_image_url']:
for path in Path(root).glob('**/*.*'):
if path.stat().st_size > 0:
try:
if any(Image.open(path).size):
image = cv2.imread(str(path))
except:
pass
else:
continue
url = list(path.parts)
url[0] = 'http://img03.platesmania.com'
url = '/'.join(url)
response = requests.get(url)
if response.ok:
with open(path, 'wb') as fo:
fo.write(response.content)
print(f'LOADED {path}')
else:
# Повреждённый или отсутствующий файл
print(path)