@gndkg

Из-за чего происходит значительное замедление программы?

Здравствуйте, есть код для парсинга

scrapper.py:
...

main_data = {
    'organization_name': [],
    'organization_voen': [],
    'organization_address': [],
    'event_name': [],
    'event_number': [],
    'classification_code': [],
    'suggested_price': [],
    'event_start_date': [],
    'submission_deadline': [],
    'envelope_opening_date': [],

    'participation_fee': [],
    'participation_description': [],

    'usage_fee': [],
    'usage_description': [],

    'full_name': [],
    'contact': [],
    'position': [],
    'phone_number': []
}

nested_data = {
    'heading': [],
    'disclosure': [],
    'quantity': [],
    'measure_unit': [],
    'code': []
}

def get_total_items_from_link(api_link='https://etender.gov.az/api/events?EventType=2&PageSize=1&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='):
    try:
        response = requests.get(api_link, timeout=20)
        if response.status_code == 200:
            data = response.json()
            return data['totalItems']
    except requests.Timeout:
        return -1
    return -1

def get_total_items_from_id(id):
    ...

def get_all_events_ids(page_size):
    template = f'https://etender.gov.az/api/events?EventType=2&PageSize={page_size}&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='
    try:
        response = requests.get(template, timeout=20)
        data = response.json()

        events_ids = [item['eventId'] for item in data['items']]
        return sorted(events_ids)
    except requests.Timeout:
        return []

def get_info_from_link(ids):
    cnt = 0
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-images')

    driver = webdriver.Chrome(options=options)

    for id in ids:
        link = f'https://etender.gov.az/main/competition/detail/{id}'
        driver.get(link)
        time.sleep(3)

        selectors = [
           ...
        ]

        for item_list, selector in selectors:
            try:
                element = WebDriverWait(driver, 60).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
                )
                item_list.append(str(element.text))
            except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
                item_list.append("None")

        try:
            participation_description_selector = 'body > app-root > app-competition-detail > main > section.section__3 > div > div > div:nth-child(1) > div > div.content'
            element = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, participation_description_selector))
            )
            main_data['participation_description'].append(str(element.text))
        except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
            main_data['participation_description'].append("None")

        try:
            usage_description_selector = 'body > app-root > app-competition-detail > main > section.section__3 > div > div > div:nth-child(2) > div > div.content'
            element = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, usage_description_selector))
            )
            main_data['usage_description'].append(str(element.text))
        except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
            main_data['usage_description'].append("None")

        cnt += 1

        print(f"[LOG] tender number {cnt} has done")

    driver.quit()
    time.sleep(3)

def get_fees(ids: list):
    ...

def get_contact(ids):
    ...

def get_all_info_from_table(ids):
    for id in ids:
        heading, disclosure, quantity, measure, code = [], [], [], [], []
        total_items = get_total_items_from_id(id)
        table_link = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize={total_items}&PageNumber=1'

        try:
            response = requests.get(table_link, timeout=10)
            if response.status_code == 200:
                data = response.json()
                for item in data['items']:
                    heading.append(str(item.get('name', 'None')) if item.get('name') else 'None')
                    disclosure.append(str(item.get('description', 'None')) if item.get('description') else 'None')
                    quantity.append(str(item.get('quantity', 'None')) if item.get('quantity') else 'None')
                    measure.append(str(item.get('unitOfMeasure', 'None')) if item.get('unitOfMeasure') else 'None')
                    code.append(str(item.get('categorycode', 'None')) if item.get('categorycode') else 'None')

        except requests.Timeout:
            ...

        nested_data['heading'].append("\n".join(heading))
        nested_data['disclosure'].append("\n".join(disclosure))
        nested_data['quantity'].append("\n".join(quantity))
        nested_data['measure_unit'].append("\n".join(measure))
        nested_data['code'].append("\n".join(code))

def fetch_data(ids):
    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = [
            ...
        ]
        for future in as_completed(futures):
            future.result()
            time.sleep(1)


и для загрузки полученных данных в бд MySQL
loadData.py:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Table, MetaData, insert
import time
from scrapper import *
from config import *

engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}')
Session = sessionmaker(bind=engine)

metadata = MetaData()
events_table = Table('events', metadata, autoload_with=engine)
events_details_table = Table('events_details', metadata, autoload_with=engine)

ids = get_all_events_ids(get_total_items_from_link())
batch_size = 100

for i in range(0, len(ids), batch_size):
    package = ids[i:i + batch_size]
    fetch_data(package)
    session = Session()

    main_data_records = [
        {
            'organization_name': org_name,
            'organization_voen': org_voen,
            'organization_address': org_address,
            'event_name': event_name,
            'event_number': event_number,
            'classification_code': classification_code,
            'suggested_price': suggested_price,
            'event_start_date': event_start_date,
            'submission_deadline': submission_deadline,
            'envelope_opening_date': envelope_opening_date,

            'participation_fee': participation_fee,
            'participation_description': participation_description,

            'usage_fee': usage_fee,
            'usage_description': usage_description,
            
            'full_name': full_name,
            'contact': contact,
            'position': position,
            'phone_number': phone_number
        }
        for org_name, org_voen, org_address, event_name, event_number, classification_code,
            suggested_price, event_start_date, submission_deadline, envelope_opening_date,
            participation_fee, participation_description, usage_fee, usage_description,
            full_name, contact, position, phone_number in zip(
                main_data['organization_name'],
                main_data['organization_voen'],
                main_data['organization_address'],
                main_data['event_name'],
                main_data['event_number'],
                main_data['classification_code'],
                main_data['suggested_price'],
                main_data['event_start_date'],
                main_data['submission_deadline'],
                main_data['envelope_opening_date'],

                main_data['participation_fee'],
                main_data['participation_description'],

                main_data['usage_fee'],
                main_data['usage_description'],

                main_data['full_name'],
                main_data['contact'],
                main_data['position'],
                main_data['phone_number']
            )
    ]

    nested_data_records = [
        {
            'heading': heading,
            'disclosure': disclosure,
            'quantity': quantity,
            'measure_unit': measure_unit,
            'code': code
        }
        for heading, disclosure, quantity, measure_unit, code in zip(
            nested_data['heading'],
            nested_data['disclosure'],
            nested_data['quantity'],
            nested_data['measure_unit'],
            nested_data['code']
        )
    ]

    for record in main_data_records:
        insert_stmt = insert(events_table).values(**record)
        session.execute(insert_stmt)

    for record in nested_data_records:
        insert_stmt = insert(events_details_table).values(**record)
        session.execute(insert_stmt)
    
    session.commit()
    session.close()
    
    time.sleep(5)


На данный момент, функцией get_total_items_from_link() будет возвращатся значение около 2500. Я тестировал loadData с разным количеством айдишников, максимум было 1400(когда работало с приемлемой скоростью). После этого начинает очень медленно работать. Почему?
  • Вопрос задан
  • 55 просмотров
Пригласить эксперта
Ваш ответ на вопрос

Войдите, чтобы написать ответ

Войти через центр авторизации
Похожие вопросы