Здравствуйте, есть код для парсинга
scrapper.py:
...
main_data = {
'organization_name': [],
'organization_voen': [],
'organization_address': [],
'event_name': [],
'event_number': [],
'classification_code': [],
'suggested_price': [],
'event_start_date': [],
'submission_deadline': [],
'envelope_opening_date': [],
'participation_fee': [],
'participation_description': [],
'usage_fee': [],
'usage_description': [],
'full_name': [],
'contact': [],
'position': [],
'phone_number': []
}
nested_data = {
'heading': [],
'disclosure': [],
'quantity': [],
'measure_unit': [],
'code': []
}
def get_total_items_from_link(api_link='https://etender.gov.az/api/events?EventType=2&PageSize=1&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='):
try:
response = requests.get(api_link, timeout=20)
if response.status_code == 200:
data = response.json()
return data['totalItems']
except requests.Timeout:
return -1
return -1
def get_total_items_from_id(id):
...
def get_all_events_ids(page_size):
template = f'https://etender.gov.az/api/events?EventType=2&PageSize={page_size}&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='
try:
response = requests.get(template, timeout=20)
data = response.json()
events_ids = [item['eventId'] for item in data['items']]
return sorted(events_ids)
except requests.Timeout:
return []
def get_info_from_link(ids):
cnt = 0
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--disable-images')
driver = webdriver.Chrome(options=options)
for id in ids:
link = f'https://etender.gov.az/main/competition/detail/{id}'
driver.get(link)
time.sleep(3)
selectors = [
...
]
for item_list, selector in selectors:
try:
element = WebDriverWait(driver, 60).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
)
item_list.append(str(element.text))
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
item_list.append("None")
try:
participation_description_selector = 'body > app-root > app-competition-detail > main > section.section__3 > div > div > div:nth-child(1) > div > div.content'
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.CSS_SELECTOR, participation_description_selector))
)
main_data['participation_description'].append(str(element.text))
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
main_data['participation_description'].append("None")
try:
usage_description_selector = 'body > app-root > app-competition-detail > main > section.section__3 > div > div > div:nth-child(2) > div > div.content'
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.CSS_SELECTOR, usage_description_selector))
)
main_data['usage_description'].append(str(element.text))
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
main_data['usage_description'].append("None")
cnt += 1
print(f"[LOG] tender number {cnt} has done")
driver.quit()
time.sleep(3)
def get_fees(ids: list):
...
def get_contact(ids):
...
def get_all_info_from_table(ids):
for id in ids:
heading, disclosure, quantity, measure, code = [], [], [], [], []
total_items = get_total_items_from_id(id)
table_link = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize={total_items}&PageNumber=1'
try:
response = requests.get(table_link, timeout=10)
if response.status_code == 200:
data = response.json()
for item in data['items']:
heading.append(str(item.get('name', 'None')) if item.get('name') else 'None')
disclosure.append(str(item.get('description', 'None')) if item.get('description') else 'None')
quantity.append(str(item.get('quantity', 'None')) if item.get('quantity') else 'None')
measure.append(str(item.get('unitOfMeasure', 'None')) if item.get('unitOfMeasure') else 'None')
code.append(str(item.get('categorycode', 'None')) if item.get('categorycode') else 'None')
except requests.Timeout:
...
nested_data['heading'].append("\n".join(heading))
nested_data['disclosure'].append("\n".join(disclosure))
nested_data['quantity'].append("\n".join(quantity))
nested_data['measure_unit'].append("\n".join(measure))
nested_data['code'].append("\n".join(code))
def fetch_data(ids):
with ThreadPoolExecutor(max_workers=16) as executor:
futures = [
...
]
for future in as_completed(futures):
future.result()
time.sleep(1)
и для загрузки полученных данных в бд MySQL
loadData.py:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Table, MetaData, insert
import time
from scrapper import *
from config import *
engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}')
Session = sessionmaker(bind=engine)
metadata = MetaData()
events_table = Table('events', metadata, autoload_with=engine)
events_details_table = Table('events_details', metadata, autoload_with=engine)
ids = get_all_events_ids(get_total_items_from_link())
batch_size = 100
for i in range(0, len(ids), batch_size):
package = ids[i:i + batch_size]
fetch_data(package)
session = Session()
main_data_records = [
{
'organization_name': org_name,
'organization_voen': org_voen,
'organization_address': org_address,
'event_name': event_name,
'event_number': event_number,
'classification_code': classification_code,
'suggested_price': suggested_price,
'event_start_date': event_start_date,
'submission_deadline': submission_deadline,
'envelope_opening_date': envelope_opening_date,
'participation_fee': participation_fee,
'participation_description': participation_description,
'usage_fee': usage_fee,
'usage_description': usage_description,
'full_name': full_name,
'contact': contact,
'position': position,
'phone_number': phone_number
}
for org_name, org_voen, org_address, event_name, event_number, classification_code,
suggested_price, event_start_date, submission_deadline, envelope_opening_date,
participation_fee, participation_description, usage_fee, usage_description,
full_name, contact, position, phone_number in zip(
main_data['organization_name'],
main_data['organization_voen'],
main_data['organization_address'],
main_data['event_name'],
main_data['event_number'],
main_data['classification_code'],
main_data['suggested_price'],
main_data['event_start_date'],
main_data['submission_deadline'],
main_data['envelope_opening_date'],
main_data['participation_fee'],
main_data['participation_description'],
main_data['usage_fee'],
main_data['usage_description'],
main_data['full_name'],
main_data['contact'],
main_data['position'],
main_data['phone_number']
)
]
nested_data_records = [
{
'heading': heading,
'disclosure': disclosure,
'quantity': quantity,
'measure_unit': measure_unit,
'code': code
}
for heading, disclosure, quantity, measure_unit, code in zip(
nested_data['heading'],
nested_data['disclosure'],
nested_data['quantity'],
nested_data['measure_unit'],
nested_data['code']
)
]
for record in main_data_records:
insert_stmt = insert(events_table).values(**record)
session.execute(insert_stmt)
for record in nested_data_records:
insert_stmt = insert(events_details_table).values(**record)
session.execute(insert_stmt)
session.commit()
session.close()
time.sleep(5)
На данный момент, функцией
get_total_items_from_link()
будет возвращатся значение около 2500. Я тестировал
loadData с разным количеством айдишников, максимум было 1400(когда работало с приемлемой скоростью). После этого начинает очень медленно работать. Почему?