Здравствуйте. Есть сайт
https://etender.gov.az, также есть код который собирает всю информацию о тендере:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
all_data = {
'Organization_name': [],
'Organization_voen': [],
'Organization_adress': [],
'Event_name': [],
'Event_number': [],
'Classification_code': [],
'Suggested_price': [],
'Event_start_date': [],
'Submission_deadline': [],
'Envelope_opening_date': [],
'Heading': [[]],
'Disclosure': [[]],
'Quantity': [[]],
'Measure_unit': [[]],
'Code': [[]],
'Participation_fee': [],
'Participation_description': [],
'Usage_fee': [],
'Usage_description': [],
'Full_name': [],
'Contact': [],
'Position': [],
'Phone_number': []
}
def get_total_items_from_link(api_link='https://etender.gov.az/api/events?EventType=2&PageSize=1&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType=') -> int:
response = requests.get(api_link)
if response.status_code == 200:
data = response.json()
return data['totalItems']
print(f'Error: {response.status_code}')
return -1
def get_total_items_from_id(id: int) -> int:
total_items = -1
template = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize=1&PageNumber=1'
response = requests.get(template)
if response.status_code == 200:
data = response.json()
total_items = data['totalItems']
else:
print(f'Error: {response.status_code}')
return total_items
# возвращает айди каждого события в таблице
def get_all_events_ids(page_size: int) -> list:
template = f'https://etender.gov.az/api/events?EventType=2&PageSize={page_size}&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='
response = requests.get(template)
data = response.json()
events_ids = []
for item in data['items']:
events_ids.append(item['eventId'])
return events_ids
# используя фунуцию get_all_events_ids(), возвращает всю информацию о тендере (шаблон ссылки https://etender.gov.az/main/competition/detail/tender_id)
def get_info_from_link(ids: list) -> list[list]:
Organization_name = []
Organization_voen = []
Organization_adress = []
Event_name = []
Event_number = []
Classification_code = []
Suggested_price = []
Event_start_date = []
Submission_deadline = []
Envelope_opening_date = []
driver = webdriver.Chrome()
for id in ids:
link = f'https://etender.gov.az/main/competition/detail/{id}'
driver.get(link)
selectors = [
...
]
for item_list, selector in selectors:
try:
element = WebDriverWait(driver, 4).until(
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
)
item_list.append(element.text)
except NoSuchElementException:
item_list.append("None")
driver.quit()
return [Organization_name, Organization_voen, Organization_adress, Event_name, Event_number, Classification_code, Suggested_price, Event_start_date, Submission_deadline, Envelope_opening_date]
# возвращают данные по оплатам за участие и просмотр, а также их описания
def get_fees(ids: list) -> list[list]:
participation_fee, usage_fee = [], []
for id in ids:
template = f'https://etender.gov.az/api/events/{id}/info'
response = requests.get(template)
if response.status_code == 200:
data = response.json()
participation_fee.append(data['participationFee'] if len(str(data['participationFee'])) > 0 else 'None')
usage_fee.append(data['viewFee'] if len(str(data['viewFee'])) > 0 else 'None')
else:
print(f'Error: {response.status_code}')
participation_fee.append('None')
usage_fee.append('None')
return [participation_fee, usage_fee]
def get_fees_description(ids: list) -> list[list]:
Participation_fee_desc = []
Usage_fee_desc = []
driver = webdriver.Chrome()
for id in ids:
link = f'https://etender.gov.az/main/competition/detail/{id}'
driver.get(link)
selectors = [
...
]
for item_list, selector in selectors:
try:
element = WebDriverWait(driver, 4).until(
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
)
item_list.append(element.text)
except NoSuchElementException:
item_list.append("None")
driver.quit()
return [Participation_fee_desc, Usage_fee_desc]
def get_contact(ids) -> list[list]:
results = []
for id in ids:
template = f'https://etender.gov.az/api/events/{id}/contact-persons'
response = requests.get(template)
if response.status_code == 200:
data_list = response.json()
for data in data_list:
contact_info = [
data.get('fullName', 'None') if data.get('fullName') else 'None',
data.get('contact', 'None') if data.get('contact') else 'None',
data.get('position', 'None') if data.get('position') else 'None',
data.get('phoneNumber', 'None') if data.get('phoneNumber') else 'None'
]
results.append(contact_info)
else:
print(f'Error for ID {id}: {response.status_code}')
results.append(['None', 'None', 'None', 'None'])
return results
def get_all_info_from_table(ids: list) -> list[list]:
results = []
for id in ids:
total_items = get_total_items_from_id(id)
table_link = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize={total_items}&PageNumber=1'
response = requests.get(table_link)
if response.status_code == 200:
data = response.json()
for item in data['items']:
item_info = [
item.get('name', 'None') if item.get('name') else 'None',
item.get('description', 'None') if item.get('description') else 'None',
item.get('quantity', 'None') if item.get('quantity') else 'None',
item.get('unitOfMeasure', 'None') if item.get('unitOfMeasure') else 'None',
item.get('categoryCode', 'None') if item.get('categoryCode') else 'None'
]
results.append([item_info])
else:
print(f'Error for ID {id}: {response.status_code}')
results.append([['None', 'None', 'None', 'None', 'None']])
return results
def fetch_data_parallel(ids):
part1_futures = []
part2_futures = []
part3_futures = []
part4_futures = []
part5_futures = []
with ThreadPoolExecutor(max_workers=5) as executor:
for id in ids:
part1_futures.append(executor.submit(get_info_from_link, [id]))
part2_futures.append(executor.submit(get_all_info_from_table, [id]))
part3_futures.append(executor.submit(get_fees, [id]))
part4_futures.append(executor.submit(get_fees_description, [id]))
part5_futures.append(executor.submit(get_contact, [id]))
...
# FIXME
for future in as_completed(part2_futures):
value = future.result()
for i in range(len(value)):
all_data['Heading'][0].append(value[i][0][0])
all_data['Disclosure'][0].append(value[i][0][1])
all_data['Quantity'][0].append(value[i][0][2])
all_data['Measure_unit'][0].append(value[i][0][3])
all_data['Code'][0].append(value[i][0][4])
...
fetch_data_parallel([315424, 315423, 315422, 315418, 315417])
df = pd.DataFrame(all_data)
print(df)
В этом коде у меня бывают ошибки касательно блока внутри
for future in as_completed(part2_futures):
, вот ошибка:
ValueError: All arrays must be of the same length
В странице тендера есть таблица с которой я собираю данные 'Heading', 'Disclosure', 'Quantity', 'Measure_unit', 'Code' и так как это таблица там их может быть много из-за чего появляется та ошибка. Я пытался исправить это вложенными списками, но что-то не помогло. Можете предложить то, как хранить такого рода данные для дальнейшего превращения всех собранных данных в датафрейм, а позже и в базу данных?