есть 2 кода, первый парсит данные, а второй загружает их в бд MySQL.
scrapper:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
main_data = {
'Organization_name': [],
'Organization_voen': [],
'Organization_adress': [],
'Event_name': [],
'Event_number': [],
'Classification_code': [],
'Suggested_price': [],
'Event_start_date': [],
'Submission_deadline': [],
'Envelope_opening_date': [],
'Participation_fee': [],
'Participation_description': [],
'Usage_fee': [],
'Usage_description': [],
'Full_name': [],
'Contact': [],
'Position': [],
'Phone_number': []
}
nested_data = {
'Heading': [],
'Disclosure': [],
'Quantity': [],
'Measure_unit': [],
'Code': []
}
def get_total_items_from_link(api_link='https://etender.gov.az/api/events?EventType=2&PageSize=1&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='):
response = requests.get(api_link)
if response.status_code == 200:
data = response.json()
return data['totalItems']
return -1
def get_total_items_from_id(id):
total_items = -1
template = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize=1&PageNumber=1'
response = requests.get(template)
if response.status_code == 200:
data = response.json()
total_items = data['totalItems']
return total_items
def get_all_events_ids(page_size):
template = f'https://etender.gov.az/api/events?EventType=2&PageSize={page_size}&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='
response = requests.get(template)
data = response.json()
events_ids = []
for item in data['items']:
events_ids.append(item['eventId'])
return events_ids
def get_info_from_link(ids: list):
driver = webdriver.Chrome()
for id in ids:
link = f'https://etender.gov.az/main/competition/detail/{id}'
driver.get(link)
selectors = [
...
]
for item_list, selector in selectors:
try:
element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
)
item_list.append(str(element.text))
except (NoSuchElementException, StaleElementReferenceException):
item_list.append("None")
driver.quit()
def get_fees(ids: list):
for id in ids:
template = f'https://etender.gov.az/api/events/{id}/info'
response = requests.get(template)
if response.status_code == 200:
data = response.json()
main_data['Participation_fee'].append(str(data['participationFee']) if len(str(data['participationFee'])) > 0 else 'None')
main_data['Usage_fee'].append(str(data['viewFee']) if len(str(data['viewFee'])) > 0 else 'None')
else:
main_data['Participation_fee'].append('None')
main_data['Usage_fee'].append('None')
def get_fees_description(ids: list):
driver = webdriver.Chrome()
for id in ids:
link = f'https://etender.gov.az/main/competition/detail/{id}'
driver.get(link)
selectors = [
...
]
for item_list, selector in selectors:
try:
element = WebDriverWait(driver, 4).until(
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
)
item_list.append(str(element.text))
except NoSuchElementException:
item_list.append("None")
driver.quit()
def get_contact(ids):
for id in ids:
template = f'https://etender.gov.az/api/events/{id}/contact-persons'
response = requests.get(template)
if response.status_code == 200:
data_list = response.json()
for data in data_list:
main_data['Full_name'].append(data.get('fullName', 'None') if data.get('fullName') else 'None')
main_data['Contact'].append(data.get('contact', 'None') if data.get('contact') else 'None')
main_data['Position'].append(data.get('position', 'None') if data.get('position') else 'None')
main_data['Phone_number'].append(data.get('phoneNumber', 'None') if data.get('phoneNumber') else 'None')
else:
main_data['Full_name'].append('None')
main_data['Contact'].append('None')
main_data['Position'].append('None')
main_data['Phone_number'].append('None')
def get_all_info_from_table(ids):
for id in ids:
heading, disclosure, quantity, measure, code = [], [], [], [], []
total_items = get_total_items_from_id(id)
table_link = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize={total_items}&PageNumber=1'
response = requests.get(table_link)
if response.status_code == 200:
data = response.json()
for item in data['items']:
heading.append(str(item.get('name', 'None')) if item.get('name') else 'None')
disclosure.append(str(item.get('description', 'None')) if item.get('description') else 'None')
quantity.append(str(item.get('quantity', 'None')) if item.get('quantity') else 'None')
measure.append(str(item.get('unitOfMeasure', 'None')) if item.get('unitOfMeasure') else 'None')
code.append(str(item.get('categoryCode', 'None')) if item.get('categoryCode') else 'None')
else:
heading.append('None')
disclosure.append('None')
quantity.append('None')
measure.append('None')
code.append('None')
nested_data['Heading'].append(heading)
nested_data['Disclosure'].append(disclosure)
nested_data['Quantity'].append(quantity)
nested_data['Measure_unit'].append(measure)
nested_data['Code'].append(code)
def fetch_data(ids):
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(get_info_from_link, ids),
executor.submit(get_all_info_from_table, ids),
executor.submit(get_fees, ids),
executor.submit(get_fees_description, ids),
executor.submit(get_contact, ids)
]
for future in as_completed(futures):
future.result()
dataLoader:
from sqlalchemy import create_engine
from scrapper import *
from config import *
page_size = get_total_items_from_link()
ids = get_all_events_ids(10)
fetch_data(ids)
main_df = pd.DataFrame(main_data)
#nested_df = pd.DataFrame(nested_data)
engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}')
main_df.to_sql('main_table', con=engine, if_exists='append', index=False)
print("Данные успешно загружены в таблицу main_table.")
Проблема в том, что после загрузки данных, смотрю в MySQL Workbench и вижу что части данных просто нету.
Также из кода
dataLoader
я проверил датафрейм
main_df
, и он выглядит точно также:
0 BALAKƏN SU MELİORASİYA SİSTEMLƏRİNİN İSTİSMARI... 3800057561 ... Baş mühasib +994507512670
1 BALAKƏN SU MELİORASİYA SİSTEMLƏRİNİN İSTİSMARI... 3800057561 ... Baş mühasib +994507512670
2 ... Baş mühasib +994507512670
3 ... Baş mühasib +994507512670
4 ... Mütəxəssis 012-505-62-31
5 ... Satınalmaların təşkili və müqavilələrin hazırl... +994 12 377 0770 (dax.601)
6 ... Mütəxəssis 012-505-62-31
7 ... Mütəxəssis 012-505-62-31
8 0201598951 ... Şöbə müdiri (+99436) 544 18 60
9 ... Şöbə müdiri (+994) 012 5998780, (daxili 2950)
Я полагаю что проблема в коде
scrapper
-а. Как мне решить эту проблему?