@gndkg

Почему в бд часть данных отсутствует?

Здравствуйте, у меня есть 2 кода, первый парсит данные, а второй загружает их в бд MySQL.
scrapper:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException

main_data = {
    'Organization_name': [],
    'Organization_voen': [],
    'Organization_adress': [],
    'Event_name': [],
    'Event_number': [],
    'Classification_code': [],
    'Suggested_price': [],
    'Event_start_date': [],
    'Submission_deadline': [],
    'Envelope_opening_date': [],

    'Participation_fee': [],
    'Participation_description': [],

    'Usage_fee': [],
    'Usage_description': [],

    'Full_name': [],
    'Contact': [],
    'Position': [],
    'Phone_number': []
}

nested_data = {
    'Heading': [],
    'Disclosure': [],
    'Quantity': [],
    'Measure_unit': [],
    'Code': []
}

def get_total_items_from_link(api_link='https://etender.gov.az/api/events?EventType=2&PageSize=1&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='):
    response = requests.get(api_link)

    if response.status_code == 200:
        data = response.json()
        return data['totalItems']
    
    return -1

def get_total_items_from_id(id):
    total_items = -1

    template = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize=1&PageNumber=1'
    response = requests.get(template)

    if response.status_code == 200:
        data = response.json()
        total_items = data['totalItems']

    return total_items

def get_all_events_ids(page_size):
    template = f'https://etender.gov.az/api/events?EventType=2&PageSize={page_size}&PageNumber=1&EventStatus=1&Keyword=&buyerOrganizationName=&PrivateRfxId=&publishDateFrom=&publishDateTo=&AwardedparticipantName=&AwardedparticipantVoen=&DocumentViewType='

    response = requests.get(template)
    data = response.json()

    events_ids = []

    for item in data['items']:
        events_ids.append(item['eventId'])

    return events_ids

def get_info_from_link(ids: list):
    driver = webdriver.Chrome()

    for id in ids:
        link = f'https://etender.gov.az/main/competition/detail/{id}'
        driver.get(link)

        selectors = [
            ...
        ]

        for item_list, selector in selectors:
            try:
                element = WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
                )
                item_list.append(str(element.text))
            except (NoSuchElementException, StaleElementReferenceException):
                item_list.append("None")

    driver.quit()

def get_fees(ids: list):
    for id in ids:
        template = f'https://etender.gov.az/api/events/{id}/info'
        response = requests.get(template)

        if response.status_code == 200:
            data = response.json()
            main_data['Participation_fee'].append(str(data['participationFee']) if len(str(data['participationFee'])) > 0 else 'None')
            main_data['Usage_fee'].append(str(data['viewFee']) if len(str(data['viewFee'])) > 0 else 'None')
        else:
            main_data['Participation_fee'].append('None')
            main_data['Usage_fee'].append('None')

def get_fees_description(ids: list):
    driver = webdriver.Chrome()

    for id in ids:
        link = f'https://etender.gov.az/main/competition/detail/{id}'
        driver.get(link)

        selectors = [
            ...
        ]

        for item_list, selector in selectors:
            try:
                element = WebDriverWait(driver, 4).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                item_list.append(str(element.text))

            except NoSuchElementException:
                item_list.append("None")

    driver.quit()

def get_contact(ids):
    for id in ids:
        template = f'https://etender.gov.az/api/events/{id}/contact-persons'
        response = requests.get(template)

        if response.status_code == 200:
            data_list = response.json()
            for data in data_list:
                main_data['Full_name'].append(data.get('fullName', 'None') if data.get('fullName') else 'None')
                main_data['Contact'].append(data.get('contact', 'None') if data.get('contact') else 'None')
                main_data['Position'].append(data.get('position', 'None') if data.get('position') else 'None')
                main_data['Phone_number'].append(data.get('phoneNumber', 'None') if data.get('phoneNumber') else 'None')

        else:
            main_data['Full_name'].append('None')
            main_data['Contact'].append('None')
            main_data['Position'].append('None')
            main_data['Phone_number'].append('None')

def get_all_info_from_table(ids):
    for id in ids:
        heading, disclosure, quantity, measure, code = [], [], [], [], []
        total_items = get_total_items_from_id(id)
        table_link = f'https://etender.gov.az/api/events/{id}/bomLines?PageSize={total_items}&PageNumber=1'
    
        response = requests.get(table_link)

        if response.status_code == 200:
            data = response.json()
            for item in data['items']:
                heading.append(str(item.get('name', 'None')) if item.get('name') else 'None')
                disclosure.append(str(item.get('description', 'None')) if item.get('description') else 'None')
                quantity.append(str(item.get('quantity', 'None')) if item.get('quantity') else 'None')
                measure.append(str(item.get('unitOfMeasure', 'None')) if item.get('unitOfMeasure') else 'None')
                code.append(str(item.get('categoryCode', 'None')) if item.get('categoryCode') else 'None')

        else:
            heading.append('None')
            disclosure.append('None')
            quantity.append('None')
            measure.append('None')
            code.append('None')

        nested_data['Heading'].append(heading)
        nested_data['Disclosure'].append(disclosure)
        nested_data['Quantity'].append(quantity)
        nested_data['Measure_unit'].append(measure)
        nested_data['Code'].append(code)

def fetch_data(ids):
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(get_info_from_link, ids),
            executor.submit(get_all_info_from_table, ids),
            executor.submit(get_fees, ids),
            executor.submit(get_fees_description, ids),
            executor.submit(get_contact, ids)
        ]
        for future in as_completed(futures):
            future.result()


dataLoader:
from sqlalchemy import create_engine

from scrapper import *
from config import *

page_size = get_total_items_from_link()
ids = get_all_events_ids(10)
fetch_data(ids)

main_df = pd.DataFrame(main_data)
#nested_df = pd.DataFrame(nested_data)

engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}')

main_df.to_sql('main_table', con=engine, if_exists='append', index=False)
print("Данные успешно загружены в таблицу main_table.")


Проблема в том, что после загрузки данных, смотрю в MySQL Workbench и вижу что части данных просто нету.
66e374c9bf585210410618.png

Также из кода dataLoader я проверил датафрейм main_df, и он выглядит точно также:
0  BALAKƏN SU MELİORASİYA SİSTEMLƏRİNİN İSTİSMARI...        3800057561  ...                                        Baş mühasib                      +994507512670
1  BALAKƏN SU MELİORASİYA SİSTEMLƏRİNİN İSTİSMARI...        3800057561  ...                                        Baş mühasib                      +994507512670
2                                                                       ...                                        Baş mühasib                      +994507512670
3                                                                       ...                                        Baş mühasib                      +994507512670
4                                                                       ...                                         Mütəxəssis                      012-505-62-31
5                                                                       ...  Satınalmaların təşkili və müqavilələrin hazırl...         +994 12 377 0770 (dax.601)
6                                                                       ...                                         Mütəxəssis                      012-505-62-31
7                                                                       ...                                         Mütəxəssis                      012-505-62-31
8                                                           0201598951  ...                                        Şöbə müdiri                 (+99436) 544 18 60
9                                                                       ...                                        Şöbə müdiri  (+994) 012 5998780, (daxili 2950)


Я полагаю что проблема в коде scrapper-а. Как мне решить эту проблему?
  • Вопрос задан
  • 109 просмотров
Пригласить эксперта
Ваш ответ на вопрос

Войдите, чтобы написать ответ

Войти через центр авторизации
Похожие вопросы