Ответы пользователя по тегу Парсинг
  • Как парсить сайт Metro-CC.ru на python через selenium?

    @SwitcherN Автор вопроса
    Начинаю изучать python
    Проблему решил. Нашел хорошую статью на habr'е. Главный посыл - старайтесь найти api, если он есть. Это будет самый верный путь к решению проблемы. BeautifulSoup, Selenium - как крайние меры.

    Что касается сайта Metro C&C - перехватил XHR-запрос graph. По нему парсится JSON. Пока - по конкретному магазину, дальше надо будет циклом прогнать интересующие магазины. Отдает JSON на 30 продуктов, но можно в data запроса изменить параметр size. Изменил в своем запросе на 2 000. По разным магазинам от 800 до 1100 элементов отдает - в зависимости от матрицы магазина.
    Плюсом в data можно почистить структуру получаемого JSON'а. Ниже 2 примера.

    Код запроса
    import json
    import requests
    
    cookies = {
        'metro_api_session': 'TVrehtvJIsAlrrmi73LSZ1td5cF4hHucbGsxIzM5',
        '_ga_VHKD93V3FV': 'GS1.1.1703784480.1.1.1703784576.0.0.0',
        'tmr_lvid': 'c6ca40446fcd9234ed280196bdd2bab2',
        'tmr_lvidTS': '1703784486767',
        '_gcl_au': '1.1.1217202258.1703784484',
        '_ym_visorc': 'b',
        '_ym_d': '1703784483',
        '_ym_isad': '2',
        '_ym_uid': '1703784483244081529',
        'uxs_uid': '740bab20-a5a6-11ee-b876-db689cf62a64',
        'mp_88875cfb7a649ab6e6e310368f37a563_mixpanel': '%7B%22distinct_id%22%3A%20%22%24device%3A18cb1780127150a-0d62ba868fe2ce-3d62684b-16a7f0-18cb1780128150a%22%2C%22%24device_id%22%3A%20%2218cb1780127150a-0d62ba868fe2ce-3d62684b-16a7f0-18cb1780128150a%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%7D',
        '_ga': 'GA1.1.878068636.1703784481',
        '_slfreq': '633ff97b9a3f3b9e90027740%3A633ffa4c90db8d5cf00d7810%3A1703791681%3B64a81e68255733f276099da5%3A64abaf645c1afe216b0a0d38%3A1703791681',
        'directCrm-session': '%7B%22deviceGuid%22%3A%2297972bdb-a82e-4f8d-a11a-24ff81485078%22%7D',
        'mindboxDeviceUUID': '97972bdb-a82e-4f8d-a11a-24ff81485078',
        '_slfs': '1703784479362',
        '_slid': '658db01f4dc372cd800a100c',
        '_slsession': '2953AFDF-D8E0-4CE4-9DC7-CED951138786',
    }
    
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Content-Type': 'application/json',
        'Origin': 'https://online.metro-cc.ru',
        'Content-Length': '4669',
        'Accept-Language': 'ru',
        'Host': 'api.metro-cc.ru',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15',
        'Referer': 'https://online.metro-cc.ru/',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
    }
    
    data = '{"query":"\\n query Query($storeId: Int!, $slug: String!, $attributes:[AttributeFilter], $filters: [FieldFilter], $from: Int!, $size: Int!, $sort: InCategorySort, $in_stock: Boolean, $eshop_order: Boolean, $is_action: Boolean, $price_levels: Boolean) {\\n category (storeId: $storeId, slug: $slug, inStock: $in_stock, eshopAvailability: $eshop_order, isPromo: $is_action, priceLevels: $price_levels) {\\n id\\n name\\n slug\\n id\\n parent_id\\n meta {\\n description\\n h1\\n title\\n keywords\\n }\\n disclaimer\\n description {\\n top\\n main\\n bottom\\n }\\n# treeBranch {\\n# id\\n# name\\n# slug\\n# children {\\n# category_type\\n# id\\n# name\\n# slug\\n# children {\\n# category_type\\n# id\\n# name\\n# slug\\n# children {\\n# category_type\\n# id\\n# name\\n# slug\\n# children {\\n# category_type\\n# id\\n# name\\n# slug\\n# }\\n# }\\n# }\\n# }\\n# }\\n breadcrumbs {\\n category_type\\n id\\n name\\n parent_id\\n parent_slug\\n slug\\n }\\n promo_banners {\\n id\\n image\\n name\\n category_ids\\n virtual_ids\\n type\\n sort_order\\n url\\n is_target_blank\\n analytics {\\n name\\n category\\n brand\\n type\\n start_date\\n end_date\\n }\\n }\\n\\n\\n dynamic_categories(from: 0, size: 9999) {\\n slug\\n name\\n id\\n category_type\\n }\\n filters {\\n facets {\\n key\\n total\\n filter {\\n id\\n name\\n display_title\\n is_list\\n is_main\\n text_filter\\n is_range\\n category_id\\n category_name\\n values {\\n slug\\n text\\n total\\n }\\n }\\n }\\n }\\n total\\n prices {\\n max\\n min\\n }\\n pricesFiltered {\\n max\\n min\\n }\\n products(attributeFilters: $attributes, from: $from, size: $size, sort: $sort, fieldFilters: $filters) {\\n health_warning\\n limited_sale_qty\\n id\\n slug\\n name\\n name_highlight\\n article\\n main_article\\n main_article_slug\\n is_target\\n category_id\\n url\\n images\\n pick_up\\n rating\\n icons {\\n id\\n badge_bg_colors\\n rkn_icon\\n caption\\n image\\n type\\n is_only_for_sales\\n stores\\n caption_settings {\\n colors\\n text\\n }\\n stores\\n sort\\n image_png\\n image_svg\\n description\\n end_date\\n start_date\\n status\\n }\\n manufacturer {\\n id\\n image\\n name\\n }\\n packing {\\n size\\n type\\n pack_factors {\\n instamart\\n }\\n }\\n stocks {\\n value\\n text\\n eshop_availability\\n scale\\n prices_per_unit {\\n old_price\\n offline {\\n price\\n old_price\\n type\\n offline_discount\\n offline_promo\\n }\\n price\\n is_promo\\n levels {\\n count\\n price\\n }\\n online_levels {\\n count\\n price\\n discount\\n }\\n discount\\n }\\n prices {\\n price\\n is_promo\\n old_price\\n offline {\\n old_price\\n price\\n type\\n offline_discount\\n offline_promo\\n }\\n levels {\\n count\\n price\\n }\\n online_levels {\\n count\\n price\\n discount\\n }\\n discount\\n }\\n }\\n }\\n }\\n }\\n","variables":{"storeId":12,"sort":"default","size":30,"from":0,"filters":[{"field":"main_article","value":"0"}],"attributes":[],"in_stock":false,"eshop_order":false,"allStocks":false,"slug":"vino"}}'
    
    response = requests.post('https://api.metro-cc.ru/products-api/graph', headers=headers, cookies=cookies, data=data)
    with open('metro.json', 'w') as file:
        json.dump(response.json(), file, indent=4, ensure_ascii=False)</spoiler>


    Оптимизирвоанный код запроса
    import json
    import requests
    
    cookies = {
        'metro_api_session': 'TVrehtvJIsAlrrmi73LSZ1td5cF4hHucbGsxIzM5',
        '_ga_VHKD93V3FV': 'GS1.1.1703784480.1.1.1703784576.0.0.0',
        'tmr_lvid': 'c6ca40446fcd9234ed280196bdd2bab2',
        'tmr_lvidTS': '1703784486767',
        '_gcl_au': '1.1.1217202258.1703784484',
        '_ym_visorc': 'b',
        '_ym_d': '1703784483',
        '_ym_isad': '2',
        '_ym_uid': '1703784483244081529',
        'uxs_uid': '740bab20-a5a6-11ee-b876-db689cf62a64',
        'mp_88875cfb7a649ab6e6e310368f37a563_mixpanel': '%7B%22distinct_id%22%3A%20%22%24device%3A18cb1780127150a-0d62ba868fe2ce-3d62684b-16a7f0-18cb1780128150a%22%2C%22%24device_id%22%3A%20%2218cb1780127150a-0d62ba868fe2ce-3d62684b-16a7f0-18cb1780128150a%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%7D',
        '_ga': 'GA1.1.878068636.1703784481',
        '_slfreq': '633ff97b9a3f3b9e90027740%3A633ffa4c90db8d5cf00d7810%3A1703791681%3B64a81e68255733f276099da5%3A64abaf645c1afe216b0a0d38%3A1703791681',
        'directCrm-session': '%7B%22deviceGuid%22%3A%2297972bdb-a82e-4f8d-a11a-24ff81485078%22%7D',
        'mindboxDeviceUUID': '97972bdb-a82e-4f8d-a11a-24ff81485078',
        '_slfs': '1703784479362',
        '_slid': '658db01f4dc372cd800a100c',
        '_slsession': '2953AFDF-D8E0-4CE4-9DC7-CED951138786',
    }
    
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Content-Type': 'application/json',
        'Origin': 'https://online.metro-cc.ru',
        'Content-Length': '4669',
        'Accept-Language': 'ru',
        'Host': 'api.metro-cc.ru',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15',
        'Referer': 'https://online.metro-cc.ru/',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
    }
    
    shop_id = "26"
    data = f'{{"query":"\\n query Query($storeId: Int!, $slug: String!, $attributes:[AttributeFilter], $filters: [FieldFilter], $from: Int!, $size: Int!, $sort: InCategorySort, $in_stock: Boolean, $eshop_order: Boolean, $is_action: Boolean, $price_levels: Boolean) {{\\n category (storeId: $storeId, slug: $slug, inStock: $in_stock, eshopAvailability: $eshop_order, isPromo: $is_action, priceLevels: $price_levels) {{\\n id\\n name\\n slug\\n id\\n parent_id\\n# treeBranch {{\\n# id\\n# name\\n# slug\\n# children {{\\n# category_type\\n# id\\n# name\\n# slug\\n# children {{\\n# category_type\\n# id\\n# name\\n# slug\\n# children {{\\n# category_type\\n# id\\n# name\\n# slug\\n# children {{\\n# category_type\\n# id\\n# name\\n# slug\\n# }}\\n# }}\\n# }}\\n# }}\\n# }}\\n products(attributeFilters: $attributes, from: $from, size: $size, sort: $sort, fieldFilters: $filters) {{\\n id\\n slug\\n name\\n name_highlight\\n article\\n main_article\\n main_article_slug\\n is_target\\n category_id\\n url\\n images\\n pick_up\\n rating\\n manufacturer {{\\n id\\n image\\n name\\n }}\\n packing {{\\n size\\n type\\n pack_factors {{\\n instamart\\n }}\\n }}\\n stocks {{\\n value\\n text\\n eshop_availability\\n scale\\n prices_per_unit {{\\n old_price\\n price\\n is_promo\\n discount\\n }}\\n }}\\n }}\\n }}\\n }}\\n","variables":{{"storeId":{shop_id},"sort":"default","size":2000,"from":0,"filters":[{{"field":"main_article","value":"0"}}],"attributes":[],"in_stock":false,"eshop_order":false,"allStocks":false,"slug":"vino"}}}}'
    
    
    response = requests.post('https://api.metro-cc.ru/products-api/graph', headers=headers, cookies=cookies, data=data)
    with open('metro.json', 'w') as file:
        json.dump(response.json(), file, indent=4, ensure_ascii=False)
    Ответ написан
    Комментировать