Есть парсер на scrapy (offers_spider.py и pipelines.py). В pipelens.py идет сохранение спаршенных данных, в папку products_id. Проблема в том что, когда ты отправляешь два или более одинаковых ссылок на парсинг, он создает только один файл для дубликатов со своим id.json. Если встречаются дубликаты я хочу сделать id(1).json, id(2).json. id=product_id.
Код файла OFFERS_SPIDER.py
import glob
import json
import os
import re
import scrapy
from datascraper.items import OfferItem
def url_offer(model_id: str) -> str:
return f'https://card.wb.ru/cards/detail?appType=1&curr=rub&dest=-1257786&' \
f'regions=80,38,83,4,64,33,68,70,30,40,86,75,69,1,66,110,48,22,31,71,114&' \
f'spp=0&nm={model_id}'
def url_offer_analog(model_id: int) -> str:
return f'https://identical-products.wildberries.ru/api/v1/identical?nmID={model_id}'
def model_id_from_url_with_nmID(url: str) -> str:
return re.search(r'nmID=(\d+)', url).group(1)
def model_id_from_url_with_nm(url: str) -> str:
return re.search(r"nm=(\d+);", url).group(1)
class OffersSpider(scrapy.Spider):
name = 'offers'
def __init__(self, message_dir='', *args, **kwargs):
super(OffersSpider, self).__init__(*args, **kwargs)
self.message_dir = message_dir
self.file_name = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../', 'messages'))
def start_requests(self):
model_ids = []
message_files = glob.glob(os.path.join(self.file_name, "messages_queue_*.json"))
for message_file in message_files:
with open(message_file, "r") as file:
for line in file:
model_ids.append(line.strip())
for model_id in model_ids:
model_id = int(re.search(r"/(\d+)/", model_id).group(1))
yield scrapy.Request(url=url_offer_analog(model_id), callback=self.offer_analog_parse, dont_filter=True)
def offer_analog_parse(self, response):
model_id = response.json()
model_id.insert(0, model_id_from_url_with_nmID(response.request.url))
model_id = [str(elem) for elem in model_id]
model_id = ';'.join(model_id) if len(model_id) > 1 else model_id[0] + ';'
yield response.follow(url_offer(model_id), callback=self.offer_parse)
def offer_parse(self, response):
wildberries_data = response.json()['data']['products']
for wildberries_item in wildberries_data:
item = OfferItem(product_id=model_id_from_url_with_nm(response.request.url),
name=wildberries_item['name'],
price=wildberries_item['priceU'] // 100,
sale_price=wildberries_item['salePriceU'] // 100)
yield item
КОД ФАЙЛА PIPELENS.py
import json
from datascraper.items import OfferItem
"""
offers: [{"title": "test", "price": 2, "sale_price":1}, {"title": "test", "price": 4, "sale_price":3}]
"offer": [79664, 47990, 69990, 79664]
"""
class OffersPipeline:
product_item = {}
def process_item(self, item, spider):
if isinstance(item, OfferItem):
if item['product_id'] not in self.product_item:
self.product_item[item['product_id']] = [
{"title": item['name'],
"price": item['price'],
"sale_price": item['sale_price']}
]
else:
self.product_item[item['product_id']].append({"title": item['name'], "price": item['price'],
"sale_price": item['sale_price']})
return item
def close_spider(self, spider):
for product_id, offers in self.product_item.items():
data = json.dumps({"product_id": product_id, "offers": offers}, ensure_ascii=False)
with open(f"products_id/{product_id}.json", 'w', encoding="utf-8") as file:
file.write(data + '\n')