Как парсить большие данные в python например вам нужно парсить из 1000 источников как это сделать быстро. Какую технологию следует использовать для быстрого парсинга сайта? С помощью этого кода разбираю сайты за 1 минуту, но мне нужно сделать это быстро.
rssParser.py
import feedparser
def rssParser(url) :
parse = feedparser.parse(url)
articles = []
for item in parse.entries:
summary = item.summary if hasattr(item, 'summary') else ''
thumbnail = ''
if hasattr(item, 'media_content'):
thumbnail = item.media_content
articles.append({
'title': item.title,
'link': item.link,
'summary': summary,
'thumbnail': thumbnail,
'published': item.published
})
return articles
views.py
from django.http import JsonResponse
import logging, sys
from api.rssParser import rssParser
def index(request) :
chicagoTribuneBusiness = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/business/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneEntertainment = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/entertainment/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneLifestyles = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/lifestyles/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneWorld = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/nation-world/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneOpinion = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/opinion/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribunePolitics = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/politics/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneRealEstate = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/real-estate/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneSports = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/sports/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
nyTimesUS = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml')
nyTimesWorld = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/World.xml')
nyTimesNYRegion = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/NYRegion.xml')
nyTimesBusiness = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Business.xml')
nyTimesTechnology = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml')
nyTimesSports = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml')
nyTimesScience = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Science.xml')
nyTimesClimate = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Climate.xml')
nyTimesSpace = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Space.xml')
nyTimesArts = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml')
nyTimesFashionandStyle = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml')
nyTimesTravel = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Travel.xml')
nyTimesRealEstate = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/RealEstate.xml')
nyTimesAutomobiles = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Automobiles.xml')
return JsonResponse({
'chicagoTribuneBusiness': chicagoTribuneBusiness,
'chicagoTribuneEntertainment': chicagoTribuneEntertainment,
'chicagoTribuneLifestyles': chicagoTribuneLifestyles,
'chicagoTribuneWorld': chicagoTribuneWorld,
'chicagoTribuneOpinion': chicagoTribuneOpinion,
'chicagoTribunePolitics': chicagoTribunePolitics,
'chicagoTribuneRealEstate': chicagoTribuneRealEstate,
'chicagoTribuneSports': chicagoTribuneSports,
'nyTimesUS': nyTimesUS,
'nyTimesWorld': nyTimesWorld,
'nyTimesNYRegion': nyTimesNYRegion,
'nyTimesBusiness': nyTimesBusiness,
'nyTimesTechnology': nyTimesTechnology,
'nyTimesSports': nyTimesSports,
'nyTimesScience': nyTimesScience,
'nyTimesClimate': nyTimesClimate,
'nyTimesSpace': nyTimesSpace,
'nyTimesArts': nyTimesArts,
'nyTimesFashionandStyle': nyTimesFashionandStyle,
'nyTimesTravel': nyTimesTravel,
'nyTimesRealEstate': nyTimesRealEstate,
'nyTimesAutomobiles': nyTimesAutomobiles
})