import requests
import csv
from bs4 import BeautifulSoup as bs
headers = {
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36'
}
base_url = 'https://ria.ru/search/?query='
def RIA_NEWS (base_url, headers):
news = []
section_news = ['Спорт', 'Политика', 'Наука', 'Экономика', 'Культура']
urls = []
for url in section_news:
url = f'https://ria.ru/search/?query={url}'
urls.append(url)
session = requests.Session()
request = session.get(base_url, headers=headers)
if request.status_code == 200:
for i in range(len(urls)):
print('\n', section_news[i], '\n')
session = requests.Session()
request = session.get(urls[i], headers=headers)
soup = bs(request.content, 'lxml')
divs = soup.find_all('div', attrs={'class': 'list-item'})
for div in divs:
try:
title = div.find('a', attrs={'class': 'list-item__title color-font-hover-only'}).text
href = div.find('a', attrs={'class': 'list-item__title color-font-hover-only'})['href']
date = div.find('div', attrs={'class': 'list-item__date'}).text
img = div.find('img', attrs={'class': 'responsive_img m-list-img'})['src']
session = requests.Session()
request = session.get(href, headers=headers)
soup = bs(request.content, 'lxml')
content_divs = soup.find_all('div', attrs={'class': 'layout-article__main'})
for content_div in content_divs:
content_div = content_div.find_all('div', attrs={'class': 'article__text'})
content = ''
for div_i in range(len(content_div)):
content += str(content_div[div_i])
except:
continue
news.append({
'title': title,
'href': href,
'date': date,
'img': img,
'content': content
})
print(title)
else:
print('ERROR')
return news
def file_w(news):
with open('parser_news.csv', 'w', encoding='utf-8') as file:
a_pen = csv.writer(file)
a_pen.writerow(('Название', 'Ссылка на статью', 'Дата', 'Ссылка на картинку', 'Контент'))
for new in news:
a_pen.writerow((new['title'], new['href'], new['date'], new['img'], new['content']))
news = RIA_NEWS(base_url, headers)
file_w(news)