import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.request import urlopen
import json
from urllib.parse import unquote
import asposecells
import jpype
from openpyxl import Workbook
import pandas as pd
import mysql.connector
from seleniumwire import webdriver
import warnings
warnings.filterwarnings("ignore")
import time
import cloudscraper
from selenium import webdriver
url = 'https://гранты.рф/data/grants/list'
OUT_FILENAME = 'out.json'
options = webdriver.EdgeOptions()
options.add_argument("--headless")
driver = webdriver.Edge(options=options)
driver.get(url)
time.sleep(1.5)
html_parsed = driver.page_source
urls = []
soup = BeautifulSoup(html_parsed, 'html.parser')
def get_soup(url, **kwargs):
response = requests.get(url, **kwargs, verify=False)
if response.status_code == 200:
soup = BeautifulSoup(response.text, features='html.parser')
else:
soup = None
return soup
def crawl_products(url):
for tag in soup.select('.competition-card-wrapper'):
t = tag.attrs['href']
url = 'https://гранты.рф{}'.format(t)
urls.append(url)
print(urls)
return urls
def parse_products(urls):
data = []
item = {}
for page in urls:
soup = get_soup(page)
if soup is None:
break
for tr in soup.select(".grants-competition-page-hero__title"):
name = tr.select_one("p").text
item['Название'] = name
data.append(item)
return data
def dump_to_json(filename, data, **kwargs):
kwargs.setdefault('ensure_ascii', False)
kwargs.setdefault('indent', 1)
with open(OUT_FILENAME, 'w', encoding="utf-8") as f:
json.dump(data, f, **kwargs)
def main():
urls = crawl_products(url)
data = parse_products(urls)
dump_to_json(OUT_FILENAME, data)
with open(OUT_FILENAME, 'w', encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=1)
if __name__ == '__main__':
main()
df = pd.read_json('./out.json')
import requests
import json
from bs4 import BeautifulSoup
import re
from datetime import date, timedelta
PAGES_COUNT = 82
OUT_FILENAME = 'out.json'
import warnings
warnings.filterwarnings("ignore")
def get_soup(url, **kwargs):
response = requests.get(url, **kwargs, verify=False)
if response.status_code == 200:
soup = BeautifulSoup(response.text, features='html.parser')
else:
soup = None
return soup
def crawl_products(pages_count):
urls = []
fmt = 'https://www.rfbr.ru/rffi/ru/contest?CONTEST_STATUS_ID=-1&CONTEST_TYPE=-1&CONTEST_YEAR=-1&page={page}'
for page_n in range(1, 1 + pages_count):
print('page: {}'.format(page_n))
page_url = fmt.format(page=page_n)
soup = get_soup(page_url)
if soup is None:
break
for tag in soup.select('.tr .link'):
href = tag.attrs['href']
url = 'https://www.rfbr.ru/rffi/ru/contest{}'.format(href)
urls.append((url, page_n))
return urls
def parse_products(urls):
data = []
for url in urls:
print('product: {}'.format(url[0]))
soup = get_soup(url[0])
if soup is None:
break
for j in soup.find_all("main", {"class": "template__main"}):
for jj in j.find_all("div", {"class": "sfc l-3 mt-5 mb-10 lh-xl"}):
ja = re.sub(r'[^\x00-\x7f]', r'', str(jj))
jo = re.sub(r'\<[^>]*\>', '', str(ja))
ji = re.sub(r'_', '', str(jo))
ju = re.sub(r' ', '', str(ji))
je = re.sub(r' :', '', str(ju))
jy = je[13:]
amount = jy
rponse = requests.get(url[0], verify=False)
sp = BeautifulSoup(rponse.text, "lxml")
document = {}
item = {}
dcs = sp(attrs={"class": "list-in article"})
for z in dcs:
document[z.h2.text] = list(z.ol.stripped_strings)
# document[z.h2.text] = tuple(z.ol.stripped_strings)
# в одну строку с разделителем запятая
for z in dcs:
document[z.h2.text] = ', '.join(z.ol.stripped_strings)
try:
article = [l.get_text(strip=True) for l in soup.find_all("p") if l.get_text(strip=True)]
art = str(article).replace("['", '').replace("']", '')
except:
article = [l.get_text(strip=True) for l in soup.find_all("h3") if l.get_text(strip=True)]
art = str(article).replace("['", '').replace("']", '')
for row in soup.select('td'):
cols = row.select('td')
cols = [c.text.strip() for c in cols]
name = [i.get_text(strip=True) for i in soup.find_all("h1") if i.get_text(strip=True)]
ame = str(name).replace("['", '').replace("']", '')
_page = f'?page={url[1]}'
p = get_soup(f'https://www.rfbr.ru/rffi/ru/contest{_page}')
for img_td in p.select('.tr'):
image = img_td.select_one("img").get('alt')
item['Название'] = ame
item['Статус'] = image
item['Время окончания приема заявок'] = amount
item['Полное описание условий конкурса'] = art
item['Документы'] = document
data.append(item)
return data
def dump_to_json(filename, data, **kwargs):
kwargs.setdefault('ensure_ascii', False)
kwargs.setdefault('indent', 1)
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.request import urlopen
import json
from urllib.parse import unquote
import warnings
warnings.filterwarnings("ignore")
BASE_URL = 'https://www.rscf.ru/contests'
session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0'
items = []
max_page = 10
for page in range(1, max_page + 1):
url = f'{BASE_URL}/?PAGEN_2={page}/' if page > 1 else BASE_URL
print(url)
rs = session.get(url, verify=False)
rs.raise_for_status()
soup = BeautifulSoup(rs.content, 'html.parser')
for item in soup.select('.classification-table-row.contest-table-row'):
number = item.select_one('.contest-num').text
title = item.select_one('.contest-name').text
date = item.select_one('.contest-date').text.replace("\n", "").replace("Подать заявку", "")
documents = item.select_one('.contest-docs').text.replace("\n", " ").replace(" ", " ").replace(" ", " ")
try:
synopsis = [s.get_text(strip=True) for s in item.select(".contest-status") if s.get_text(strip=True)]
del synopsis[:1]
syn = str(synopsis).replace("['", '').replace("']", '')
except:
synopsis = [s.get_text(strip=True) for s in item.select(".contest-success") if s.get_text(strip=True)]
del synopsis[:1]
syn = str(synopsis).replace("['", '').replace("']", '')
items.append({
'Номер': number,
'Наименование конкурса': title,
'Приём заявок': date,
'Статус': syn,
'Документы': documents,
})
with open('out.json', 'w', encoding='utf-8') as f:
json.dump(items, f, indent=4, ensure_ascii=False)
{
"Номер": "92",
"Наименование конкурса": " Конкурс на получение грантов РНФ по мероприятию «Проведение фундаментальных научных исследований и поисковых научных исследований отдельными научными группами»",
"Приём заявок": "до 15.11.2023 17:00",
"Статус": "Прием заявок",
"Документы": " Извещение Конкурсная документация "
},
{
"Номер": "3005",
"Наименование конкурса": "Конкурс на получение грантов РНФ «Проведение пилотных проектов НИОКР в рамках стратегических инициатив Президента РФ в научно-технологической сфере» по теме: «Разработка нитрид-галлиевого СВЧ-транзистора S-диапазона с выходной мощностью не менее 120 Вт»",
"Приём заявок": "до 02.06.2023 17:00",
"Статус": "Конкурс завершен",
"Документы": " Извещение Конкурсная документация Список победителей "
},
{
import requests
import json
from bs4 import BeautifulSoup
import chardet
import xlsxwriter
import re
from datetime import date, timedelta
PAGES_COUNT = 100
OUT_FILENAME = 'out.json'
import warnings
warnings.filterwarnings("ignore")
def get_soup(url, **kwargs):
response = requests.get(url, **kwargs, verify=False)
if response.status_code == 200:
soup = BeautifulSoup(response.text, features='html.parser')
else:
soup = None
return soup
def crawl_products(pages_count):
urls = []
fmt = 'https://www.rfbr.ru/rffi/ru/contest?CONTEST_STATUS_ID=-1&CONTEST_TYPE=-1&CONTEST_YEAR=-1&page={page}'
for page_n in range(1, 1 + pages_count):
print('page: {}'.format(page_n))
page_url = fmt.format(page=page_n)
soup = get_soup(page_url)
if soup is None:
break
for tag in soup.select('.tr .link'):
href = tag.attrs['href']
url = 'https://www.rfbr.ru/rffi/ru/contest{}'.format(href)
urls.append(url)
return urls
def parse_products(urls):
data = []
for url in urls:
print('product: {}'.format(url))
soup = get_soup(url)
if soup is None:
break
for i in soup.find_all("h1"):
name = i.text
for j in soup.find_all("main", {"class":"template__main"}):
for jj in j.find_all("div", {"class":"sfc l-3 mt-5 mb-10 lh-xl"}):
ja = re.sub(r'[^\x00-\x7f]', r'', str(jj))
jo = re.sub(r'\<[^>]*\>', '', str(ja))
ji = re.sub(r'_', '', str(jo))
ju = re.sub(r' ', '', str(ji))
je = re.sub(r' :', '', str(ju))
jy = je[13:]
amount = jy
rponse = requests.get(url, verify=False)
sp = BeautifulSoup(rponse.text, "lxml")
document ={}
dcs = sp(attrs={"class": "list-in article"})
for z in dcs:
document[z.h2.text] = list(z.ol.stripped_strings)
# document[z.h2.text] = tuple(z.ol.stripped_strings)
# в одну строку с разделителем запятая
for z in dcs:
document[z.h2.text] = ', '.join(z.ol.stripped_strings)
try:
article = [l.get_text(strip=True) for l in soup.find_all("p") if l.get_text(strip=True).startswith('Условия')]
art = str(article).replace("['", '').replace("']", '')
except:
article = [l.get_text(strip=True) for l in soup.find_all("strong") if l.get_text(strip=True).startswith('Условия')]
art = str(article).replace("['", '').replace("']", '')
for row in soup.select('td'):
cols = row.select('td')
cols = [c.text.strip() for c in cols]
item = {
'Название': name,
'Статус': 'Заявки не принимаются',
'Время окончания приема заявок': amount,
'Полное описание условий конкурса': art
}
item['Документы'] = document
data.append(item)
return data
def dump_to_json(filename, data, **kwargs):
kwargs.setdefault('ensure_ascii', False)
kwargs.setdefault('indent', 1)
with open(OUT_FILENAME, 'w') as f:
json.dump(data, f, **kwargs)
def main():
urls = crawl_products(PAGES_COUNT)
data = parse_products(urls)
dump_to_json(OUT_FILENAME, data)
with open(OUT_FILENAME, 'w') as f:
json.dump(data, f, ensure_ascii=False, indent=1)
if __name__ == '__main__':
main()