import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.request import urlopen
import json
class Parser:
def __init__(self, href) -> None:
self.url = "https://fasie.ru"
self.href = href
self.source = self.url + self.href['href']
self.name = self.href.text
try:
r = requests.get(self.source, timeout=20)
except:
print(f'err. conn: {self.source} in "def __init__"')
soup = BeautifulSoup(r.text, "html.parser")
self.section = soup.find('div',{'class':'tabs'}).find_all('section')
def get_description(self):
l = re.findall(r"[^\n\t\r]+", self.section[2].text)
return '\n'.join(map(str.strip, l))
def get_program(self):
l = re.findall(r"[^\n\t\r]+", self.section[0].text)
return '\n'.join(map(str.strip, l))
def get_contact(self):
l = []
pattern = r"(\+?[\d\(\) -]+)\s\(?доб\.\s?\d{3}\)?"
if self.section[-1].find('tr'):
for i in self.section[-1].find_all('tr'):
d = {}
d['name'] = i.find('h4').text.strip().replace('\xa0', ' ')
d['tel'] = ''
tmp = re.search(pattern, i.text)
if tmp:
d['tel'] = tmp[1].strip()
d['email'] = i.find('a').text
l.append(d)
elif self.section[-1].find('b'):
name = [i.text for i in self.section[-1].find_all('b') if i.text.strip()]
tel = re.findall(pattern, self.section[-1].text)
email = self.section[-1].find_all('a')
for i in zip(name, tel, email):
d = {}
d['name'] = i[0].strip().replace('\xa0', ' ')
d['tel'] = i[1].strip()
d['email'] = i[2].text
l.append(d)
else:
for i in self.section[-1].find_all('p', recursive=False):
if i.find('a'):
d = {}
d['name'] = ''
d['tel'] = ''
tmp = re.search(pattern, i)
if tmp:
d['tel'] = tmp[1].strip()
d['email'] = i.find('a').text
l.append(d)
return l
def get_documents(self):
l = []
for i in self.section[1].find_all('a'):
d = {}
d['source'] = self.url + i['href']
d['path'] = '.'+ self.href['href'] + '/'.join(i['href'].replace('%20', '_').rsplit('/', 2)[-2:])
d['name'] = d['path'].rsplit('/', 1)[-1]
d['extension'] = d['name'].rsplit('.', 1)[-1]
try:
r = requests.get(d['source'], timeout=20)
except:
print(f"err. conn: {d['source']} in 'def get_documents'")
continue
os.makedirs(os.path.dirname(d['path']), exist_ok=True)
with open(d['path'], 'wb') as f:
f.write(r.content)
d['size'] = len(r.content)
l.append(d)
return l
def run(self):
d = {
'source':self.source,
'name':self.name,
'description':self.get_description(),
'programs':self.get_program(),
'contacts':self.get_contact(),
'documents':self.get_documents()
}
return d
def main():
url = "https://fasie.ru"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', class_ = 'wrap')
programms_list = div[1].find('ul', class_='').find_all('ul', class_='')[1]
hrefs = programms_list.find_all('a')
data = []
for i in hrefs:
p = Parser(i)
data.append(p.run())
with open('output.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(data, indent=2, ensure_ascii=False))
main()