def remove_unreachable(self):
reachable = set()
prev_reachable = set()
reachable.add(self.start)
while reachable != prev_reachable:
prev_reachable = reachable.copy()
for variable in prev_reachable:
if variable in self.productions: # Add this line to fix the KeyError
for rule in self.productions[variable]:
for symbol in rule:
if symbol in self.variables:
reachable.add(symbol)
new_productions = {variable: rules for variable, rules in self.productions.items() if variable in reachable}
self.variables = reachable
self.productions = new_productions
def remove_unreachable(self):
reachable = set()
prev_reachable = set()
reachable.add(self.start)
while reachable != prev_reachable:
prev_reachable = reachable.copy()
for variable in prev_reachable:
for rule in self.productions[variable]:
for symbol in rule:
if symbol in self.variables:
reachable.add(symbol)
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.request import urlopen
import json
from urllib.parse import unquote
class Parser:
def __init__(self, href) -> None:
self.url = "https://fasie.ru"
self.href = href
self.source = self.url + self.href['href']
self.name = self.href.text
try:
r = requests.get(self.source, timeout=20)
except:
print(f'err. conn: {self.source} in "def __init__"')
soup = BeautifulSoup(r.text, "html.parser")
self.section = soup.find('div',{'class':'tabs'}).find_all('section')
def get_description(self):
l = re.findall(r"[^\n\t\r]+", self.section[2].text)
return '\n'.join(map(str.strip, l))
def get_program(self):
l = re.findall(r"[^\n\t\r]+", self.section[0].text)
return '\n'.join(map(str.strip, l))
def get_contact(self):
l = []
pattern = r"(\+?[\d\(\) -]+)\s\(?доб\.\s?\d{3}\)?"
pattern_email = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b"
if self.section[-1].find('tr'):
for i in self.section[-1].find_all('tr'):
d = {}
d['name'] = i.find('h4').text.strip().replace('\xa0', ' ').split('\r', 1)[0]
tmp = re.search(pattern, i.text)
d['tel'] = tmp[1] if tmp else ''
tmp = re.search(pattern_email, i.text)
d['email'] = tmp[0] if tmp else ''
if sum(map(len, d.values())):
l.append(d)
elif self.section[-1].find('b'):
name = [i.text for i in self.section[-1].find_all('b') if i.text.strip()]
tel = re.findall(pattern, self.section[-1].text)
email = [i.text for i in self.section[-1].find_all('a') if i.text]
for i in zip(name, tel, email):
d = {}
d['name'] = i[0].strip().replace('\xa0', ' ')
d['tel'] = re.sub(r'\s\(?доб\.\s?\d{3}\)', '', i[1].strip())
d['email'] = i[2].strip()
l.append(d)
else:
for i in self.section[-1].find_all('p', recursive=False):
if i.find('a'):
d = {}
d['name'] = ''
tmp = re.search(pattern, i)
d['tel'] = tmp[0] if tmp else ''
d['email'] = i.find('a').text
l.append(d)
return l
def get_documents(self):
l = []
for i in self.section[1].find_all('a'):
if i['href']:
i['href'] = i['href'].replace(self.url, '')
name = unquote(i['href'])
d = {}
d['source'] = self.url + '/' + i['href'].lstrip('/')
d['path'] = f"./{self.href['href'].strip('/')}/{name}"
d['name'] = name.rsplit('/', 1)[-1]
d['extension'] = name.rsplit('.', 1)[-1]
try:
r = requests.get(d['source'], timeout=20)
except:
print(f"err. conn: {d['source']}")
continue
if r.status_code == 200:
try:
os.makedirs(os.path.dirname(d['path']), exist_ok=True)
except:
print(f"Ошибка при создание папки\nТег: {i}\nname{d['path']}")
raise
try:
with open(d['path'], 'wb') as f:
f.write(r.content)
except:
print(f"Ошибка при создание файла\nТег: {i}")
raise
d['size'] = len(r.content)
l.append(d)
else:
print(f"{d['source']} no response")
return l
def run(self):
d = {
'source':self.source,
'name':self.name,
'description':self.get_description(),
'programs':self.get_program(),
'contacts':self.get_contact(),
'documents':self.get_documents()
}
return d
def main():
url = "https://fasie.ru"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', class_ = 'wrap')
programms_list = div[1].find('ul', class_='').find_all('ul', class_='')[1]
hrefs = programms_list.find_all('a')
data = []
for i in hrefs:
p = Parser(i)
data.append(p.run())
with open('output.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(data, indent=2, ensure_ascii=False))
main()
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.request import urlopen
import json
class Parser:
def __init__(self, href) -> None:
self.url = "https://fasie.ru"
self.href = href
self.source = self.url + self.href['href']
self.name = self.href.text
try:
r = requests.get(self.source, timeout=20)
except:
print(f'err. conn: {self.source} in "def __init__"')
soup = BeautifulSoup(r.text, "html.parser")
self.section = soup.find('div',{'class':'tabs'}).find_all('section')
def get_description(self):
l = re.findall(r"[^\n\t\r]+", self.section[2].text)
return '\n'.join(map(str.strip, l))
def get_program(self):
l = re.findall(r"[^\n\t\r]+", self.section[0].text)
return '\n'.join(map(str.strip, l))
def get_contact(self):
l = []
pattern = r"(\+?[\d\(\) -]+)\s\(?доб\.\s?\d{3}\)?"
if self.section[-1].find('tr'):
for i in self.section[-1].find_all('tr'):
d = {}
d['name'] = i.find('h4').text.strip().replace('\xa0', ' ')
d['tel'] = ''
tmp = re.search(pattern, i.text)
if tmp:
d['tel'] = tmp[1].strip()
d['email'] = i.find('a').text
l.append(d)
elif self.section[-1].find('b'):
name = [i.text for i in self.section[-1].find_all('b') if i.text.strip()]
tel = re.findall(pattern, self.section[-1].text)
email = self.section[-1].find_all('a')
for i in zip(name, tel, email):
d = {}
d['name'] = i[0].strip().replace('\xa0', ' ')
d['tel'] = i[1].strip()
d['email'] = i[2].text
l.append(d)
else:
for i in self.section[-1].find_all('p', recursive=False):
if i.find('a'):
d = {}
d['name'] = ''
d['tel'] = ''
tmp = re.search(pattern, i)
if tmp:
d['tel'] = tmp[1].strip()
d['email'] = i.find('a').text
l.append(d)
return l
def get_documents(self):
l = []
for i in self.section[1].find_all('a'):
if i['href']:
d = {}
d['source'] = self.url + '/' + i['href'].lstrip('/')
d['path'] = '.'+ self.href['href'] + '/'.join(i['href'].replace('%20', '_').rsplit('/', 2)[-2:])
d['name'] = d['path'].rsplit('/', 1)[-1]
d['extension'] = d['name'].rsplit('.', 1)[-1]
try:
r = requests.get(d['source'], timeout=20)
except:
print(f"err. conn: {d['source']}")
continue
if r.status_code == 200:
os.makedirs(os.path.dirname(d['path']), exist_ok=True)
try:
with open(d['path'], 'wb') as f:
f.write(r.content)
except:
print(f"Ошибка при создание файла\nТег: {i}")
raise
d['size'] = len(r.content)
l.append(d)
else:
print(f"{d['source']} no response")
return l
def run(self):
d = {
'source':self.source,
'name':self.name,
'description':self.get_description(),
'programs':self.get_program(),
'contacts':self.get_contact(),
'documents':self.get_documents()
}
return d
def main():
url = "https://fasie.ru"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', class_ = 'wrap')
programms_list = div[1].find('ul', class_='').find_all('ul', class_='')[1]
hrefs = programms_list.find_all('a')
data = []
for i in hrefs:
p = Parser(i)
data.append(p.run())
with open('output.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(data, indent=2, ensure_ascii=False))
main()
<a href=""> </a>
ты будешь наблюдать как метод remove_unproductive стирает self.variables и self.productions.
у тебя происходить такая сценария:
Метод transform вызывает метод remove_unproductive и стирает твой self.productions и self.variables. Дальше метод transform вызывает метод remove_unreachable, и метод remove_unreachable не может найти ключа "S" в словаря self.productions, потому-то метод remove_unproductive стер словаря self.productions.