def get_html(url):
headers = {'Accept-Encoding': 'identity'}
response = requests.get(url, headers=headers)
return response.content
def parse_html(html):
soup = BeautifulSoup(html, features="html.parser")
links = []
for link in soup.find_all('a', attrs={'href': re.compile("^(/wiki/)((?!:).)*$")}):
try:
link = 'https://en.wikipedia.org' + link['href']
links.append(link)
except KeyError:
pass
return links
def main_link(html):
soup = BeautifulSoup(html, features="html.parser")
links = soup.findAll('link', rel="canonical")
for link in links:
link = link['href']
return link
def main():
article1 = main_link(get_html('https://en.wikipedia.org/wiki/Special:Random'))
print('article 1 ' + article1)
parse_art1 = parse_html(get_html(article1))
print('links article 1:')
print(parse_art1)
article 1 https://en.wikipedia.org/wiki/Larissa_Marolt
links article 1:
['https://en.wikipedia.org/wiki/Carinthia_(state)', 'https://en.wikipedia.org/wiki/Austria', 'https://en.wikipedia.org/wiki/Klagenfurt', 'https://en.wikipedia.org/wiki/Fashion_model', 'https://en.wikipedia.org/wiki/Austria%27s_Next_Topmodel,_Cycle_1', 'https://en.wikipedia.org/wiki/Austria%27s_Next_Topmodel', 'https://en.wikipedia.org/wiki/Germany%27s_Next_Topmodel,_Cycle_4', 'https://en.wikipedia.org/wiki/Germany%27s_Next_Top_Model', 'https://en.wikipedia.org/wiki/Carinthia_(state)', 'https://en.wikipedia.org/wiki/Sankt_Kanzian_am_Klopeiner_See', 'https://en.wikipedia.org/wiki/Freedom_Party_of_Austria', 'https://en.wikipedia.org/wiki/Gymnasium_(Germany)', 'https://en.wikipedia.org/wiki/Swarovski',
..... очень много ссылок....