from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse
resp = urllib.request.urlopen("https://toster.ru")
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
href = link['href']
if not urlparse(href).netloc or 'toster.ru' in href:
print(href)
links = [
'http://example.com/page.php',
'//example.com/page2.php',
'page3.php',
'/page4.php',
]
for link in links:
if not urlparse(link['href']).netloc:
print(link['href'])
links = [
'http://example.com/page.php',
'//example.com/page2.php',
'google.com',
'http://amazon.com'
'page3.php',
'/page4.php',
]
for href in links:
if not urlparse(href).netloc or 'example.com' in href:
print(href)
# result:
http://example.com/page.php
//example.com/page2.php
google.com
/page4.php
# а должно быть:
http://example.com/page.php
//example.com/page2.php
page3.php
/page4.php
http://example.com/page.php
//example.com/page2.php
http://example.com/google.com
http://example.com/page4.php
example.com/google.com
http://example.com/page.php
//example.com/page2.php
http://example.com/page3.php
http://example.com/page4.php