import requests
base_url = "https://www.ursus.ru/catalogue/zashchita_ot_padeniy_s_vysoty/page-"
pages = 8
for i in range(1, pages + 1):
print(f"Page: {i}")
print(f"{base_url}{i}/")
requests.get(f"{base_url}{i}.")
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36"
}
data = requests.get(url, headers=headers)
import requests
from bs4 import BeautifulSoup
url = "https://realt.by/sale-flats/object/2562548/"
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36"
}
data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, features="html.parser")
img = (
soup.find("div", attrs={"class": "swiper-wrapper"})
.findAll("img", class_="blur-sm scale-105")[1]
.get("src")
)
print(img)
https://static.realt.by/thumb/c/600x400/6f57b1d409f96f2b1ede7f082f120b50/ja/e/site15nf8eja/7c30f38145.jpg
from bs4 import BeautifulSoup
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
estrade_url = 'https://www.instagram.com/estrade.pmk/'
page = requests.get(estrade_url,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
import requests
from bs4 import BeautifulSoup
main_url = 'https://uristhome.ru'
docs_url = "https://uristhome.ru/document"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
down_link = []
r = requests.get(docs_url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for doc in soup.find("ul",{"class": "y_articles-document-list"}):
down_link.append(main_url+doc.find("a").attrs['href'])
with open('download_link.txt', 'a') as nf:
nf.writelines('\n'.join(docs))
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
down_link = open('download_links.txt', 'r')
docs = []
counter = 0
for links in down_link.readlines():
try:
r = requests.get(links, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
x = soup.find("div",{"class": "filefield-file"}).find("a").attrs['href']
counter += 1
print(counter)
print(x)
docs.append(x)
except:
pass
with open('documents_link.txt', 'a') as nf:
nf.writelines('\n'.join(docs))
import requests
from bs4 import BeautifulSoup
def get_first_news():
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36 OPR/84.0.4316.52'
}
url = 'https://www.anekdot.ru/random/anekdot/'
r = requests.get(url=url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
anecdot = soup.find_all('div', class_="text")
for article in anecdot:
article_title = article.text.strip()
print(article_title)
get_first_news()
soup = BeautifulSoup(html)
author_text = soup.find('i',{'class':'icon icon-user'})
email_text = soup.find('i',{'class':'icon icon-support'})
phone_text = soup.find('i',{'class':'icon icon-phone'})
print(author_text.next)
print(email_text.next)
print(phone_text.next)
Пыльнев Анатолий
tollik36@mail.ru
89055663563
from bs4 import BeautifulSoup
html="""
<tr>
<td>
<br/><br/>
<i class="icon icon-user" data-selector=".icon" title="Автор"></i> Барышева Олеся<br/>
<i class="icon icon-support" data-selector=".icon" title="E-mail"></i> olesya052019@bk.ru<br/>
<i class="icon icon-phone" data-selector=".icon" title="Телефон"></i> 89188565504<br/>
</td>
</tr>
"""
soup = BeautifulSoup(html)
my_text = soup.find('td')
print(my_text.get_text().split())
['Барышева', 'Олеся', 'olesya052019@bk.ru', '89188565504']
Барышева Олеся
olesya052019@bk.ru
89188565504
from bs4 import BeautifulSoup
html="""
<tr>
<td>
<br/><br/>
<i class="icon icon-user" data-selector=".icon" title="Автор"></i> Барышева Олеся<br/>
<i class="icon icon-support" data-selector=".icon" title="E-mail"></i> olesya052019@bk.ru<br/>
<i class="icon icon-phone" data-selector=".icon" title="Телефон"></i> 89188565504<br/>
</td>
</tr>
<tr>
<td>
<br/><br/>
<i class="icon icon-user" data-selector=".icon" title="Автор"></i> Иван Иванович<br/>
<i class="icon icon-support" data-selector=".icon" title="E-mail"></i> obi_van_ia9@bk.ru<br/>
<i class="icon icon-phone" data-selector=".icon" title="Телефон"></i> 232321113312<br/>
</td>
</tr>
<tr>
<td>
<br/><br/>
<i class="icon icon-user" data-selector=".icon" title="Автор"></i> Темный лорд<br/>
<i class="icon icon-support" data-selector=".icon" title="E-mail"></i> pirojok51@mail.ru<br/>
<i class="icon icon-phone" data-selector=".icon" title="Телефон"></i> 80002111122<br/>
</td>
</tr>
"""
soup = BeautifulSoup(html)
my_text = soup.findAll('td')
for text in my_text:
print(text.get_text().split())
['Барышева', 'Олеся', 'olesya052019@bk.ru', '89188565504']
['Иван', 'Иванович', 'obi_van_ia9@bk.ru', '232321113312']
['Темный', 'лорд', 'pirojok51@mail.ru', '80002111122']
from bs4 import BeautifulSoup
import re
html = """
<p class="order-quantity j-orders-count-wrapper" data-link="class{merge: selectedNomenclature^ordersCount < 1 toggle='hide'}">Купили
<span data-link="{include tmpl='productCardOrderCount' ^~ordersCount=selectedNomenclature^ordersCount}">
<script type="jsv#29_"></script>
<script type="jsv#27^"></script>
<script type="jsv#30_"></script>
<script type="jsv#26^"></script>более 700 раз<script type="jsv/26^">
</script>
<script type="jsv/30_"></script>
<script type="jsv/27^"></script>
<script type="jsv/29_"></script>
</span>
</p>
"""
soup = BeautifulSoup(html)
full_text = re.sub(' +', ' ',soup.find('p').get_text().strip().replace(u'\n', u' '))
print(full_text)
number = re.findall("[0-9]+",soup.find('p').get_text())
print(nunber)
pip install bs4
. а также requests pip install requests
, через Win+R или cmdpip install bs4
. а также requests pip install requests
, через Win+R или cmditem = soup.find('a', class_="sih-inspect-magnifier")
print(item[href])
If you look at the source code for the page, you'll see that some javascript generates the webpage. What you see in the element browser is the webpage after the script has been run, and beautifulsoup just gets the html file. In order to parse the rendered webpage you'll need to use something like Selenium to render the webpage for you.
So, for example, this is how it would look with Selenium:
from bs4 import BeautifulSoup import selenium.webdriver as webdriver url = 'http://instagram.com/umnpics/' driver = webdriver.Firefox() driver.get(url) soup = BeautifulSoup(driver.page_source) for x in soup.findAll('li', {'class':'photo'}): print x
soup.findAll('li', {'class':'photo'})
меняете на ваши нужды <span class="post-stats__comments-count" title="Читать комментарии">21</span>
import requests
from bs4 import BeautifulSoup
pageURL = "https://habr.com/ru/post/442800/"
req = requests.get(pageURL)
soup = BeautifulSoup(req.content, 'html.parser')
comments = soup.find('span', {'class': 'post-stats__comments-count'}).get_text()
print('Количество комментариев на '+ pageURL + ' : ' +comments)