from collections import Counter
import re
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import requests
def extract_text(node):
"""
Extract text without markup from node
"""
def extract_text_gen(node):
if node.text:
yield node.text.strip()
for child in node.iterchildren():
yield from extract_text_gen(child)
if child.tail:
yield child.tail.strip()
return ' '.join((s for s in extract_text_gen(node) if s))
def count_words(text):
return Counter((s for s in re.split(r'\s', text) if s))
html = requests.get('https://toster.ru/q/276749').content.decode('utf-8')
root = fromstring(html)
Cleaner()(root)
text = extract_text(root.body)
words_count = count_words(extract_text(root))
print('\n'.join(('"%s": %i' % (word, count) for word, count in words_count.most_common())))
Функция
extract_text
взята из одного моего проекта, слегка адаптирована и упрощена.