from fake_useragent import UserAgent
import aiohttp
import asyncio
from bs4 import BeautifulSoup as bs
BASE_URL = ['https://dok.ua/ua']
async def getPage(session, url):
user_agent = {'user-agent': UserAgent().random}
async with session.get(url, headers=user_agent) as r:
return await r.text(encoding='utf-8')
async def getPages(session, urls):
tasks = []
for url in urls:
task = asyncio.create_task(getPage(session, url))
tasks.append(task)
return await asyncio.gather(*tasks)
async def createSession(urls):
cookie_lang = {'lang': 'ua'}
async with aiohttp.ClientSession(cookies=cookie_lang) as session:
return await getPages(session, urls)
def requests(urls):
return asyncio.run(createSession(urls))
def getSoup(result):
return bs(result, 'lxml')
def parseRubricsLinks():
r = requests(BASE_URL)
soup = getSoup(r[0])
rubrics = soup.find_all('a', class_='menu-list__link')
rubrics_links = []
for rubric in rubrics:
rubrics_links.append(BASE_URL[0] + rubric.get('href'))
return rubrics_links
def parseCatalogsLinks():
rubrics = parseRubricsLinks()
catalog_links = []
for result in requests(rubrics[:9]):
soup = getSoup(result)
links = soup.find_all('a', class_='menu-mob-level3__link')
for link in links:
print(link)# print item for test
link_url = link.get("href")
if link_url != None:
catalog_links.append(f'{BASE_URL[0]}{link_url}')
return list(set(catalog_links))
def main():
parseCatalogsLinks()
if __name__ == "__main__":
main()