Сайт
https://www.udemy.com/courses/business/
Пользовался библиотекой phpQuery и simple_html_dom, обе не помогают.
Я так-же пробовал парсить через CURL, не получается.
Как спарсить?
<?
include 'simple_html_dom.php';
require_once 'phpQuery/phpQuery/phpQuery.php';
$siteList = array(
'https://www.udemy.com/courses/business/',
'https://www.udemy.com/courses/business/finance-courses/'
);
ini_set('max_execution_time', 0);
$url = 'https://www.udemy.com/courses/business/';
function get_content ($base) {
$curl = curl_init();
// curl_setopt($curl, CURLOPT_PROXY, '96.96.33.133:1080');
// curl_setopt($curl, CURLOPT_PROXYPORT, '1080');
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_HTTPHEADER, array(
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"X-Requested-With: XMLHttpRequest",
"Referer: https://client.work-zilla.com/away?url=https%3A%2F%2Fwww.udemy.com%2Fcourses%2Fbusiness%2F%3Fsort%3Dnewest",
"Connection: keep-alive",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding: gzip, deflate, br",
"Accept-Language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
"Cache-Control: max-age=0",
"Host: www.udemy.com",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Upgrade-Insecure-Requests: 1",
"Content-Type: text/html; charset=utf-8"
));
curl_setopt($curl, CURLOPT_HEADER, true);
// curl_setopt($curl, CURLOPT_POSTFIELDS, array(
// 'event '=> 'pageview',
// 'metadata' => '{"url":"https://www.udemy.com/courses/business/","user_agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36","language":"ru-RU","page_has_microdata":false,"screen_width":1920,"screen_height":1080,"og_data":{"$og_title":"Онлайн-курсы по бизнесу","$og_description":"Богатая библиотека курсов по электронному бизнесу и предпринимательству. Узнайте, как основать собственную компанию, что такое маркетинг, реклама, финансы и начните учиться уже сегодня.","$og_image_url":"https://www.udemy.com/staticx/udemy/images/v6/default-meta-image.png","$og_video":null},"hosted_deeplink_data":{}}',
// 'initial_referrer' => 'https://client.work-zilla.com/away?url=https%3A%2F%2Fwww.udemy.com%2Fcourses%2Fbusiness%2F%3Fsort%3Dnewest',
// 'browser_fingerprint_id' => '776518790625545054',
// 'identity_id' => '777756563292973063',
// 'sdk' => 'web2.25.0',
// 'session_id' => '777756563309493432',
// 'branch_key' => 'key_live_dkmkLOFE7jz8nKqIcFl0hliaamb5MyJ6'
// ));
curl_setopt($curl, CURLOPT_POSTFIELDS, 'event=pageview&metadata=%7B%22url%22%3A%22https%3A%2F%2Fwww.udemy.com%2Fcourses%2Fbusiness%2F%22%2C%22user_agent%22%3A%22Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F80.0.3987.163%20Safari%2F537.36%22%2C%22language%22%3A%22ru-RU%22%2C%22page_has_microdata%22%3Afalse%2C%22screen_width%22%3A1920%2C%22screen_height%22%3A1080%2C%22og_data%22%3A%7B%22%24og_title%22%3A%22%D0%9E%D0%BD%D0%BB%D0%B0%D0%B9%D0%BD-%D0%BA%D1%83%D1%80%D1%81%D1%8B%20%D0%BF%D0%BE%20%D0%B1%D0%B8%D0%B7%D0%BD%D0%B5%D1%81%D1%83%22%2C%22%24og_description%22%3A%22%D0%91%D0%BE%D0%B3%D0%B0%D1%82%D0%B0%D1%8F%20%D0%B1%D0%B8%D0%B1%D0%BB%D0%B8%D0%BE%D1%82%D0%B5%D0%BA%D0%B0%20%D0%BA%D1%83%D1%80%D1%81%D0%BE%D0%B2%20%D0%BF%D0%BE%20%D1%8D%D0%BB%D0%B5%D0%BA%D1%82%D1%80%D0%BE%D0%BD%D0%BD%D0%BE%D0%BC%D1%83%20%D0%B1%D0%B8%D0%B7%D0%BD%D0%B5%D1%81%D1%83%20%D0%B8%20%D0%BF%D1%80%D0%B5%D0%B4%D0%BF%D1%80%D0%B8%D0%BD%D0%B8%D0%BC%D0%B0%D1%82%D0%B5%D0%BB%D1%8C%D1%81%D1%82%D0%B2%D1%83.%20%D0%A3%D0%B7%D0%BD%D0%B0%D0%B9%D1%82%D0%B5%2C%20%D0%BA%D0%B0%D0%BA%20%D0%BE%D1%81%D0%BD%D0%BE%D0%B2%D0%B0%D1%82%D1%8C%20%D1%81%D0%BE%D0%B1%D1%81%D1%82%D0%B2%D0%B5%D0%BD%D0%BD%D1%83%D1%8E%20%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8E%2C%20%D1%87%D1%82%D0%BE%20%D1%82%D0%B0%D0%BA%D0%BE%D0%B5%20%D0%BC%D0%B0%D1%80%D0%BA%D0%B5%D1%82%D0%B8%D0%BD%D0%B3%2C%20%D1%80%D0%B5%D0%BA%D0%BB%D0%B0%D0%BC%D0%B0%2C%20%D1%84%D0%B8%D0%BD%D0%B0%D0%BD%D1%81%D1%8B%20%D0%B8%20%D0%BD%D0%B0%D1%87%D0%BD%D0%B8%D1%82%D0%B5%20%D1%83%D1%87%D0%B8%D1%82%D1%8C%D1%81%D1%8F%20%D1%83%D0%B6%D0%B5%20%D1%81%D0%B5%D0%B3%D0%BE%D0%B4%D0%BD%D1%8F.%22%2C%22%24og_image_url%22%3A%22https%3A%2F%2Fwww.udemy.com%2Fstaticx%2Fudemy%2Fimages%2Fv6%2Fdefault-meta-image.png%22%2C%22%24og_video%22%3Anull%7D%2C%22hosted_deeplink_data%22%3A%7B%7D%7D&initial_referrer=https%3A%2F%2Fclient.work-zilla.com%2Faway%3Furl%3Dhttps%253A%252F%252Fwww.udemy.com%252Fcourses%252Fbusiness%252F%253Fsort%253Dnewest&browser_fingerprint_id=776518790625545054&identity_id=777756563292973063&sdk=web2.25.0&session_id=777756563309493432&branch_key=key_live_dkmkLOFE7jz8nKqIcFl0hliaamb5MyJ6');
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_URL, $base);
curl_setopt($curl, CURLOPT_REFERER, $base);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_ENCODING ,"");
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/531.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36");
curl_setopt($curl, CURLOPT_COOKIE, "evi=SlFYNkxYDm4DQRx1TFgObkdREXBCQAMtE0kedlpUCGATQR52VkBPNxMFCXtfTlc6UFERd1tRRTEdURl3V1JXdkpRXWNUWlluRxIJe1hVTXxMDgdjGwYZJUUWCXsVQEd6CUMJexVAA24LQgdjGANXdgdEHnUTHwg=; PHPSESSID=jl0i13pn3157qca807jgp0jqa7; ServerName=WoW+Circle+3.3.5a+x5; serverId=1");
$str = curl_exec($curl);
$str = iconv('UTF-8', 'UTF-8', $str);
$info = curl_getinfo($curl);
curl_close($curl);
return $str;
}
$str = get_content($url);
echo $str;
// sleep(5);
$html_base = new simple_html_dom();
$html_base->load($str);
print_r($html_base->plaintext );
$html_base->clear();
unset($html_base);
?>
Если посмотрите, я там пробовал отправлять куки с помощью CURLOPT_POSTFIELDS. Используя оба способа, выходит ошибка "403 Forbidden".
Если не использовать, сайт вроде как показывает, но не то, что нужно.