diff --git a/app/app.php b/app/app.php index 39a1a0c..6da9e2e 100644 --- a/app/app.php +++ b/app/app.php @@ -1,6 +1,4 @@ getSites($dbNiimko); -$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko); -$orgs = ContingentManager::getInstance()->getOrgs($dbOpendata); +// $specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko); +// $orgs = ContingentManager::getInstance()->getOrgs($dbOpendata); -$exceptionsOrgHtml = ContingentManager::getInstance()->getExceptionsHtml('error-html.log'); -$exceptionsOrgHttpCurl = ContingentManager::getInstance()->getExceptionsHttpCurl('error-http-curl.log'); +// $exceptionsOrgHtml = ContingentManager::getInstance()->getExceptionsHtml('select-html-error.log'); +// $exceptionsOrgHttpCurl = ContingentManager::getInstance()->getExceptionsHttpCurl('select-http-error.log'); // print_r($exceptionsOrgHttpCurl); -$start = 0; +// echo count($exceptionsOrgHttpCurl) . " - http-error sites" . PHP_EOL; -for ($i = $start; $i < count($sites); $i++) { +// $start = 794; +// for ($i = $start; $i < count($sites); $i++) { +// // Нет URL сайта вуза +// if (empty($sites[$i]['site'])) { +// $message = implode(' ', $sites[$i]); +// Logger::log($pathLogErrorHttp, $message); +// Logger::log($pathErrorHttp, implode(' ', $sites[$i])); +// continue; +// } +// // Уже в базе +// if (in_array($sites[$i]['org_id'], $orgs)) { +// continue; +// } +// // С ошибками разметки игнорируем +// if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) { +// continue; +// } +// // Без ошибок http игнорируем +// if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) { +// continue; +// } +// try { +// $baseUri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']); +// $client = new Client([ +// 'force_ip_resolve' => 'v4', +// 'debug' => fopen("debug-http.log", "a"), +// 'base_uri' => $baseUri, +// 'allow_directs' => [ +// 'max' => 5, +// 'strict' => true, +// 'referer' => true, +// 'protocols' => ['http', 'https'], +// 'track_redirects' => true +// ], +// 'connect_timeout' => 300.0, +// 'verify' => false, +// 'headers' => [ +// 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36', +// 'Content-Type' => 'text/html;charset=utf-8' +// ] +// ]); +// $response = $client->get('', [ +// 'on_stats' => function (TransferStats $stats) use (&$url) { +// $url = $stats->getEffectiveUri(); +// } +// ]); + +// print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL); +// if (substr($url, -1) == '/') { +// $url = $url."sveden/education/"; +// } else { +// $url = $url."/sveden/education/"; +// } +// echo $url .PHP_EOL; +// $response = $client->get($url, [ +// 'on_stats' => function (TransferStats $stats) use (&$url) { +// $url = $stats->getEffectiveUri(); +// } +// ]); +// echo $url . PHP_EOL; + +// $html = $response->getBody()->getContents(); +// if (empty($html)) { +// $message = implode(' ', $sites[$i]); +// Logger::log($pathLogErrorHtml, $message); +// continue; +// } +// $parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//'); +// $contingent = $parser->getDataTable(); + +// // Добавляем поле spec_id по spec_code +// ContingentManager::getInstance()->addSpecId($contingent, $specializations); + +// // Добавляем поле org_id +// ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']); +// print_r($contingent); +// if (empty($contingent)) { +// $message = implode(' ', $sites[$i]); +// Logger::log($pathLogErrorHtml, $message); +// } else { +// $set = ContingentManager::getInstance()->checkContingent($contingent); +// if ($set) { +// // Заносим в базу +// ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent); +// } else { +// $message = implode(' ', $sites[$i]); +// Logger::log($pathLogErrorHtml, $message); +// } +// unset($contingent); +// } +// } catch (ClientException +// | RequestException +// | ConnectException +// | ServerException +// | MalformedUriException $e +// ) { +// $message = implode(' ', $sites[$i]) . " " . $e->getCode() . " " . $e->getMessage(); +// Logger::log($pathLogErrorHttp, $message); +// Logger::log($pathErrorHttp, implode(' ', $sites[$i])); +// } +// } + +$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko); +$orgs = ContingentManager::getInstance()->getOrgs($dbOpendata); +// $sites = Yaml::parse(file_get_contents(dirname(__FILE__) ."/sites.yaml")); +// print_r($sites); + +for ($i = 0; $i < count($sites); $i++) { // Нет URL сайта вуза if (empty($sites[$i]['site'])) { - $message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site']; - Logger::log($pathLogErrorHttp, $message); + // $message = implode(' ', $sites[$i]); + // Logger::log($pathLogErrorHttp, $message); + // Logger::log($pathErrorHttp, implode(' ', $sites[$i])); continue; } // Уже в базе @@ -45,82 +151,65 @@ for ($i = $start; $i < count($sites); $i++) { continue; } // С ошибками разметки игнорируем - if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) { - continue; + // if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) { + // continue; + // } + // Без ошибок http игнорируем + // if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) { + // continue; + // } + print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL); + $uri = trim(ContingentManager::getInstance()->buildBaseUri($sites[$i]['site'])); + $uri = str_replace("_","/", $uri); + if (substr($uri, -1) == '/') { + $uri = $uri."sveden/education/"; + } else { + $uri = $uri."/sveden/education/"; } - if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) { - continue; - } - try { - $baseUri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']); - $onRedirect = function( - RequestInterface $request, - ResponseInterface $res, - UriInterface $uri - ) { - echo 'Redirecting! ' . $request->getUri() . ' to ' . $uri . "\n"; - }; - $client = new Client([ - 'force_ip_resolve' => 'v4', - 'debug' => fopen("debug-http.log", "a"), - 'base_uri' => $baseUri, - 'allow_directs' => [ - 'max' => 5, - 'strict' => true, - 'referer' => true, - 'protocols' => ['http', 'https'], - 'on_redirect' => $onRedirect, - 'track_redirects' => true - ], - 'connect_timeout' => 300.0, - 'verify' => false, - // 'http_errors' => false, - 'headers' => [ - 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36', - 'Content-Type' => 'text/html;charset=utf-8' - ] - ]); - print(($i+1). '. ' . implode(' ', $sites[$i]) . "\n"); - $response = $client->get('/sveden/education/'); - echo $response->getStatusCode() .PHP_EOL; - var_dump($response->getHeaderLine("'X-Guzzle-Redirect-History") . PHP_EOL); - - $html = $response->getBody()->getContents(); - if (empty($html)) { - $message = implode(' ', $sites[$i]); - Logger::log($pathLogErrorHtml, $message); - continue; - } - $parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//'); - $contingent = $parser->getDataTable(); - - // Добавляем поле spec_id по spec_code - ContingentManager::getInstance()->addSpecId($contingent, $specializations); - - // Добавляем поле org_id - ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']); - print_r($contingent); - if (empty($contingent)) { - $message = implode(' ', $sites[$i]); - Logger::log($pathLogErrorHtml, $message); - } else { - $set = ContingentManager::getInstance()->checkContingent($contingent); - if ($set) { - // Заносим в базу - ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent); - } else { - $message = implode(' ', $sites[$i]); - Logger::log($pathLogErrorHtml, $message); - } - unset($contingent); - } - } catch (ClientException - | RequestException - | ConnectException - | ServerException - | MalformedUriException $e - ) { - $message = implode(' ', $sites[$i]) . "\t" . $e->getCode() . "\t" . $e->getMessage(); + echo $uri . PHP_EOL; + $ch = curl_init($uri); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_HEADER, false); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + $html = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + if ($httpCode != 200) { + $errno = curl_errno($ch); + $message = implode(' ', $sites[$i]); + $message .= " cURL error ({$errno}): ".curl_strerror($errno); Logger::log($pathLogErrorHttp, $message); - } -} + unset($httpCode); + continue; + } + + curl_close($ch); + echo "HTTP-code: " . $httpCode . PHP_EOL; + if (empty($html)) continue; + + $parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//'); + $contingent = $parser->getDataTable(); + // Добавляем поле spec_id по spec_code + ContingentManager::getInstance()->addSpecId($contingent, $specializations); + // Добавляем поле org_id + ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']); + + + if (empty($contingent)) { + echo "empty". PHP_EOL; + $message = implode(' ', $sites[$i]); + Logger::log($pathLogErrorHtml, $message); + } else { + print_r($contingent); + $set = ContingentManager::getInstance()->checkContingent($contingent); + if ($set) { + // Заносим в базу + ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent); + } else { + $message = implode(' ', $sites[$i]); + Logger::log($pathLogErrorHtml, $message); + } + unset($contingent); + unset($httpCode); + } +} \ No newline at end of file diff --git a/app/library/ContingentManager.php b/app/library/ContingentManager.php index fecd39f..f20d1c0 100644 --- a/app/library/ContingentManager.php +++ b/app/library/ContingentManager.php @@ -37,6 +37,31 @@ final class ContingentManager return $sites; } + public function getSitesFromMiccedu(Database $db, array $params) : array + { + // select site, vuzkod as org_id from opendata.miccedu_monitoring + // where year = 2023 and (vuzkod = :val1 or vuzkod = :val2 or ...) + $year = 2023; + foreach ($params as $key => $org) { + $params[$key] = (int)$org['org_id']; + } + $query = $this->builder->select() + ->setTable('miccedu_monitoring') + ->setColumns(['org_id' => 'vuzkod','site']) + ->where('AND') + ->equals('year', $year) + ->subWhere('OR'); + foreach ($params as $orgId) { + $query->equals('vuzkod', $orgId); + } + $query = $query->end(); + $sql = $this->builder->writeFormatted($query); + array_unshift($params, $year); + $sites = $db->selectQuery($sql, $params); + + return $sites; + } + public function insertContingent(Database $db, array $contingent) : void { $params = ['spec_code', 'spec_name', 'edu_level', 'edu_forms', 'contingent', 'spec_id', 'org_id']; @@ -84,11 +109,21 @@ final class ContingentManager public function buildBaseUri(string $url): string { // Строит -> https:// - if (strpos($url,'https://') === false && strpos($url,'http://') === false) { + $url = str_replace("www/", "www.", $url); + if (strpos($url,'https://') === false + && strpos($url,'http://') === false + ) { $url = "http://$url"; } + $url = str_replace("https://", "http://", $url); + $arr = parse_url($url); + $url = $arr['scheme'] .'://'. $arr['host'] . '/'; + + // $url = str_replace("www.", "", $url); + $url = str_replace("_","/", $url); + $url = $url."sveden/education/"; - return $url; + return trim($url); } public function addSpecId(array &$contingent, array $specializations) : void @@ -141,12 +176,12 @@ final class ContingentManager $data = explode (' ', $str); if (preg_match("/^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])$/", $data[0]) && $data[3] != PHP_EOL) { - $orgHttpError[] = $data[2]; - // $orgHttpError[] = ['org_id' => $data[2], 'site' => $data[3]]; + // $orgHttpError[] = $data[2]; + $orgHttpError[] = ['org_id' => $data[2], 'site' => $data[3]]; } } - $orgHttpError = array_unique($orgHttpError); - sort($orgHttpError); + // $orgHttpError = array_unique($orgHttpError); + ksort($orgHttpError); return $orgHttpError; } } \ No newline at end of file diff --git a/app/library/ContingentParser.php b/app/library/ContingentParser.php index dd50758..a4d4f21 100644 --- a/app/library/ContingentParser.php +++ b/app/library/ContingentParser.php @@ -36,10 +36,10 @@ class ContingentParser 'UTF-8', $encoding ); - $html = str_replace('windows-1251','utf-8', $html); - } else { - $dom->loadHTML(mb_convert_encoding($html,'HTML-ENTITIES','UTF-8')); + $html = str_replace('windows-1251','utf-8', $html); } + $dom->loadHTML(mb_convert_encoding($html,'HTML-ENTITIES','UTF-8')); + $this->xpath = new \DOMXPath($dom); $this->template = $template; } diff --git a/curl-helper.php b/curl-helper.php index 850d436..fe4c82c 100644 --- a/curl-helper.php +++ b/curl-helper.php @@ -7,45 +7,8 @@ use App\Library\Logger; use Symfony\Component\Yaml\Yaml; require_once(dirname(__FILE__) ."/vendor/autoload.php"); +// require_once(dirname(__FILE__) ."/test.php"); -function curl_redir_exec($ch) -{ - static $curl_loops = 0; - static $curl_max_loops = 20; - if ($curl_loops++ >= $curl_max_loops) { - $curl_loops = 0; - return false; - } - curl_setopt($ch, CURLOPT_HEADER, true); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - $data = curl_exec($ch); - list($header, $data) = explode("\n\n", $data, 2); - $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); - if ($http_code == 301 || $http_code == 302) { - $matches = []; - preg_match('/Location:(.*?)\n/', $header, $matches); - $url = @parse_url(trim(array_pop($matches))); - if (!$url) { - //couldn't process the url to redirect to - $curl_loops = 0; - return $data; - } - $last_url = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); - if (!$url['scheme']) - $url['scheme'] = $last_url['scheme']; - if (!$url['host']) - $url['host'] = $last_url['host']; - if (!$url['path']) - $url['path'] = $last_url['path']; - $new_url = $url['scheme'] . '://' . $url['host'] . $url['path'] . ($url['query']?'?'.$url['query']:''); - curl_setopt($ch, CURLOPT_URL, $new_url); - // debug('Redirecting to', $new_url); - return curl_redir_exec($ch); - } else { - $curl_loops=0; - return $data; - } -} $pathLogErrorHttp = __DIR__.'/log/'. date('Y-m-d') . '/error-http-curl.log'; $pathLogErrorHtml = __DIR__.'/log/'. date('Y-m-d') . '/error-html.log'; @@ -58,8 +21,9 @@ $dbNiimko = new Database(new DatabaseConfig('niimko')); // $sites = ContingentManager::getInstance()->getSites($dbNiimko); $specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko); $orgs = ContingentManager::getInstance()->getOrgs($dbOpendata); -$sites = Yaml::parse(file_get_contents(dirname(__FILE__) ."/sites.yaml")); -// $sites = ContingentManager::getInstance()->getExceptionsHttpCurl('log/2024-08-27/error-http-curl.log'); +// $sites = Yaml::parse(file_get_contents(dirname(__FILE__) ."/sites.yaml")); +$sites = ContingentManager::getInstance()->getExceptionsHttpCurl('log/2024-08-28/error-http-curl.log'); +$sites = ContingentManager::getInstance()->getSitesFromMiccedu($dbOpendata, $sites); // print_r($sites); for ($i = 0; $i < count($sites); $i++) { // Нет URL сайта вуза @@ -82,23 +46,22 @@ for ($i = 0; $i < count($sites); $i++) { // continue; // } print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL); - $uri = trim(ContingentManager::getInstance()->buildBaseUri($sites[$i]['site'])); - $uri = str_replace("_","/", $uri); - if (substr($uri, -1) == '/') { - $uri = $uri."sveden/education/"; - } else { - $uri = $uri."/sveden/education/"; - } + $uri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']); echo $uri . PHP_EOL; + $ua = 'Mozilla/5.0 (X11; Linux x86_64) ' + .'AppleWebKit/537.36 (KHTML, like Gecko) ' + .'Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36'; + // $html = get_content($uri); $ch = curl_init($uri); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36'); + curl_setopt($ch, CURLOPT_USERAGENT, $ua); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); - + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 90); $html = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); if ($httpCode != 200 && $httpCode != 0) { $message = implode(' ', $sites[$i]) . ' ' . $httpCode;