diff --git a/script.php b/add_recording.php similarity index 96% rename from script.php rename to add_recording.php index 629e0e2..b0e1778 100644 --- a/script.php +++ b/add_recording.php @@ -7,7 +7,8 @@ use Symfony\Component\Yaml\Yaml; // сохраняются в yaml-файле. Скрипт парсит этот файл и заносит в БД require_once "vendor/autoload.php"; -function array_depth(array $array) { +function array_depth(array $array) +{ $max_depth = 1; foreach ($array as $value) { diff --git a/app/app.php b/app/app.php index f3a14aa..ec5ead4 100644 --- a/app/app.php +++ b/app/app.php @@ -4,6 +4,7 @@ namespace App; use App\Library\ContingentManager; use App\Library\DatabaseConfig; use App\Library\Logger; +use GuzzleHttp\Psr7\Exception\MalformedUriException; use GuzzleHttp\Exception\ClientException; use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Exception\RequestException; @@ -11,6 +12,10 @@ use GuzzleHttp\Exception\ServerException; use App\Library\ContingentParser; use App\Library\Database; use GuzzleHttp\Client; +use GuzzleHttp\Psr7\Request; +use Psr\Http\Message\RequestInterface; +use Psr\Http\Message\ResponseInterface; +use Psr\Http\Message\UriInterface; $pathLogErrorHtml = 'error-html.log'; $pathLogErrorHttp = 'error-http-curl.log'; @@ -21,29 +26,13 @@ $dbNiimko = new Database(new DatabaseConfig('niimko')); $sites = ContingentManager::getInstance()->getSites($dbNiimko); $specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko); $orgs = ContingentManager::getInstance()->getOrgs($dbOpendata); -// print_r($sites); -// print_r($specializations); -// print_r($org); -// $errorSites = []; -// $filename = 'error-html.log'; -// $array = file($filename); -// for ($i = 0; $i < count($array); $i++) { -// $arr = explode(' ', $array[$i]); -// if (!in_array($arr[2], $orgs)) { -// $errorSites[] = $arr[2]; -// } -// } -// $filename = 'error-http.log'; -// $array = file($filename); -// for ($i = 0; $i < count($array); $i++) { -// $arr = explode(' ', $array[$i]); -// if (!in_array($arr[2], $orgs)) { -// $errorSites[] = $arr[2]; -// } -// } +$exceptionsOrgHtml = ContingentManager::getInstance()->getExceptionsHtml('error-html.log'); +$exceptionsOrgHttpCurl = ContingentManager::getInstance()->getExceptionsHttpCurl('error-http-curl.log'); -$start = 999; +// print_r($exceptionsOrgHttpCurl); + +$start = 0; for ($i = $start; $i < count($sites); $i++) { // Нет URL сайта вуза @@ -57,13 +46,33 @@ for ($i = $start; $i < count($sites); $i++) { continue; } // С ошибками разметки игнорируем - // if (in_array($sites[$i]['org_id'], $errorSites)) { - // continue; - // } + if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) { + continue; + } + if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) { + continue; + } try { + $baseUri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']); + $onRedirect = function( + RequestInterface $request, + ResponseInterface $res, + UriInterface $uri + ) { + echo 'Redirecting! ' . $request->getUri() . ' to ' . $uri . "\n"; + }; $client = new Client([ - 'allow_directs' => true, - 'track_redirects' => true, + 'force_ip_resolve' => 'v4', + 'debug' => fopen("debug-http.log", "a"), + 'base_uri' => $baseUri, + 'allow_directs' => [ + 'max' => 5, + 'strict' => true, + 'referer' => true, + 'protocols' => ['http', 'https'], + 'on_redirect' => $onRedirect, + 'track_redirects' => true + ], 'connect_timeout' => 300.0, 'verify' => false, // 'http_errors' => false, @@ -72,17 +81,14 @@ for ($i = $start; $i < count($sites); $i++) { 'Content-Type' => 'text/html;charset=utf-8' ] ]); - - $url = $sites[$i]['site']; - $url = ContingentManager::getInstance()->buildURL($url); - print(($i+1).". Current url: $url\n"); - - $response = $client->get($url); + print(($i+1). '. ' . implode(' ', $sites[$i]) . "\n"); + $response = $client->get('/sveden/education/'); echo $response->getStatusCode() .PHP_EOL; + var_dump($response->getHeaderLine("'X-Guzzle-Redirect-History") . PHP_EOL); $html = $response->getBody()->getContents(); if (empty($html)) { - $message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site']; + $message = implode(' ', $sites[$i]); Logger::log($pathLogErrorHtml, $message); continue; } @@ -96,7 +102,7 @@ for ($i = $start; $i < count($sites); $i++) { ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']); print_r($contingent); if (empty($contingent)) { - $message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site']; + $message = implode(' ', $sites[$i]); Logger::log($pathLogErrorHtml, $message); } else { $set = ContingentManager::getInstance()->checkContingent($contingent); @@ -104,7 +110,7 @@ for ($i = $start; $i < count($sites); $i++) { // Заносим в базу ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent); } else { - $message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site']; + $message = implode(' ', $sites[$i]); Logger::log($pathLogErrorHtml, $message); } unset($contingent); @@ -112,7 +118,8 @@ for ($i = $start; $i < count($sites); $i++) { } catch (ClientException | RequestException | ConnectException - | ServerException $e + | ServerException + | MalformedUriException $e ) { $message = implode(' ', $sites[$i]) . "\t" . $e->getCode() . "\t" . $e->getMessage(); Logger::log($pathLogErrorHttp, $message); diff --git a/app/library/ContingentManager.php b/app/library/ContingentManager.php index c83d70f..fecd39f 100644 --- a/app/library/ContingentManager.php +++ b/app/library/ContingentManager.php @@ -81,21 +81,12 @@ final class ContingentManager return $orgs; } - public function buildURL(string $url): string + public function buildBaseUri(string $url): string { - // Строит -> https:///sveden/education/ - $offset = strpos($url, '/', strlen('http://')); - if ($offset) { - $url = substr_replace($url, '', $offset); + // Строит -> https:// + if (strpos($url,'https://') === false && strpos($url,'http://') === false) { + $url = "http://$url"; } - - $url = "$url/sveden/education/"; - if (str_contains($url, "http://")) { - $url = str_replace("http://","https://", $url); - } else { - $url = "https://$url"; - } - $url = str_replace("www.","", $url); return $url; } @@ -130,4 +121,32 @@ final class ContingentManager } return $count ? true : false; } + + public function getExceptionsHtml(string $filename) : array + { + $errorSites = []; + $array = file($filename); + for ($i = 0; $i < count($array); $i++) { + $arr = explode(' ', $array[$i]); + $errorSites[] = $arr[2]; + } + return $errorSites; + } + + public function getExceptionsHttpCurl(string $filename) : array + { + $array = file($filename); + $orgHttpError = []; + foreach ($array as $str) { + $data = explode (' ', $str); + if (preg_match("/^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])$/", $data[0]) + && $data[3] != PHP_EOL) { + $orgHttpError[] = $data[2]; + // $orgHttpError[] = ['org_id' => $data[2], 'site' => $data[3]]; + } + } + $orgHttpError = array_unique($orgHttpError); + sort($orgHttpError); + return $orgHttpError; + } } \ No newline at end of file diff --git a/app/library/ContingentParser.php b/app/library/ContingentParser.php index b14f92d..60017c3 100644 --- a/app/library/ContingentParser.php +++ b/app/library/ContingentParser.php @@ -27,15 +27,18 @@ class ContingentParser { libxml_use_internal_errors(true); $dom = new \DOMDocument( - encoding: "UTF-8" + // encoding: "UTF-8" ); - $html = mb_convert_encoding( - $html, - 'UTF-8', - mb_detect_encoding($html, 'UTF-8, windows-1251') - ); - $html = str_replace('windows-1251','utf-8', $html); - $dom->loadHTML($html); + if (mb_detect_encoding($html, 'UTF-8, windows-1251') != "UTF-8") { + $html = mb_convert_encoding( + $html, + 'UTF-8', + mb_detect_encoding($html, 'UTF-8, windows-1251') + ); + $html = str_replace('windows-1251','utf-8', $html); + } else { + $dom->loadHTML(mb_convert_encoding($html,'HTML-ENTITIES','UTF-8')); + } $this->xpath = new \DOMXPath($dom); $this->template = $template; } diff --git a/app/library/Database.php b/app/library/Database.php index 3972838..ceaae08 100644 --- a/app/library/Database.php +++ b/app/library/Database.php @@ -68,18 +68,13 @@ class Database $stmt->execute(); Logger::log(self::$logFile, "Запрос выполнен успешно!"); } catch (PDOException $e) { - // При ошибке запроса сохраняем валидные данные в yaml-файл - $yaml = Yaml::dump([$params]); - file_put_contents(__DIR__ . '/not-recorded-in-db.yaml', $yaml, FILE_APPEND); $message = "Ошибка запроса:" . $e->getMessage(); Logger::log(self::$logFile, $message); - // TODO узнать код ошибки - // SQLSTATE[HY000]: General error: 2006 MySQL server has gone away - // if ($e->getCode() == 0) { - // $yaml = Yaml::dump($params); - // file_put_contents(__DIR__ . '/not-recorded-in-db.yaml', $yaml); - // exit(1); - // } + // При ошибке запроса сохраняем валидные данные в yaml-файл + if ($e->getCode() === "HY000") { + $yaml = Yaml::dump($params); + file_put_contents(__DIR__ . '/not-recorded-in-db.yaml', $yaml, FILE_APPEND); + } } } } \ No newline at end of file