sveden-parser/app/app.php

215 lines
8.3 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
use App\Library\ContingentManager;
use App\Library\DatabaseConfig;
use App\Library\Logger;
use GuzzleHttp\Psr7\Exception\MalformedUriException;
use GuzzleHttp\Exception\ClientException;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Exception\ServerException;
use App\Library\ContingentParser;
use App\Library\Database;
use GuzzleHttp\Client;
use GuzzleHttp\TransferStats;
use Symfony\Component\Yaml\Yaml;
$pathLogErrorHtml = __DIR__.'/../log/'. date('Y-m-d') . '/error-html.log';
$pathLogErrorHttp = __DIR__.'/../log/'. date('Y-m-d') . '/error-http-curl.log';
$pathErrorHttp = __DIR__.'/../log/'. date('Y-m-d') . '/error-http.log';
$dbOpendata = new Database(new DatabaseConfig('opendata'));
$dbNiimko = new Database(new DatabaseConfig('niimko'));
$sites = ContingentManager::getInstance()->getSites($dbNiimko);
// $specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
// $orgs = ContingentManager::getInstance()->getOrgs($dbOpendata);
// $exceptionsOrgHtml = ContingentManager::getInstance()->getExceptionsHtml('select-html-error.log');
// $exceptionsOrgHttpCurl = ContingentManager::getInstance()->getExceptionsHttpCurl('select-http-error.log');
// print_r($exceptionsOrgHttpCurl);
// echo count($exceptionsOrgHttpCurl) . " - http-error sites" . PHP_EOL;
// $start = 794;
// for ($i = $start; $i < count($sites); $i++) {
// // Нет URL сайта вуза
// if (empty($sites[$i]['site'])) {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHttp, $message);
// Logger::log($pathErrorHttp, implode(' ', $sites[$i]));
// continue;
// }
// // Уже в базе
// if (in_array($sites[$i]['org_id'], $orgs)) {
// continue;
// }
// // С ошибками разметки игнорируем
// if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) {
// continue;
// }
// // Без ошибок http игнорируем
// if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) {
// continue;
// }
// try {
// $baseUri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']);
// $client = new Client([
// 'force_ip_resolve' => 'v4',
// 'debug' => fopen("debug-http.log", "a"),
// 'base_uri' => $baseUri,
// 'allow_directs' => [
// 'max' => 5,
// 'strict' => true,
// 'referer' => true,
// 'protocols' => ['http', 'https'],
// 'track_redirects' => true
// ],
// 'connect_timeout' => 300.0,
// 'verify' => false,
// 'headers' => [
// 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36',
// 'Content-Type' => 'text/html;charset=utf-8'
// ]
// ]);
// $response = $client->get('', [
// 'on_stats' => function (TransferStats $stats) use (&$url) {
// $url = $stats->getEffectiveUri();
// }
// ]);
// print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL);
// if (substr($url, -1) == '/') {
// $url = $url."sveden/education/";
// } else {
// $url = $url."/sveden/education/";
// }
// echo $url .PHP_EOL;
// $response = $client->get($url, [
// 'on_stats' => function (TransferStats $stats) use (&$url) {
// $url = $stats->getEffectiveUri();
// }
// ]);
// echo $url . PHP_EOL;
// $html = $response->getBody()->getContents();
// if (empty($html)) {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHtml, $message);
// continue;
// }
// $parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//');
// $contingent = $parser->getDataTable();
// // Добавляем поле spec_id по spec_code
// ContingentManager::getInstance()->addSpecId($contingent, $specializations);
// // Добавляем поле org_id
// ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']);
// print_r($contingent);
// if (empty($contingent)) {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHtml, $message);
// } else {
// $set = ContingentManager::getInstance()->checkContingent($contingent);
// if ($set) {
// // Заносим в базу
// ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent);
// } else {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHtml, $message);
// }
// unset($contingent);
// }
// } catch (ClientException
// | RequestException
// | ConnectException
// | ServerException
// | MalformedUriException $e
// ) {
// $message = implode(' ', $sites[$i]) . " " . $e->getCode() . " " . $e->getMessage();
// Logger::log($pathLogErrorHttp, $message);
// Logger::log($pathErrorHttp, implode(' ', $sites[$i]));
// }
// }
$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
$orgs = ContingentManager::getInstance()->getOrgs($dbOpendata);
// $sites = Yaml::parse(file_get_contents(dirname(__FILE__) ."/sites.yaml"));
// print_r($sites);
for ($i = 0; $i < count($sites); $i++) {
// Нет URL сайта вуза
if (empty($sites[$i]['site'])) {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHttp, $message);
// Logger::log($pathErrorHttp, implode(' ', $sites[$i]));
continue;
}
// Уже в базе
if (in_array($sites[$i]['org_id'], $orgs)) {
continue;
}
// С ошибками разметки игнорируем
// if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) {
// continue;
// }
// Без ошибок http игнорируем
// if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) {
// continue;
// }
print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL);
$uri = trim(ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']));
$uri = str_replace("_","/", $uri);
if (substr($uri, -1) == '/') {
$uri = $uri."sveden/education/";
} else {
$uri = $uri."/sveden/education/";
}
echo $uri . PHP_EOL;
$ch = curl_init($uri);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$html = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($httpCode != 200) {
$errno = curl_errno($ch);
$message = implode(' ', $sites[$i]);
$message .= " cURL error ({$errno}): ".curl_strerror($errno);
Logger::log($pathLogErrorHttp, $message);
unset($httpCode);
continue;
}
curl_close($ch);
echo "HTTP-code: " . $httpCode . PHP_EOL;
if (empty($html)) continue;
$parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//');
$contingent = $parser->getDataTable();
// Добавляем поле spec_id по spec_code
ContingentManager::getInstance()->addSpecId($contingent, $specializations);
// Добавляем поле org_id
ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']);
if (empty($contingent)) {
echo "empty". PHP_EOL;
$message = implode(' ', $sites[$i]);
Logger::log($pathLogErrorHtml, $message);
} else {
print_r($contingent);
$set = ContingentManager::getInstance()->checkContingent($contingent);
if ($set) {
// Заносим в базу
ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent);
} else {
$message = implode(' ', $sites[$i]);
Logger::log($pathLogErrorHtml, $message);
}
unset($contingent);
unset($httpCode);
}
}