Добавлена выборка актульных сайтов по непройденным вариантам

This commit is contained in:
Alexander 2024-08-29 09:36:04 +03:00
parent 5e0f59f8c5
commit 74ba48620c
4 changed files with 237 additions and 150 deletions

View File

@ -1,6 +1,4 @@
<?php
namespace App;
use App\Library\ContingentManager;
use App\Library\DatabaseConfig;
use App\Library\Logger;
@ -12,32 +10,140 @@ use GuzzleHttp\Exception\ServerException;
use App\Library\ContingentParser;
use App\Library\Database;
use GuzzleHttp\Client;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use GuzzleHttp\TransferStats;
use Symfony\Component\Yaml\Yaml;
$pathLogErrorHtml = 'error-html.log';
$pathLogErrorHttp = 'error-http-curl.log';
$pathLogErrorHtml = __DIR__.'/../log/'. date('Y-m-d') . '/error-html.log';
$pathLogErrorHttp = __DIR__.'/../log/'. date('Y-m-d') . '/error-http-curl.log';
$pathErrorHttp = __DIR__.'/../log/'. date('Y-m-d') . '/error-http.log';
$dbOpendata = new Database(new DatabaseConfig('opendata'));
$dbNiimko = new Database(new DatabaseConfig('niimko'));
$sites = ContingentManager::getInstance()->getSites($dbNiimko);
$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
$orgs = ContingentManager::getInstance()->getOrgs($dbOpendata);
// $specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
// $orgs = ContingentManager::getInstance()->getOrgs($dbOpendata);
$exceptionsOrgHtml = ContingentManager::getInstance()->getExceptionsHtml('error-html.log');
$exceptionsOrgHttpCurl = ContingentManager::getInstance()->getExceptionsHttpCurl('error-http-curl.log');
// $exceptionsOrgHtml = ContingentManager::getInstance()->getExceptionsHtml('select-html-error.log');
// $exceptionsOrgHttpCurl = ContingentManager::getInstance()->getExceptionsHttpCurl('select-http-error.log');
// print_r($exceptionsOrgHttpCurl);
$start = 0;
// echo count($exceptionsOrgHttpCurl) . " - http-error sites" . PHP_EOL;
for ($i = $start; $i < count($sites); $i++) {
// $start = 794;
// for ($i = $start; $i < count($sites); $i++) {
// // Нет URL сайта вуза
// if (empty($sites[$i]['site'])) {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHttp, $message);
// Logger::log($pathErrorHttp, implode(' ', $sites[$i]));
// continue;
// }
// // Уже в базе
// if (in_array($sites[$i]['org_id'], $orgs)) {
// continue;
// }
// // С ошибками разметки игнорируем
// if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) {
// continue;
// }
// // Без ошибок http игнорируем
// if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) {
// continue;
// }
// try {
// $baseUri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']);
// $client = new Client([
// 'force_ip_resolve' => 'v4',
// 'debug' => fopen("debug-http.log", "a"),
// 'base_uri' => $baseUri,
// 'allow_directs' => [
// 'max' => 5,
// 'strict' => true,
// 'referer' => true,
// 'protocols' => ['http', 'https'],
// 'track_redirects' => true
// ],
// 'connect_timeout' => 300.0,
// 'verify' => false,
// 'headers' => [
// 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36',
// 'Content-Type' => 'text/html;charset=utf-8'
// ]
// ]);
// $response = $client->get('', [
// 'on_stats' => function (TransferStats $stats) use (&$url) {
// $url = $stats->getEffectiveUri();
// }
// ]);
// print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL);
// if (substr($url, -1) == '/') {
// $url = $url."sveden/education/";
// } else {
// $url = $url."/sveden/education/";
// }
// echo $url .PHP_EOL;
// $response = $client->get($url, [
// 'on_stats' => function (TransferStats $stats) use (&$url) {
// $url = $stats->getEffectiveUri();
// }
// ]);
// echo $url . PHP_EOL;
// $html = $response->getBody()->getContents();
// if (empty($html)) {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHtml, $message);
// continue;
// }
// $parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//');
// $contingent = $parser->getDataTable();
// // Добавляем поле spec_id по spec_code
// ContingentManager::getInstance()->addSpecId($contingent, $specializations);
// // Добавляем поле org_id
// ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']);
// print_r($contingent);
// if (empty($contingent)) {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHtml, $message);
// } else {
// $set = ContingentManager::getInstance()->checkContingent($contingent);
// if ($set) {
// // Заносим в базу
// ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent);
// } else {
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHtml, $message);
// }
// unset($contingent);
// }
// } catch (ClientException
// | RequestException
// | ConnectException
// | ServerException
// | MalformedUriException $e
// ) {
// $message = implode(' ', $sites[$i]) . " " . $e->getCode() . " " . $e->getMessage();
// Logger::log($pathLogErrorHttp, $message);
// Logger::log($pathErrorHttp, implode(' ', $sites[$i]));
// }
// }
$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
$orgs = ContingentManager::getInstance()->getOrgs($dbOpendata);
// $sites = Yaml::parse(file_get_contents(dirname(__FILE__) ."/sites.yaml"));
// print_r($sites);
for ($i = 0; $i < count($sites); $i++) {
// Нет URL сайта вуза
if (empty($sites[$i]['site'])) {
$message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
Logger::log($pathLogErrorHttp, $message);
// $message = implode(' ', $sites[$i]);
// Logger::log($pathLogErrorHttp, $message);
// Logger::log($pathErrorHttp, implode(' ', $sites[$i]));
continue;
}
// Уже в базе
@ -45,65 +151,56 @@ for ($i = $start; $i < count($sites); $i++) {
continue;
}
// С ошибками разметки игнорируем
if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) {
continue;
// if (in_array($sites[$i]['org_id'], $exceptionsOrgHtml)) {
// continue;
// }
// Без ошибок http игнорируем
// if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) {
// continue;
// }
print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL);
$uri = trim(ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']));
$uri = str_replace("_","/", $uri);
if (substr($uri, -1) == '/') {
$uri = $uri."sveden/education/";
} else {
$uri = $uri."/sveden/education/";
}
if (!in_array($sites[$i]['org_id'], $exceptionsOrgHttpCurl)) {
continue;
}
try {
$baseUri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']);
$onRedirect = function(
RequestInterface $request,
ResponseInterface $res,
UriInterface $uri
) {
echo 'Redirecting! ' . $request->getUri() . ' to ' . $uri . "\n";
};
$client = new Client([
'force_ip_resolve' => 'v4',
'debug' => fopen("debug-http.log", "a"),
'base_uri' => $baseUri,
'allow_directs' => [
'max' => 5,
'strict' => true,
'referer' => true,
'protocols' => ['http', 'https'],
'on_redirect' => $onRedirect,
'track_redirects' => true
],
'connect_timeout' => 300.0,
'verify' => false,
// 'http_errors' => false,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36',
'Content-Type' => 'text/html;charset=utf-8'
]
]);
print(($i+1). '. ' . implode(' ', $sites[$i]) . "\n");
$response = $client->get('/sveden/education/');
echo $response->getStatusCode() .PHP_EOL;
var_dump($response->getHeaderLine("'X-Guzzle-Redirect-History") . PHP_EOL);
$html = $response->getBody()->getContents();
if (empty($html)) {
echo $uri . PHP_EOL;
$ch = curl_init($uri);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$html = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($httpCode != 200) {
$errno = curl_errno($ch);
$message = implode(' ', $sites[$i]);
Logger::log($pathLogErrorHtml, $message);
$message .= " cURL error ({$errno}): ".curl_strerror($errno);
Logger::log($pathLogErrorHttp, $message);
unset($httpCode);
continue;
}
curl_close($ch);
echo "HTTP-code: " . $httpCode . PHP_EOL;
if (empty($html)) continue;
$parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//');
$contingent = $parser->getDataTable();
// Добавляем поле spec_id по spec_code
ContingentManager::getInstance()->addSpecId($contingent, $specializations);
// Добавляем поле org_id
ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']);
print_r($contingent);
if (empty($contingent)) {
echo "empty". PHP_EOL;
$message = implode(' ', $sites[$i]);
Logger::log($pathLogErrorHtml, $message);
} else {
print_r($contingent);
$set = ContingentManager::getInstance()->checkContingent($contingent);
if ($set) {
// Заносим в базу
@ -113,14 +210,6 @@ for ($i = $start; $i < count($sites); $i++) {
Logger::log($pathLogErrorHtml, $message);
}
unset($contingent);
}
} catch (ClientException
| RequestException
| ConnectException
| ServerException
| MalformedUriException $e
) {
$message = implode(' ', $sites[$i]) . "\t" . $e->getCode() . "\t" . $e->getMessage();
Logger::log($pathLogErrorHttp, $message);
unset($httpCode);
}
}

View File

@ -37,6 +37,31 @@ final class ContingentManager
return $sites;
}
public function getSitesFromMiccedu(Database $db, array $params) : array
{
// select site, vuzkod as org_id from opendata.miccedu_monitoring
// where year = 2023 and (vuzkod = :val1 or vuzkod = :val2 or ...)
$year = 2023;
foreach ($params as $key => $org) {
$params[$key] = (int)$org['org_id'];
}
$query = $this->builder->select()
->setTable('miccedu_monitoring')
->setColumns(['org_id' => 'vuzkod','site'])
->where('AND')
->equals('year', $year)
->subWhere('OR');
foreach ($params as $orgId) {
$query->equals('vuzkod', $orgId);
}
$query = $query->end();
$sql = $this->builder->writeFormatted($query);
array_unshift($params, $year);
$sites = $db->selectQuery($sql, $params);
return $sites;
}
public function insertContingent(Database $db, array $contingent) : void
{
$params = ['spec_code', 'spec_name', 'edu_level', 'edu_forms', 'contingent', 'spec_id', 'org_id'];
@ -84,11 +109,21 @@ final class ContingentManager
public function buildBaseUri(string $url): string
{
// Строит -> https://<base_uri>
if (strpos($url,'https://') === false && strpos($url,'http://') === false) {
$url = str_replace("www/", "www.", $url);
if (strpos($url,'https://') === false
&& strpos($url,'http://') === false
) {
$url = "http://$url";
}
$url = str_replace("https://", "http://", $url);
$arr = parse_url($url);
$url = $arr['scheme'] .'://'. $arr['host'] . '/';
return $url;
// $url = str_replace("www.", "", $url);
$url = str_replace("_","/", $url);
$url = $url."sveden/education/";
return trim($url);
}
public function addSpecId(array &$contingent, array $specializations) : void
@ -141,12 +176,12 @@ final class ContingentManager
$data = explode (' ', $str);
if (preg_match("/^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])$/", $data[0])
&& $data[3] != PHP_EOL) {
$orgHttpError[] = $data[2];
// $orgHttpError[] = ['org_id' => $data[2], 'site' => $data[3]];
// $orgHttpError[] = $data[2];
$orgHttpError[] = ['org_id' => $data[2], 'site' => $data[3]];
}
}
$orgHttpError = array_unique($orgHttpError);
sort($orgHttpError);
// $orgHttpError = array_unique($orgHttpError);
ksort($orgHttpError);
return $orgHttpError;
}
}

View File

@ -37,9 +37,9 @@ class ContingentParser
$encoding
);
$html = str_replace('windows-1251','utf-8', $html);
} else {
$dom->loadHTML(mb_convert_encoding($html,'HTML-ENTITIES','UTF-8'));
}
$dom->loadHTML(mb_convert_encoding($html,'HTML-ENTITIES','UTF-8'));
$this->xpath = new \DOMXPath($dom);
$this->template = $template;
}

View File

@ -7,45 +7,8 @@ use App\Library\Logger;
use Symfony\Component\Yaml\Yaml;
require_once(dirname(__FILE__) ."/vendor/autoload.php");
// require_once(dirname(__FILE__) ."/test.php");
function curl_redir_exec($ch)
{
static $curl_loops = 0;
static $curl_max_loops = 20;
if ($curl_loops++ >= $curl_max_loops) {
$curl_loops = 0;
return false;
}
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$data = curl_exec($ch);
list($header, $data) = explode("\n\n", $data, 2);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($http_code == 301 || $http_code == 302) {
$matches = [];
preg_match('/Location:(.*?)\n/', $header, $matches);
$url = @parse_url(trim(array_pop($matches)));
if (!$url) {
//couldn't process the url to redirect to
$curl_loops = 0;
return $data;
}
$last_url = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
if (!$url['scheme'])
$url['scheme'] = $last_url['scheme'];
if (!$url['host'])
$url['host'] = $last_url['host'];
if (!$url['path'])
$url['path'] = $last_url['path'];
$new_url = $url['scheme'] . '://' . $url['host'] . $url['path'] . ($url['query']?'?'.$url['query']:'');
curl_setopt($ch, CURLOPT_URL, $new_url);
// debug('Redirecting to', $new_url);
return curl_redir_exec($ch);
} else {
$curl_loops=0;
return $data;
}
}
$pathLogErrorHttp = __DIR__.'/log/'. date('Y-m-d') . '/error-http-curl.log';
$pathLogErrorHtml = __DIR__.'/log/'. date('Y-m-d') . '/error-html.log';
@ -58,8 +21,9 @@ $dbNiimko = new Database(new DatabaseConfig('niimko'));
// $sites = ContingentManager::getInstance()->getSites($dbNiimko);
$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
$orgs = ContingentManager::getInstance()->getOrgs($dbOpendata);
$sites = Yaml::parse(file_get_contents(dirname(__FILE__) ."/sites.yaml"));
// $sites = ContingentManager::getInstance()->getExceptionsHttpCurl('log/2024-08-27/error-http-curl.log');
// $sites = Yaml::parse(file_get_contents(dirname(__FILE__) ."/sites.yaml"));
$sites = ContingentManager::getInstance()->getExceptionsHttpCurl('log/2024-08-28/error-http-curl.log');
$sites = ContingentManager::getInstance()->getSitesFromMiccedu($dbOpendata, $sites);
// print_r($sites);
for ($i = 0; $i < count($sites); $i++) {
// Нет URL сайта вуза
@ -82,23 +46,22 @@ for ($i = 0; $i < count($sites); $i++) {
// continue;
// }
print(($i+1). '. ' . implode(' ', $sites[$i]) . PHP_EOL);
$uri = trim(ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']));
$uri = str_replace("_","/", $uri);
if (substr($uri, -1) == '/') {
$uri = $uri."sveden/education/";
} else {
$uri = $uri."/sveden/education/";
}
$uri = ContingentManager::getInstance()->buildBaseUri($sites[$i]['site']);
echo $uri . PHP_EOL;
$ua = 'Mozilla/5.0 (X11; Linux x86_64) '
.'AppleWebKit/537.36 (KHTML, like Gecko) '
.'Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36';
// $html = get_content($uri);
$ch = curl_init($uri);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36');
curl_setopt($ch, CURLOPT_USERAGENT, $ua);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 90);
$html = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($httpCode != 200 && $httpCode != 0) {
$message = implode(' ', $sites[$i]) . ' ' . $httpCode;