2024-08-08 12:32:27 +02:00
|
|
|
|
<?php
|
|
|
|
|
namespace App;
|
|
|
|
|
|
2024-08-13 15:05:24 +02:00
|
|
|
|
use App\Library\ContingentManager;
|
2024-08-13 12:15:21 +02:00
|
|
|
|
use App\Library\DatabaseConfig;
|
2024-08-12 14:14:49 +02:00
|
|
|
|
use GuzzleHttp\Exception\ClientException;
|
|
|
|
|
use GuzzleHttp\Exception\ConnectException;
|
|
|
|
|
use GuzzleHttp\Exception\RequestException;
|
|
|
|
|
use GuzzleHttp\Exception\ServerException;
|
2024-08-12 15:58:12 +02:00
|
|
|
|
use GuzzleHttp\RequestOptions;
|
2024-08-08 15:38:54 +02:00
|
|
|
|
use App\Library\ContingentParser;
|
2024-08-08 12:32:27 +02:00
|
|
|
|
use App\Library\Database;
|
|
|
|
|
use GuzzleHttp\Client;
|
|
|
|
|
|
2024-08-13 12:15:21 +02:00
|
|
|
|
$dbOpendata = new Database(new DatabaseConfig('opendata'));
|
|
|
|
|
$dbNiimko = new Database(new DatabaseConfig('niimko'));
|
2024-08-12 15:58:12 +02:00
|
|
|
|
|
2024-08-13 15:05:24 +02:00
|
|
|
|
$sites = ContingentManager::getInstance()->getSites($dbNiimko);
|
|
|
|
|
$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
|
2024-08-16 12:44:49 +02:00
|
|
|
|
$sql = 'SELECT DISTINCT org_id FROM sveden_education_contingent';
|
|
|
|
|
$org = $dbOpendata->selectQuery($sql);
|
2024-08-16 13:31:52 +02:00
|
|
|
|
print_r($sites);
|
2024-08-12 15:58:12 +02:00
|
|
|
|
// print_r($specializations);
|
2024-08-16 12:44:49 +02:00
|
|
|
|
// print_r($org);
|
|
|
|
|
$orgs = [];
|
|
|
|
|
foreach ($org as $o) {
|
|
|
|
|
$orgs[] = $o['org_id'];
|
|
|
|
|
}
|
|
|
|
|
unset($org);
|
|
|
|
|
|
|
|
|
|
$errorSites = [];
|
|
|
|
|
$filename = 'error-html.log';
|
|
|
|
|
$array = file($filename);
|
|
|
|
|
for ($i = 0; $i < count($array); $i++) {
|
|
|
|
|
$arr = explode(' ', $array[$i]);
|
|
|
|
|
if (!in_array($arr[2], $orgs)) {
|
|
|
|
|
$errorSites[] = $arr[2];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$filename = 'error-http.log';
|
|
|
|
|
$array = file($filename);
|
|
|
|
|
for ($i = 0; $i < count($array); $i++) {
|
|
|
|
|
$arr = explode(' ', $array[$i]);
|
|
|
|
|
if (!in_array($arr[2], $orgs)) {
|
|
|
|
|
$errorSites[] = $arr[2];
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-08-12 15:58:12 +02:00
|
|
|
|
|
2024-08-13 15:05:24 +02:00
|
|
|
|
$status = null;
|
2024-08-16 12:44:49 +02:00
|
|
|
|
|
|
|
|
|
$start = 600;
|
|
|
|
|
|
|
|
|
|
for ($i = $start; $i < count($sites); $i++) {
|
|
|
|
|
// Нет URL сайта вуза
|
|
|
|
|
if (empty($sites[$i]['site'])) {
|
|
|
|
|
$log = date('Y-m-d H:i:s') . ' ' . $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
|
|
|
|
|
file_put_contents(__DIR__ . '/../error-http.log', $log . PHP_EOL, FILE_APPEND);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// Уже в базе
|
|
|
|
|
if (in_array($sites[$i]['org_id'], $orgs)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// С ошибками разметки игнорируем
|
|
|
|
|
if (in_array($sites[$i]['org_id'], $errorSites)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2024-08-12 14:14:49 +02:00
|
|
|
|
try {
|
2024-08-12 15:58:12 +02:00
|
|
|
|
$client = new Client([
|
|
|
|
|
RequestOptions::ALLOW_REDIRECTS => [
|
2024-08-13 15:05:24 +02:00
|
|
|
|
'max' => 10,
|
|
|
|
|
'strict' => true,
|
|
|
|
|
'referer' => true,
|
2024-08-16 13:31:52 +02:00
|
|
|
|
'allow_directs' => true,
|
2024-08-12 15:58:12 +02:00
|
|
|
|
'track_redirects' => true,
|
2024-08-16 12:44:49 +02:00
|
|
|
|
'headers' => [
|
2024-08-16 13:31:52 +02:00
|
|
|
|
'User-Agent' => '
|
|
|
|
|
Mozilla/5.0 (X11; Linux x86_64)
|
|
|
|
|
AppleWebKit/537.36 (KHTML, like Gecko)
|
|
|
|
|
Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36
|
|
|
|
|
',
|
|
|
|
|
'Content-Type' => 'text/html;',
|
2024-08-16 12:44:49 +02:00
|
|
|
|
'charset' => 'utf-8'
|
|
|
|
|
]
|
2024-08-13 15:05:24 +02:00
|
|
|
|
]
|
2024-08-12 15:58:12 +02:00
|
|
|
|
]);
|
2024-08-16 12:44:49 +02:00
|
|
|
|
|
|
|
|
|
$url = $sites[$i]['site'];
|
|
|
|
|
$url = ContingentManager::getInstance()->buildURL($url);
|
2024-08-13 15:05:24 +02:00
|
|
|
|
print(($i+1).". Current url: $url\n");
|
2024-08-08 15:38:54 +02:00
|
|
|
|
|
2024-08-13 15:05:24 +02:00
|
|
|
|
$response = $client->get($url, ['timeout' => 300]);
|
|
|
|
|
$status = $response->getStatusCode();
|
2024-08-12 14:14:49 +02:00
|
|
|
|
|
|
|
|
|
$html = $response->getBody()->getContents();
|
|
|
|
|
$parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//');
|
|
|
|
|
$contingent = $parser->getDataTable();
|
2024-08-12 15:58:12 +02:00
|
|
|
|
|
|
|
|
|
// Добавляем поле spec_id по spec_code
|
2024-08-13 15:05:24 +02:00
|
|
|
|
ContingentManager::getInstance()->addSpecId($contingent, $specializations);
|
2024-08-12 15:58:12 +02:00
|
|
|
|
|
|
|
|
|
// Добавляем поле org_id
|
2024-08-13 15:05:24 +02:00
|
|
|
|
ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']);
|
2024-08-12 14:14:49 +02:00
|
|
|
|
print_r($contingent);
|
|
|
|
|
} catch (ClientException $e) {
|
2024-08-16 12:44:49 +02:00
|
|
|
|
$status = 0;
|
2024-08-12 14:14:49 +02:00
|
|
|
|
} catch (RequestException $e) {
|
2024-08-16 12:44:49 +02:00
|
|
|
|
$status = 0;
|
2024-08-12 14:14:49 +02:00
|
|
|
|
} catch (ConnectException $e) {
|
2024-08-16 12:44:49 +02:00
|
|
|
|
$status = 0;
|
2024-08-12 14:14:49 +02:00
|
|
|
|
} catch (ServerException $e) {
|
2024-08-16 12:44:49 +02:00
|
|
|
|
$status = 0;
|
2024-08-13 15:05:24 +02:00
|
|
|
|
} finally {
|
2024-08-16 12:44:49 +02:00
|
|
|
|
if ($status != 200) {
|
|
|
|
|
$log = date('Y-m-d H:i:s') . ' ' . $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
|
|
|
|
|
file_put_contents(__DIR__ . '/../error-http.log', $log . PHP_EOL, FILE_APPEND);
|
|
|
|
|
} else if (empty($contingent)) {
|
|
|
|
|
$log = date('Y-m-d H:i:s') . ' ' . $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
|
|
|
|
|
file_put_contents(__DIR__ . '/../error-html.log', $log . PHP_EOL, FILE_APPEND);
|
2024-08-13 15:05:24 +02:00
|
|
|
|
} else {
|
2024-08-16 12:44:49 +02:00
|
|
|
|
$set = ContingentManager::getInstance()->checkContingent($contingent);
|
|
|
|
|
if ($set) {
|
|
|
|
|
// Заносим в базу
|
2024-08-16 13:08:52 +02:00
|
|
|
|
// ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent);
|
2024-08-16 12:44:49 +02:00
|
|
|
|
} else {
|
|
|
|
|
$log = date('Y-m-d H:i:s') . ' ' . $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
|
|
|
|
|
file_put_contents(__DIR__ . '/../error-html.log', $log . PHP_EOL, FILE_APPEND);
|
|
|
|
|
}
|
|
|
|
|
unset($contingent);
|
2024-08-13 15:05:24 +02:00
|
|
|
|
}
|
2024-08-08 15:38:54 +02:00
|
|
|
|
}
|
2024-08-16 12:44:49 +02:00
|
|
|
|
}
|