sveden-parser/app/app.php

142 lines
4.6 KiB
PHP
Raw Normal View History

2024-08-08 13:32:27 +03:00
<?php
namespace App;
use App\Library\ContingentManager;
use App\Library\DatabaseConfig;
2024-08-16 15:00:37 +03:00
use App\Library\Logger;
use GuzzleHttp\Exception\ClientException;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Exception\ServerException;
use GuzzleHttp\RequestOptions;
use App\Library\ContingentParser;
2024-08-08 13:32:27 +03:00
use App\Library\Database;
use GuzzleHttp\Client;
2024-08-16 15:00:37 +03:00
$pathLogErrorHtml = 'error-html.log';
$pathLogErrorHttp = 'error-http.log';
Logger::log($pathLogErrorHtml, 'start');
Logger::log($pathLogErrorHttp, 'start');
$dbOpendata = new Database(new DatabaseConfig('opendata'));
$dbNiimko = new Database(new DatabaseConfig('niimko'));
2024-08-16 15:00:37 +03:00
exit(0);
$sites = ContingentManager::getInstance()->getSites($dbNiimko);
$specializations = ContingentManager::getInstance()->getSpecializations($dbNiimko);
$sql = 'SELECT DISTINCT org_id FROM sveden_education_contingent';
$org = $dbOpendata->selectQuery($sql);
print_r($sites);
// print_r($specializations);
// print_r($org);
$orgs = [];
foreach ($org as $o) {
$orgs[] = $o['org_id'];
}
unset($org);
$errorSites = [];
$filename = 'error-html.log';
$array = file($filename);
for ($i = 0; $i < count($array); $i++) {
$arr = explode(' ', $array[$i]);
if (!in_array($arr[2], $orgs)) {
$errorSites[] = $arr[2];
}
}
$filename = 'error-http.log';
$array = file($filename);
for ($i = 0; $i < count($array); $i++) {
$arr = explode(' ', $array[$i]);
if (!in_array($arr[2], $orgs)) {
$errorSites[] = $arr[2];
}
}
$status = null;
$start = 600;
for ($i = $start; $i < count($sites); $i++) {
// Нет URL сайта вуза
if (empty($sites[$i]['site'])) {
2024-08-16 15:00:37 +03:00
$message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
Logger::log($pathLogErrorHttp, $message);
continue;
}
// Уже в базе
if (in_array($sites[$i]['org_id'], $orgs)) {
continue;
}
// С ошибками разметки игнорируем
if (in_array($sites[$i]['org_id'], $errorSites)) {
continue;
}
try {
$client = new Client([
RequestOptions::ALLOW_REDIRECTS => [
'max' => 10,
'strict' => true,
'referer' => true,
'allow_directs' => true,
'track_redirects' => true,
'headers' => [
'User-Agent' => '
Mozilla/5.0 (X11; Linux x86_64)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36
',
'Content-Type' => 'text/html;',
'charset' => 'utf-8'
]
]
]);
$url = $sites[$i]['site'];
$url = ContingentManager::getInstance()->buildURL($url);
print(($i+1).". Current url: $url\n");
$response = $client->get($url, ['timeout' => 300]);
$status = $response->getStatusCode();
$html = $response->getBody()->getContents();
$parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//');
$contingent = $parser->getDataTable();
// Добавляем поле spec_id по spec_code
ContingentManager::getInstance()->addSpecId($contingent, $specializations);
// Добавляем поле org_id
ContingentManager::getInstance()->addOrgId($contingent, $sites[$i]['org_id']);
print_r($contingent);
} catch (ClientException $e) {
$status = 0;
} catch (RequestException $e) {
$status = 0;
} catch (ConnectException $e) {
$status = 0;
} catch (ServerException $e) {
$status = 0;
} finally {
if ($status != 200) {
2024-08-16 15:00:37 +03:00
$message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
Logger::log($pathLogErrorHttp, $message);
} else if (empty($contingent)) {
2024-08-16 15:00:37 +03:00
$message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
Logger::log($pathLogErrorHtml, $message);
} else {
$set = ContingentManager::getInstance()->checkContingent($contingent);
if ($set) {
// Заносим в базу
2024-08-16 14:08:52 +03:00
// ContingentManager::getInstance()->insertContingent($dbOpendata, $contingent);
} else {
2024-08-16 15:00:37 +03:00
$message = $sites[$i]['org_id'] . ' ' . $sites[$i]['site'];
Logger::log($pathLogErrorHtml, $message);
}
unset($contingent);
}
}
2024-08-16 15:00:37 +03:00
}
Logger::log($pathLogErrorHtml, 'stop');
Logger::log($pathLogErrorHttp, 'stop');