2024-09-03 15:41:45 +03:00
|
|
|
|
<?php
|
|
|
|
|
namespace ContingentParser;
|
|
|
|
|
|
|
|
|
|
use ContingentParser\Database\DatabaseFacade;
|
|
|
|
|
use ContingentParser\Http\HttpClientFacade;
|
|
|
|
|
use ContingentParser\Http\UrlBuilder;
|
|
|
|
|
use ContingentParser\Logger\HtmlLogger;
|
|
|
|
|
use ContingentParser\Parser\ContingentFacade;
|
|
|
|
|
|
2024-09-06 14:11:38 +03:00
|
|
|
|
final class Facade
|
2024-09-03 15:41:45 +03:00
|
|
|
|
{
|
|
|
|
|
private DatabaseFacade $databaseFacade;
|
|
|
|
|
private HttpClientFacade $httpClientFacade;
|
|
|
|
|
private ContingentFacade $contingentFacade;
|
|
|
|
|
private UrlBuilder $urlBuilder;
|
|
|
|
|
private HtmlLogger $htmlLogger;
|
|
|
|
|
/**
|
|
|
|
|
* Конструктор
|
|
|
|
|
*/
|
|
|
|
|
public function __construct()
|
|
|
|
|
{
|
|
|
|
|
$this->databaseFacade = new DatabaseFacade();
|
|
|
|
|
$this->httpClientFacade = new HttpClientFacade();
|
|
|
|
|
$this->urlBuilder = new UrlBuilder();
|
|
|
|
|
$this->contingentFacade = new ContingentFacade();
|
|
|
|
|
$this->htmlLogger = new HtmlLogger('log/html.log');
|
|
|
|
|
}
|
|
|
|
|
/**
|
|
|
|
|
* Получить массив сайтов
|
2024-09-06 14:11:38 +03:00
|
|
|
|
* @param array $params Массив сайтов, у которых нужны обновиленные URL
|
2024-09-03 15:41:45 +03:00
|
|
|
|
* @return array
|
|
|
|
|
*/
|
2024-09-06 14:11:38 +03:00
|
|
|
|
public function getSites(array $params = []): array
|
2024-09-03 15:41:45 +03:00
|
|
|
|
{
|
2024-09-06 14:11:38 +03:00
|
|
|
|
if (!$params) {
|
2024-09-03 15:41:45 +03:00
|
|
|
|
return $this->databaseFacade->getSitesFromNiimko();
|
|
|
|
|
} else {
|
|
|
|
|
return $this->databaseFacade->getSitesFromMiccedu($params);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/**
|
|
|
|
|
* Cобирает из микроразметки данные таблицы
|
|
|
|
|
* "Информация о численности обучающихся" в разделе "Образование"
|
2024-09-06 14:11:38 +03:00
|
|
|
|
* @param array $site Сайт содержащий id организации и URL
|
2024-09-03 15:41:45 +03:00
|
|
|
|
* @return void
|
|
|
|
|
*/
|
2024-09-06 14:11:38 +03:00
|
|
|
|
public function collectDataFromContingent(array $site): void
|
2024-09-03 15:41:45 +03:00
|
|
|
|
{
|
|
|
|
|
list('org_id' => $orgId, 'site' => $url) = $site;
|
2024-09-06 14:11:38 +03:00
|
|
|
|
if ($this->isExit($site)) {
|
2024-09-03 15:41:45 +03:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
$url = $this->urlBuilder->build($url);
|
2024-09-06 14:11:38 +03:00
|
|
|
|
Printer::println(implode(' ', $site), Color::GREEN);
|
|
|
|
|
|
|
|
|
|
$html = $this->httpClientFacade->getContentOfSite(
|
2024-09-03 15:41:45 +03:00
|
|
|
|
$url,
|
|
|
|
|
$site
|
|
|
|
|
);
|
2024-09-06 14:11:38 +03:00
|
|
|
|
$uri = $this->contingentFacade->getLink($html);
|
|
|
|
|
if ($uri) {
|
|
|
|
|
$pattern = '/^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$/';
|
|
|
|
|
if (preg_match($pattern, $uri)) {
|
|
|
|
|
$html = $this->httpClientFacade->getContentOfSite(
|
|
|
|
|
$url,
|
|
|
|
|
$site
|
|
|
|
|
);
|
|
|
|
|
} else if ($this->urlBuilder->checkUri($uri)) {
|
|
|
|
|
$html = $this->httpClientFacade->getContentOfSite(
|
|
|
|
|
$url,
|
|
|
|
|
$site,
|
|
|
|
|
$uri
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Получаем данные таблицы численности
|
2024-09-03 15:41:45 +03:00
|
|
|
|
$contingent = $this->contingentFacade->getContingent(
|
|
|
|
|
$html,
|
|
|
|
|
$this->databaseFacade->specialties(),
|
2024-09-06 14:11:38 +03:00
|
|
|
|
$site['org_id']
|
2024-09-03 15:41:45 +03:00
|
|
|
|
);
|
|
|
|
|
|
2024-09-06 14:11:38 +03:00
|
|
|
|
if ($this->contingentFacade->isValidContingent($contingent)
|
|
|
|
|
&& $contingent
|
|
|
|
|
) {
|
|
|
|
|
// Заносим в базу
|
|
|
|
|
Printer::print_r($contingent, Color::BLUE);
|
|
|
|
|
$this->databaseFacade->insertContingent($contingent);
|
2024-09-03 15:41:45 +03:00
|
|
|
|
} else {
|
2024-09-06 14:11:38 +03:00
|
|
|
|
Printer::println("No result", Color::RED);
|
|
|
|
|
$this->htmlLogger->log("$orgId $url");
|
2024-09-03 15:41:45 +03:00
|
|
|
|
}
|
2024-09-04 16:12:03 +03:00
|
|
|
|
Printer::println();
|
2024-09-03 15:41:45 +03:00
|
|
|
|
}
|
2024-09-06 14:11:38 +03:00
|
|
|
|
/**
|
|
|
|
|
* Условие выхода
|
|
|
|
|
* @param array $site
|
|
|
|
|
* @return bool
|
|
|
|
|
*/
|
|
|
|
|
private function isExit(array $site): bool
|
2024-09-03 15:41:45 +03:00
|
|
|
|
{
|
2024-09-06 14:11:38 +03:00
|
|
|
|
// Нет URL сайта вуза
|
|
|
|
|
if (!$site['site']) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
// Уже в базе
|
|
|
|
|
if (in_array($site['org_id'], $this->databaseFacade->universities())) {
|
|
|
|
|
return true;
|
2024-09-03 15:41:45 +03:00
|
|
|
|
}
|
2024-09-06 14:11:38 +03:00
|
|
|
|
return false;
|
2024-09-03 15:41:45 +03:00
|
|
|
|
}
|
|
|
|
|
}
|