Перенес код

This commit is contained in:
2024-09-03 20:16:34 +03:00
parent 88d85865a0
commit 5344b31f97
1716 changed files with 146261 additions and 6896 deletions

View File

@ -0,0 +1,29 @@
<?php
namespace ContingentParser;
enum Color : string
{
case WHITE = "\033[0m";
case GREEN = "\033[92m";
case RED = "\033[91m";
case BLUE = "\033[94m";
public static function create(string $color) : Color
{
switch ($color) {
case 'green':
return self::GREEN;
case 'red':
return self::RED;
case 'blue':
return self::BLUE;
default:
return self::WHITE;
}
}
public function tostring() : string
{
return $this->value;
}
}

View File

@ -8,79 +8,126 @@ use PDO;
final class Database
{
private PDO $_pdo;
private static $_logFile = 'log/database.log';
private DatabaseConfig $_databaseConfig;
private DatabaseLogger $_logger;
private PDO $pdo;
private static $logfile = 'log/database.log';
private DatabaseConfig $databaseConfig;
private DatabaseLogger $logger;
private const NO_CONNECT = "HY000";
/**
* Конструктор
* @param \ContingentParser\Database\DatabaseConfig $config
* Конфигурация подключения к базе данных
*/
public function __construct(DatabaseConfig $config)
{
$this->_logger = new DatabaseLogger(self::$_logFile);
$this->_databaseConfig = $config;
$this->logger = new DatabaseLogger(self::$logfile);
$this->databaseConfig = $config;
try {
$dsn = $this->_databaseConfig->getDsn();
$username = $this->_databaseConfig->getUsername();
$password = $this->_databaseConfig->getPassword();
$this->_pdo = new PDO(
$dsn = $this->databaseConfig->getDsn();
$username = $this->databaseConfig->getUsername();
$password = $this->databaseConfig->getPassword();
$this->pdo = new PDO(
$dsn,
$username,
$password,
[PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION]
);
$message = "Подключение к {$this->_databaseConfig->getDBName()} успешно!";
$this->_logger->log($message);
$message = "Подключение к {$this->databaseConfig->getDBName()} успешно!";
$this->logger->log($message);
} catch (PDOException $e) {
$message = "Ошибка подключения к {$this->_databaseConfig->getDBName()}: {$e->getMessage()}";
$this->_logger->log($message);
$message = "Ошибка подключения к {$this->databaseConfig->getDBName()}: {$e->getMessage()}";
$this->logger->log($message);
}
}
/**
* Сообщение о разрыве соединения
*/
public function __destruct()
{
$message = "Подключение к {$this->_databaseConfig->getDBName()} прервано!";
$this->_logger->log($message);
$message = "Подключение к {$this->databaseConfig->getDBName()} прервано!";
$this->logger->log($message);
}
// Массив $params должен начанаться с 1
/**
* Выборка данных из базы
* @param string $sql
* SQL-запрос
* @param array $params
* Параметры запроса
* @return array
*/
public function select(string $sql, array $params = []) : array
{
try {
$stmt = $this->_pdo->prepare($sql);
for ($i = 1; $i < count($params); $i++) {
$stmt->bindParam(":v$i", $params[$i]);
$stmt = $this->pdo->prepare($sql);
if (!empty($params)) {
for ($i = 0; $i < count($params); $i++) {
$stmt->bindParam(":v".($i+1), $params[$i]);
}
}
$stmt->execute();
$array = $stmt->fetchAll(PDO::FETCH_ASSOC);
} catch (PDOException $e) {
$message = "Ошибка запроса: " . $e->getMessage();
$this->_logger->log($message);
$this->logger->log($message);
} finally {
return $array;
}
}
/**
* Добавление данных в базу
* @param string $sql
* SQL-запрос
* @param array $params
* Параметры запроса
* @return void
*/
public function insert(string $sql, array $params)
{
try {
$stmt = $this->_pdo->prepare($sql);
for ($i = 0; $i < count($params); $i++) {
$stmt->bindParam(":spec_code".$i+1, $params[$i]['spec_code']);
$stmt->bindParam(":spec_name".$i+1, $params[$i]['spec_name']);
$stmt->bindParam(":edu_forms".$i+1, $params[$i]['edu_forms']);
$stmt->bindParam(":edu_level".$i+1, $params[$i]['edu_level']);
$stmt->bindParam(":contingent".$i+1, $params[$i]['contingent']);
$stmt->bindParam(":org_id".$i+1, $params[$i]['org_id']);
$stmt->bindParam(":spec_id".$i+1, $params[$i]['spec_id']);
$stmt = $this->pdo->prepare($sql);
$count = 1;
$size = count($params[0]);
foreach ($params as $param) {
for ($i = $count; $i <= $size; $i++) {
$param = array_values($param);
$stmt->bindParam(":v$i", $param[$i-$count]);
}
$count += count($param);
$size += count($param);
}
$stmt->execute();
$this->_logger->log("Запрос выполнен успешно!");
$this->logger->log("Запрос выполнен успешно!");
} catch (PDOException $e) {
$message = "Ошибка запроса:" . $e->getMessage();
$this->_logger->log($message);
$this->logger->log($message);
// При ошибке запроса сохраняем валидные данные в yaml-файл
if ($e->getCode() === "HY000") {
if ($e->getCode() === self::NO_CONNECT) {
$yaml = Yaml::dump($params);
file_put_contents('/not-recorded-in-db.yaml', $yaml, FILE_APPEND);
file_put_contents('not-recorded-in-db.yaml', $yaml, FILE_APPEND);
}
}
}
}
/**
* Обновление данных в базе
* @param string $sql
* SQL-запрос
* @param array $params
* Параметры запроса
* @return void
*/
public function update(string $sql, array $params)
{
try {
$stmt = $this->pdo->prepare($sql);
$count = count($params);
for ($i = 0; $i < $count; $i++) {
$stmt->bindParam(":v".($i+1), $params[$i]);
}
$stmt->execute();
$this->logger->log("Запрос выполнен успешно!");
} catch (PDOException $e) {
$message = "Ошибка запроса:" . $e->getMessage();
$this->logger->log($message);
}
}
}

View File

@ -3,25 +3,25 @@ namespace ContingentParser\Database;
final class DatabaseConfig
{
private string $_driver;
private string $_host;
private string $_dbname;
private string $_port;
private string $_charset;
private string $_username;
private string $_password;
private string $driver;
private string $host;
private string $dbname;
private string $port;
private string $charset;
private string $username;
private string $password;
public function __construct(string $db)
{
$config = $this->getDataEnv($db);
$this->_driver = $config['DB_DRIVER'];
$this->_host = $config['DB_HOST'];
$this->_dbname = $config['DB_NAME'];
$this->_port = $config['DB_PORT'];
$this->_charset = $config["DB_CHARSET"];
$this->_username = $config['DB_USERNAME'];
$this->_password = $config['DB_PASSWORD'];
$this->driver = $config['DB_DRIVER'];
$this->host = $config['DB_HOST'];
$this->dbname = $config['DB_NAME'];
$this->port = $config['DB_PORT'];
$this->charset = $config["DB_CHARSET"];
$this->username = $config['DB_USERNAME'];
$this->password = $config['DB_PASSWORD'];
}
private function getDataEnv(string $db) : array
@ -39,24 +39,24 @@ final class DatabaseConfig
public function getDBName(): string
{
return $this->_dbname;
return $this->dbname;
}
public function getDsn() : string
{
return $this->_driver.":host=".$this->_host
.";dbname=".$this->_dbname
.";charset=".$this->_charset
.";port=".$this->_port;
return $this->driver.":host=".$this->host
.";dbname=".$this->dbname
.";charset=".$this->charset
.";port=".$this->port;
}
public function getUsername() : string
{
return $this->_username;
return $this->username;
}
public function getPassword() : string
{
return $this->_password;
return $this->password;
}
}
}

View File

@ -0,0 +1,213 @@
<?php
namespace ContingentParser\Database;
use NilPortugues\Sql\QueryBuilder\Builder\GenericBuilder;
class DatabaseFacade
{
private Database $opendata;
private Database $niimko;
private array $specialties;
private array $universities;
/**
* Конструктор
*/
public function __construct()
{
$this->niimko = new Database(new DatabaseConfig('niimko'));
$this->opendata = new Database(new DatabaseConfig('opendata'));
$this->specialties = $this->getSpecialties();
$this->universities = $this->getUniversities();
}
/**
* Извлечение URL сайтов из базы данных niimko
* @return array
*/
public function getSitesFromNiimko() : array
{
/*
SELECT kod AS org_id, site FROM niimko.s_vuzes
WHERE ootype = 'vuz' AND deleted = 'n' AND fake = 'n'
*/
$builder = new GenericBuilder();
$params = ['vuz', 'n', 'n', 'RU'];
$query = $builder->select()
->setTable('s_vuzes')
->setColumns(['org_id' => 'kod', 'site'])
->where('AND')
->equals('ootype', 'vuz')
->equals('deleted', 'n')
->equals('fake', 'n')
->equals('country', 'RU')
->end();
$sql = $builder->write($query);
$sites = $this->niimko->select($sql, $params);
return $sites;
}
/**
* Извлечение сайтов базы данных opendata
* из таблицы miccedu_monitoring.
* @param array $params
* Сайты, у которых устаревшие URL
* @return array
*/
public function getSitesFromMiccedu(array $params) : array
{
/*
SELECT site, vuzkod AS org_id FROM opendata.miccedu_monitoring
WHERE year = 2023 AND (vuzkod = :val1 OR vuzkod = :val2 OR ...)
*/
$builder = new GenericBuilder();
$year = 2023;
foreach ($params as $key => $org) {
$params[$key] = (int)$org['org_id'];
}
$query = $builder->select()
->setTable('miccedu_monitoring')
->setColumns(['org_id' => 'vuzkod','site'])
->where('AND')
->equals('year', $year)
->subWhere('OR');
foreach ($params as $orgId) {
$query->equals('vuzkod', $orgId);
}
$query = $query->end();
$sql = $builder->write($query);
array_unshift($params, $year);
$sites = $this->opendata->select($sql, $params);
return $sites;
}
/**
* Внесение данных численности обучающихся в базу данных opendata
* @param array $contingent
* Массив записей численности по специальностям
* @return void
*/
public function insertContingent(array $contingent) : void
{
/*
INSERT INTO sveden_education_contingent
(org_id, spec_id, spec_code, spec_name, edu_level, edu_forms, contingent)
VALUES
(:v1, :v2, :v3, :v4, :v5, :v6, :v7)
...
*/
$builder = new GenericBuilder();
$countAtributes = count($contingent[0]);
$size = $countAtributes * (count($contingent) - 1);
$query = $builder->insert()
->setTable('sveden_education_contingent')
->setValues(
$contingent[0]
);
$sql = $builder->write($query);
for ($i = $countAtributes; $i <= $size;) {
$sql .= " (:v".(++$i).", :v".(++$i).", :v".(++$i).", :v"
.(++$i).", :v".(++$i).", :v".(++$i).", :v".(++$i).")\n";
}
$sql = preg_replace('/\)\s*VALUES\s*/', ') VALUES ', $sql);
$sql = preg_replace('/\)\s*\(/', '), (', $sql);
$this->opendata->insert($sql, $contingent);
}
/**
* Публичное получение специальностей
* @return array
*/
public function specialties() : array
{
return $this->specialties ? $this->universities : [];
}
/**
* Публичное получение id вузов, занесенных в базу opendata
* @return array
*/
public function universities() : array
{
return $this->universities ? $this->specialties : [];
}
/**
* Извлечение кодов специальности из базы данных niimko
* @return array
*/
private function getSpecialties() : array
{
/*
SELECT id AS spec_id, kod AS spec_code FROM niimko.s_specs
WHERE oopkodes = 'gos3p'
*/
$builder = new GenericBuilder();
$params = ['gos3p'];
$query = $builder->select()
->setTable('s_specs')
->setColumns(['spec_id' =>'id', 'spec_code' => 'kod'])
->where()
->equals('oopkodes','gos3p')
->end();
$sql = $builder->write($query);
$specialties = $this->niimko->select($sql, $params);
return $specialties;
}
/**
* Извлечение id вузов, занесенных в базу opendata
* @return array
*/
private function getUniversities() : array
{
/*
SELECT DISTINCT org_id FROM sveden_education_contingent
*/
$builder = new GenericBuilder();
$query = $builder->select()
->setTable('sveden_education_contingent')
->setColumns(['org_id'])
->where()
->greaterThan('org_id', 0)
->end();
$sql = $builder->write($query);
$sql = preg_replace("/ WHERE.*/", '', $sql);
$sql = preg_replace('/SELECT/', 'SELECT DISTINCT', $sql);
$universities = $this->opendata->select($sql);
return array_column($universities, 'org_id');
}
/**
* Обновление сайтов в базе данных niimko
* @param array $params
* Массив [['org_id' => val1, 'site' => val1,],...]
* @return void
*/
public function updateSitesOpendata(array $params) : void
{
/*
UPDATE niimko.s_vuzes
SET site = CASE kod
WHEN :v1 THEN :v2
WHEN :v3 THEN :v4
...
ELSE kod
END
WHERE kod IN (:v1, :v2...)
*/
$count = count($params);
for ($i = 0; $i < $count; $i++) {
if ($i % 2 == 0) {
$params[] = $params[$i];
}
}
$sql = "UPDATE niimko.s_vuzes\nSET site = CASE kod\n";
for ($i = 0; $i < $count;) {
$sql .= "WHEN :v".++$i." THEN :v".++$i."\n";
}
$sql .= "ELSE kod\nEND\nWHERE kod in(";
for ($i = $count++; $i < count($params);) {
$sql .= ":v".++$i.",\n";
}
$sql = rtrim($sql,",\n") .")\n";
$this->opendata->update($sql, $params);
}
}

View File

@ -1,215 +1,102 @@
<?php
namespace ContingentParser;
use ContingentParser\Database\Database;
use ContingentParser\Parser\ContingentParser;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\ClientException;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Exception\ServerException;
use GuzzleHttp\Psr7\Exception\MalformedUriException;
use GuzzleHttp\TransferStats;
use NilPortugues\Sql\QueryBuilder\Builder\GenericBuilder;
use ContingentParser\Database\DatabaseFacade;
use ContingentParser\Http\HttpClientFacade;
use ContingentParser\Http\UrlBuilder;
use ContingentParser\Logger\HtmlLogger;
use ContingentParser\Parser\ContingentFacade;
class Facade
{
private GenericBuilder $_builder;
private DatabaseFacade $databaseFacade;
private HttpClientFacade $httpClientFacade;
private ContingentFacade $contingentFacade;
private HtmlLogger $htmlLogger;
/**
* Конструктор
*/
public function __construct()
{
$this->_builder = new GenericBuilder();
$this->databaseFacade = new DatabaseFacade();
$this->httpClientFacade = new HttpClientFacade();
$this->contingentFacade = new ContingentFacade();
$this->htmlLogger = new HtmlLogger('log/html.log');
}
public function getSitesFromDatabase(Database $db) : array
/**
* Получить массив сайтов
* @param array $params
* Массив сайтов, у которых нужны обновиленные URL
* @return array
*/
public function getSites(array $params = []) : array
{
// SELECT kod AS org_id, site FROM niimko.s_vuzes
// WHERE ootype = 'vuz' AND deleted = 'n' AND fake = 'n'
$params = [1 => 'vuz', 'n', 'n', 'RU'];
$query = $this->_builder->select()
->setTable('s_vuzes')
->setColumns(['org_id' => 'kod', 'site'])
->where('AND')
->equals('ootype', 'vuz')
->equals('deleted', 'n')
->equals('fake', 'n')
->equals('country', 'RU')
->end();
$sql = $this->_builder->write($query);
$sites = $db->select($sql, $params);
return $sites;
}
public function getSpecialtiesFromDatabase(Database $db) : array
{
// SELECT id AS spec_id, kod AS spec_code FROM niimko.s_specs
// WHERE oopkodes = 'gos3p'
$params = [1 => 'gos3p'];
$query = $this->_builder->select()
->setTable('s_specs')
->setColumns(['spec_id' =>'id', 'spec_code' => 'kod'])
->where()
->equals('oopkodes','gos3p')
->end();
$sql = $this->_builder->write($query);
var_dump($sql);
$specialties = $db->select($sql, $params);
return $specialties;
}
public function getUniversitiesFromDatabase(Database $db) : array
{
// SELECT DISTINCT org_id FROM sveden_education_contingent
$params = [1 => 'org_id'];
$query = $this->_builder->select()
->setTable('sveden_education_contingent')
->setColumns(['org_id'])
->where()
->greaterThan('org_id', 0)
->end();
$sql = $this->_builder->write($query);
$sql = preg_replace("/ WHERE.*/", '', $sql);
$sql = preg_replace('/SELECT/', 'SELECT DISTINCT', $sql);
$specialties = $db->select($sql, $params);
return $specialties;
}
public function getBaseUri(string $url) : string
{
// Строит -> https://<base_uri>
$url = trim(strtolower($url));
$url = preg_replace('/\s+/', '', $url);
$url = str_replace("www/", "www.", $url);
$url = str_replace("http:\\\\", "", $url);
if (!preg_match('#^https?://#', $url)) {
$url = "https://$url";
if (empty($params)) {
return $this->databaseFacade->getSitesFromNiimko();
} else {
return $this->databaseFacade->getSitesFromMiccedu($params);
}
// $url = str_replace("http://", "https://", $url);
$arr = parse_url($url);
$url = $arr['scheme'] . '://' . $arr['host'] . '/';
// $url = str_replace("www.", "", $url);
$url = str_replace("_", "/", $url);
return trim($url);
}
/**
* Cобирает из микроразметки данные таблицы
* "Информация о численности обучающихся" в разделе "Образование"
* @param array $site
* Сайт содержащий id организации и URL
* @return void
*/
public function collectDataFromContingent(array $site) : void
{
list('org_id' => $orgId, 'site' => $url) = $site;
// Нет URL сайта вуза
if (empty($site)) {
// $httpLogger->log($orgId);
return;
}
// Уже в базе
if (in_array($orgId, $this->databaseFacade->universities())) {
return;
}
$urlBuilder = new UrlBuilder();
$url = $urlBuilder->build($url);
Printer::println(implode(' ', $site), 'green');
$html = $this->httpClientFacade->processEducationContingentSites(
$url,
$site
);
public function handleEducationContingentSites(
string $uri,
array $site
) : string {
try {
$client = new Client(
$this->setConfigClient($uri)
);
// Запрос по базовому uri
$response = $client->get('', [
'on_stats' => function (TransferStats $stats) use (&$url) {
$url = $stats->getEffectiveUri();
}
]);
print("Redirect $uri -> $url" . PHP_EOL);
if (substr($url, -1) == '/') {
$url .= "sveden/education/";
$contingent = $this->contingentFacade->getContingent(
$html,
$this->databaseFacade->specialties(),
$orgId
);
if (empty($contingent)) {
Printer::println("No result", 'red');
$this->htmlLogger->log("$orgId $url");
} else {
if ($this->contingentFacade->isValidContingent($contingent)) {
// Заносим в базу
Printer::print_r($contingent, 'blue');
// $this->databaseFacade->insertContingent($contingent);
} else {
$url .= "/sveden/education/";
$this->htmlLogger->log("$orgId $url");
Printer::println("No result", 'red');
}
print("Parsing for $url" . PHP_EOL);
$response = $client->get($url);
$html = $response->getBody()->getContents();
} catch (ClientException
| RequestException
| ConnectException
| ServerException
| MalformedUriException $e
) {
$html = '';
} finally {
return $html;
}
Printer::println();
}
private function setConfigClient(string $baseUri) : array
public function getExclusionSites(string $path) : array
{
return [
'force_ip_resolve' => 'v4',
'debug' => fopen("debug-http.log", "a"),
'base_uri' => $baseUri,
'allow_directs' => [
'max' => 5,
'strict' => true,
'referer' => true,
'protocols' => ['http', 'https'],
'track_redirects' => true
],
'connect_timeout' => 300.0,
'verify' => false,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36',
'Content-Type' => 'text/html;charset=utf-8'
]
];
}
public function getContingent(
string $html,
?array $specialties,
int $orgId
) : array {
$parser = new ContingentParser($html, '//tr[@itemprop="eduChislen"]//');
$contingent = $parser->getDataTable();
// $this->addSpecId($contingent, $specialties);
$this->addOrgId($contingent, $orgId);
return $contingent;
}
private function addSpecId(array &$contingent, array $specialties) : void
{
$specIdMap = array_column($specialties, 'spec_id', 'spec_code');
print_r($specIdMap);
foreach ($contingent as $key => $con) {
$contingent[$key]['spec_id'] = $specIdMap[$con['spec_code']] ?? null;
$logs = file($path);
$result = [];
foreach ($logs as $log) {
$data = explode(' ', $log);
$result[] = [
'org_id' => $data[2],
'site' => $data[3] ? $data[3] : ''
];
}
}
private function addOrgId(array &$contingent, int $orgId): void
{
foreach ($contingent as &$con) {
$con['org_id'] = $orgId;
}
}
public function isValidContingent(array $contingent) : bool
{
$count = 0;
foreach ($contingent as $value) {
$count += $value['contingent'];
}
return $count ? true : false;
}
public function insertContingent(array $contingent) : void
{
$countAtributes = count($contingent[0]);
$size = count($contingent) * ($countAtributes - 1);
$query = $this->_builder->insert()
->setTable('sveden_education_contingent')
->setValues([
'org_id' => '',
'spec_id' => '',
'edu_code' => '',
'edu_name' => '',
'edu_form' => '',
'edu_level' => '',
'contingent' => ''
]);
$sql = $this->_builder->writeFormatted($query);
for ($i = $countAtributes; $i <= $size;) {
$sql .= " (:v".(++$i).", :v".(++$i).", :v".(++$i).", :v".(++$i).", :v".(++$i).", :v".(++$i).", :v".(++$i).")\n";
}
echo $sql;
return $result;
}
}

View File

@ -0,0 +1,99 @@
<?php
namespace ContingentParser\Http;
use ContingentParser\Logger\HttpLogger;
use ContingentParser\Printer;
use CurlHandle;
/**
* Summary of CurlHelper
*/
final class CurlHelper
{
private CurlHandle|bool $curl;
private string $url;
private array $site;
/**
* Коснтруктор
* Инициализация сессии
* @param string $url
* URL сайта
* @param array $site
* Идентификатор организации и базовый URL сайта
*/
public function __construct(string $url, array $site)
{
$this->url = $url;
$this->site = $site;
$this->curl = curl_init();
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($this->curl, CURLOPT_HEADER, true);
curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($this->curl, CURLOPT_USERAGENT,
'Mozilla/5.0 (X11; Linux x86_64) '
.'AppleWebKit/537.36 (KHTML, like Gecko) '
.'Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36'
);
curl_setopt($this->curl, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($this->curl, CURLOPT_CONNECTTIMEOUT, 90);
}
/**
* Прекратить сессию
*/
public function __destruct()
{
curl_close($this->curl);
}
/**
* Получить html-разметку
* @return string
*/
public function getContent() : string
{
curl_setopt($this->curl, CURLOPT_URL, $this->url);
$html = curl_exec($this->curl);
if ($this->checkLocation($this->url, $html)) {
$html = $this->getContent();
}
return $html;
}
/**
* Summary of checkLocation
* @param string $html
* @return bool
*/
private function checkLocation(string &$url, string $html) : bool
{
preg_match('/location:(.*?)\n/i', $html, $matches);
if (empty($matches)) return false;
$target = $matches[1];
$target = preg_replace("/[^a-z0-9\-:.\/,]/iu", '', $target);
$url = $target ? $target : $url;
return $target ? true : false;
}
/**
* Сообщить об ошибке
* @return void
*/
public function reportError() : void
{
$httpLogger = new HttpLogger('log/http-curl.log');
$httpCode = curl_getinfo($this->curl, CURLINFO_HTTP_CODE);
if ($httpCode != 200 && $httpCode != 0) {
Printer::println("HTTP-code: $httpCode", 'red');
$message = implode(' ', $this->site) . ' HTTP-code(' . $httpCode.')';
$httpLogger->log($message, $httpCode);
} else if ($httpCode == 0) {
$errno = curl_errno($this->curl);
$message = implode(' ', $this->site);
$message .= " cURL error ({$errno}): ".curl_strerror($errno);
$httpLogger->log($message);
} else {
Printer::println("HTTP-code: $httpCode", 'blue');
}
}
}

View File

@ -0,0 +1,112 @@
<?php
namespace ContingentParser\Http;
use ContingentParser\Printer;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\ClientException;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Exception\ServerException;
use GuzzleHttp\Psr7\Exception\MalformedUriException;
use GuzzleHttp\TransferStats;
final class HttpClientFacade
{
private Client $client;
private array $config;
public function __construct() {}
/**
* Обработка численности обучающихся
* @param string $url
* URL сайта
* @param array $site
* Идентификатор организации, и базовый URL
* @return string
*/
public function processEducationContingentSites(
string $url,
array $site
) : string {
try {
$client = $this->createClient($url);
// Запрос по базовому uri
$response = $client->get('', [
'on_stats' => function (TransferStats $stats) use (&$redirectUrl) {
$redirectUrl = $stats->getEffectiveUri();
}
]);
Printer::println("Redirect $url -> $redirectUrl");
$url .= substr($url, -1) == '/' ? '':'/';
$url .= "sveden/education/";
Printer::println("Parsing for $url");
$response = $client->get($url);
$httpCode = $response->getStatusCode();
Printer::println("HTTP-code: $httpCode", 'blue');
$html = $response->getBody()->getContents();
} catch (ClientException
| RequestException
| ConnectException
| ServerException
| MalformedUriException $e
) {
Printer::println("HTTP-code: ".$e->getCode(), 'blue');
$html = $this->handleException($url, $site);
} finally {
return $html;
}
}
/**
* Обработка исключения
* Повторная попытка спомощью CurlHelper
* @param string $url
* URL сайта
* @param array $site
* @return string
*/
private function handleException(string $url, array $site) : string
{
$curlHelper = new CurlHelper($url, $site);
$html = $curlHelper->getContent();
$curlHelper->reportError();
return $html;
}
/**
* Создать клиента с базовым URL
* @param string $url
* @return \GuzzleHttp\Client
*/
private function createClient(string $url) : Client
{
$this->config = $this->config() + ["base_uri" => $url];
return new Client($this->config);
}
/**
* Конфигурация клиента
* @return array
*/
private function config() : array
{
return [
'force_ip_resolve' => 'v4',
'debug' => fopen("log/debug-http.log", "w"),
'allow_directs' => [
'max' => 5,
'strict' => true,
'referer' => true,
'protocols' => ['http', 'https'],
'track_redirects' => true
],
'connect_timeout' => 300.0,
'verify' => false,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) '
.'AppleWebKit/537.36 (KHTML, like Gecko) '
.'Chrome/124.0.0.0 YaBrowser/24.6.0.0 Safari/537.36',
'Content-Type' => 'text/html;charset=utf-8'
]
];
}
}

View File

@ -0,0 +1,30 @@
<?php
namespace ContingentParser\Http;
class UrlBuilder
{
public function __construct() {}
/**
* Строит валидный URL сайта
* @param string $url
* Изначальный URL
* @return string
*/
public function build(string $url) : string
{
// Строит -> https://<base_uri>
$url = trim(strtolower($url));
$url = preg_replace('/\s+/', '', $url);
$url = str_replace("www/", "www.", $url);
$url = str_replace("http:\\\\", "", $url);
if (!preg_match('#^https?://#', $url)) {
$url = "http://$url";
}
// $url = str_replace("http://", "https://", $url);
$arr = parse_url($url);
$url = $arr['scheme'] . '://' . $arr['host'] . '/';
// $url = str_replace("www.", "", $url);
$url = str_replace("_", "/", $url);
return trim($url);
}
}

View File

@ -9,4 +9,4 @@ final class DatabaseLogger extends Logger
$logMessage = "[$date] $message\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
}
}
}

View File

@ -0,0 +1,12 @@
<?php
namespace ContingentParser\Logger;
class HtmlLogger extends Logger
{
public function log(string $message) : void
{
$date = date('Y-m-d H:i:s');
$logMessage = "[$date] $message\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
}
}

View File

@ -3,8 +3,68 @@ namespace ContingentParser\Logger;
final class HttpLogger extends Logger
{
public function log(string $message) : void
private const ARR_HTTP_STATUS_CODE = array(
100 => 'Continue',
101 => 'Switching Protocols',
200 => 'OK',
201 => 'Created',
202 => 'Accepted',
203 => 'Non-Authoritative Information',
204 => 'No Content',
205 => 'Reset Content',
206 => 'Partial Content',
300 => 'Multiple Choices',
301 => 'Moved Permanently',
302 => 'Found',
303 => 'See Other',
304 => 'Not Modified',
305 => 'Use Proxy',
306 => 'Switch Proxy',
307 => 'Temporary Redirect',
400 => 'Bad Request',
401 => 'Unauthorized',
402 => 'Payment Required',
403 => 'Forbidden',
404 => 'Not Found',
405 => 'Method Not Allowed',
406 => 'Not Acceptable',
407 => 'Proxy Authentication Required',
408 => 'Request Timeout',
409 => 'Conflict',
410 => 'Gone',
411 => 'Length Required',
412 => 'Precondition Failed',
413 => 'Payload Too Large',
414 => 'URI Too Long',
415 => 'Unsupported Media Type',
416 => 'Range Not Satisfiable',
417 => 'Expectation Failed',
418 => 'I\'m a teapot',
429 => 'Too Many Requests',
451 => 'Unavailable For Legal Reasons',
500 => 'Internal Server Error',
501 => 'Not Implemented',
502 => 'Bad Gateway',
503 => 'Service Unavailable',
504 => 'Gateway Timeout',
505 => 'HTTP Version Not Supported',
506 => 'Variant Also Negotiates',
507 => 'Insufficient Storage',
508 => 'Loop Detected',
509 => 'Bandwidth Limit Exceeded',
510 => 'Not Extended',
511 => 'Network Authentication Required'
);
public function log(string $message, int $httpCode = null) : void
{
$date = date('Y-m-d H:i:s');
if (empty($httpCode)) {
$logMessage = "[$date] $message\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
} else {
$logMessage = "[$date] $message "
.self::ARR_HTTP_STATUS_CODE[$httpCode]."\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
}
}
}
}

View File

@ -4,9 +4,9 @@ namespace ContingentParser\Logger;
abstract class Logger
{
protected string $_path;
public function __construct(string $path)
{
$this->_path = $path;
}
abstract public function log(string $message) : void;
}
}

View File

@ -0,0 +1,71 @@
<?php
namespace ContingentParser\Parser;
class ContingentFacade
{
/**
* Получить данные о численности
* @param string $html
* Разметка сайта вуза
* @param mixed $specialties
* Массив специальностей
* @param int $orgId
* Идентификатор организации
* @return array
*/
public function getContingent(
string $html,
array $specialties,
int $orgId
) : array {
$parser = new ContingentParser($html);
$contingent = $parser->getDataTable();
$this->addSpecId($contingent, $specialties);
$this->addOrgId($contingent, $orgId);
return $contingent;
}
/**
* Проверка на валидность записи численнести
* @param array $contingent
* Массив численности по специальностям
* @return bool
*/
public function isValidContingent(array $contingent) : bool
{
$count = 0;
foreach ($contingent as $value) {
$count += $value['contingent'];
}
return $count ? true : false;
}
/**
* Добавить идентификатор специальности в запись численности
* @param array $contingent
* Массив численности по специальностям
* @param array $specialties
* Массив специальностей
* @return void
*/
private function addSpecId(array &$contingent, array $specialties) : void
{
$specIdMap = array_column($specialties, 'spec_id', 'spec_code');
foreach ($contingent as $key => $con) {
$contingent[$key]['spec_id'] = $specIdMap[$con['spec_code']] ?? null;
}
}
/**
* Добавить идентификатор организации в запись численности
* @param array $contingent
* Массив численности по специальностям
* @param int $orgId
* Идентифиактор организации
* @return void
*/
private function addOrgId(array &$contingent, int $orgId): void
{
foreach ($contingent as &$con) {
$con['org_id'] = $orgId;
}
}
}

View File

@ -6,8 +6,8 @@ use DOMXPath;
class ContingentParser
{
private DOMXPath $xpath;
private string $template;
private ?DOMXPath $xpath;
private const TEMPLATE = '//tr[@itemprop="eduChislen"]//';
private const ENCODING = "UTF-8";
private const FIELDS = [
"eduCode" => "td",
@ -17,17 +17,19 @@ class ContingentParser
"numberAll" => ["th", "td"]
];
public function __construct(string $html, string $template)
public function __construct(string $html)
{
libxml_use_internal_errors(true);
$dom = new DOMDocument(
encoding: self::ENCODING
);
$this->setEncoding($html);
$dom->loadHTML($html);
$this->xpath = new DOMXPath($dom);
$this->template = $template;
if (empty($html)) {
$this->xpath = null;
} else {
$this->setEncoding($html);
$dom->loadHTML($html);
$this->xpath = new DOMXPath($dom);
}
}
private function setEncoding(string &$html) : void
@ -41,25 +43,33 @@ class ContingentParser
);
$html = str_replace('windows-1251',self::ENCODING, $html);
}
$html = mb_convert_encoding($html,'HTML-ENTITIES','UTF-8');
}
private function parse() : array
{
$data = [];
foreach (self::FIELDS as $field => $tag) {
if (!is_array($tag)) {
$data[$field] = $this->xpath->query($this->template . $tag . "[@itemprop=\"$field\"]");
$data[$field] = $this->xpath->query(
self::TEMPLATE . $tag . "[@itemprop=\"$field\"]"
);
} else {
$th = $this->xpath->query($this->template . $tag[0] . "[@itemprop=\"$field\"]");
$td = $this->xpath->query($this->template . $tag[1] . "[@itemprop=\"$field\"]");
$th = $this->xpath->query(
self::TEMPLATE . $tag[0] . "[@itemprop=\"$field\"]"
);
$td = $this->xpath->query(
self::TEMPLATE . $tag[1] . "[@itemprop=\"$field\"]"
);
$data[$field] = $th->length > $td->length ? $th : $td;
}
}
return $data;
}
public function getDataTable() : array
{
if (empty($this->xpath)) return [];
$data = $this->parse();
$records = [];
if ($data == null) return [];
@ -86,4 +96,4 @@ class ContingentParser
}
return $records;
}
}
}

View File

@ -30,4 +30,4 @@ class ContingentRow
"contingent" => $this->contingent
];
}
}
}

View File

@ -0,0 +1,26 @@
<?php
namespace ContingentParser;
class Printer
{
public static function print(string $text = '', string $color = '') : void
{
$color = Color::create($color);
print($color->tostring().$text.Color::WHITE->tostring());
}
public static function println(string $text = '', string $color = '') : void
{
$color = Color::create($color);
print($color->tostring().$text.Color::WHITE->tostring());
print(PHP_EOL);
}
public static function print_r(mixed $value, string $color = '') : void
{
$color = Color::create($color);
print($color->tostring());
print_r($value);
print(Color::WHITE->tostring());
}
}

View File

@ -1 +0,0 @@
<?php