добавил абстракций и первые варианты для парсинга результатов према

This commit is contained in:
2024-10-10 11:51:24 +03:00
parent 984f6bda0a
commit 3b6fecec6c
310 changed files with 7831 additions and 44954 deletions

View File

@ -1,149 +0,0 @@
<?php
namespace SvedenParser\ContingentParser;
use SvedenParser\Http\HttpClient;
use SvedenParser\Http\UrlBuilder;
use SvedenParser\Logger\HtmlLogger;
use SvedenParser\Color;
use SvedenParser\Printer;
final class ContingentFacade
{
private ContingentRepository $contingentRepository;
private HttpClient $httpClient;
private ContingentService $contingentService;
private HtmlLogger $htmlLogger;
/**
* Конструктор
*/
public function __construct()
{
$this->contingentRepository = new ContingentRepository();
$this->httpClient = new HttpClient();
$this->contingentService = new ContingentService();
$this->htmlLogger = new HtmlLogger('log/html.log');
}
/**
* Получить массив сайтов
* @param array $params Массив сайтов, у которых нужны обновиленные URL
* @return array
*/
public function getSites(array $params = []): array
{
if (!$params) {
return $this->contingentRepository->getSitesFromNiimko();
} else {
return $this->contingentRepository->getSitesFromMiccedu($params);
}
}
/**
* Cобирает из микроразметки данные таблицы
* "Информация о численности обучающихся" в разделе "Образование"
* @param array $site Сайт содержащий id организации и URL
* @return void
*/
public function collectDataFromContingent(array $site): void
{
if ($this->isExit($site)) {
return;
}
list('org_id' => $orgId, 'site' => $url) = $site;
$url = UrlBuilder::build($url);
Printer::println(implode(' ', $site), Color::GREEN);
$html = $this->httpClient->getContentOfSite(
$url,
$site,
'sveden/education/'
);
if (!$html) {
return;
}
$uri = $this->contingentService->getLink($html);
Printer::println($uri, Color::YELLOW);
if ($uri) {
$pattern = '/^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$/';
if (preg_match($pattern, $uri)) {
$html = $this->httpClient->getContentOfSite(
$uri,
$site
);
} else if (UrlBuilder::checkUri($uri)) {
if (0 === strpos($uri, '/')) {
$html = $this->httpClient->getContentOfSite(
$url,
$site,
$uri
);
} else {
$html = $this->httpClient->getContentOfSite(
$url,
$site,
"sveden/education/$uri"
);
}
}
}
// Получаем данные таблицы численности
$contingent = $this->contingentService->getContingent(
$html,
$this->contingentRepository->specialties(),
$orgId
);
if ($contingent
&& $this->contingentService->isValidContingent($contingent)
) {
// Заносим в базу
Printer::print_r($contingent, Color::BLUE);
// $this->contingentRepository->insertContingent($contingent);
} else {
Printer::println("No result", Color::RED);
$this->htmlLogger->log("$orgId $url");
}
Printer::println();
}
public function getOrgInOpendata(): array
{
return $this->contingentRepository->universities();
}
/**
* Условие выхода
* @param array $site
* @return bool
*/
private function isExit(array $site): bool
{
// Нет URL сайта вуза
if (!$site['site']) {
return true;
}
// Уже в базе
if (in_array($site['org_id'], $this->contingentRepository->universities())) {
return true;
}
return false;
}
public function getSitesFromLog(string $path): array
{
try {
$result = [];
$data = file($path);
foreach ($data as &$dt) {
$dt = explode(' ', $dt);
$result[] = [
'org_id' => trim($dt[0]),
'site' => trim($dt[1])
];
}
} catch (\Exception $e) {
Printer::println($e->getMessage(), Color::RED);
} finally {
return $result;
}
}
}

View File

@ -0,0 +1,90 @@
<?php
namespace SvedenParser\ContingentParser;
use SvedenParser\Http\UrlBuilder;
use SvedenParser\Color;
use SvedenParser\Manager;
use SvedenParser\Printer;
final class ContingentManager extends Manager
{
/**
* Конструктор
*/
public function __construct()
{
parent::__construct();
$this->repository = new ContingentRepository();
$this->service = new ContingentService();
}
/**
* Cобирает из микроразметки данные таблицы
* "Информация о численности обучающихся" в разделе "Образование"
* @param array $site Сайт содержащий id организации и URL
* @return void
*/
public function collectData(array $site): void
{
if ($this->isExit($site)) {
return;
}
list('org_id' => $orgId, 'site' => $url) = $site;
$url = UrlBuilder::build($url);
Printer::println(implode(' ', $site), Color::GREEN);
$html = $this->httpClient->getContentOfSite(
$url,
$site,
'sveden/education/'
);
if (!$html) {
return;
}
$uri = $this->service->getLink($html);
Printer::println($uri, Color::YELLOW);
if ($uri) {
$pattern = '/^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$/';
if (preg_match($pattern, $uri)) {
$html = $this->httpClient->getContentOfSite(
$uri,
$site
);
} else if (UrlBuilder::checkUri($uri)) {
if (0 === strpos($uri, '/')) {
$html = $this->httpClient->getContentOfSite(
$url,
$site,
$uri
);
} else {
$html = $this->httpClient->getContentOfSite(
$url,
$site,
"sveden/education/$uri"
);
}
}
}
// Получаем данные таблицы численности
$contingent = $this->service->getData(
$html,
$this->repository->specialties(),
$orgId
);
if ($contingent
&& $this->service->isValidData($contingent)
) {
// Заносим в базу
Printer::print_r($contingent, Color::BLUE);
// $this->repository->insert($contingent);
} else {
Printer::println("No result", Color::RED);
$this->htmlLogger->log("$orgId $url");
}
Printer::println();
}
}

View File

@ -6,16 +6,12 @@
namespace SvedenParser\ContingentParser;
use SvedenParser\Color;
use SvedenParser\Parser;
use SvedenParser\Printer;
use DOMDocument;
use DOMXPath;
final class ContingentParser
final class ContingentParser extends Parser
{
private ?DOMXPath $xpath;
private DOMDocument $dom;
private const TEMPLATE = '//tr[@itemprop="eduChislen"]//';
private const ENCODING = "UTF-8";
private const FIELDS = [
"eduCode" => "td",
"eduName" => "td",
@ -24,40 +20,11 @@ final class ContingentParser
"numberAll" => ["th", "td"]
];
public function __construct(string $html)
{
libxml_use_internal_errors(true);
$this->dom = new DOMDocument(
encoding: self::ENCODING
);
if (empty($html)) {
$this->xpath = null;
} else {
$this->setEncoding($html);
$this->dom->loadHTML($html);
$this->xpath = new DOMXPath($this->dom);
}
}
private function setEncoding(string &$html): void
{
$encoding = mb_detect_encoding($html, 'UTF-8, windows-1251');
if ($encoding != self::ENCODING) {
$html = mb_convert_encoding(
$html,
self::ENCODING,
$encoding
);
$html = str_replace('windows-1251',self::ENCODING, $html);
}
$html = mb_convert_encoding($html,'HTML-ENTITIES','UTF-8');
}
public function getDataTable(): array
{
if (empty($this->xpath)) return [];
$data = $this->parseContingent();
var_dump($data);
$data = $this->parse();
$records = [];
if ($data == null) return [];
@ -88,7 +55,7 @@ final class ContingentParser
return $records;
}
private function parseContingent(): array
protected function parse(): array
{
$data = [];
foreach (self::FIELDS as $field => $tag) {

View File

@ -1,94 +1,18 @@
<?php
namespace SvedenParser\ContingentParser;
use SvedenParser\Database\Database;
use SvedenParser\Database\DatabaseConfig;
use NilPortugues\Sql\QueryBuilder\Builder\GenericBuilder;
use SvedenParser\Repository;
final class ContingentRepository
final class ContingentRepository extends Repository
{
private Database $opendata;
private Database $niimko;
public const FILE_ADD_RECORDING = Database::FILE_ADD_RECORDING;
private array $specialties;
private array $universities;
/**
* Конструктор
*/
public function __construct()
{
$this->niimko = new Database(new DatabaseConfig('niimko'));
$this->opendata = new Database(new DatabaseConfig('opendata'));
$this->specialties = $this->getSpecialties();
$this->universities = $this->getUniversities();
}
/**
* Извлечение URL сайтов из базы данных niimko
* @return array
*/
public function getSitesFromNiimko(): array
{
/*
SELECT kod AS org_id, site FROM niimko.s_vuzes
WHERE ootype = 'vuz' AND deleted = 'n' AND fake = 'n'
*/
$builder = new GenericBuilder();
$params = ['vuz', 'n', 'n', 'RU'];
$query = $builder->select()
->setTable('s_vuzes')
->setColumns(['org_id' => 'kod', 'site'])
->where('AND')
->equals('ootype', 'vuz')
->equals('deleted', 'n')
->equals('fake', 'n')
->equals('country', 'RU')
->end();
$sql = $builder->write($query);
$sites = $this->niimko->select($sql, $params);
return $sites;
}
/**
* Извлечение сайтов базы данных opendata
* из таблицы miccedu_monitoring.
* @param array $params
* Сайты, у которых устаревшие URL
* @return array
*/
public function getSitesFromMiccedu(array $params): array
{
/*
SELECT site, vuzkod AS org_id FROM opendata.miccedu_monitoring
WHERE year = 2023 AND (vuzkod = :val1 OR vuzkod = :val2 OR ...)
*/
$builder = new GenericBuilder();
$year = 2023;
foreach ($params as $key => $org) {
$params[$key] = (int)$org['org_id'];
}
$query = $builder->select()
->setTable('miccedu_monitoring')
->setColumns(['org_id' => 'vuzkod','site'])
->where('AND')
->equals('year', $year)
->subWhere('OR');
foreach ($params as $orgId) {
$query->equals('vuzkod', $orgId);
}
$query = $query->end();
$sql = $builder->write($query);
array_unshift($params, $year);
$sites = $this->opendata->select($sql, $params);
return $sites;
}
/**
* Внесение данных численности обучающихся в базу данных opendata
* @param array $contingent
* @param array $data
* Массив записей численности по специальностям
* @return void
*/
public function insertContingent(array $contingent): void
public function insert(array $data): void
{
/*
INSERT INTO sveden_education_contingent
@ -98,12 +22,12 @@ final class ContingentRepository
...
*/
$builder = new GenericBuilder();
$countAtributes = count($contingent[0]);
$size = $countAtributes * (count($contingent) - 1);
$countAtributes = count($data[0]);
$size = $countAtributes * (count($data) - 1);
$query = $builder->insert()
->setTable('sveden_education_contingent')
->setValues(
$contingent[0]
$data[0]
);
$sql = $builder->write($query);
for ($i = $countAtributes; $i <= $size;) {
@ -112,105 +36,8 @@ final class ContingentRepository
}
$sql = preg_replace('/\)\s*VALUES\s*/', ') VALUES ', $sql);
$sql = preg_replace('/\)\s*\(/', '), (', $sql);
$this->opendata->insert($sql, $contingent);
}
/**
* Публичное получение специальностей
* @return array
*/
public function specialties(): array
{
return $this->specialties ? $this->specialties : [];
}
/**
* Публичное получение id вузов, занесенных в базу opendata
* @return array
*/
public function universities(): array
{
return $this->universities ? $this->universities : [];
}
/**
* Извлечение кодов специальности из базы данных niimko
* @return array
*/
private function getSpecialties(): array
{
/*
SELECT id AS spec_id, kod AS spec_code FROM niimko.s_specs
WHERE oopkodes = 'gos3p'
*/
$builder = new GenericBuilder();
$params = ['gos3p'];
$query = $builder->select()
->setTable('s_specs')
->setColumns(['spec_id' =>'id', 'spec_code' => 'kod'])
->where()
->equals('oopkodes','gos3p')
->end();
$sql = $builder->write($query);
$specialties = $this->niimko->select($sql, $params);
return $specialties;
}
/**
* Извлечение id вузов, занесенных в базу opendata
* @return array
*/
private function getUniversities(): array
{
/*
SELECT DISTINCT org_id FROM sveden_education_contingent
*/
$builder = new GenericBuilder();
$query = $builder->select()
->setTable('sveden_education_contingent')
->setColumns(['org_id'])
->where()
->greaterThan('org_id', 0)
->end();
$sql = $builder->write($query);
$sql = preg_replace("/ WHERE.*/", '', $sql);
$sql = preg_replace('/SELECT/', 'SELECT DISTINCT', $sql);
$universities = $this->opendata->select($sql);
return array_column($universities, 'org_id');
}
/**
* Обновление сайтов в базе данных niimko
* @param array $params
* Массив [['org_id' => val1, 'site' => val1,],...]
* @return void
*/
public function updateSitesOpendata(array $params): void
{
/*
UPDATE niimko.s_vuzes
SET site = CASE kod
WHEN :v1 THEN :v2
WHEN :v3 THEN :v4
...
ELSE kod
END
WHERE kod IN (:v1, :v2...)
*/
$count = count($params);
for ($i = 0; $i < $count; $i++) {
if ($i % 2 == 0) {
$params[] = $params[$i];
}
}
$sql = "UPDATE niimko.s_vuzes\nSET site = CASE kod\n";
for ($i = 0; $i < $count;) {
$sql .= "WHEN :v".++$i." THEN :v".++$i."\n";
}
$sql .= "ELSE kod\nEND\nWHERE kod in(";
for ($i = $count++; $i < count($params);) {
$sql .= ":v".++$i.",\n";
}
$sql = rtrim($sql,",\n") .")\n";
$this->opendata->update($sql, $params);
$this->opendata->insert($sql, $data);
}
}

View File

@ -1,7 +1,8 @@
<?php
namespace SvedenParser\ContingentParser;
use SvedenParser\Service;
final class ContingentService
final class ContingentService extends Service
{
/**
* Получить данные о численности
@ -10,11 +11,8 @@ final class ContingentService
* @param int $orgId Идентификатор организации
* @return array
*/
public function getContingent(
string $html,
array $specialties,
int $orgId
): array {
public function getData(string $html, array $specialties, int $orgId): array
{
$parser = new ContingentParser($html);
$contingent = $parser->getDataTable();
$this->addSpecId($contingent, $specialties);
@ -27,7 +25,7 @@ final class ContingentService
* @param array $contingent Массив численности по специальностям
* @return bool
*/
public function isValidContingent(array $contingent): bool
public function isValidData(array $contingent): bool
{
$count = 0;
foreach ($contingent as $value) {
@ -35,31 +33,7 @@ final class ContingentService
}
return $count ? true : false;
}
/**
* Добавить идентификатор специальности в запись численности
* @param array $contingent Массив численности по специальностям
* @param array $specialties Массив специальностей
* @return void
*/
private function addSpecId(array &$contingent, array $specialties): void
{
$specIdMap = array_column($specialties, 'spec_id', 'spec_code');
foreach ($contingent as $key => $con) {
$contingent[$key]['spec_id'] = $specIdMap[$con['spec_code']] ?? null;
}
}
/**
* Добавить идентификатор организации в запись численности
* @param array $contingent Массив численности по специальностям
* @param int $orgId Идентифиактор организации
* @return void
*/
private function addOrgId(array &$contingent, int $orgId): void
{
foreach ($contingent as &$con) {
$con['org_id'] = $orgId;
}
}
/**
*
* @param string $html

View File

@ -9,9 +9,9 @@ use PDO;
final class Database
{
private PDO $pdo;
public const FILE_ADD_RECORDING ='not-recorded-in-db.yaml';
public const FILE_ADD_RECORDING = SVEDEN_PARSER . '/not-recorded-in-db.yaml';
private const ERR_NO_CONNECT = "HY000";
private static $logfile = 'log/database.log';
private static $logfile = SVEDEN_PARSER . '/log/database.log';
private DatabaseConfig $databaseConfig;
private DatabaseLogger $logger;
/**

View File

@ -26,7 +26,7 @@ final class DatabaseConfig
private function getDataEnv(string $db): array
{
$envVars = parse_ini_file('.env', true);
$envVars = parse_ini_file(SVEDEN_PARSER . '/.env', true);
$db = strtoupper($db);
$config = [];
foreach ($envVars as $dbname => $dbconfig) {

View File

@ -83,7 +83,7 @@ final class CurlHelper
*/
public function isError(): bool
{
$httpLogger = new HttpLogger('log/http-curl.log');
$httpLogger = new HttpLogger(SVEDEN_PARSER . '/log/http-curl.log');
$httpCode = curl_getinfo($this->curl, CURLINFO_HTTP_CODE);

View File

@ -86,7 +86,7 @@ final class HttpClient
{
return [
'force_ip_resolve' => 'v4',
'debug' => fopen("log/debug-http.log", "w"),
'debug' => fopen(SVEDEN_PARSER . "/log/debug-http.log", "w"),
'allow_directs' => [
'max' => 5,
'strict' => true,

View File

@ -7,6 +7,6 @@ final class DatabaseLogger extends Logger
{
$date = date('Y-m-d H:i:s');
$logMessage = "[$date] $message\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
file_put_contents($this->path, $logMessage, FILE_APPEND);
}
}

View File

@ -7,6 +7,6 @@ final class HtmlLogger extends Logger
{
$date = date('Y-m-d H:i:s');
$logMessage = "[$date] $message\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
file_put_contents($this->path, $logMessage, FILE_APPEND);
}
}

View File

@ -60,11 +60,11 @@ final class HttpLogger extends Logger
$date = date('Y-m-d H:i:s');
if (!$httpCode) {
$logMessage = "[$date] $message\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
file_put_contents($this->path, $logMessage, FILE_APPEND);
} else {
$logMessage = "[$date] $message "
.self::ARR_HTTP_STATUS_CODE[$httpCode]."\n";
file_put_contents($this->_path, $logMessage, FILE_APPEND);
file_put_contents($this->path, $logMessage, FILE_APPEND);
}
}
}

View File

@ -3,10 +3,10 @@ namespace SvedenParser\Logger;
abstract class Logger
{
protected string $_path;
protected string $path;
public function __construct(string $path)
{
$this->_path = $path;
$this->path = $path;
}
}

71
src/Manager.php Normal file
View File

@ -0,0 +1,71 @@
<?php
namespace SvedenParser;
use SvedenParser\Http\HttpClient;
use SvedenParser\Logger\HtmlLogger;
abstract class Manager
{
protected Service $service;
protected Repository $repository;
protected HttpClient $httpClient;
protected HtmlLogger $htmlLogger;
public function __construct()
{
$this->httpClient = new HttpClient();
$this->htmlLogger = new HtmlLogger(SVEDEN_PARSER . '/log/html.log');
}
abstract function collectData(array $site): void;
/**
* Получить массив сайтов
* @param array $params Массив сайтов, у которых нужны обновиленные URL
* @return array
*/
public function getSites(array $params = []): array
{
if (!$params) {
return $this->repository->getSitesFromNiimko();
} else {
return $this->repository->getSitesFromMiccedu($params);
}
}
/**
* Условие выхода
* @param array $site
* @return bool
*/
protected function isExit(array $site): bool
{
// Нет URL сайта вуза
if (!$site['site']) {
return true;
}
// Уже в базе
if (in_array($site['org_id'], $this->repository->universities())) {
// return true;
}
return false;
}
public function getSitesFromLog(string $path): array
{
try {
$result = [];
$data = file($path);
foreach ($data as &$dt) {
$dt = explode(' ', $dt);
$result[] = [
'org_id' => trim($dt[0]),
'site' => trim($dt[1])
];
}
} catch (\Exception $e) {
Printer::println($e->getMessage(), Color::RED);
} finally {
return $result;
}
}
}

43
src/Parser.php Normal file
View File

@ -0,0 +1,43 @@
<?php
namespace SvedenParser;
abstract class Parser
{
protected ?\DOMXPath $xpath;
protected \DOMDocument $dom;
protected const ENCODING = "UTF-8";
public function __construct(string $html)
{
libxml_use_internal_errors(true);
$this->dom = new \DOMDocument(
encoding: self::ENCODING
);
if (empty($html)) {
$this->xpath = null;
} else {
$this->setEncoding($html);
$this->dom->loadHTML($html);
$this->xpath = new \DOMXPath($this->dom);
}
}
private function setEncoding(string &$html): void
{
$encoding = mb_detect_encoding($html, 'UTF-8, windows-1251');
if ($encoding != self::ENCODING) {
$html = mb_convert_encoding(
$html,
self::ENCODING,
$encoding
);
$html = str_replace('windows-1251',self::ENCODING, $html);
}
$html = mb_convert_encoding($html,'HTML-ENTITIES','UTF-8');
}
abstract public function getDataTable(): array;
abstract protected function parse(): array;
abstract public function getLink(): string;
}

View File

@ -0,0 +1,81 @@
<?php
namespace SvedenParser\PriemParser;
use SvedenParser\Color;
use SvedenParser\Http\UrlBuilder;
use SvedenParser\Manager;
use SvedenParser\Printer;
final class PriemManager extends Manager
{
public function __construct()
{
parent::__construct();
$this->repository = new PriemRepository();
$this->service = new PriemService();
}
public function collectData(array $site): void
{
if ($this->isExit($site)) {
return;
}
list('org_id' => $orgId, 'site' => $url) = $site;
$url = UrlBuilder::build($url);
Printer::println(implode(' ', $site), Color::GREEN);
$html = $this->httpClient->getContentOfSite(
$url,
$site,
'sveden/education/'
);
if (!$html) {
return;
}
$uri = $this->service->getLink($html);
Printer::println($uri, Color::YELLOW);
if ($uri) {
$pattern = '/^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$/';
if (preg_match($pattern, $uri)) {
$html = $this->httpClient->getContentOfSite(
$uri,
$site
);
} else if (UrlBuilder::checkUri($uri)) {
if (0 === strpos($uri, '/')) {
$html = $this->httpClient->getContentOfSite(
$url,
$site,
$uri
);
} else {
$html = $this->httpClient->getContentOfSite(
$url,
$site,
"sveden/education/$uri"
);
}
}
}
// Получаем данные таблицы приема
$priem = $this->service->getData(
$html,
$this->repository->specialties(),
$orgId
);
if ($priem
&& $this->service->isValidData($priem)
) {
// Заносим в базу
Printer::print_r($priem, Color::BLUE);
// $this->repository->insert($contingent);
} else {
Printer::println("No result", Color::RED);
$this->htmlLogger->log("$orgId $url");
}
Printer::println();
}
}

View File

@ -0,0 +1,97 @@
<?php
namespace SvedenParser\PriemParser;
use SvedenParser\Color;
use SvedenParser\Parser;
use SvedenParser\Printer;
final class PriemParser extends Parser
{
private const TEMPLATE = '//tr[@itemprop="eduPriem"]//';
private const FIELDS = [
"eduCode" => "td",
"eduName" => "td",
"eduLevel" => "td",
"eduForm" => "td",
"numberBF" => "td",
"numberBR" => "td",
"numberBM" => "td",
"numberP" => "td",
"score" => "td"
];
public function getDataTable(): array
{
if (!$this->xpath) return [];
$data = $this->parse();
$records = [];
if (!$data) return [];
$equal = $data['eduName']->length;
foreach ($data as $field) {
if ($field->length == 0) {
return [];
}
if ($field->length != $equal) {
return [];
}
}
for ($i = 0; $i < $data['eduCode']->length; $i++) {
try {
$contingentRow = new PriemRow(
$data['eduCode']->item($i)->textContent,
$data['eduName']->item($i)->textContent,
$data['eduLevel']->item($i)->textContent,
$data['eduForm']->item($i)->textContent,
$data['score']->item($i)->textContent,
[
$data['numberBF']->item($i)->textContent,
$data['numberBR']->item($i)->textContent,
$data['numberBM']->item($i)->textContent,
$data['numberP']->item($i)->textContent,
],
);
$records[] = $contingentRow->getData();
} catch (\Exception $e) {
Printer::println($e->getMessage(), Color::RED);
}
}
return $records;
}
protected function parse(): array
{
$data = [];
foreach (self::FIELDS as $field => $tag) {
if (!is_array($tag)) {
$data[$field] = $this->xpath->query(
self::TEMPLATE . $tag . "[@itemprop=\"$field\"]"
);
} else {
// $th = $this->xpath->query(
// self::TEMPLATE . $tag[0] . "[@itemprop=\"$field\"]"
// );
// $td = $this->xpath->query(
// self::TEMPLATE . $tag[1] . "[@itemprop=\"$field\"]"
// );
// $data[$field] = $th->length > $td->length ? $th : $td;
}
}
return $data;
}
public function getLink(): string
{
$needle = "Информация о результатах приёма";
$data = $this->dom->getElementsByTagName('a');
for ($i = 0; $i < $data->length; $i++) {
$haystack = $data->item($i)->textContent;
$isInformationOfContingent = strpos($haystack, $needle) !== false;
if ($isInformationOfContingent) {
return $data->item($i)->getAttribute('href');
}
}
return '';
}
}

View File

@ -0,0 +1,11 @@
<?php
namespace SvedenParser\PriemParser;
use SvedenParser\Repository;
final class PriemRepository extends Repository
{
public function insert(array $data): void
{
}
}

View File

@ -0,0 +1,54 @@
<?php
namespace SvedenParser\PriemParser;
class PriemRow
{
private int $all;
private int $budget;
private float $avgScore;
public function __construct(
private string $eduCode,
private string $eduName,
private string $eduLevel,
private string $eduForm,
string $avgScore,
array $contingent,
) {
if ($avgScore < 0) {
throw new \Exception('Недействительная средняя сумма набранных баллов обучающихся!');
}
$this->eduCode = trim($eduCode);
$this->eduName = trim($eduName);
$this->eduLevel = trim($eduLevel);
$this->eduForm = trim($eduForm);
$this->avgScore = (float)str_replace(',', '.', $avgScore);
$this->calcContingent($contingent);
}
public function getData(): array
{
return [
'spec_code' => $this->eduCode,
'spec_name' => $this->eduName,
'edu_level' => $this->eduLevel,
'edu_forms'=> $this->eduForm,
'avgScore' => $this->avgScore,
'contongent' => $this->all,
'budget' => $this->budget,
];
}
private function calcContingent(array $contingent): void
{
$all = 0;
$budget = 0;
foreach ($contingent as $key => $con) {
$all += (int)$con;
if ($key !== 3) {
$budget += $con;
}
}
$this->all = $all;
$this->budget = $budget;
}
}

View File

@ -0,0 +1,42 @@
<?php
namespace SvedenParser\PriemParser;
use SvedenParser\Service;
final class PriemService extends Service
{
/**
* Получить данные о приеме
* @param string $html Разметка сайта вуза
* @param mixed $specialties Массив специальностей
* @param int $orgId Идентификатор организации
* @return array
*/
public function getData(string $html, array $specialties, int $orgId): array
{
$parser = new PriemParser($html);
$contingent = $parser->getDataTable();
$this->addSpecId($contingent, $specialties);
$this->addOrgId($contingent, $orgId);
return $contingent;
}
/**
* Проверка на валидность записи примема
* @param array $contingent Массив численности по специальностям
* @return bool
*/
public function isValidData(array $contingent): bool
{
$count = 1;
// foreach ($contingent as $value) {
// $count += $value['contingent'];
// }
return $count ? true : false;
}
public function getLink(string $html): string
{
$parser = new PriemParser($html);
return $parser->getLink();
}
}

184
src/Repository.php Normal file
View File

@ -0,0 +1,184 @@
<?php
namespace SvedenParser;
use NilPortugues\Sql\QueryBuilder\Builder\GenericBuilder;
use SvedenParser\Database\Database;
use SvedenParser\Database\DatabaseConfig;
abstract class Repository
{
protected Database $opendata;
protected Database $niimko;
public const FILE_ADD_RECORDING = Database::FILE_ADD_RECORDING;
protected array $specialties;
protected array $universities;
public function __construct()
{
$this->niimko = new Database(new DatabaseConfig('niimko'));
$this->opendata = new Database(new DatabaseConfig('opendata'));
$this->specialties = $this->getSpecialties();
$this->universities = $this->getUniversities();
}
/**
* Извлечение URL сайтов из базы данных niimko
* @return array
*/
public function getSitesFromNiimko(): array
{
/*
SELECT kod AS org_id, site FROM niimko.s_vuzes
WHERE ootype = 'vuz' AND deleted = 'n' AND fake = 'n'
*/
$builder = new GenericBuilder();
$params = ['vuz', 'n', 'n', 'RU'];
$query = $builder->select()
->setTable('s_vuzes')
->setColumns(['org_id' => 'kod', 'site'])
->where('AND')
->equals('ootype', 'vuz')
->equals('deleted', 'n')
->equals('fake', 'n')
->equals('country', 'RU')
->end();
$sql = $builder->write($query);
$sites = $this->niimko->select($sql, $params);
return $sites;
}
/**
* Извлечение сайтов базы данных opendata
* из таблицы miccedu_monitoring.
* @param array $params
* Сайты, у которых устаревшие URL
* @return array
*/
public function getSitesFromMiccedu(array $params): array
{
/*
SELECT site, vuzkod AS org_id FROM opendata.miccedu_monitoring
WHERE year = 2023 AND (vuzkod = :val1 OR vuzkod = :val2 OR ...)
*/
$builder = new GenericBuilder();
$year = 2023;
foreach ($params as $key => $org) {
$params[$key] = (int)$org['org_id'];
}
$query = $builder->select()
->setTable('miccedu_monitoring')
->setColumns(['org_id' => 'vuzkod','site'])
->where('AND')
->equals('year', $year)
->subWhere('OR');
foreach ($params as $orgId) {
$query->equals('vuzkod', $orgId);
}
$query = $query->end();
$sql = $builder->write($query);
array_unshift($params, $year);
$sites = $this->opendata->select($sql, $params);
return $sites;
}
/**
* Публичное получение специальностей
* @return array
*/
public function specialties(): array
{
return $this->specialties ? $this->specialties : [];
}
/**
* Публичное получение id вузов, занесенных в базу opendata
* @return array
*/
public function universities(): array
{
return $this->universities ? $this->universities : [];
}
/**
* Извлечение кодов специальности из базы данных niimko
* @return array
*/
private function getSpecialties(): array
{
/*
SELECT id AS spec_id, kod AS spec_code FROM niimko.s_specs
WHERE oopkodes = 'gos3p'
*/
$builder = new GenericBuilder();
$params = ['gos3p'];
$query = $builder->select()
->setTable('s_specs')
->setColumns(['spec_id' =>'id', 'spec_code' => 'kod'])
->where()
->equals('oopkodes','gos3p')
->end();
$sql = $builder->write($query);
$specialties = $this->niimko->select($sql, $params);
return $specialties;
}
/**
* Извлечение id вузов, занесенных в базу opendata
* @return array
*/
private function getUniversities(): array
{
/*
SELECT DISTINCT org_id FROM sveden_education_contingent
*/
$builder = new GenericBuilder();
$query = $builder->select()
->setTable('sveden_education_contingent')
->setColumns(['org_id'])
->where()
->greaterThan('org_id', 0)
->end();
$sql = $builder->write($query);
$sql = preg_replace("/ WHERE.*/", '', $sql);
$sql = preg_replace('/SELECT/', 'SELECT DISTINCT', $sql);
$universities = $this->opendata->select($sql);
return array_column($universities, 'org_id');
}
/**
* Обновление сайтов в базе данных niimko
* @param array $params
* Массив [['org_id' => val1, 'site' => val1,],...]
* @return void
*/
public function updateSitesOpendata(array $params): void
{
/*
UPDATE niimko.s_vuzes
SET site = CASE kod
WHEN :v1 THEN :v2
WHEN :v3 THEN :v4
...
ELSE kod
END
WHERE kod IN (:v1, :v2...)
*/
$count = count($params);
for ($i = 0; $i < $count; $i++) {
if ($i % 2 == 0) {
$params[] = $params[$i];
}
}
$sql = "UPDATE niimko.s_vuzes\nSET site = CASE kod\n";
for ($i = 0; $i < $count;) {
$sql .= "WHEN :v".++$i." THEN :v".++$i."\n";
}
$sql .= "ELSE kod\nEND\nWHERE kod in(";
for ($i = $count++; $i < count($params);) {
$sql .= ":v".++$i.",\n";
}
$sql = rtrim($sql,",\n") .")\n";
$this->opendata->update($sql, $params);
}
abstract public function insert(array $data): void;
}

35
src/Service.php Normal file
View File

@ -0,0 +1,35 @@
<?php
namespace SvedenParser;
abstract class Service
{
/**
* Добавить идентификатор специальности в запись численности
* @param array $contingent Массив численности по специальностям
* @param array $specialties Массив специальностей
* @return void
*/
protected function addSpecId(array &$contingent, array $specialties): void
{
$specIdMap = array_column($specialties, 'spec_id', 'spec_code');
foreach ($contingent as $key => $con) {
$contingent[$key]['spec_id'] = $specIdMap[$con['spec_code']] ?? null;
}
}
/**
* Добавить идентификатор организации в запись численности
* @param array $contingent Массив численности по специальностям
* @param int $orgId Идентифиактор организации
* @return void
*/
protected function addOrgId(array &$contingent, int $orgId): void
{
foreach ($contingent as &$con) {
$con['org_id'] = $orgId;
}
}
abstract public function getLink(string $html): string;
abstract public function getData(string $html, array $specialties, int $orgId): array;
abstract public function isValidData(array $data): bool;
}