2024-08-08 12:32:27 +02:00
|
|
|
<?php
|
2024-09-03 14:41:45 +02:00
|
|
|
namespace ContingentParser\Parser;
|
2024-08-08 12:32:27 +02:00
|
|
|
|
2024-09-03 14:41:45 +02:00
|
|
|
use DOMDocument;
|
|
|
|
use DOMXPath;
|
2024-08-08 12:32:27 +02:00
|
|
|
|
2024-08-08 15:38:54 +02:00
|
|
|
class ContingentParser
|
2024-08-08 12:32:27 +02:00
|
|
|
{
|
2024-09-03 14:41:45 +02:00
|
|
|
private ?DOMXPath $xpath;
|
|
|
|
private const TEMPLATE = '//tr[@itemprop="eduChislen"]//';
|
|
|
|
private const ENCODING = "UTF-8";
|
2024-08-08 12:32:27 +02:00
|
|
|
private const FIELDS = [
|
|
|
|
"eduCode" => "td",
|
|
|
|
"eduName" => "td",
|
|
|
|
"eduLevel" => "td",
|
|
|
|
"eduForm" => "td",
|
2024-08-12 14:14:49 +02:00
|
|
|
"numberAll" => ["th", "td"]
|
2024-08-08 12:32:27 +02:00
|
|
|
];
|
|
|
|
|
2024-09-03 14:41:45 +02:00
|
|
|
public function __construct(string $html)
|
2024-08-08 12:32:27 +02:00
|
|
|
{
|
|
|
|
libxml_use_internal_errors(true);
|
2024-09-03 14:41:45 +02:00
|
|
|
$dom = new DOMDocument(
|
|
|
|
encoding: self::ENCODING
|
2024-08-21 13:51:01 +02:00
|
|
|
);
|
2024-09-03 14:41:45 +02:00
|
|
|
if (empty($html)) {
|
|
|
|
$this->xpath = null;
|
|
|
|
} else {
|
|
|
|
$this->setEncoding($html);
|
|
|
|
$dom->loadHTML($html);
|
|
|
|
$this->xpath = new DOMXPath($dom);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private function setEncoding(string &$html) : void
|
|
|
|
{
|
2024-08-26 16:20:55 +02:00
|
|
|
$encoding = mb_detect_encoding($html, 'UTF-8, windows-1251');
|
2024-09-03 14:41:45 +02:00
|
|
|
if ($encoding != self::ENCODING) {
|
2024-08-22 16:03:25 +02:00
|
|
|
$html = mb_convert_encoding(
|
|
|
|
$html,
|
2024-09-03 14:41:45 +02:00
|
|
|
self::ENCODING,
|
2024-08-26 16:20:55 +02:00
|
|
|
$encoding
|
2024-08-22 16:03:25 +02:00
|
|
|
);
|
2024-09-03 14:41:45 +02:00
|
|
|
$html = str_replace('windows-1251',self::ENCODING, $html);
|
2024-08-22 16:03:25 +02:00
|
|
|
}
|
2024-09-03 14:41:45 +02:00
|
|
|
$html = mb_convert_encoding($html,'HTML-ENTITIES','UTF-8');
|
2024-08-08 12:32:27 +02:00
|
|
|
}
|
2024-09-03 14:41:45 +02:00
|
|
|
private function parse() : array
|
2024-08-08 12:32:27 +02:00
|
|
|
{
|
2024-09-03 14:41:45 +02:00
|
|
|
$data = [];
|
2024-08-08 12:32:27 +02:00
|
|
|
foreach (self::FIELDS as $field => $tag) {
|
2024-08-12 14:14:49 +02:00
|
|
|
if (!is_array($tag)) {
|
2024-09-03 14:41:45 +02:00
|
|
|
$data[$field] = $this->xpath->query(
|
|
|
|
self::TEMPLATE . $tag . "[@itemprop=\"$field\"]"
|
|
|
|
);
|
2024-08-12 14:14:49 +02:00
|
|
|
} else {
|
2024-09-03 14:41:45 +02:00
|
|
|
$th = $this->xpath->query(
|
|
|
|
self::TEMPLATE . $tag[0] . "[@itemprop=\"$field\"]"
|
|
|
|
);
|
|
|
|
$td = $this->xpath->query(
|
|
|
|
self::TEMPLATE . $tag[1] . "[@itemprop=\"$field\"]"
|
|
|
|
);
|
2024-08-16 12:44:49 +02:00
|
|
|
$data[$field] = $th->length > $td->length ? $th : $td;
|
2024-08-12 14:14:49 +02:00
|
|
|
}
|
2024-08-08 12:32:27 +02:00
|
|
|
}
|
|
|
|
return $data;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function getDataTable() : array
|
|
|
|
{
|
2024-09-03 14:41:45 +02:00
|
|
|
if (empty($this->xpath)) return [];
|
|
|
|
|
2024-08-08 12:32:27 +02:00
|
|
|
$data = $this->parse();
|
2024-09-03 14:41:45 +02:00
|
|
|
$records = [];
|
2024-08-19 09:17:54 +02:00
|
|
|
if ($data == null) return [];
|
2024-08-08 12:32:27 +02:00
|
|
|
|
2024-08-19 09:17:54 +02:00
|
|
|
$equal = $data['eduName']->length;
|
2024-08-16 12:44:49 +02:00
|
|
|
foreach ($data as $field) {
|
|
|
|
if ($field->length == 0) {
|
|
|
|
return [];
|
|
|
|
}
|
2024-08-19 09:17:54 +02:00
|
|
|
if ($field->length != $equal) {
|
2024-08-16 12:44:49 +02:00
|
|
|
return [];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-12 14:14:49 +02:00
|
|
|
for ($i = 0; $i < $data['eduCode']->length; $i++) {
|
2024-08-08 15:38:54 +02:00
|
|
|
$contingentRow = new ContingentRow(
|
2024-08-08 12:32:27 +02:00
|
|
|
$data['eduCode']->item($i)->textContent,
|
|
|
|
$data['eduName']->item($i)->textContent,
|
|
|
|
$data['eduLevel']->item($i)->textContent,
|
|
|
|
$data['eduForm']->item($i)->textContent,
|
|
|
|
(int)$data['numberAll']->item($i)->textContent
|
|
|
|
);
|
2024-08-08 15:38:54 +02:00
|
|
|
$records[] = $contingentRow->getData();
|
2024-08-08 12:32:27 +02:00
|
|
|
}
|
|
|
|
return $records;
|
|
|
|
}
|
|
|
|
}
|