sveden-parser/ContingentParser/Parser/ContingentParser.php

118 lines
3.6 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/**
* Парсер информации об образовательной организации
* с её сайта с использованием микроразметки
*/
namespace ContingentParser\Parser;
use DOMDocument;
use DOMXPath;
final class ContingentParser
{
private ?DOMXPath $xpath;
private DOMDocument $dom;
private const TEMPLATE = '//tr[@itemprop="eduChislen"]//';
private const ENCODING = "UTF-8";
private const FIELDS = [
"eduCode" => "td",
"eduName" => "td",
"eduLevel" => "td",
"eduForm" => "td",
"numberAll" => ["th", "td"]
];
public function __construct(string $html)
{
libxml_use_internal_errors(true);
$this->dom = new DOMDocument(
encoding: self::ENCODING
);
if (empty($html)) {
$this->xpath = null;
} else {
$this->setEncoding($html);
$this->dom->loadHTML($html);
$this->xpath = new DOMXPath($this->dom);
}
}
private function setEncoding(string &$html): void
{
$encoding = mb_detect_encoding($html, 'UTF-8, windows-1251');
if ($encoding != self::ENCODING) {
$html = mb_convert_encoding(
$html,
self::ENCODING,
$encoding
);
$html = str_replace('windows-1251',self::ENCODING, $html);
}
$html = mb_convert_encoding($html,'HTML-ENTITIES','UTF-8');
}
public function getDataTable(): array
{
if (empty($this->xpath)) return [];
$data = $this->parseContingent();
$records = [];
if ($data == null) return [];
$equal = $data['eduName']->length;
foreach ($data as $field) {
if ($field->length == 0) {
return [];
}
if ($field->length != $equal) {
return [];
}
}
for ($i = 0; $i < $data['eduCode']->length; $i++) {
$contingentRow = new ContingentRow(
$data['eduCode']->item($i)->textContent,
$data['eduName']->item($i)->textContent,
$data['eduLevel']->item($i)->textContent,
$data['eduForm']->item($i)->textContent,
(int)$data['numberAll']->item($i)->textContent
);
$records[] = $contingentRow->getData();
}
return $records;
}
private function parseContingent(): array
{
$data = [];
foreach (self::FIELDS as $field => $tag) {
if (!is_array($tag)) {
$data[$field] = $this->xpath->query(
self::TEMPLATE . $tag . "[@itemprop=\"$field\"]"
);
} else {
$th = $this->xpath->query(
self::TEMPLATE . $tag[0] . "[@itemprop=\"$field\"]"
);
$td = $this->xpath->query(
self::TEMPLATE . $tag[1] . "[@itemprop=\"$field\"]"
);
$data[$field] = $th->length > $td->length ? $th : $td;
}
}
return $data;
}
public function getLink(): string
{
$needle = "Информация о численности обучающихся";
$data = $this->dom->getElementsByTagName('a');
for ($i = 0; $i < $data->length; $i++) {
$haystack = $data->item($i)->textContent;
$isInformationOfContingent = strpos($haystack, $needle) !== false;
if ($isInformationOfContingent) {
return $data->item($i)->getAttribute('href');
}
}
return '';
}
}