Работает обработка ссылок на таблицы с численностью

This commit is contained in:
2024-09-06 14:11:38 +03:00
parent 04374fef40
commit 2be45826c1
1698 changed files with 138656 additions and 174 deletions

View File

@ -1,6 +1,7 @@
<?php
namespace ContingentParser\Http;
use ContingentParser\Color;
use ContingentParser\Logger\HttpLogger;
use ContingentParser\Printer;
use CurlHandle;
@ -12,16 +13,17 @@ final class CurlHelper
private CurlHandle|bool $curl;
private string $url;
private array $site;
private int $countRedirect;
private const MAX_REDIRECT = 5;
/**
* Коснтруктор
* Инициализация сессии
* @param string $url
* URL сайта
* @param array $site
* Идентификатор организации и базовый URL сайта
* @param string $url URL сайта
* @param array $site Идентификатор организации и базовый URL сайта
*/
public function __construct(string $url, array $site)
{
$this->countRedirect = 0;
$this->url = $url;
$this->site = $site;
@ -49,21 +51,25 @@ final class CurlHelper
* Получить html-разметку
* @return string
*/
public function getContent() : string
public function getContent(): string
{
curl_setopt($this->curl, CURLOPT_URL, $this->url);
$html = curl_exec($this->curl);
if ($this->checkLocation($this->url, $html)) {
$html = $this->getContent();
if ($this->countRedirect < self::MAX_REDIRECT) {
curl_setopt($this->curl, CURLOPT_URL, $this->url);
$html = curl_exec($this->curl);
if ($this->checkLocation($this->url, $html)) {
$this->countRedirect++;
$html = $this->getContent();
}
return $html;
}
return $html;
return '';
}
/**
* Summary of checkLocation
* @param string $html
* @return bool
*/
private function checkLocation(string &$url, string $html) : bool
private function checkLocation(string &$url, string $html): bool
{
preg_match('/location:(.*?)\n/i', $html, $matches);
if (empty($matches)) return false;
@ -77,14 +83,14 @@ final class CurlHelper
* Сообщить об ошибке
* @return void
*/
public function reportError() : void
public function reportError(): void
{
$httpLogger = new HttpLogger('log/http-curl.log');
$httpCode = curl_getinfo($this->curl, CURLINFO_HTTP_CODE);
if ($httpCode != 200 && $httpCode != 0) {
Printer::println("HTTP-code: $httpCode", 'red');
Printer::println("HTTP-code: $httpCode", Color::RED);
$message = implode(' ', $this->site) . ' HTTP-code(' . $httpCode.')';
$httpLogger->log($message, $httpCode);
} else if ($httpCode == 0) {
@ -93,7 +99,7 @@ final class CurlHelper
$message .= " cURL error ({$errno}): ".curl_strerror($errno);
$httpLogger->log($message);
} else {
Printer::println("HTTP-code: $httpCode", 'blue');
Printer::println("HTTP-code: $httpCode", Color::BLUE);
}
}
}

View File

@ -1,6 +1,7 @@
<?php
namespace ContingentParser\Http;
use ContingentParser\Color;
use ContingentParser\Printer;
use GuzzleHttp\Client;
use GuzzleHttp\TransferStats;
@ -17,10 +18,11 @@ final class HttpClientFacade
* @param array $site Идентификатор организации, и базовый URL
* @return string
*/
public function processEducationContingentSites(
public function getContentOfSite(
string $url,
array $site
) : string {
array $site,
string $uri = "sveden/education/"
): string {
try {
$client = $this->createClient($url);
// Запрос по базовому uri
@ -30,18 +32,18 @@ final class HttpClientFacade
}
]);
Printer::println("Redirect $url -> $redirectUrl");
$url .= substr($url, -1) == '/' ? '':'/';
$url .= "sveden/education/study";
$url .= substr($url, -1) == '/' ? '' : '/';
$url .= substr($uri, 0, 1) == '/' ? substr($uri, 1) : $uri;
Printer::println("Parsing for $url");
$response = $client->get($url);
$httpCode = $response->getStatusCode();
Printer::println("HTTP-code: $httpCode", 'blue');
Printer::println("HTTP-code: $httpCode", Color::BLUE);
$html = $response->getBody()->getContents();
} catch (\Exception $e
) {
Printer::println("HTTP-code: ".$e->getCode(), 'red');
Printer::println("HTTP-code: ".$e->getCode(), Color::RED);
$html = $this->handleException($url, $site);
} finally {
return $html;
@ -75,7 +77,7 @@ final class HttpClientFacade
* Конфигурация клиента
* @return array
*/
private function config() : array
private function config(): array
{
return [
'force_ip_resolve' => 'v4',
@ -87,7 +89,7 @@ final class HttpClientFacade
'protocols' => ['http', 'https'],
'track_redirects' => true
],
'connect_timeout' => 300.0,
'connect_timeout' => 90.0,
'verify' => false,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) '

View File

@ -1,16 +1,15 @@
<?php
namespace ContingentParser\Http;
class UrlBuilder
final class UrlBuilder
{
public function __construct() {}
/**
* Строит валидный URL сайта
* @param string $url
* Изначальный URL
* @param string $url Изначальный URL
* @return string
*/
public function build(string $url) : string
public function build(string $url): string
{
// Строит -> https://<base_uri>
$url = trim(strtolower($url));
@ -18,13 +17,26 @@ class UrlBuilder
$url = str_replace("www/", "www.", $url);
$url = str_replace("http:\\\\", "", $url);
if (!preg_match('#^https?://#', $url)) {
$url = "http://$url";
$url = "https://$url";
}
// $url = str_replace("http://", "https://", $url);
$url = str_replace("http://", "https://", $url);
$arr = parse_url($url);
$url = $arr['scheme'] . '://' . $arr['host'] . '/';
// $url = str_replace("www.", "", $url);
$url = str_replace("www.", "", $url);
$url = str_replace("_", "/", $url);
return trim($url);
}
public function checkUri(string $uri): bool
{
if (str_ends_with($uri, ".pdf")
|| str_ends_with($uri, ".docx")
|| str_ends_with($uri, ".doc")
|| str_starts_with($uri, "javascript")
) {
return false;
}
return true;
}
}