diff --git a/README.md b/README.md index 656931f..018af09 100644 --- a/README.md +++ b/README.md @@ -110,3 +110,19 @@ following config in config.yml. SilverStripe\ExternalLinks\Tasks\CurlLinkChecker:: bypass_cache: 1 ``` + +## Headers + +You may want to set headers to be sent with the CURL request (eg: user-agent) to avoid website rejecting the request thinking it is a bot. +You can set them with the following config in config.yml. + +```yaml +# Headers +SilverStripe\ExternalLinks\Tasks\CurlLinkChecker: + user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0" + headers: + - "accept-encoding: gzip, deflate, br" + - "referer: https://www.domain.com/" + - "sec-fetch-mode: navigate" + ... +``` \ No newline at end of file diff --git a/src/Tasks/CurlLinkChecker.php b/src/Tasks/CurlLinkChecker.php index df6fc76..cc8f2d1 100644 --- a/src/Tasks/CurlLinkChecker.php +++ b/src/Tasks/CurlLinkChecker.php @@ -31,6 +31,24 @@ class CurlLinkChecker implements LinkChecker */ private static $bypass_cache = false; + /** + * Set default user agent as config + * Override via YAML file + * + * * @config + * @var string + */ + private static $user_agent = ''; + + /** + * Allow to pass custom header to be in CURL request + * + * * @config + * @var array + */ + + private static $headers = []; + /** * Return cache * @@ -66,11 +84,28 @@ class CurlLinkChecker implements LinkChecker // No cached result so just request $handle = curl_init($href); curl_setopt($handle, CURLOPT_RETURNTRANSFER, true); + curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, 5); + curl_setopt($handle, CURLOPT_TIMEOUT, 10); if ($this->config()->get('follow_location')) { curl_setopt($handle, CURLOPT_FOLLOWLOCATION, true); } - curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, 5); - curl_setopt($handle, CURLOPT_TIMEOUT, 10); + + // Add user agent + $userAgent = trim($this->config()->get('user_agent')); + if ($userAgent) { + curl_setopt($handle, CURLOPT_USERAGENT , $userAgent); + } + + // Other headers + if ($headers = $this->config()->get('headers')) { + if (is_array($headers)) { + curl_setopt($handle, CURLOPT_HTTPHEADER , $headers); + } else { + curl_setopt($handle, CURLOPT_HTTPHEADER , array($headers)); + } + } + + // Retrieve http code curl_exec($handle); $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE); curl_close($handle); @@ -79,6 +114,7 @@ class CurlLinkChecker implements LinkChecker // Cache result $this->getCache()->set($cacheKey, $httpCode); } + return $httpCode; } }