2013-08-25 23:49:49 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
class CheckExternalLinks extends BuildTask {
|
2014-07-30 02:34:39 +02:00
|
|
|
public static $pageToProcess;
|
2013-08-25 23:49:49 +02:00
|
|
|
protected $title = 'Checking broken External links in the SiteTree';
|
|
|
|
|
|
|
|
protected $description = 'A task that records external broken links in the SiteTree';
|
|
|
|
|
|
|
|
protected $enabled = true;
|
|
|
|
|
2014-07-22 23:57:10 +02:00
|
|
|
private $completedPages;
|
|
|
|
private $totalPages;
|
|
|
|
|
2013-08-25 23:49:49 +02:00
|
|
|
function run($request) {
|
2014-07-31 06:49:20 +02:00
|
|
|
$track = CheckExternalLinks::getLatestTrack();
|
|
|
|
|
|
|
|
// if the script has already been started
|
|
|
|
if ($track && $track->Status == 'Running') {
|
|
|
|
$batch = BrokenExternalPageTrack::get()
|
|
|
|
->filter(array(
|
|
|
|
'TrackID' => $track->ID,
|
|
|
|
'Processed' => 0
|
|
|
|
))->limit(10)->column('PageID');
|
|
|
|
$pages = Versioned::get_by_stage('SiteTree', 'Live')
|
|
|
|
->filter('ID', $batch)
|
|
|
|
->limit(10);
|
|
|
|
$this->updateJobInfo('Fetching pages to check');
|
|
|
|
if ($track->CompletedPages == $track->TotalPages) {
|
|
|
|
$track->Status = 'Completed';
|
|
|
|
$track->write();
|
|
|
|
$this->updateJobInfo('Setting to completed');
|
|
|
|
}
|
|
|
|
// if the script is to be started
|
2014-07-24 04:20:48 +02:00
|
|
|
} else {
|
2014-07-31 06:49:20 +02:00
|
|
|
$pages = Versioned::get_by_stage('SiteTree', 'Live')->column('ID');
|
|
|
|
$noPages = count($pages);
|
|
|
|
|
|
|
|
$track = BrokenExternalPageTrackStatus::create();
|
|
|
|
$track->TotalPages = $noPages;
|
|
|
|
$track->write();
|
|
|
|
$this->updateJobInfo('Creating new tracking object');
|
|
|
|
|
|
|
|
foreach ($pages as $page) {
|
|
|
|
$trackPage = BrokenExternalPageTrack::create();
|
|
|
|
$trackPage->PageID = $page;
|
|
|
|
$trackPage->TrackID = $track->ID;
|
|
|
|
$trackPage->write();
|
2014-07-30 02:34:39 +02:00
|
|
|
}
|
2014-07-31 06:49:20 +02:00
|
|
|
|
|
|
|
$batch = BrokenExternalPageTrack::get()
|
|
|
|
->filter(array(
|
|
|
|
'TrackID' => $track->ID
|
|
|
|
))->limit(10)->column('PageID');
|
|
|
|
|
|
|
|
$pages = Versioned::get_by_stage('SiteTree', 'Live')
|
|
|
|
->filter('ID', $batch);
|
2014-07-24 04:20:48 +02:00
|
|
|
}
|
2014-07-31 06:49:20 +02:00
|
|
|
$trackID = $track->ID;
|
2013-08-25 23:49:49 +02:00
|
|
|
foreach ($pages as $page) {
|
2014-07-22 23:57:10 +02:00
|
|
|
++$this->totalPages;
|
|
|
|
|
2014-07-31 06:49:20 +02:00
|
|
|
if ($track->ID) {
|
|
|
|
$trackPage = BrokenExternalPageTrack::get()
|
|
|
|
->filter(array(
|
|
|
|
'PageID' => $page->ID,
|
|
|
|
'TrackID' => $track->ID
|
|
|
|
))->first();
|
|
|
|
$trackPage->Processed = 1;
|
|
|
|
$trackPage->write();
|
|
|
|
}
|
|
|
|
|
2013-08-25 23:49:49 +02:00
|
|
|
$htmlValue = Injector::inst()->create('HTMLValue', $page->Content);
|
2014-07-22 23:57:10 +02:00
|
|
|
if (!$htmlValue->isValid()) {
|
|
|
|
continue;
|
|
|
|
}
|
2013-08-25 23:49:49 +02:00
|
|
|
|
|
|
|
// Populate link tracking for internal links & links to asset files.
|
|
|
|
if($links = $htmlValue->getElementsByTagName('a')) foreach($links as $link) {
|
2014-07-22 23:57:10 +02:00
|
|
|
$class = $link->getAttribute('class');
|
|
|
|
$pos = stripos($class, 'ss-broken');
|
|
|
|
if ($pos !== false && $page->HasBrokenLink == 1) continue;
|
|
|
|
|
2013-08-25 23:49:49 +02:00
|
|
|
$href = Director::makeRelative($link->getAttribute('href'));
|
|
|
|
if ($href == 'admin/') continue;
|
|
|
|
|
2014-07-22 23:57:10 +02:00
|
|
|
// ignore SiteTree, anchor and assets links as they will be caught
|
|
|
|
// by SiteTreeLinkTracking
|
|
|
|
if(preg_match('/\[(file_link|sitetree_link),id=([0-9]+)\]/i', $href, $matches)) {
|
|
|
|
continue;
|
|
|
|
} else if (isset($href[0]) && $href[0] == '#') {
|
2013-08-26 00:55:04 +02:00
|
|
|
continue;
|
2013-08-25 23:49:49 +02:00
|
|
|
} else if(substr($href, 0, strlen(ASSETS_DIR) + 1) == ASSETS_DIR.'/') {
|
2013-08-26 00:55:04 +02:00
|
|
|
continue;
|
2013-08-25 23:49:49 +02:00
|
|
|
}
|
2014-07-22 23:57:10 +02:00
|
|
|
|
2013-08-25 23:49:49 +02:00
|
|
|
if($href && function_exists('curl_init')) {
|
|
|
|
$handle = curl_init($href);
|
|
|
|
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
|
2014-07-28 02:39:19 +02:00
|
|
|
curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, 5);
|
|
|
|
curl_setopt($handle, CURLOPT_TIMEOUT, 10);
|
2013-08-25 23:49:49 +02:00
|
|
|
$response = curl_exec($handle);
|
|
|
|
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
|
|
|
|
curl_close($handle);
|
2014-07-30 05:29:24 +02:00
|
|
|
// do we have any whitelisted codes
|
|
|
|
$ignoreCodes = Config::inst()->get('CheckExternalLinks', 'IgnoreCodes');
|
|
|
|
// if the code is whitelisted set it to 200
|
|
|
|
$httpCode = (is_array($ignoreCodes) && in_array($httpCode, $ignoreCodes)) ?
|
|
|
|
200 : $httpCode;
|
|
|
|
|
|
|
|
// ignore empty hrefs and internal links
|
|
|
|
if (($httpCode < 200 || $httpCode > 302) || ($href == '' || $href[0] == '/')) {
|
2014-07-28 02:39:19 +02:00
|
|
|
$brokenLink = new BrokenExternalLink();
|
2014-07-24 04:50:14 +02:00
|
|
|
$brokenLink->PageID = $page->ID;
|
|
|
|
$brokenLink->Link = $href;
|
|
|
|
$brokenLink->HTTPCode = $httpCode;
|
|
|
|
$brokenLink->write();
|
|
|
|
|
2014-07-22 23:57:10 +02:00
|
|
|
// set the broken link class
|
|
|
|
$class = ($class && stripos($class, 'ss-broken')) ?
|
|
|
|
$class . ' ss-broken' : 'ss-broken';
|
|
|
|
$link->setAttribute('class', ($class ? $class : 'ss-broken'));
|
|
|
|
$htmlValue->__call('saveHTML', array());
|
|
|
|
|
|
|
|
$page->Content = $htmlValue->getContent();
|
2014-07-24 04:20:48 +02:00
|
|
|
$page->owner->write();
|
2014-07-22 23:57:10 +02:00
|
|
|
|
|
|
|
if (!$page->HasBrokenLink) {
|
2014-07-24 04:20:48 +02:00
|
|
|
|
2014-07-22 23:57:10 +02:00
|
|
|
// bypass the ORM as syncLinkTracking does not allow you
|
|
|
|
// to update HasBrokenLink to true
|
|
|
|
$query = "UPDATE \"SiteTree_Live\" SET \"HasBrokenLink\" = 1 ";
|
|
|
|
$query .= "WHERE \"ID\" = " . (int)$page->ID;
|
|
|
|
$result = DB::query($query);
|
|
|
|
if (!$result) {
|
|
|
|
$this->debugMessage('Error updating HasBrokenLink');
|
|
|
|
}
|
2013-08-25 23:49:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-07-22 23:57:10 +02:00
|
|
|
++$this->completedPages;
|
2014-07-31 06:49:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// run this outside the foreach loop to stop it locking DB rows
|
|
|
|
$this->updateJobInfo('Updating completed pages');
|
|
|
|
$this->updateCompletedPages($trackID);
|
|
|
|
|
|
|
|
// do we need to carry on running the job
|
|
|
|
$track = $this->getLatestTrack();
|
|
|
|
if ($track->CompletedPages >= $track->TotalPages) {
|
|
|
|
$track->Status = 'Completed';
|
|
|
|
$track->write();
|
|
|
|
|
|
|
|
// clear any old previous data
|
|
|
|
$rows = BrokenExternalPageTrack::get()
|
|
|
|
->exclude('TrackID', $track->ID);
|
|
|
|
foreach ($rows as $row) {
|
|
|
|
$row->delete();
|
2014-07-30 02:34:39 +02:00
|
|
|
}
|
2014-07-31 06:49:20 +02:00
|
|
|
} else {
|
|
|
|
$this->updateJobInfo("Running next batch {$track->CompletedPages}/{$track->TotalPages}");
|
|
|
|
$this->run($request);
|
2013-08-25 23:49:49 +02:00
|
|
|
}
|
2013-08-26 02:24:36 +02:00
|
|
|
|
2013-08-26 04:14:36 +02:00
|
|
|
// run this again if queued jobs exists and is a valid int
|
2014-07-28 02:39:19 +02:00
|
|
|
$queuedJob = Config::inst()->get('CheckExternalLinks', 'Delay');
|
2013-08-26 04:14:36 +02:00
|
|
|
if (isset($queuedJob) && is_int($queuedJob) && class_exists('QueuedJobService')) {
|
2013-08-26 02:24:36 +02:00
|
|
|
$checkLinks = new CheckExternalLinksJob();
|
|
|
|
singleton('QueuedJobService')
|
2013-08-26 04:14:36 +02:00
|
|
|
->queueJob($checkLinks, date('Y-m-d H:i:s', time() + $queuedJob));
|
2013-08-26 02:24:36 +02:00
|
|
|
}
|
2014-07-31 06:49:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
public static function getLatestTrack() {
|
|
|
|
$track = BrokenExternalPageTrackStatus::get()->sort('ID', 'DESC')->first();
|
|
|
|
if (!$track || !$track->exists()) return null;
|
|
|
|
return $track;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static function getLatestTrackID() {
|
|
|
|
$track = CheckExternalLinks::getLatestTrack();
|
|
|
|
if (!$track || !$track->exists()) return null;
|
|
|
|
return $track->ID;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static function getLatestTrackStatus() {
|
|
|
|
$track = CheckExternalLinks::getLatestTrack();
|
|
|
|
if (!$track || !$track->exists()) return null;
|
|
|
|
return $track->Status;
|
|
|
|
}
|
|
|
|
|
|
|
|
private function updateCompletedPages($trackID = 0) {
|
|
|
|
$noPages = BrokenExternalPageTrack::get()
|
|
|
|
->filter(array('TrackID' => $trackID, 'Processed' => 1))->count();
|
|
|
|
$track = $this->getLatestTrack($trackID);
|
|
|
|
$track->CompletedPages = $noPages;
|
|
|
|
$track->write();
|
|
|
|
return $noPages;
|
|
|
|
}
|
2013-08-26 02:24:36 +02:00
|
|
|
|
2014-07-31 06:49:20 +02:00
|
|
|
private function updateJobInfo($message) {
|
|
|
|
$track = CheckExternalLinks::getLatestTrack();
|
|
|
|
if (!$track || !$track->exists()) return null;
|
|
|
|
$track->JobInfo = $message;
|
|
|
|
$track->write();
|
2013-08-25 23:49:49 +02:00
|
|
|
}
|
|
|
|
}
|