BUGFIX: Updated HTTP::findByTagAndAttribute() to be more versatile, especially when dealing with attributes containing special characters.

From: Andrew Short <andrewjshort@gmail.com>

git-svn-id: svn://svn.silverstripe.com/silverstripe/open/modules/sapphire/trunk@88469 467b73ca-7a2a-4603-9d3b-597d59a354a9
This commit is contained in:
Andrew Short 2009-10-11 00:06:55 +00:00 committed by Sam Minnee
parent bd5b134c37
commit 75b875ae39
2 changed files with 60 additions and 33 deletions

View File

@ -100,24 +100,30 @@ class HTTP {
return Convert::xml2raw($url); return Convert::xml2raw($url);
} }
static function findByTagAndAttribute($content, $attribs) { /**
$regExps = array(); * Search for all tags with a specific attribute, then return the value of that attribute in a flat array.
*
* @param string $content
* @param array $attributes an array of tags to attributes, for example "[a] => 'href', [div] => 'id'"
* @return array
*/
public static function findByTagAndAttribute($content, $attributes) {
$regexes = array();
foreach($attribs as $tag => $attrib) { foreach($attributes as $tag => $attribute) {
$tagPrefix = (is_numeric($tag)) ? '' : "$tag "; $regexes[] = "/<{$tag} [^>]*$attribute *= *([\"'])(.*?)\\1[^>]*>/i";
$regexes[] = "/<{$tag} [^>]*$attribute *= *([^ \"'>]+)/i";
$regExps[] = "/(<{$tagPrefix}[^>]*$attrib *= *\")([^\"]*)(\")/ie";
$regExps[] = "/(<{$tagPrefix}[^>]*$attrib *= *')([^']*)(')/ie";
$regExps[] = "/(<{$tagPrefix}[^>]*$attrib *= *)([^\"' ]*)( )/ie";
} }
if($regExps) { $result = array();
foreach($regExps as $regExp) {
$content = preg_replace($regExp, '$items[] = "$2"', $content);
}
}
return isset($items) ? $items : null; if($regexes) foreach($regexes as $regex) {
if(preg_match_all($regex, $content, $matches)) {
$result = array_merge_recursive($result, (isset($matches[2]) ? $matches[2] : $matches[1]));
}
}
return count($result) ? $result : null;
} }
static function getLinksIn($content) { static function getLinksIn($content) {

View File

@ -10,17 +10,38 @@ class HTTPTest extends SapphireTest {
/** /**
* Tests {@link HTTP::getLinksIn()} * Tests {@link HTTP::getLinksIn()}
*/ */
public function testGetLinksIn() { public function testGetLinksIn() {
$content = ' $content = '
<h2>My page</h2> <h2><a href="/">My Cool Site</a></h2>
<p>A boy went <a href="home/">home</a> to see his <span><a href="mother/">mother</a></span>.</p>
';
$links = HTTP::getLinksIn($content); <p>
A boy went <a href="home/">home</a> to see his <span><a href="mother/">mother</a></span>. This
involved a short <a href="$Journey">journey</a>, as well as some <a href="space travel">space travel</a>
and <a href=unquoted>unquoted</a> events, as well as a <a href=\'single quote\'>single quote</a> from
his <a href="/father">father</a>.
</p>
$this->assertTrue(is_array($links)); <p>
$this->assertTrue(count($links) == 2); There were also some elements with extra <a class=attribute href=\'attributes\'>attributes</a> which
} played a part in his <a href=journey"extra id="JourneyLink">journey</a>. HE ALSO DISCOVERED THE
<A HREF="CAPS LOCK">KEY</a>. Later he got his <a href="quotes \'mixed\' up">mixed up</a>.
</p>
';
$expected = array (
'/', 'home/', 'mother/', '$Journey', 'space travel', 'unquoted', 'single quote', '/father', 'attributes',
'journey', 'CAPS LOCK', 'quotes \'mixed\' up'
);
$result = HTTP::getLinksIn($content);
// Results don't neccesarily come out in the order they are in the $content param.
sort($result);
sort($expected);
$this->assertTrue(is_array($result));
$this->assertEquals($expected, $result, 'Test that all links within the content are found.');
}
/** /**
* Tests {@link HTTP::setGetVar()} * Tests {@link HTTP::setGetVar()}