From cbc228f9a292199ce2e4c2fb0b14dbd4d28d53b2 Mon Sep 17 00:00:00 2001 From: Sam Minnee Date: Fri, 3 Oct 2008 00:27:58 +0000 Subject: [PATCH] API CHANGE: Created CSVParser class and updated CSVBulkLoader to use it git-svn-id: svn://svn.silverstripe.com/silverstripe/open/modules/sapphire/trunk@63563 467b73ca-7a2a-4603-9d3b-597d59a354a9 --- dev/CSVParser.php | 234 ++++++++++++++++++ dev/CsvBulkLoader.php | 89 ++----- tests/dev/CSVParserTest.php | 80 ++++++ tests/{ => dev}/CsvBulkLoaderTest.php | 18 +- tests/{ => dev}/CsvBulkLoaderTest.yml | 0 tests/{ => dev}/CsvBulkLoaderTest_Players.csv | 0 ...est_PlayersWithCustomHeaderAndRelation.csv | 0 .../CsvBulkLoaderTest_PlayersWithHeader.csv | 3 +- .../CsvBulkLoaderTest_PlayersWithId.csv | 0 ...CsvBulkLoaderTest_PlayersWithIdUpdated.csv | 0 10 files changed, 343 insertions(+), 81 deletions(-) create mode 100644 dev/CSVParser.php create mode 100644 tests/dev/CSVParserTest.php rename tests/{ => dev}/CsvBulkLoaderTest.php (87%) rename tests/{ => dev}/CsvBulkLoaderTest.yml (100%) rename tests/{ => dev}/CsvBulkLoaderTest_Players.csv (100%) rename tests/{ => dev}/CsvBulkLoaderTest_PlayersWithCustomHeaderAndRelation.csv (100%) rename tests/{ => dev}/CsvBulkLoaderTest_PlayersWithHeader.csv (58%) rename tests/{ => dev}/CsvBulkLoaderTest_PlayersWithId.csv (100%) rename tests/{ => dev}/CsvBulkLoaderTest_PlayersWithIdUpdated.csv (100%) diff --git a/dev/CSVParser.php b/dev/CSVParser.php new file mode 100644 index 000000000..701ae46be --- /dev/null +++ b/dev/CSVParser.php @@ -0,0 +1,234 @@ + + * $parser = new CSVParser('myfile.csv'); + * $parser->mapColumns( + * 'first name' => 'FirstName' + * 'lastname' => 'Surname', + * 'last name' => 'Surname' + * )); + * foreach($parser as $row) { + * // $row is a map of column name => column value + * $obj = new MyDataObject(); + * $obj->update($row); + * $obj->write(); + * } + * + */ +class CSVParser extends Object implements Iterator { + protected $filename; + protected $fileHandle; + + /** + * Map of source columns to output columns + * Once they get into this variable, all of the source columns are in lowercase + */ + protected $columnMap = array(); + + /** + * The header row used to map data in the CSV file + * To begin with, this is null. Once it has been set, data will get returned from the CSV file + */ + protected $headerRow = null; + + /** + * A custom header row provided by the caller + */ + protected $providedHeaderRow = null; + + /** + * The data of the current row + */ + protected $currentRow = null; + + /** + * The current row number + * 1 is the first data row in the CSV file; the header row, if it exists, is ignored + */ + protected $rowNum = 0; + + /** + * The character for separating columns + */ + protected $delimiter = ","; + + /** + * The character for quoting colums + */ + protected $enclosure = '"'; + + /** + * Open a CSV file for parsing. + * You can use the object returned in a foreach loop to extract the data + * @param $filename The name of the file. If relative, it will be relative to the site's base dir + * @param $delimiter The character for seperating columns + * @param $enclosure The character for quoting or enclosing columns + */ + function __construct($filename, $delimiter = ",", $enclosure = '"') { + if($filename['0'] != '/') $filename = Director::baseFolder() . '/' . $filename; + $this->filename = $filename; + $tis->delimiter = ","; + $this->enclosure = '"'; + } + + /** + * Re-map columns in the CSV file. + * This can be useful for identifying synonyms in the file + * For example: + * + * $csv->mapColumns(array( + * 'firstname' => 'FirstName', + * 'last name' => 'Surname', + * )); + * + */ + function mapColumns($columnMap) { + if($columnMap) { + $lowerColumnMap = array(); + foreach($columnMap as $k => $v) { + $lowerColumnMap[strtolower($k)] = $v; + } + $this->columnMap = array_merge($this->columnMap, $lowerColumnMap); + } + } + + /** + * If your CSV file doesn't have a header row, then you can call this function to provide one. + * If you call this function, then the first row of the CSV will be included in the data returned. + */ + function provideHeaderRow($headerRow) { + $this->providedHeaderRow = $headerRow; + } + + /** + * Open the CSV file for reading + */ + protected function openFile() { + ini_set('auto_detect_line_endings',1); + $this->fileHandle = fopen($this->filename,'r'); + + + if($this->providedHeaderRow) { + $this->headerRow = $this->remapHeader($this->providedHeaderRow); + } + } + + /** + * Close the CSV file and re-set all of the internal variables + */ + protected function closeFile() { + if($this->fileHandle) fclose($this->fileHandle); + $this->fileHandle = null; + + $this->rowNum = 0; + $this->currentRow = null; + $this->headerRow = null; + } + + + /** + * Get a header row from the CSV file + */ + protected function fetchCSVHeader() { + $srcRow = fgetcsv($this->fileHandle, 0, $this->delimiter, $this->enclosure); + $this->headerRow = $this->remapHeader($srcRow); + } + + /** + * Map the contents of a header array using $this->mappedColumns + */ + protected function remapHeader($header) { + $mappedHeader = array(); + foreach($header as $item) { + if(isset($this->columnMap[strtolower($item)])) $item = $this->columnMap[strtolower($item)]; + $mappedHeader[] = $item; + } + return $mappedHeader; + } + + /** + * Get a row from the CSV file and update $this->currentRow; + */ + protected function fetchCSVRow() { + if(!$this->fileHandle) $this->openFile(); + if(!$this->headerRow) $this->fetchCSVHeader(); + + $this->rowNum++; + + $srcRow = fgetcsv($this->fileHandle, 0, $this->delimiter, $this->enclosure); + if($srcRow) { + $row = array(); + foreach($srcRow as $i => $value) { + // Allow escaping of quotes and commas in the data + $value = str_replace( + array('\\'.$this->enclosure,'\\'.$this->delimiter), + array($this->enclosure,$this->delimiter),$value); + if(array_key_exists($i, $this->headerRow)) { + if($this->headerRow[$i]) $row[$this->headerRow[$i]] = $value; + } else { + user_error("No heading for column $i on row $this->rowNum", E_USER_WARNING); + } + } + + $this->currentRow = $row; + } else { + $this->closeFile(); + } + return $this->currentRow; + } + + /** + * @ignore + */ + function __destruct() { + $this->closeFile(); + } + + //// ITERATOR FUNCTIONS + + /** + * @ignore + */ + function rewind() { + $this->closeFile(); + $this->fetchCSVRow(); + } + + /** + * @ignore + */ + function current() { + return $this->currentRow; + } + + /** + * @ignore + */ + function key() { + return $this->rowNum; + } + + /** + * @ignore + */ + function next() { + $this->fetchCSVRow(); + return $this->currentRow; + } + + /** + * @ignore + */ + function valid() { + return $this->currentRow ? true : false; + } + + +} + +?> \ No newline at end of file diff --git a/dev/CsvBulkLoader.php b/dev/CsvBulkLoader.php index 60d29a22a..338af1f0e 100644 --- a/dev/CsvBulkLoader.php +++ b/dev/CsvBulkLoader.php @@ -27,7 +27,8 @@ class CsvBulkLoader extends BulkLoader { public $enclosure = '"'; /** - * Identifies if the loaded file has a header row. + * Identifies if the + * has a header row. * If a {@link self::$columnMap} is passed, we assume * the file has no headerrow, unless explicitly noted. * @@ -36,67 +37,26 @@ class CsvBulkLoader extends BulkLoader { public $hasHeaderRow = false; protected function processAll($filepath, $preview = false) { - ini_set('auto_detect_line_endings',1); - - $file = fopen($filepath, 'r'); - if(!$file) return false; - $results = new BulkLoader_Result(); - - if($this->hasHeaderRow && $this->columnMap) { - $columnRow = fgetcsv($file, 0, $this->delimiter, $this->enclosure); - $columnMap = array(); - foreach($columnRow as $k => $origColumnName) { - $origColumnName = trim($origColumnName); - if(isset($this->columnMap[$origColumnName])) { - $columnMap[$origColumnName] = $this->columnMap[$origColumnName]; - } else { - $columnMap[$origColumnName] = null; - } - - } - } elseif($this->columnMap) { - $columnMap = $this->columnMap; - } else { - // assuming that first row is column naming if no columnmap is passed - $columnRow = fgetcsv($file, 0, $this->delimiter, $this->enclosure); - $columnMap = array_combine($columnRow, $columnRow); - } - - $rowIndex = 0; - $rowIndex = 0; - while (($row = fgetcsv($file, 0, $this->delimiter, $this->enclosure)) !== FALSE) { - $rowIndex++; - - /* - // the columnMap should have the same amount of columns as each record row - if(count(array_keys($columnMap)) == count(array_values($row))) { - user_error("CsvBulkLoader::processAll(): Columns in row {$rowIndex} don't match the \$columnMap", E_USER_WARNING); - } - */ - - $indexedRow = array(); - foreach($columnMap as $origColumnName => $fieldName) { - // in case the row has less fields than the columnmap, - // ignore the "leftover" mappings - if(!isset($row[count($indexedRow)])) { - user_error("CsvBulkLoader::processAll(): Columns in row {$rowIndex} don't match the \$columnMap", E_USER_NOTICE); - continue; - } - - $indexedRow[$origColumnName] = $row[count($indexedRow)]; - } - - $this->processRecord($indexedRow, $columnMap, $results); + + $csv = new CSVParser($filepath, $this->delimiter, $this->enclosure); + + // ColumnMap has two uses, depending on whether hasHeaderRow is set + if($this->columnMap) { + if($this->hasHeaderRow) $csv->mapColumns($this->columnMap); + else $csv->provideHeaderRow($this->columnMap); } - fclose($file); + foreach($csv as $row) { + $this->processRecord($row, array(), $results, $preview); + } return $results; } /** * @todo Better messages for relation checks and duplicate detection + * Note that columnMap isn't used */ protected function processRecord($record, $columnMap, &$results, $preview = false) { $class = $this->objectClass; @@ -108,9 +68,7 @@ class CsvBulkLoader extends BulkLoader { // first run: find/create any relations and store them on the object // we can't combine runs, as other columns might rely on the relation being present $relations = array(); - foreach($record as $origColumnName => $val) { - $fieldName = $columnMap[$origColumnName]; - + foreach($record as $fieldName => $val) { // don't bother querying of value is not set if($this->isNullValue($val)) continue; @@ -128,6 +86,7 @@ class CsvBulkLoader extends BulkLoader { $obj->setComponent($relationName, $relationObj); $obj->{"{$relationName}ID"} = $relationObj->ID; $obj->write(); + } elseif(strpos($fieldName, '.') !== false) { // we have a relation column with dot notation list($relationName,$columnName) = split('\.', $fieldName); @@ -143,24 +102,13 @@ class CsvBulkLoader extends BulkLoader { $id = ($preview) ? 0 : $obj->write(); // second run: save data - foreach($record as $origColumnName => $val) { - $fieldName = $columnMap[$origColumnName]; - + foreach($record as $fieldName => $val) { if($this->isNullValue($val, $fieldName)) continue; if($obj->hasMethod("import{$fieldName}")) { $obj->{"import{$fieldName}"}($val, $record); - } elseif(strpos($fieldName, '.') !== false) { - // we have a relation column - list($relationName,$columnName) = split('\.', $fieldName); - $relationObj = $obj->getComponent($relationName); - $relationObj->{$columnName} = $val; - $relationObj->write(); - $obj->flushCache(); // avoid relation caching confusion - //} elseif($obj->hasField($fieldName) || $obj->hasMethod($fieldName)) { } else { - // plain old value setter - $obj->{$fieldName} = $val; + $obj->update(array($fieldName => $val)); } } @@ -187,10 +135,9 @@ class CsvBulkLoader extends BulkLoader { * columns specified via {@link self::$duplicateChecks} * * @param array $record CSV data column - * @param array $columnMap * @return unknown */ - public function findExistingObject($record, $columnMap) { + public function findExistingObject($record) { // checking for existing records (only if not already found) foreach($this->duplicateChecks as $fieldName => $duplicateCheck) { if(is_string($duplicateCheck)) { diff --git a/tests/dev/CSVParserTest.php b/tests/dev/CSVParserTest.php new file mode 100644 index 000000000..7886a7ec9 --- /dev/null +++ b/tests/dev/CSVParserTest.php @@ -0,0 +1,80 @@ +assertEquals(array('FirstName','Biography','Birthday'), array_keys($record)); + $firstNames[] = $record['FirstName']; + $biographies[] = $record['Biography']; + $birthdays[] = $record['Birthday']; + } + + $this->assertEquals(array('John','Jane','Jamie','Järg'), $firstNames); + $this->assertEquals(array( + "He's a good guy", + "She is awesome.\nSo awesome that she gets multiple rows and \"escaped\" strings in her biography", + "Pretty old, with an escaped comma", + "Unicode FTW"), $biographies); + $this->assertEquals(array("31/01/1988","31/01/1982","31/01/1882","31/06/1982"), $birthdays); + } + + function testParsingWithHeadersAndColumnMap() { + /* By default, a CSV file will be interpreted as having headers */ + $csv = new CSVParser('sapphire/tests/dev/CSVBulkLoaderTest_PlayersWithHeader.csv'); + + /* We can set up column remapping. The keys are case-insensitive. */ + $csv->mapColumns(array( + 'FirstName' => '__fn', + 'bIoGrApHy' => '__BG', + )); + + $firstNames = $birthdays = $biographies = array(); + foreach($csv as $record) { + /* Each row in the CSV file will be keyed with the renamed columns. Any unmapped column names will be left as-is. */ + $this->assertEquals(array('__fn','__BG','Birthday'), array_keys($record)); + $firstNames[] = $record['__fn']; + $biographies[] = $record['__BG']; + $birthdays[] = $record['Birthday']; + } + + $this->assertEquals(array('John','Jane','Jamie','Järg'), $firstNames); + $this->assertEquals(array( + "He's a good guy", + "She is awesome.\nSo awesome that she gets multiple rows and \"escaped\" strings in her biography", + "Pretty old, with an escaped comma", + "Unicode FTW"), $biographies); + $this->assertEquals(array("31/01/1988","31/01/1982","31/01/1882","31/06/1982"), $birthdays); + } + + function testParsingWithExplicitHeaderRow() { + /* If your CSV file doesn't have a header row */ + $csv = new CSVParser('sapphire/tests/dev/CSVBulkLoaderTest_PlayersWithHeader.csv'); + + $csv->provideHeaderRow(array('__fn','__bio','__bd')); + + $firstNames = $birthdays = $biographies = array(); + foreach($csv as $record) { + /* Each row in the CSV file will be keyed with the header row that you gave */ + $this->assertEquals(array('__fn','__bio','__bd'), array_keys($record)); + $firstNames[] = $record['__fn']; + $biographies[] = $record['__bio']; + $birthdays[] = $record['__bd']; + } + + /* And the first row will be returned in the data */ + $this->assertEquals(array('FirstName','John','Jane','Jamie','Järg'), $firstNames); + $this->assertEquals(array( + 'Biography', + "He's a good guy", + "She is awesome.\nSo awesome that she gets multiple rows and \"escaped\" strings in her biography", + "Pretty old, with an escaped comma", + "Unicode FTW"), $biographies); + $this->assertEquals(array("Birthday","31/01/1988","31/01/1982","31/01/1882","31/06/1982"), $birthdays); + } + +} \ No newline at end of file diff --git a/tests/CsvBulkLoaderTest.php b/tests/dev/CsvBulkLoaderTest.php similarity index 87% rename from tests/CsvBulkLoaderTest.php rename to tests/dev/CsvBulkLoaderTest.php index b122670fe..1457d84b6 100644 --- a/tests/CsvBulkLoaderTest.php +++ b/tests/dev/CsvBulkLoaderTest.php @@ -5,14 +5,14 @@ * @todo Test with columnn headers and custom mappings */ class CsvBulkLoaderTest extends SapphireTest { - static $fixture_file = 'sapphire/tests/CsvBulkLoaderTest.yml'; + static $fixture_file = 'sapphire/tests/dev/CsvBulkLoaderTest.yml'; /** * Test plain import with column auto-detection */ function testLoad() { $loader = new CsvBulkLoader('CsvBulkLoaderTest_Player'); - $filepath = Director::baseFolder() . '/sapphire/tests/CsvBulkLoaderTest_PlayersWithHeader.csv'; + $filepath = Director::baseFolder() . '/sapphire/tests/dev/CsvBulkLoaderTest_PlayersWithHeader.csv'; $file = fopen($filepath, 'r'); $compareCount = $this->getLineCount($file); fgetcsv($file); // pop header row @@ -20,7 +20,7 @@ class CsvBulkLoaderTest extends SapphireTest { $results = $loader->load($filepath); // Test that right amount of columns was imported - $this->assertEquals($results->Count(), $compareCount-1, 'Test correct count of imported data'); + $this->assertEquals(4, $results->Count(), 'Test correct count of imported data'); // Test that columns were correctly imported $obj = Dataobject::get_one("CsvBulkLoaderTest_Player", "FirstName = 'John'"); @@ -36,7 +36,7 @@ class CsvBulkLoaderTest extends SapphireTest { */ function testLoadWithColumnMap() { $loader = new CsvBulkLoader('CsvBulkLoaderTest_Player'); - $filepath = Director::baseFolder() . '/sapphire/tests/CsvBulkLoaderTest_Players.csv'; + $filepath = Director::baseFolder() . '/sapphire/tests/dev/CsvBulkLoaderTest_Players.csv'; $file = fopen($filepath, 'r'); $compareCount = $this->getLineCount($file); $compareRow = fgetcsv($file); @@ -49,7 +49,7 @@ class CsvBulkLoaderTest extends SapphireTest { $results = $loader->load($filepath); // Test that right amount of columns was imported - $this->assertEquals($results->Count(), $compareCount, 'Test correct count of imported data'); + $this->assertEquals(4, $results->Count(), 'Test correct count of imported data'); // Test that columns were correctly imported $obj = Dataobject::get_one("CsvBulkLoaderTest_Player", "FirstName = 'John'"); @@ -65,7 +65,7 @@ class CsvBulkLoaderTest extends SapphireTest { */ function testLoadWithCustomHeaderAndRelation() { $loader = new CsvBulkLoader('CsvBulkLoaderTest_Player'); - $filepath = Director::baseFolder() . '/sapphire/tests/CsvBulkLoaderTest_PlayersWithCustomHeaderAndRelation.csv'; + $filepath = Director::baseFolder() . '/sapphire/tests/dev/CsvBulkLoaderTest_PlayersWithCustomHeaderAndRelation.csv'; $file = fopen($filepath, 'r'); $compareCount = $this->getLineCount($file); fgetcsv($file); // pop header row @@ -89,7 +89,7 @@ class CsvBulkLoaderTest extends SapphireTest { $results = $loader->load($filepath); // Test that right amount of columns was imported - $this->assertEquals($results->Count(), $compareCount-1, 'Test correct count of imported data'); + $this->assertEquals(1, $results->Count(), 'Test correct count of imported data'); // Test of augumenting existing relation (created by fixture) $testTeam = DataObject::get_one('CsvBulkLoaderTest_Team', null, null, 'Created DESC'); @@ -115,7 +115,7 @@ class CsvBulkLoaderTest extends SapphireTest { function testLoadWithIdentifiers() { // first load $loader = new CsvBulkLoader('CsvBulkLoaderTest_Player'); - $filepath = Director::baseFolder() . '/sapphire/tests/CsvBulkLoaderTest_PlayersWithId.csv'; + $filepath = Director::baseFolder() . '/sapphire/tests/dev/CsvBulkLoaderTest_PlayersWithId.csv'; $loader->duplicateChecks = array( 'ExternalIdentifier' => 'ExternalIdentifier' ); @@ -126,7 +126,7 @@ class CsvBulkLoaderTest extends SapphireTest { $this->assertEquals($player->Biography, 'He\'s a good guy', 'test updating of duplicate imports within the same import works'); // load with updated data - $filepath = Director::baseFolder() . '/sapphire/tests/CsvBulkLoaderTest_PlayersWithIdUpdated.csv'; + $filepath = Director::baseFolder() . '/sapphire/tests/dev/CsvBulkLoaderTest_PlayersWithIdUpdated.csv'; $results = $loader->load($filepath); $player = DataObject::get_by_id('CsvBulkLoaderTest_Player', 1); diff --git a/tests/CsvBulkLoaderTest.yml b/tests/dev/CsvBulkLoaderTest.yml similarity index 100% rename from tests/CsvBulkLoaderTest.yml rename to tests/dev/CsvBulkLoaderTest.yml diff --git a/tests/CsvBulkLoaderTest_Players.csv b/tests/dev/CsvBulkLoaderTest_Players.csv similarity index 100% rename from tests/CsvBulkLoaderTest_Players.csv rename to tests/dev/CsvBulkLoaderTest_Players.csv diff --git a/tests/CsvBulkLoaderTest_PlayersWithCustomHeaderAndRelation.csv b/tests/dev/CsvBulkLoaderTest_PlayersWithCustomHeaderAndRelation.csv similarity index 100% rename from tests/CsvBulkLoaderTest_PlayersWithCustomHeaderAndRelation.csv rename to tests/dev/CsvBulkLoaderTest_PlayersWithCustomHeaderAndRelation.csv diff --git a/tests/CsvBulkLoaderTest_PlayersWithHeader.csv b/tests/dev/CsvBulkLoaderTest_PlayersWithHeader.csv similarity index 58% rename from tests/CsvBulkLoaderTest_PlayersWithHeader.csv rename to tests/dev/CsvBulkLoaderTest_PlayersWithHeader.csv index e77222b2c..9f46724a2 100644 --- a/tests/CsvBulkLoaderTest_PlayersWithHeader.csv +++ b/tests/dev/CsvBulkLoaderTest_PlayersWithHeader.csv @@ -1,5 +1,6 @@ "FirstName","Biography","Birthday" "John","He's a good guy","31/01/1988" -"Jane","She is awesome.\nSo awesome that she gets multiple rows and \"escaped\" strings in her biography","31/01/1982" +"Jane","She is awesome. +So awesome that she gets multiple rows and \"escaped\" strings in her biography","31/01/1982" "Jamie","Pretty old\, with an escaped comma","31/01/1882" "Järg","Unicode FTW","31/06/1982" \ No newline at end of file diff --git a/tests/CsvBulkLoaderTest_PlayersWithId.csv b/tests/dev/CsvBulkLoaderTest_PlayersWithId.csv similarity index 100% rename from tests/CsvBulkLoaderTest_PlayersWithId.csv rename to tests/dev/CsvBulkLoaderTest_PlayersWithId.csv diff --git a/tests/CsvBulkLoaderTest_PlayersWithIdUpdated.csv b/tests/dev/CsvBulkLoaderTest_PlayersWithIdUpdated.csv similarity index 100% rename from tests/CsvBulkLoaderTest_PlayersWithIdUpdated.csv rename to tests/dev/CsvBulkLoaderTest_PlayersWithIdUpdated.csv