I recently had to write a PHP CSV parser for a project, and found the built-in fgetcsv function fairly useless for real-world CSVs (where some fields are quoted and some aren’t for example, or where escaped delimiters or quotes are present inside the field text). This function should handle :
- Lines where some fields are quoted and some aren’t
- Lines where quoted fields have ’escaped’ quotes inside them
- Lines where non-quoted fields have ’escaped’ delimiters inside them
- Lines where the quote is ’escaped’ by another quote (the default behaviour for Excel CSV exports)
- Lines containing multi-byte strings
/**
* Split a line from a CSV file into fields. Should handle various field
* delimiters, escape characters and enclosing quotes for fields
*
* @param string $line Line to split for the CSV
* @param string $delimiter Field delimiter (defaults to ',')
* @param string $escaper Field escaper (defaults to '\')
* @param string $encloser Field encloser (defaults to '"')
*/
function splitCsvLine($line, $delimiter=',', $encloser='"', $escaper='\\') {
$fields = array();
if (is_string($delimiter) && is_string($encloser)) :
// Make sure the arguments are regex-safe
$reg_safe_delimiter = '\x' . dechex(ord($delimiter));
$reg_safe_encloser = '\x' . dechex(ord($encloser));
$reg_safe_escaper = '\x' . dechex(ord($escaper));
$line = trim($line);
// Replace any 'quote-escaped' quotes within fields. This is only really necessary to
// handle excel exports, which often escape double quotes with double quotes.
if ($encloser == $escaper) :
$line = mb_ereg_replace($reg_safe_escaper . $reg_safe_encloser, "__ESCAPED__ENCLOSER__", $line);
endif;
// Loop over the string and extract each field
$field_num = 0;
while(mb_strlen($line) > 0) :
if(mb_substr($line, 0, 1) == $encloser) :
// If this string starts with an encloser, look for the next (non-escaped) encloser
preg_match('/^' . $reg_safe_encloser . '((?:[^' . $reg_safe_encloser .
']|(?<='.$reg_safe_escaper.')' . $reg_safe_encloser .
')*)' . $reg_safe_encloser . $reg_safe_delimiter .
'?(.*)$/', $line, $matches);
$value = mb_ereg_replace($reg_safe_escaper . $reg_safe_encloser, $encloser, $matches[1]);
$line = trim($matches[2]);
$fields[$field_num++] = $value;
// Otherwise, look for the next (non-escaped) delimiter
else :
preg_match('/^((?:[^' . $reg_safe_delimiter .
']|(?<='.$reg_safe_escaper.')' . $reg_safe_delimiter .
')*)' . $reg_safe_delimiter .
'?(.*)$/', $line, $matches);
$value = mb_ereg_replace($reg_safe_escaper . $reg_safe_delimiter, $delimiter, $matches[1]);
$line = trim($matches[2]);
$fields[$field_num++] = $value;
endif;
endwhile;
if ($encloser == $escaper) :
for ($i=0; $i<count($fields); $i++) :
$fields[$i] = mb_ereg_replace("__ESCAPED__ENCLOSER__", $encloser, $fields[$i]);
endfor;
endif;
endif;
return $fields;
}