aheber
9/15/2017 - 3:35 PM

SFDC CSV Parser

CSV Parsing and helper class w/ test code. Still needs performance improvements.

@isTest
public class CsvDataTest {
	@isTest
    public static void testDataParser(){
        List<List<String>> rows = CsvParser.Parse('hone, htwo, hthree\none,"two\ntwo",three');
        System.assertEquals(2, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('hone', rows[0][0]);
        System.assertEquals('htwo', rows[0][1]);
        System.assertEquals('hthree', rows[0][2]);
        System.assertEquals(3, rows[1].size());
        System.assertEquals('one', rows[1][0]);
        System.assertEquals('two\ntwo', rows[1][1]);
        System.assertEquals('three', rows[1][2]);
        
        CsvData csvData = new CsvData(rows);
        System.assertEquals(1, csvData.dataRows.size());
        System.assertEquals(3, csvData.headers.size());
        System.assertEquals('one', csvData.dataRows[0].get('hone'));
        System.assertEquals('two\ntwo', csvData.dataRows[0].get('htwo'));
        System.assertEquals('three', csvData.dataRows[0].get('hthree'));
    }
}
/*
 * Class to wrap lists of rows and columns, assumes headers are present and makes
 * data available as maps by header name. This class is not optimized for performance but for convenience.
 * If you need more performance please use the CsvParser class directly and work with the raw lists.
 */
public class CsvData {
    
    public Integer fieldCount {get;private set;}
    
    public List<String> headers {get;set;}
    public List<String> commentRows {get;set;}
    public List<Map<String, String>> dataRows {get;set;}
    
    public CsvData(List<List<String>> rows){
        headers = new List<String>();
        commentRows = new List<String>();
        dataRows = new List<Map<String, String>>();
        fieldCount = 0;
        for(List<String> row : rows){
            if(row.isEmpty()){
                continue;
            }
            // Parser has dropped some data as it doesn't preserve comment lines but this is good enough for now
            if(row[0].startsWith('#')){
                commentRows.add(String.join(row,',')); 
                continue;
            }
            if(row.size() > fieldCount){
                fieldCount = row.size();
            }
            if(headers.isEmpty()){
                headers = row;
            } else {
                Map<String, String> rowMap = new Map<String, String>();
                for(Integer i = 0; i < headers.size(); i ++){
                    if(row.size() > i){
                        rowMap.put(headers[i], row[i]);
                    }
                }
                dataRows.add(rowMap);
            }
        }
    }
}
@isTest
public class CsvParserTest {
	@isTest
    public static void testSimpleParse(){
        List<List<String>> rows = CsvParser.Parse('one,two,three');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
	@isTest
    public static void testEndingTwo(){
        List<List<String>> rows = CsvParser.Parse('one,two,three\n');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
	@isTest
    public static void testEndingThree(){
        List<List<String>> rows = CsvParser.Parse('one,two,three\r\n');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
	@isTest
    public static void testEndingFour(){
        List<List<String>> rows = CsvParser.Parse('one,two,three\r');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
    
	@isTest
    public static void testQuotedParse(){
        List<List<String>> rows = CsvParser.Parse('one,"two",three');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
    
	@isTest
    public static void testQuotedWQuoteParse(){
        List<List<String>> rows = CsvParser.Parse('one,"two""two",three');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two"two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
    
	@isTest
    public static void testQuotedWCommaParse(){
        List<List<String>> rows = CsvParser.Parse('one,"two,two",three');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two,two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
    
	@isTest
    public static void testMultiLineParse(){
        List<List<String>> rows = CsvParser.Parse('one,two,three\nfour,five,six');
        System.assertEquals(2, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
        System.assertEquals(3, rows[1].size());
        System.assertEquals('four', rows[1][0]);
        System.assertEquals('five', rows[1][1]);
        System.assertEquals('six', rows[1][2]);
    }
    
	@isTest
    public static void testMultiLineParse2(){
        List<List<String>> rows = CsvParser.Parse('one,two,three\r\nfour,five,six\n');
        System.assertEquals(2, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
        System.assertEquals(3, rows[1].size());
        System.assertEquals('four', rows[1][0]);
        System.assertEquals('five', rows[1][1]);
        System.assertEquals('six', rows[1][2]);
    }
    
	@isTest
    public static void testMultiLineParseLiteral(){
        List<List<String>> rows = CsvParser.Parse('one,"two\r\n\rtwo",three\r\nfour,five,six\n');
        System.assertEquals(2, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two\r\n\rtwo', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
        System.assertEquals(3, rows[1].size());
        System.assertEquals('four', rows[1][0]);
        System.assertEquals('five', rows[1][1]);
        System.assertEquals('six', rows[1][2]);
    }
    
	@isTest
    public static void testSpaces(){
        List<List<String>> rows = CsvParser.Parse(' one, " two\ntwo"  , three ');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals(' two\ntwo', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
    
	@isTest
    public static void testBrokenLineParse(){
        List<List<String>> rows = CsvParser.Parse('one,"two\ntwo",three');
        System.assertEquals(1, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two\ntwo', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
    
	@isTest
    public static void testBulk(){
        String text = 'one,"two\ntwo",three\n';
        String largeText = text;
        for(Integer i = 1; i < 1000; i++){
            largeText += text;
        }
        List<List<String>> rows = CsvParser.Parse(largeText);
        System.assertEquals(1000, rows.size());
        System.assertEquals(3, rows[0].size());
        System.assertEquals('one', rows[0][0]);
        System.assertEquals('two\ntwo', rows[0][1]);
        System.assertEquals('three', rows[0][2]);
    }
}
/*
 * CSV Parser ported from http://www.boyet.com/articles/csvparser.html
 * Takes in CSV data and expands to rows of columns, pair with CsvData class
 * to create a more structured and easy to use 
 */
public class CsvParser {
    
    public CsvParserConfig config;
    
    public CsvParser(){
        this.config = new CsvParserConfig();
    }
    
    public static List<List<String>> Parse(Blob bytes){
        return Parse(bytes.toString());
    }
    
    public static List<List<String>> Parse(String text){
        CsvParser parser = new CsvParser();
        
        DefaultCsvConsumer c = new DefaultCsvConsumer();
        DefaultStringCharTokenizer t = new DefaultStringCharTokenizer(parser.config, text);
        parser.Parse(t, c);
        return c.getRows();
    }
    
    public void Parse(ICharTokenizer reader, ICsvConsumer consumer) {
        parseCsvFile(reader, consumer);
    }
    
    private void parseCsvFile(ICharTokenizer reader, ICsvConsumer consumer) {
        while (reader.Peek() != config.EOF) {
            parseCsvRecord(reader, consumer);
        }
        consumer.SignalEndOfFile();
    }
    
    private void parseCsvRecord(ICharTokenizer reader, ICsvConsumer consumer) {
        parseCsvStringList(reader, consumer);
        String ch = reader.Read();
        if (ch == config.EOF) {
            reader.Unread(ch);
            ch = '\n';
        }
        if (ch != '\n') {
            throw new CsvParserTooMuchDataException('End of record was expected but more data exists.');
        }
        consumer.SignalEndOfRecord();
    }
    
    private void parseCsvStringList(ICharTokenizer reader, ICsvConsumer consumer) {
        String ch;
        do {
            parseRawString(reader, consumer);
            ch = reader.Read();
        } while (ch == ',');
        reader.Unread(ch);
    }
    
    private Boolean isFieldTerminator(String c) {
        return ((c == ',') || (c == '\n') || (c == config.EOF));
    }
    
    private Boolean isSpace(String c) {
        return ((c == ' ') | (c == '\t'));
    }
    
    private void parseOptionalSpaces(ICharTokenizer reader) {
        String ch;
        do {  
            ch = reader.Read();
        } while (isSpace(ch));
        reader.Unread(ch);
    }
    
    private void parseRawString(ICharTokenizer reader, ICsvConsumer consumer) {
        parseOptionalSpaces(reader);
        parseRawField(reader, consumer);
        if (!isFieldTerminator(reader.Peek()))
            parseOptionalSpaces(reader);
    }
    
    private void parseRawField(ICharTokenizer reader, ICsvConsumer consumer) {
        String fieldValue = '';
        
        String ch = reader.Peek();
        if (!isFieldTerminator(ch)) {
            if (ch == '"')
                fieldValue = parseQuotedField(reader);
            else 
                fieldValue = parseSimpleField(reader);
        }
        consumer.ConsumeField(fieldValue);
    }
    
    private string parseQuotedField(ICharTokenizer reader) {
        System.debug('parse quoted field');
        reader.Read(); // read and discard initial quote
        
        string field = parseEscapedField(reader);
        
        String ch = reader.Read();
        if (ch != '"') {
            reader.Unread(ch);
            throw new CsvParserNoTermQuoteException('Quoted field has no terminating double quote');
        }
        return field;
    }
    
    private string parseEscapedField(ICharTokenizer reader) {
        String sb = '';
        
        sb = parseSubField(reader);
        System.debug('Sub 1:'+sb);
        String ch = reader.Read();
        while (processDoubleQuote(reader, ch)) {
            sb += '"';
            sb += parseSubField(reader);
        	System.debug('Sub 2:'+sb);
            ch = reader.Read();
        }
        reader.Unread(ch);
        
        return sb;
    }
    
    private String parseSubField(ICharTokenizer reader) {
        String sb = '';
        String ch = reader.ReadLiteral();
        while ((ch != '"') && (ch != config.EOF)) {
            sb += ch;
            ch = reader.ReadLiteral();
        }
        reader.Unread(ch);
        return sb;
    }
    
    private Boolean isBadSimpleFieldChar(String c) {
        return isSpace(c) || isFieldTerminator(c) || (c == '"');
    }
    
    private string parseSimpleField(ICharTokenizer reader) {
        System.debug('parse simple');
        String ch = reader.Read();
        if (isBadSimpleFieldChar(ch)) {
            reader.Unread(ch);
            return '';
        }
        
        String sb = '';
        sb += ch;
        ch = reader.Read();
        while (!isBadSimpleFieldChar(ch)) {
            sb += ch;
            ch = reader.Read();
        }
        reader.Unread(ch);
        
        return sb;
    }
    
    private Boolean processDoubleQuote(ICharTokenizer reader, String ch) {
        if ((ch == '"') && (reader.Peek() == '"')) {
            reader.Read(); // discard second quote of double
            return true;
        }
        return false;
    }
    
    public interface ICharTokenizer {
        String Peek();
        String Read();
        String ReadLiteral();
        void Unread(String text);
    }
    
    public interface ICsvConsumer {
        void SignalEndOfFile();
        void SignalEndOfRecord();
        void ConsumeField(String text);
    }
    
    public class CsvParserTooMuchDataException extends Exception {}
    
    public class CsvParserNoTermQuoteException extends Exception{}

    
    public class CharTokenizerException extends Exception {}
    
    public class CsvParserConfig {
        public String EOF = null;
        public String TextEnclusure = '"';
        public String Separator = ',';
    }
    
    // Doesn't have the most respect for memory or performance, TODO optimize
    public class DefaultStringCharTokenizer implements ICharTokenizer {
        private List<String> s;
        private CsvParserConfig config;
        private Integer index;
        private Boolean haveUnreadChar;
        private String unreadChar;
        
        public DefaultStringCharTokenizer(CsvParserConfig config, string s) {
            this.config = config;
            this.s = s.split('');
            index = 0;
            haveUnreadChar = false;
        }
        
        private void skipCrInCrLf() {
            if ((s[index] == '\r') && (index + 1 < s.size()) && (s[index + 1] == '\n'))
                index++;
        }
        
        private String mapCrToLf(String c) {
            if (c == '\r')
                return '\n';
            return c;
        }
        
        public String Peek() {
            if (haveUnreadChar)
                return unreadChar;
            if (index < s.size())
                return mapCrToLf(s[index]);
            return config.EOF;
        }
        
        public String Read() {
            if (haveUnreadChar) {
                haveUnreadChar = false;
                return unreadChar;
            }
            if (index < s.size()) {
                skipCrInCrLf();
                return mapCrToLf(s[index++]);
            }
            return config.EOF;
        }
        
        public String ReadLiteral() {
            if (haveUnreadChar) {
                haveUnreadChar = false;
                return unreadChar;
            }
            if (index < s.size()) {
                return s[index++];
            }
            return config.EOF;
        }
        
        public void Unread(String c) {
            if (haveUnreadChar) {
                throw new CsvParser.CharTokenizerException('Unread() cannot accept more than one pushed back character');
            }
            haveUnreadChar = true;
            unreadChar = c;
        }
        
    }
    
    public class DefaultCsvConsumer implements ICsvConsumer {
        List<List<String>> rows = new List<List<String>>();
        List<String> row = new List<String>();
        public List<List<String>> getRows(){
            return rows;
        }
        public void ConsumeField(string s) {
            System.debug('[' + s + ']');
            row.add(s);
        }
        
        public void SignalEndOfRecord() {
            System.debug('[end of record]');
            rows.add(row);
            row = new List<String>();
        }
        
        public void SignalEndOfFile() {
            // capture end of row if needed
            if(!row.isEmpty()){
                SignalEndOfRecord();
            }
            System.debug('[end of file]');
        }
    }
}