// CSV.cpp (C) 2008 adolfo@di-mare.com #ifdef English_dox /** \file CSV.cpp \brief Implementation for \c CSV.h. \author Adolfo Di Mare \date 2008 */ #endif #ifdef Spanish_dox /** \file CSV.cpp \brief Implementación para \c CSV.h. \author Adolfo Di Mare \date 2008 */ #endif #include "CSV.h" #define COMMA ',' #define DQUOTE '"' #define LF '\n' // Line Feed #define CR '\r' // Carriage Return // Actions for the finite automaton used to parse CSV input // ======================================================== // [ ] ==> n=0; i=0; DATA[0] = ""; // [ csv="" ] ==> ++n; DATA[n] = ""; ++i; // [ ] ==> ++i; // [ h+= ] ==> DATA[n] += str[i]; ++i; // [ h='""' ] ==> DATA[n] = """"; ++i; // [ END ] ==> return n; // // | ',' '\n' | '"' | l | // delta() | comma+LF | d-quote | letter | // ----------+------------+------------+------------+ // ==> 0 | 0 | 1 | 3 | // init | return | | csv+=ch | // ----------+------------+------------+------------+ // 1 | 1 | 2 | 1 | // quoted(1)| csv+=ch | | csv+=ch | // ----------+------------+------------+------------+ // 2 | 0 | 1 | 3 | // inquote(2)| return | csv+=ch | csv='""' | // ----------+------------+------------+------------+ // 3 | 0 | 3 | 3 | // regular | return | csv+=ch | csv+=ch | // ----------+------------+------------+------------+ #ifdef English_dox /** Scans input stream \c CIN and returns the next CSV value. - The retrieved value from \c CIN gets stored into \c csv. - Works with \c char, not tested for \c wchar_t. - Stops when \c CIN.fail() or when \c CIN.eof(). - Will not remove any chars from the retrieved value. \return true when the CSV complies with RFC-4180. */ #endif #ifdef Spanish_dox /** Obtiene del flujo de entrada \c CIN el siguiente valor CSV. - El valor obtenido de \c CIN queda almacenado en \c csv. - Trabaja bien con \c char, no ha sido probado para \c wchar_t. - Para cuando \c CIN.fail() o cuando \c CIN.eof(). - No elimina ningún caracter del valor obtenido. \return true cuando el campo CSV sigue la especificación RFC-4180. */ #endif bool automataCSV( std::string& csv, std::istream& CIN ) { csv.clear(); if ( CIN.fail() || CIN.eof() ) { // see http://www.horstmann.com/cpp/pitfalls.html return false; } int state=0; char ch; bool trailing_CR = false; // true when the last char was CR bool ret_val = true; // true while csv complies with RFC-4180 for (;;) { CIN.get(ch); if ( CIN.fail() || CIN.eof() ) { return ret_val; } csv += ch; switch (state) { case 0: { // init if ( ch == COMMA ) { return ret_val; } else if ( ch == LF ) { return ret_val; } else if ( ch == CR ) { trailing_CR = true; state = 3; } else if ( ch == DQUOTE ) { // | ',' '\n' | '"' | l | state = 1; // delta() | comma+LF | d-quote | letter | } // ----------+------------+------------+------------+ else { // letter // ==> 0 | 0 | 1 | 3 | state = 3; // init | return | | csv+=ch | } // ----------+------------+------------+------------+ } break; case 1: { // quote(1) if ( ch == DQUOTE ) { // | ',' '\n' | '"' | l | state = 2; // delta() | comma+LF | d-quote | letter | } // ----------+------------+------------+------------+ // else { // letter COMMA LF // 1 | 1 | 2 | 1 | // state = 1; // quoted(1)| csv+=ch | | csv+=ch | // } // ----------+------------+------------+------------+ } break; case 2: { // inquote(2) if ( ch == COMMA ) { // state = 0; return ret_val; } else if ( ch == LF ) { // state = 0; return ret_val; } else if ( trailing_CR ) { // ["...""..."\r?...,] '?' after '\r' trailing_CR = false; ret_val = false; state = 3; } else if ( ch == CR ) { trailing_CR = true; // state = 2; } else if ( ch == DQUOTE ) { // | ',' '\n' | '"' | l | state = 1; // delta() | comma+LF | d-quote | letter | } // ----------+------------+------------+------------+ else { // letter (error) // 2 | 0 | 1 | 3 | ret_val = false; // inquote(2)| return | csv+=ch | csv='""' | state = 3; // ----------+------------+------------+------------+ } // [," ... "" "3x,] ==> error condition ["3] } break; case 3: { // regular if ( ch == COMMA ) { // state = 0; return ret_val; } else if ( ch == LF ) { // state = 0; return ret_val; } // | ',' '\n' | '"' | l | else { // letter // delta() | comma+LF | d-quote | letter | // state = 3; // ----------+------------+------------+------------+ // swallows DQUOTE's && CR's // 3 | 0 | 3 | 3 | } // regular | return | csv+=ch | csv+=ch | } // ----------+------------+------------+------------+ break; } // swith (state) } // for (;;) return ret_val; } void singleDQUOTE( std::string & str ); bool getNextCSV( std::string& csv, std::istream& CIN ) { bool correct = automataCSV( csv, CIN ); bool ret_val = false; // true if ( csv[ csv.length()-1 ] == LF ) size_t N = csv.length(); // number of retrieved chars if ( correct ) { if ( csv.empty() ) { return ret_val; } N--; // last char if ( csv[N] == COMMA ) { csv.erase(N); // chop() trailing comma } else if( csv[N] == LF ) { ret_val = true; csv.erase(N); // chop() trailing LF if ( N>0 ) { N--; if( csv[N] == CR ) { csv.erase(N); // chop() trailing CR } } } if ( ! csv.empty() ) { if ( csv[0] == DQUOTE ) { singleDQUOTE( csv ); // transfrom [""] ==> ["] } } } else { // assert( correct == false ); if ( N>0 ) { N--; // last char if ( csv[N] == COMMA ) { csv.erase(N,1); // removes trailing comma } } } return ret_val; } void setQuotedCSV( std::string& res , const std::string& value ) { std::string::const_iterator ch; bool quote_surround = false; res.clear(); for ( ch = value.begin(); ch != value.end(); ++ch ) { if ( isspace( *ch ) || *ch == COMMA ) { quote_surround = true; } else if ( *ch == DQUOTE ) { res += DQUOTE; quote_surround = true; } res += *ch; } if ( quote_surround ) { res = DQUOTE + res + DQUOTE; } } void trim( std::string & str ) { if ( str.empty() ) { // already trimmed return; } // find in-string range of chars str[i->j] std::string::size_type i = 0, LEN = str.length(); while ( i < LEN ) { if ( isspace(str[i]) ) { // trim traling whitespace ++i; } else { break; } } std::string::size_type j = LEN; while ( j > 0 ) { --j; if ( ! isspace(str[j]) ) { break; } } // leave out leading and trailing whitespace str = str.substr(i,j-i+1); } void trimCSV( std::string & str ) { trim( str ); // 1) trim() if ( str.empty() ) { return; } // D-Quoted??? std::string::size_type N = str.length()-1; if ( str[0] != DQUOTE || str[N] != DQUOTE ) { return; } // Substitute each double DQUOTE's by a single DQUOTE singleDQUOTE( str ); return; } #ifdef English_dox /// Substitute each double DQUOTE's by a single DQUOTE within \c str. #endif #ifdef Spanish_dox /// Sustituey cada letra DQUOTE doble por una solaletra DQUOTE en \c str. #endif void singleDQUOTE( std::string & str ) { // Substitute each double DQUOTE's by a single DQUOTE std::string tmp; std::string::const_iterator from, next; for ( from = str.begin(); from != str.end(); ++from ) { tmp.push_back( *from ); if ( *from == DQUOTE ) { // already copied the first next = from; next++; if ( next == str.end() ) { break; } else if ( *next == DQUOTE ) { from = next; // don´t copy the second DQUOTE } } } // Removed enclosing (outermost) DQUOTE's str = tmp.substr(1, tmp.length()-2); return; } void chop( std::string & str , char ch ) { if ( str.empty() ) { // nothing to chop return; } std::string::size_type N = str.length()-1; if ( str[N] == ch ) { str.erase(N); // removed if it's the last } } #if 0 /// Test ==> \c rebuildDquote(). void test_CSV::test_rebuildDquote() { void rebuildDquote( std::string & str ); {{ // test::rebuildDquote() std::string s; s = "\"" ; rebuildDquote(s); // ["] ==> [""] assertTrue( s == "\"\""); s = "\" \" \"" ; rebuildDquote(s); // [" " "] ==> ["" "" ""] assertTrue( s == "\"\" \"\" \"\""); s = "3,4\"" ; rebuildDquote(s); // [3,4"] ==> [3,4""] assertTrue( s == "3,4\"\""); s = " ," ; rebuildDquote(s); // [ ,] ==> [ ,] assertTrue( s == " ,"); }} { // A61196-A76944 std::string s = "\"2\",3, \r\n"; // ["2",3, \r\n] rebuildDquote(s); assertTrue( s == "\"\"2\"\",3, \r\n"); // ["2",3, \r\n] ==> [""2"",3, \r\n] } } #ifdef English_dox /** Scans \c str substituting \c '"' by 2 double-quotes \c [""]. - Local routine used in the implementation of \c getNextCSV(). \dontinclude test_CSV.cpp \skipline test::rebuildDquote() \until }} \see test_CSV::test_rebuildDquote() */ #endif #ifdef Spanish_dox /** Sustituye en \c str cada comilla doble \c '"' por 2 comillas dobles \c [""]. - Rutina local useda en la implementación de \c getNextCSV(). \dontinclude test_CSV.cpp \skipline test::rebuildDquote() \until }} \see test_CSV::test_rebuildDquote() */ #endif void rebuildDquote( std::string & str ) { std::string res; std::string::const_iterator ch; for ( ch = str.begin(); ch != str.end(); ++ch ) { res += *ch; if ( *ch == DQUOTE ) { res += DQUOTE; } } str = res; } bool getNextCSV_OLD( std::string& csv, std::istream& CIN ) { csv.clear(); if ( CIN.fail() ) { // see http://www.horstmann.com/cpp/pitfalls.html return false; } int state=0; char ch; bool trailing_CR = false; // true when the last char was CR for (;;) { CIN.get(ch); if ( CIN.fail() ) { return false; } switch (state) { case 0: { // init if ( ch == COMMA ) { // csv += COMMA; // removes COMMA from result string return false; } else if ( ch == LF ) { // csv += LF; // removes LF from result string return true; } else if ( ch == CR ) { trailing_CR = true; csv += CR; state = 3; } else if ( ch == DQUOTE ) { // | ',' '\n' | '"' | l | state = 1; // delta() | comma+LF | d-quote | letter | } // ----------+------------+------------+------------+ else { // letter // ==> 0 | 0 | 1 | 3 | csv += ch; // init | return | | csv+=ch | state = 3; // ----------+------------+------------+------------+ } } break; case 1: { // quote(1) if ( ch == DQUOTE ) { // | ',' '\n' | '"' | l | state = 2; // delta() | comma+LF | d-quote | letter | } // ----------+------------+------------+------------+ else { // letter COMMA LF // 1 | 1 | 2 | 1 | csv += ch; // quoted(1)| csv+=ch | | csv+=ch | // state = 1; // ----------+------------+------------+------------+ } } break; case 2: { // inquote(2) if ( ch == COMMA ) { // state = 0; return false; } else if ( ch == LF ) { // state = 0; return true; } else if ( trailing_CR ) { // ["...""..."\r?...,] '?' after '\r' rebuildDquote( csv ); csv = DQUOTE + csv + DQUOTE + CR + ch; trailing_CR = false; state = 3; } else if ( ch == CR ) { // removes CR+LF at the end of line trailing_CR = true; // csv += CR; // removes trailing CR+LF // state = 2; } else if ( ch == DQUOTE ) { // | ',' '\n' | '"' | l | csv += DQUOTE; // delta() | comma+LF | d-quote | letter | state = 1; // ----------+------------+------------+------------+ } // 2 | 0 | 1 | 3 | else { // letter (error) // inquote(2)| return | csv+=ch | csv='""' | rebuildDquote( csv ); // ----------+------------+------------+------------+ csv= DQUOTE + csv + DQUOTE + ch; // [," ... "" "3x,] ==> error condition ["3] state = 3; // [" ... "" "3] ==> rebuilt value } } break; case 3: { // regular if ( ch == COMMA ) { return false; } else if ( ch == LF ) { if ( trailing_CR ) { csv = csv.substr( 0, csv.length()-1 ); // chop( csv , CR ); } // state = 0; // csv += LF; return true; } else if ( ch == CR ) { // leaves CR at the end trailing_CR = true; // mark to remove later csv += CR; // state = 3; } // | ',' '\n' | '"' | l | else { // letter // delta() | comma+LF | d-quote | letter | csv += ch; // ----------+------------+------------+------------+ // state = 3; // 3 | 0 | 3 | 3 | // swallows DQUOTE's && CR's // regular | return | csv+=ch | csv+=ch | } // ----------+------------+------------+------------+ } break; } // swith (state) } // for (;;) return false; } #endif #ifdef English_dox /// Comma Separated Value (not used in this implementation). #endif #ifdef Spanish_dox /// Comma Separated Value (no usado en esta implementación). #endif namespace csv { } // trick to include it into the Doxygen documentation // Trick to force Doxygen to document these. // - They are at the end of file to avoid trouble. using namespace std; using namespace csv; // EOF: CSV.cpp