// CSV.cpp  (C) 2008 adolfo@di-mare.com

#ifdef English_dox
/** \file   CSV.cpp
    \brief  Implementation for \c CSV.h.
    \author Adolfo Di Mare <adolfo@di-mare.com>
    \date   2008
*/
#endif

#ifdef Spanish_dox
/** \file   CSV.cpp
    \brief  Implementación para \c CSV.h.
    \author Adolfo Di Mare <adolfo@di-mare.com>
    \date   2008
*/
#endif

#include "CSV.h"

#define COMMA  ','
#define DQUOTE '"'
#define LF '\n' // Line Feed
#define CR '\r' // Carriage Return



//    Actions for the finite automaton used to parse CSV input
//    ========================================================
//    [              ] ==> n=0; i=0; DATA[0] = "";
//    [   csv=""     ] ==> ++n; DATA[n] = ""; ++i;
//    [              ] ==> ++i;
//    [   h+=        ] ==> DATA[n] += str[i]; ++i;
//    [  h='""'      ] ==> DATA[n] = """"; ++i;
//    [     END      ] ==> return n;
//
//            |  ',' '\n'  |    '"'     |     l      |
//    delta() |  comma+LF  |  d-quote   |   letter   |
//  ----------+------------+------------+------------+
//   ==>  0   |     0      |     1      |     3      |
//       init |   return   |            |  csv+=ch   |
//  ----------+------------+------------+------------+
//        1   |     1      |     2      |     1      |
//   quoted(1)|  csv+=ch   |            |  csv+=ch   |
//  ----------+------------+------------+------------+
//        2   |     0      |     1      |     3      |
//  inquote(2)|   return   |  csv+=ch   |  csv='""'  |
//  ----------+------------+------------+------------+
//        3   |     0      |     3      |     3      |
//    regular |   return   |  csv+=ch   |  csv+=ch   |
//  ----------+------------+------------+------------+

#ifdef English_dox
/** Scans input stream \c CIN and returns the next CSV value.
    - The retrieved value from \c CIN gets stored into \c csv.
    - Works with \c char, not tested for \c wchar_t.
    - Stops when \c CIN.fail() or when \c CIN.eof().
    - Will not remove any chars from the retrieved value.

    \return true when the CSV complies with RFC-4180.
*/
#endif
#ifdef Spanish_dox
/** Obtiene del flujo de entrada \c CIN el siguiente valor CSV.
    - El valor obtenido de \c CIN queda almacenado en \c csv.
    - Trabaja bien con \c char, no ha sido probado para \c wchar_t.
    - Para cuando \c CIN.fail() o cuando \c CIN.eof().
    - No elimina ningún caracter del valor obtenido.

    \return true cuando el campo CSV sigue la especificación RFC-4180.
*/
#endif
bool automataCSV( std::string& csv, std::istream& CIN ) {
    csv.clear();
    if ( CIN.fail() || CIN.eof() ) { // see http://www.horstmann.com/cpp/pitfalls.html
        return false;
    }
    int state=0;  char ch;
    bool trailing_CR = false; // true when the last char was CR
    bool ret_val     = true;  // true while csv complies with RFC-4180
    for (;;) {
        CIN.get(ch);
        if ( CIN.fail() || CIN.eof() ) {
            return ret_val;
        }
        csv += ch;

        switch (state) {
        case 0: { // init
                if ( ch == COMMA ) {
                    return ret_val;
                }
                else if ( ch == LF ) {
                    return ret_val;
                }
                else if ( ch == CR ) {
                    trailing_CR = true;
                    state = 3;
                }
                else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
                    state = 1;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
                }                              //  ----------+------------+------------+------------+
                else { // letter               //   ==>  0   |     0      |     1      |     3      |
                    state = 3;                 //       init |   return   |            |  csv+=ch   |
                }                              //  ----------+------------+------------+------------+
            }
            break;

        case 1: { // quote(1)
                if ( ch == DQUOTE ) {          //            |  ',' '\n'  |    '"'     |     l      |
                    state = 2;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
                }                              //  ----------+------------+------------+------------+
            //  else { // letter COMMA LF      //        1   |     1      |     2      |     1      |
            //      state = 1;                 //   quoted(1)|  csv+=ch   |            |  csv+=ch   |
            //  }                              //  ----------+------------+------------+------------+
            }
            break;

        case 2: { // inquote(2)
                if ( ch == COMMA ) {
                //  state = 0;
                    return ret_val;
                } else if ( ch == LF ) {
                //  state = 0;
                    return ret_val;
                }
                else if ( trailing_CR ) { //  ["...""..."\r?...,] '?' after '\r'
                    trailing_CR = false;
                    ret_val = false;
                    state = 3;
                }
                else if ( ch == CR ) {
                    trailing_CR = true;
                //  state = 2;
                }
                else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
                    state = 1;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
                }                              //  ----------+------------+------------+------------+
                else { // letter (error)       //        2   |     0      |     1      |     3      |
                    ret_val = false;           //  inquote(2)|   return   |  csv+=ch   |  csv='""'  |
                    state = 3;                 //  ----------+------------+------------+------------+
                }                              // [," ... "" "3x,] ==> error condition ["3]
            }
            break;

        case 3: { // regular
                if ( ch == COMMA ) {
                //  state = 0;
                    return ret_val;
                }
                else if ( ch == LF ) {
                //  state = 0;
                    return ret_val;
                }                              //            |  ',' '\n'  |    '"'     |     l      |
                else { // letter               //    delta() |  comma+LF  |  d-quote   |   letter   |
                //  state = 3;                 //  ----------+------------+------------+------------+
                // swallows DQUOTE's && CR's   //        3   |     0      |     3      |     3      |
                }                              //    regular |   return   |  csv+=ch   |  csv+=ch   |
            }                                  //  ----------+------------+------------+------------+
            break;

        } // swith (state)
    } // for (;;)

    return ret_val;
}

void singleDQUOTE( std::string & str );

bool getNextCSV( std::string& csv, std::istream& CIN ) {
    bool correct = automataCSV( csv, CIN );
    bool ret_val = false;    // true if ( csv[ csv.length()-1 ] == LF )
    size_t N = csv.length(); // number of retrieved chars

    if ( correct ) {
        if ( csv.empty() ) {
            return ret_val;
        }
        N--; // last char
        if ( csv[N] == COMMA ) {
            csv.erase(N); // chop() trailing comma
        }
        else if( csv[N] == LF ) {
            ret_val = true;
            csv.erase(N); // chop() trailing LF
            if ( N>0 ) {
                N--;
                if( csv[N] == CR ) {
                    csv.erase(N); // chop() trailing CR
                }
            }
        }

        if ( ! csv.empty() ) {
            if ( csv[0] == DQUOTE ) {
                singleDQUOTE( csv ); // transfrom [""] ==> ["]
            }
        }
    }

    else {   // assert( correct == false );
        if ( N>0 ) {
            N--; // last char
            if ( csv[N] == COMMA ) {
                csv.erase(N,1); // removes trailing comma
            }
        }
    }
    return ret_val;
}

void setQuotedCSV( std::string& res , const std::string& value ) {
    std::string::const_iterator ch;
    bool quote_surround = false;
    res.clear();
    for ( ch = value.begin(); ch != value.end(); ++ch ) {
        if ( isspace( *ch ) || *ch == COMMA ) {
            quote_surround = true;
        }
        else if ( *ch == DQUOTE ) {
            res += DQUOTE;
            quote_surround = true;
        }
        res += *ch;
    }

    if ( quote_surround ) {
        res = DQUOTE + res + DQUOTE;
    }
}

void trim( std::string & str ) {
    if ( str.empty() ) { // already trimmed
        return;
    }

    // find in-string range of chars str[i->j]
    std::string::size_type i = 0, LEN = str.length();
    while ( i < LEN ) {
        if ( isspace(str[i]) ) { // trim traling whitespace
            ++i;
        }
        else {
            break;
        }
    }

    std::string::size_type j = LEN;
    while ( j > 0 ) {
        --j;
        if ( ! isspace(str[j]) ) {
            break;
        }
    }
    // leave out leading and trailing whitespace
    str = str.substr(i,j-i+1);
}

void trimCSV( std::string & str ) {
    trim( str ); // 1) trim()
    if ( str.empty() )  {
        return;
    }

    // D-Quoted???
    std::string::size_type N = str.length()-1;
    if ( str[0] != DQUOTE || str[N] != DQUOTE )  {
        return;
    }

    // Substitute each double DQUOTE's by a single DQUOTE
    singleDQUOTE( str );
    return;
}


#ifdef English_dox
/// Substitute each double DQUOTE's by a single DQUOTE within \c str.
#endif
#ifdef Spanish_dox
/// Sustituey cada letra DQUOTE doble por una solaletra DQUOTE en \c str.
#endif
void singleDQUOTE( std::string & str ) {
    // Substitute each double DQUOTE's by a single DQUOTE
    std::string tmp;
    std::string::const_iterator from, next;
    for ( from = str.begin(); from != str.end(); ++from ) {
        tmp.push_back( *from );
        if ( *from == DQUOTE ) { // already copied the first
            next = from; next++;
            if ( next == str.end() ) {
                break;
            }
            else if ( *next == DQUOTE ) {
                from = next; // don´t copy the second DQUOTE
            }
        }
    }
    // Removed enclosing (outermost) DQUOTE's
    str = tmp.substr(1, tmp.length()-2);
    return;
}

void chop( std::string & str , char ch ) {
    if ( str.empty() ) { // nothing to chop
        return;
    }
    std::string::size_type N = str.length()-1;
    if ( str[N] == ch ) {
        str.erase(N); // removed if it's the last
    }
}


#if 0

/// Test ==> \c rebuildDquote().
void test_CSV::test_rebuildDquote() {
    void rebuildDquote( std::string & str );
    {{  // test::rebuildDquote()
        std::string s;
        s =  "\"" ; rebuildDquote(s);       // ["] ==> [""]
        assertTrue( s == "\"\"");
        s =  "\" \" \"" ; rebuildDquote(s); // [" " "] ==> ["" "" ""]
        assertTrue( s == "\"\" \"\" \"\"");
        s =  "3,4\"" ; rebuildDquote(s);    // [3,4"] ==> [3,4""]
        assertTrue( s == "3,4\"\"");
        s =  " ," ; rebuildDquote(s);       // [ ,] ==> [ ,]
        assertTrue( s == " ,");
    }}
    {   // A61196-A76944
        std::string s =  "\"2\",3, \r\n";        // ["2",3, \r\n]
        rebuildDquote(s);
        assertTrue( s ==  "\"\"2\"\",3, \r\n");  // ["2",3, \r\n] ==> [""2"",3, \r\n]
    }
}

#ifdef English_dox
/** Scans \c str substituting \c '"' by 2 double-quotes \c [""].
    - Local routine used in the implementation of \c getNextCSV().

    \dontinclude test_CSV.cpp
    \skipline    test::rebuildDquote()
    \until       }}
    \see         test_CSV::test_rebuildDquote()
*/
#endif
#ifdef Spanish_dox
/** Sustituye en \c str cada comilla doble \c '"' por 2 comillas dobles \c [""].
    - Rutina local useda en la implementación de \c getNextCSV().

    \dontinclude test_CSV.cpp
    \skipline    test::rebuildDquote()
    \until       }}
    \see         test_CSV::test_rebuildDquote()
*/
#endif
void rebuildDquote( std::string & str ) {
    std::string res;
    std::string::const_iterator ch;
    for ( ch = str.begin(); ch != str.end(); ++ch ) {
        res += *ch;
        if ( *ch == DQUOTE ) {
            res += DQUOTE;
        }
    }
    str = res;
}

bool getNextCSV_OLD( std::string& csv, std::istream& CIN ) {
    csv.clear();
    if ( CIN.fail() ) { // see http://www.horstmann.com/cpp/pitfalls.html
        return false;
    }
    int state=0; char ch;
    bool trailing_CR = false; // true when the last char was CR
    for (;;) {
        CIN.get(ch);
        if ( CIN.fail() ) {
            return false;
        }

        switch (state) {
        case 0: { // init
                if ( ch == COMMA ) {
                //  csv += COMMA; // removes COMMA from result string
                    return false;
                }
                else if ( ch == LF ) {
                //  csv += LF;    // removes LF from result string
                    return true;
                }
                else if ( ch == CR ) {
                    trailing_CR = true;
                    csv += CR;
                    state = 3;
                }
                else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
                    state = 1;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
                }                              //  ----------+------------+------------+------------+
                else { // letter               //   ==>  0   |     0      |     1      |     3      |
                    csv += ch;                 //       init |   return   |            |  csv+=ch   |
                    state = 3;                 //  ----------+------------+------------+------------+
                }
            }
            break;

        case 1: { // quote(1)
                if ( ch == DQUOTE ) {          //            |  ',' '\n'  |    '"'     |     l      |
                    state = 2;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
                }                              //  ----------+------------+------------+------------+
                else { // letter COMMA LF      //        1   |     1      |     2      |     1      |
                    csv += ch;                 //   quoted(1)|  csv+=ch   |            |  csv+=ch   |
                //  state = 1;                 //  ----------+------------+------------+------------+
                }
            }
            break;

        case 2: { // inquote(2)
                if ( ch == COMMA ) {
                //  state = 0;
                    return false;
                } else if ( ch == LF ) {
                //  state = 0;
                    return true;
                }
                else if ( trailing_CR ) { //  ["...""..."\r?...,] '?' after '\r'
                    rebuildDquote( csv );
                    csv = DQUOTE + csv + DQUOTE + CR + ch;
                    trailing_CR = false;
                    state = 3;
                }
                else if ( ch == CR ) { // removes CR+LF at the end of line
                    trailing_CR = true;
                //  csv += CR; // removes trailing CR+LF
                //  state = 2;
                }
                else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
                    csv += DQUOTE;             //    delta() |  comma+LF  |  d-quote   |   letter   |
                    state = 1;                 //  ----------+------------+------------+------------+
                }                              //        2   |     0      |     1      |     3      |
                else { // letter (error)       //  inquote(2)|   return   |  csv+=ch   |  csv='""'  |
                    rebuildDquote( csv );      //  ----------+------------+------------+------------+
                    csv= DQUOTE + csv + DQUOTE + ch;  // [," ... "" "3x,] ==> error condition ["3]
                    state = 3;                        //  [" ... "" "3]  ==> rebuilt value
                }
            }
            break;

        case 3: { // regular
                if ( ch == COMMA ) {
                    return false;
                } else if ( ch == LF ) {
                    if ( trailing_CR ) {
                        csv = csv.substr( 0, csv.length()-1 ); // chop( csv , CR );
                    }
                //  state = 0;
                //  csv += LF;
                    return true;
                }
                else if ( ch == CR ) {    // leaves CR at the end
                    trailing_CR = true;   // mark to remove later
                    csv += CR;
                //  state = 3;
                }                              //            |  ',' '\n'  |    '"'     |     l      |
                else { // letter               //    delta() |  comma+LF  |  d-quote   |   letter   |
                    csv += ch;                 //  ----------+------------+------------+------------+
                //  state = 3;                 //        3   |     0      |     3      |     3      |
                // swallows DQUOTE's && CR's   //    regular |   return   |  csv+=ch   |  csv+=ch   |
               }                               //  ----------+------------+------------+------------+
            }
            break;

        } // swith (state)
    } // for (;;)

    return false;
}

#endif

#ifdef English_dox
/// Comma Separated Value (not used in this implementation).
#endif
#ifdef Spanish_dox
/// Comma Separated Value (no usado en esta implementación).
#endif
namespace csv { } // trick to include it into the Doxygen documentation

// Trick to force Doxygen to document these.
// - They are at the end of file to avoid trouble.
using namespace std;
using namespace csv;

// EOF: CSV.cpp