Ram Laxman wrote:
I have a text file which have data in CSV format.
"empno","phonen umber","wardnum ber"
12345,2234353,1 000202
12326,2243653,1 000098
Iam a beginner of C/C++ programming.
I don't know how to tokenize the comma separated values.I used strtok
function reading line by line using fgets.but it gives some weird
behavior.It doesnot stripout the "" fully.Could any body have sample
code for the same so that it will be helfful for my reference?
Parsing is tricky. Consider these rules:
- \n is absolute. All lines must be unbroken
- "" precedes , - so commas inside strings are text, not delimiters
- quotes inside "" need an escape, either \n or ""
- escapes need escapes - \\ is \
Try this project to learn more:
http://c2.com/cgi/wiki?MsWindowsResourceLint
First, we express those rules (one by one) as test cases:
TEST_(TestCase, pullNextToken_c omma)
{
Source aSource("a , b\nc, \n d");
string
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("a", token);
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("b", token);
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("c", token);
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("d", token);
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("", token);
// EOF!
}
struct
TestTokens: TestCase
{
void
test_a_b_d(stri ng input)
{
Source aSource(input);
string
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("a", token);
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("b", token);
// token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("c",
token);
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("d", token);
token = aSource.pullNex tToken(); CPPUNIT_ASSERT_ EQUAL("", token);
// EOF!
}
};
TEST_(TestToken s?, elideComments)
{
test_a_b_d("a b\n //c\n d");
test_a_b_d("a b\n//c \n d");
test_a_b_d("a b\n // c \"neither\" \n d");
test_a_b_d("a b\n // c \"neither\" \n d//");
test_a_b_d("//\na b\n // c \"neither\" \n d//");
test_a_b_d("//c\na b\n // c \"neither\" \n d//");
test_a_b_d("// c\na b\n // c \"neither\" \n d//");
test_a_b_d("//c \na b\n // c \"neither\" \n d//");
test_a_b_d("// \na b\n // c \"neither\" \n d//");
test_a_b_d(" // \na b\n // c \"neither\" \n d//");
}
TEST_(TestToken s?, elideStreamComm ents)
{
test_a_b_d("a b\n /*c*/\n d");
test_a_b_d("a b\n/*c*/ \n d");
test_a_b_d("a b\n /* c \"neither\" */\n d");
test_a_b_d("a b\n /* c \"neither\" \n */ d//");
test_a_b_d("//\na b\n /* c \"neither\" */ \n d/**/");
test_a_b_d("//c\na b\n // c \"neither\" \n d/* */");
test_a_b_d("/* c\n*/a b\n // c \"neither\" \n d//");
test_a_b_d("//c \na b\n // c \"neither\" \n d//");
test_a_b_d("// \na b\n // c \"neither\" \n d//");
test_a_b_d(" // \na b\n // c \"neither\" \n d//");
}
Those tests re-use the fixture test_a_b_d() to ensure that every one of
those strings parse into a, b, & d, skipping (for whatever reason) c.
You will need tests that show slightly different behaviors. But write your
tests one at a time. I wrote every single line you see here, essentially in
order, and got it to work before adding the next line. Don't write all your
tests at once, because when programming you should never go more than 1~10
edits before passing all tests.
Now here's the source of Source (which means "source of tokens"):
class
Source
{
public:
Source(string const & rc = ""):
m_rc(rc),
m_bot(0),
m_eot(0)
{}
void setResource(str ing const & rc) { m_rc = rc; }
size_type getBOT() { return m_bot; }
string const & getPriorToken() { return m_priorToken; }
string const & getCurrentToken () { return m_currentToken; }
string const &
pullNextToken()
{
m_priorToken = m_currentToken;
extractNextToke n();
return m_currentToken;
}
size_type
getLineNumber(s ize_type at)
{
size_type lineNumber = 1;
for(size_type idx(0); idx < at; ++idx)
if ('\n' == m_rc[idx])
++lineNumber;
return lineNumber;
}
string
getLine(size_ty pe at)
{
size_type bol = m_rc.rfind('\n' , at);
if (string::npos == bol) bol = 0; else ++bol;
size_type eol = m_rc.find('\n', at);
if (string::npos == eol) eol = m_rc.length(); else ++eol;
return m_rc.substr(bol , eol - bol);
}
private:
string const &
extractNextToke n()
{
char static const delims[] = " \t\n,";
m_bot = m_rc.find_first _not_of(delims, m_eot);
if (string::npos == m_bot)
m_currentToken = "";
else if (m_rc[m_bot] == '"')
m_currentToken = parseString();
else if (m_rc.substr(m_ bot, 2) == "//")
{
if (skipUntil("\n" ))
return extractNextToke n();
}
else if (m_rc.substr(m_ bot, 2) == "/*")
{
if (skipUntil("*/"))
return extractNextToke n();
}
/* else if (m_rc.substr(m_ bot, 1) == "#")
{
string line = getLine(m_bot);
size_type at(0);
while(isspace(l ine[at]) && at < line.size()) ++at;
if ('#' == line[at])
{
m_eot = m_bot + 1;
if (skipUntil("\n" ))
return extractNextToke n();
}
}*/
else
{
m_eot = m_rc.find_first _of(" \n,/", m_bot);
m_currentToken = m_rc.substr(m_b ot, m_eot - m_bot);
}
if ('#' == m_currentToken[0])
{
// assert(m_rc.sub str(m_bot, 1) == "#");
string line = getLine(m_bot);
size_type at(0);
while(isspace(l ine[at]) && at < line.size()) ++at;
if ('#' == line[at])
{
--m_eot;
if (skipUntil("\n" ))
return extractNextToke n();
}
}
return m_currentToken;
}
bool
skipUntil(char const * delimiter)
{
m_eot = m_rc.find(delim iter, m_eot + 1);
if (string::npos == m_eot)
{
m_currentToken = "";
return false;
}
m_eot += strlen(delimite r);
return true;
}
char
parseStringChar ()
{
if (m_rc[m_eot] == '\\')
{
m_eot += 1;
char escapee(m_rc[m_eot++]);
switch (escapee)
{
case 'n' : return '\n';
case 'r' : return '\r';
case 't' : return '\t';
case '0' : return '\0';
case '\\': return '\\';
case 'a' : return '\a';
default : // TODO \x, \v \b, \f
if (isdigit(escape e))
{
string slug = m_rc.substr(m_e ot - 1, 3);
return char(strtol(slu g.c_str(), NULL, 8));
}
else
//assert(false);
return escapee;
}
}
else if (m_rc[m_eot] == '"' && m_rc[m_eot+1] == '"')
m_eot++;
return m_rc[m_eot++];
}
string
parseString()
{
m_eot = m_bot + 1;
string z;
while ( m_eot < m_rc.length() &&
( m_rc[m_eot] != '"' ||
m_rc[m_eot + 1] == '"' ) )
z += parseStringChar ();
if (m_eot < m_rc.length())
m_eot += 1;
return z;
}
string m_rc;
size_type m_bot;
size_type m_eot;
string m_priorToken;
string m_currentToken;
};
That looks really ugly & long, because it hides so much behind such a narrow
interface. (I don't know if I copied all of it in, either.) But it
demonstrates (possibly) correct usage of std::string and std::vector.
Do not copy my source into your editor and try to run it. It will not parse
CVS. Start your project like this:
#include <assert.h>
#include <string>
#include <vector>
typedef std::vector<std ::string> strings_t;
strings_t parse(std::stri ng input)
{
strings_t result;
return result;
}
int main()
{
assert("a" == parse("a,b")[0]);
}
If that compiles, it >will< crash if you run it.
Now fix parse() so that it _only_ does not crash, and passes this test. Make
the implementation as stupid as you like.
Then add a test:
assert("a" == parse("a,b")[0]);
assert("b" == parse("a,b")[1]);
Keep going. Make the implementation just a little better after each test.
Write a set of tests for each of the parsing rules I listed. When the new
parse() function is full-featured, put it to work in your program.
All programs should be written by generating long lists of simple tests like
this. That keeps the bug count very low, and prevents wasting hours and
hours with a debugger.
--
Phlip
http://www.xpsd.org/cgi-bin/wiki?Tes...UserInterfaces