473,890 Members | 1,359 Online
Bytes | Software Development & Data Engineering Community
+ Post

Home Posts Topics Members FAQ

A simple parser

Hi guys

I have written this small parser to print out the functions defined in a
C file. This is an example of parsing in C, that I want to add to my
tutorial. Comments (and bug reports) are welcome.

-------------------------------------------------------------cut here

/* A simple scanner that will take a file of C source code and
print the names of all functions therein, in the following format:
"Function XXXX found line dddd .... ddddd"
Algorithm. It scans for a terminating parentheses and an immediately
following opening brace. Comments can appear between the closing
paren and the opening braces, but no other characters besides white
space. Functions must have the correct prototype, K & R syntax
is not supported.
*/
#include <stdio.h>
#define MAXID 1024 // Longest Identifier we support. Sorry
// Java guys...
static char IdBuffer[MAXID]; // Buffer for remembering the function name
static int line = 1; // We start at line 1

// This function reads a character and if
// it is \n it bumps the line counter
static int Fgetc(FILE *f)
{
int c = fgetc(f);
if (c == '\n')
line++;
return c;
}

// Return 1 if the character is a legal C identifier
// character, zero if not. The parameter "start"
// means if an identifier START character
// (numbers) is desired.
static int IsIdentifier(in t c,int start)
{
if (c >= 'a' && c <= 'z')
return 1;
if (c >= 'A' && c <= 'Z')
return 1;
if (start == 0 && c >= '0' && c <= '9')
return 1;
if (c == '_')
return 1;
return 0;
}

// Just prints the function name
static int PrintFunction(F ILE *f)
{
printf("Functio n %s found line %d ...",IdBuffer,l ine);
return Fgetc(f);
}

// Reads a global identifier into our name buffer
static int ReadId(char c,FILE *f)
{
int i = 1;
IdBuffer[0] = c;
while (i < MAXID-1) {
c = Fgetc(f);
if (c != EOF) {
if (IsIdentifier(c ,0))
IdBuffer[i++] = c;
else break;
}
else break;
}
IdBuffer[i] = 0;
return c;
}
static int ParseString(FIL E *f) // Skips strings
{
int c = Fgetc(f);
while (c != EOF && c != '"') {
if (c == '\\')
c = Fgetc(f);
if (c != EOF)
c = Fgetc(f);
}
if (c == '"')
c = Fgetc(f);
return c;
}

static int ParseComment(FI LE *f) // Skips comments
{
int c = Fgetc(f);
restart:
while (c != '*') {
c = Fgetc(f);
if (c == EOF)
return EOF;
}
c = Fgetc(f);
if (c == '/')
return Fgetc(f);
else goto restart;
}
static int ParseCppComment (FILE *f) // Skips // comments
{
int c = Fgetc(f);
while (c != EOF && c != '\n') {
if (c == '\\')
c = Fgetc(f);
if (c != EOF)
c = Fgetc(f);
}
if (c == '\n')
c = Fgetc(f);
return c;
}

// Skips white space and comments
static int SkipWhiteSpace( int c,FILE *f) {
if (c ' ')
return c;
while (c <= ' ') {
c = Fgetc(f);
if (c == '/') {
c = Fgetc(f);
if (c == '*')
c = ParseComment(f) ;
else if (c == '/')
c = ParseCppComment (f);
}
}
return c;
}

// Skips chars between simple quotes
static int ParseQuotedChar (FILE *f)
{
int c = Fgetc(f);
while (c != EOF && c != '\'') {
if (c == '\\')
c = Fgetc(f);
if (c != EOF)
c = Fgetc(f);
}
if (c == '\'')
c = Fgetc(f);
return c;
}
int main(int argc,char *argv[])
{
if (argc == 1) {
printf("Usage: %s <file.c>\n",arg v[0]);
return 1;
}
FILE *f = fopen(argv[1],"r");
if (f == NULL) {
printf("Can't find %s\n",argv[1]);
return 2;
}
int c = Fgetc(f);
int level = 0;
int parenlevel = 0;
int inFunction = 0;
while (c != EOF) {
// Note that each of the switches must advance the
// character read so that we avoid an infinite loop.
switch (c) {
case '"':
c = ParseString(f);
break;
case '/':
c = Fgetc(f);
if (c == '*')
c = ParseComment(f) ;
else if (c == '/')
c = ParseCppComment (f);
break;
case '\'':
c = ParseQuotedChar (f);
break;
case '{':
level++;
c = Fgetc(f);
break;
case '}':
if (level == 1 && inFunction) {
printf(" %d\n",line);
inFunction = 0;
}
if (level 0)
level--;
c = Fgetc(f);
break;
case '(':
parenlevel++;
c = Fgetc(f);
break;
case ')':
if (parenlevel 0)
parenlevel--;
c = Fgetc(f);
if ((parenlevel|le vel) == 0) {
c = SkipWhiteSpace( c,f);
if (c == '{') {
level++;
inFunction = 1;
c = PrintFunction(f );
}
}
break;
default:
if ((level | parenlevel) == 0 &&
IsIdentifier(c, 1))
c = ReadId(c,f);
else c = Fgetc(f);
}
}
fclose(f);
return 0;
}
Oct 14 '06
121 6603


On Oct 15, 2:42¬*pm, Richard Heathfield <inva...@invali d.invalid>
wrote:
jacob navia said:
CBFalconer wrote:<snip>
3. ¬*Faulty code. ¬*There is no guarantee 'a' .. 'z' etc. are
contiguous, in any specific order, etc.
Mmmm, it *could* be, but I have never found a machine where they aren't
contiguous...Th at doesn't mean such machines don't exist. I've spent several years
working on such machines. Look up "EBCDIC" in Google.

Yeah. In fact, C99 gives us little guarantee about how characters are
portable. The following is found in C99 5.2.1:

A byte with all bits set to 0, called the null character,
shall exist in the basic execution character set; it is used to
terminate a character string.

Both the basic source and basic execution character sets shall have the
following members: the 26 uppercase letters of the Latin alphabet
A B C D E F G H I J K L M
N O P Q R S T U V W X Y Z
the 26 lowercase letters of the Latin alphabet
a b c d e f g h i j k l m
n o p q r s t u v w x y z
the 10 decimal digits
0 1 2 3 4 5 6 7 8 9
the following 29 graphic characters
! " # % & ' ( ) * + , - . /
:
; < = ? [ \ ] ^ _ { | } ~
the space character, and control characters representing horizontal
tab, vertical tab, and form feed. The representation of each member of
the source and execution basic
character sets shall Ô¨Āt in a byte. In both the source and execution
basic character sets, the value of each character after 0 in the above
list of decimal digits shall be one greater than
the value of the previous. In source Ô¨Āles, there shall be some way of
indicating the end of each line of text; this International Standard
treats such an end-of-line indicator as if it
were a single new-line character. In the basic execution character set,
there shall be control characters representing alert, backspace,
carriage return, and new line.

Oct 15 '06 #11


"Keith Thompson" <ks***@mib.orgw rote in message
>
Every one of those errors is caused by two things: "//" comments and
mixed declarations and statements.
I decided that surely, slash slash comments were so widespread by now that I
could use them.
Only to have my code break on the next complier, a parallel job.
--
www.personal.leeds.ac.uk/~bgy1mm
freeware games to download.

Oct 15 '06 #12
On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
<in*****@invali d.invalidwrote:
>jacob navia said:
>CBFalconer wrote:

<snip>
>>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
contiguous, in any specific order, etc.

Mmmm, it *could* be, but I have never found a machine where they aren't
contiguous.. .

That doesn't mean such machines don't exist. I've spent several years
working on such machines. Look up "EBCDIC" in Google.
<snip>
Also BCD, as used on a 14xx series system. (came before "EBCDIC") :-)
--
ArarghMail610 at [drop the 'http://www.' from ->] http://www.arargh.com
BCET Basic Compiler Page: http://www.arargh.com/basic/index.html

To reply by email, remove the garbage from the reply address.
Oct 15 '06 #13
On Sun, 15 Oct 2006 07:55:01 +0200, jacob navia wrote:
>CBFalconer wrote:
>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
contiguous, in any specific order, etc.

Mmmm, it *could* be, but I have never found a machine where they aren't
contiguous.. .
What's wrong with isalpha and similar functions?

Best wishes,
Roland Pibinger
Oct 15 '06 #14
Roland Pibinger wrote:
On Sun, 15 Oct 2006 07:55:01 +0200, jacob navia wrote:
>>CBFalconer wrote:
>>>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
contiguous , in any specific order, etc.

Mmmm, it *could* be, but I have never found a machine where they aren't
contiguous. ..


What's wrong with isalpha and similar functions?

Best wishes,
Roland Pibinger
Yes, I will change that as proposed by Chuck.
Oct 15 '06 #15
On Sun, 15 Oct 2006 00:27:43 +0200, jacob navia wrote:
>I have written this small parser to print out the functions defined in a
C file. This is an example of parsing in C, that I want to add to my
tutorial. Comments (and bug reports) are welcome.
[...]
>static char IdBuffer[MAXID]; // Buffer for remembering the function name
static int line = 1; // We start at line 1
Why statics? They make your code non-reusable. BTW, if you also made
FILE *f and int c static here you (almost) wouldn't have to pass any
arguments to your functions. IMO, non-const globals (statics) should
be avoided in a C tutorial (because they should be avoided in a C
program).
>static int Fgetc(FILE *f)
Why are the functions static? That's only confusing for a newbie.
>static int ParseComment(FI LE *f) // Skips comments
{
int c = Fgetc(f);
restart:
while (c != '*') {
c = Fgetc(f);
if (c == EOF)
return EOF;
}
c = Fgetc(f);
if (c == '/')
return Fgetc(f);
else goto restart;
}
Hmm, goto in a C tutorial? You could add an exercise for the reader:
'Enhance the clarity of this function by rewriting it with one return
statement and without using goto'.

Best regards,
Roland Pibinger
Oct 15 '06 #16
Ar************* ****@NOT.AT.Ara rgh.com writes:
On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
<in*****@invali d.invalidwrote:
>>jacob navia said:
>>CBFalconer wrote:

<snip>
>>>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
contiguous , in any specific order, etc.
Mmmm, it *could* be, but I have never found a machine where they aren't
contiguous. ..

That doesn't mean such machines don't exist. I've spent several years
working on such machines. Look up "EBCDIC" in Google.
<snip>
Also BCD, as used on a 14xx series system. (came before "EBCDIC") :-)
BCD is worse, not only there are not contiguous, they aren't even in order.

Yours,

--
Jean-Marc
Oct 15 '06 #17
Roland Pibinger wrote:
On Sun, 15 Oct 2006 00:27:43 +0200, jacob navia wrote:
>>I have written this small parser to print out the functions defined in a
C file. This is an example of parsing in C, that I want to add to my
tutorial. Comments (and bug reports) are welcome.

[...]
>>static char IdBuffer[MAXID]; // Buffer for remembering the function name
static int line = 1; // We start at line 1


Why statics? They make your code non-reusable. BTW, if you also made
FILE *f and int c static here you (almost) wouldn't have to pass any
arguments to your functions. IMO, non-const globals (statics) should
be avoided in a C tutorial (because they should be avoided in a C
program).
If you make global variables visible by other parts of the
program this can lead to name conflicts.

Making global variables static limits their scope and
allows for code reuse. Actually I believe the
contrary is true. I think the code is more reusable
BECAUSE it will export just ONE function.
>
>>static int Fgetc(FILE *f)


Why are the functions static? That's only confusing for a newbie.
No. See above.
>
>>static int ParseComment(FI LE *f) // Skips comments
{
int c = Fgetc(f);
restart:
while (c != '*') {
c = Fgetc(f);
if (c == EOF)
return EOF;
}
c = Fgetc(f);
if (c == '/')
return Fgetc(f);
else goto restart;
}


Hmm, goto in a C tutorial? You could add an exercise for the reader:
'Enhance the clarity of this function by rewriting it with one return
statement and without using goto'.

Best regards,
Roland Pibinger

I believe goto is not bad when used correctly. It is part of the
language anyway, and if used correctly it is perfectly OK.

Care to solve your problem?

I will add it to the tutorial.
Oct 15 '06 #18
Jean-Marc Bourguet wrote:
Ar************* ****@NOT.AT.Ara rgh.com writes:

>>On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
<in*****@inva lid.invalidwrot e:

>>>jacob navia said:
CBFalcone r wrote:

<snip>

>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
>contiguous , in any specific order, etc.
>

Mmmm, it *could* be, but I have never found a machine where they aren't
contiguous. ..

That doesn't mean such machines don't exist. I've spent several years
working on such machines. Look up "EBCDIC" in Google.
<snip>

Also BCD, as used on a 14xx series system. (came before "EBCDIC") :-)


BCD is worse, not only there are not contiguous, they aren't even in order.

Yours,
Well, EBCDIC was a 7 bit code, used for punched cards. The eighth bit
was there to signal the card reader that a character was in that column.

Using only 7 bits, the codes are continuous. When punched cards weren't
so much used (approx beginning of the 80es) IBM added foreign language
characters in those positions.

This has a maybe anectodical importance, but its practical impact is ...

Anyway Chuck was right and I changed that.
Oct 15 '06 #19
jacob navia <ja***@jacob.re mcomp.frwrites:
Jean-Marc Bourguet wrote:
>Ar************* ****@NOT.AT.Ara rgh.com writes:
>>>On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
<in*****@inv alid.invalidwro te:
jacob navia said:
>CBFalcon er wrote:

<snip>

>>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
>>contiguou s, in any specific order, etc.
>>
>
>Mmmm, it *could* be, but I have never found a machine where they aren't
>contiguous ...

That doesn't mean such machines don't exist. I've spent several years
working on such machines. Look up "EBCDIC" in Google.
<snip>

Also BCD, as used on a 14xx series system. (came before "EBCDIC") :-)
BCD is worse, not only there are not contiguous, they aren't even in
order.
BTW, BCD is a 6 bits code which would be unsuitable for C, there are no
lower case letters and the digits are contiguous but not in numerical
order (0 is after 9)
Well, EBCDIC was a 7 bit code, used for punched cards.
While it is true that EBCDIC was strongly constrained by punched cards,
EBCDIC was a 8 bit code since its inconception. And the number of
codepoint defined was not constrained by punched cards considerations but
by keyboard, printer and typewriter one.
The eighth bit
was there to signal the card reader that a character was in that column.
Using only 7 bits, the codes are continuous.
I wonder if you ever saw a table. Here is one, now just give me the bit to
ignore so that A-Z are contiguous:

0 1 2 3 4 5 6 7 8 9 A B C D E F
0 & - 0
1 / a j A J 1
2 b k s B K S 2
3 c l t C L T 3
4 d m u D M U 4
5 e n v E N V 5
6 f o w F O W 6
7 g p x G P X 7
8 h q y H Q Y 8
9 i r z I R Z 9
A NL NL
B . NU , NU
C < * % NU
D ( ) _ '
E + ; =
F | ~ ? NL

When punched cards weren't so much used (approx beginning of the 80es)
IBM added foreign language characters in those positions.
National were present since its inconception. The table shows the reserved
position with NL (national lowercase) NU (national uppercase). The constraint
constraint
This has a maybe anectodical importance, but its practical impact is ...
EBCDIC is still alive on IBM mainframes. And I'd not be surprised if the
volume of data stored in EBCDIC was still greater than the volume of data
stored in codesets compatible with ISO-646.

Yours,

--
Jean-Marc
Oct 15 '06 #20

This thread has been closed and replies have been disabled. Please start a new discussion.

Similar topics

3
2223
by: Kenneth Downs | last post by:
Well, I'm coming to the end of a large and exhausting project, done in my new favorite language PHP, and its time for a diversion. I'm wondering if anyone has experience with writing simple parsers. I've never done it myself, but I know they are not as mysterious as they may seem, it's a matter of finding the tools. The idea is to take something like CSS format, except that it allows nesting, and turn it into associative arrays, such...
13
2303
by: Paulo Pinto | last post by:
Hi, does anyone know of a Python package that is able to load XML like the XML::Simple Perl package does? For those that don't know it, this package maps the XML file to a dictionary.
4
2457
by: Leif K-Brooks | last post by:
I'm writing a site with mod_python which will have, among other things, forums. I want to allow users to use some HTML (<em>, <strong>, <p>, etc.) on the forums, but I don't want to allow bad elements and attributes (onclick, <script>, etc.). I would also like to do basic validation (no overlapping elements like <strong><em>foo</em></strong>, no missing end tags). I'm not asking anyone to write a script for me, but does anyone have general...
8
6508
by: Dan | last post by:
Using XML::Simple in perl is extreemly slow to parse big XML files (can be up to 250M, taking ~1h). How can I increase my performance / reduce my memory usage? Is SAX the way forward?
4
11451
by: Greg B | last post by:
Well since getopt() doesn't seem to be compatible with Windows, and the free implementation of it for Windows that I found still had some annoying restrictions, I thought I'd whip up a simple parser myself. Just wanted to see if anyone could provide me with some constructive criticism :) any feedback would be greatly appreciated ----------------------------------------------------------------------------- #include "stdio.h" #include...
1
3081
by: steve smith | last post by:
Hi I have just downloaded the Borland C# Builder and the Micorsoft ..Net framework SDK v1.1 from the borland webist, and i am trying to get a simple program to run, however I keep getting errors, any ideas why this might be happening? Program i am running is: namespace ExamProblem { using System;
26
495
by: jacob navia | last post by:
Summary: I have changed (as proposed by Chuck) the code to use isalpha() instead of (c>='a' && c <= 'z') etc. I agree that EBCDIC exists :-) I eliminated the goto statement, obviously it is better in a tutorial to stick to structured programming whenever possible...
4
2751
by: =?Utf-8?B?SmFu?= | last post by:
In my application the user can configure automation-scripts by inserting different "actions" into a "procedure". These different procedure- and action-objects are all translated into C# code before execution. One "action" type is an expression-evaluator. At the moment the expression the user writes into the action is just inserted into the generated C# code unchanged. The problem is the variables in my system and in the "procedures";...
11
1355
by: Stef Mientki | last post by:
hello, I need to translate the following string a = '(0, 0, 0, 255), (192, 192, 192, 255), True, 8' into the following list or tuple b = Is there a simple way to to this. (Not needed now, but might need it in the future: even deeper nested
7
1204
by: bvdp | last post by:
Is there a simple/safe expression evaluator I can use in a python program. I just want to pass along a string in the form "1 + 44 / 3" or perhaps "1 + (-4.3*5)" and get a numeric result. I can do this with eval() but I really don't want to subject my users to the problems with that method. In this use I don't need python to worry about complex numbers, variables or anything else. Just do the math on a set of values. Would eval() with...
0
9826
by: Hystou | last post by:
Most computers default to English, but sometimes we require a different language, especially when relocating. Forgot to request a specific language before your computer shipped? No problem! You can effortlessly switch the default language on Windows 10 without reinstalling. I'll walk you through it. First, let's disable language synchronization. With a Microsoft account, language settings sync across devices. To prevent any complications,...
0
11236
Oralloy
by: Oralloy | last post by:
Hello folks, I am unable to find appropriate documentation on the type promotion of bit-fields when using the generalised comparison operator "<=>". The problem is that using the GNU compilers, it seems that the internal comparison operator "<=>" tries to promote arguments from unsigned to signed. This is as boiled down as I can make it. Here is my compilation command: g++-12 -std=c++20 -Wnarrowing bit_field.cpp Here is the code in...
0
10830
jinu1996
by: jinu1996 | last post by:
In today's digital age, having a compelling online presence is paramount for businesses aiming to thrive in a competitive landscape. At the heart of this digital strategy lies an intricately woven tapestry of website design and digital marketing. It's not merely about having a website; it's about crafting an immersive digital experience that captivates audiences and drives business growth. The Art of Business Website Design Your website is...
0
10468
tracyyun
by: tracyyun | last post by:
Dear forum friends, With the development of smart home technology, a variety of wireless communication protocols have appeared on the market, such as Zigbee, Z-Wave, Wi-Fi, Bluetooth, etc. Each protocol has its own unique characteristics and advantages, but as a user who is planning to build a smart home system, I am a bit confused by the choice of these technologies. I'm particularly interested in Zigbee because I've heard it does some...
0
9641
agi2029
by: agi2029 | last post by:
Let's talk about the concept of autonomous AI software engineers and no-code agents. These AIs are designed to manage the entire lifecycle of a software development projectóplanning, coding, testing, and deploymentówithout human intervention. Imagine an AI that can take a project description, break it down, write the code, debug it, and then launch it, all on its own.... Now, this would greatly impact the work of software developers. The idea...
0
5855
by: TSSRALBI | last post by:
Hello I'm a network technician in training and I need your help. I am currently learning how to create and manage the different types of VPNs and I have a question about LAN-to-LAN VPNs. The last exercise I practiced was to create a LAN-to-LAN VPN between two Pfsense firewalls, by using IPSEC protocols. I succeeded, with both firewalls in the same network. But I'm wondering if it's possible to do the same thing, with 2 Pfsense firewalls...
0
6061
by: adsilva | last post by:
A Windows Forms form does not have the event Unload, like VB6. What one acts like?
1
4682
by: 6302768590 | last post by:
Hai team i want code for transfer the data from one system to another through IP address by using C# our system has to for every 5mins then we have to update the data what the data is updated we have to send another system
3
3283
bsmnconsultancy
by: bsmnconsultancy | last post by:
In today's digital era, a well-designed website is crucial for businesses looking to succeed. Whether you're a small business owner or a large corporation in Toronto, having a strong online presence can significantly impact your brand's success. BSMN Consultancy, a leader in Website Development in Toronto offers valuable insights into creating effective websites that not only look great but also perform exceptionally well. In this comprehensive...

By using Bytes.com and it's services, you agree to our Privacy Policy and Terms of Use.

To disable or enable advertisements and analytics tracking please visit the manage ads & tracking page.