This program is long. I don't really want to bore everyone with the
details, but it handles wierd cases like:
/\
* this is a comment *\
/
#define FOO ??/* this is not a comment */
char *a = /* this is a comment "\*/"this is a string"/*" another comment */;
I intend this program to be an example of how to write a kind of state
machine, not really an example of tight coding, but any comments would
be welcome.
Thanks,
-- James
--
/*
* cstripc: A C program to strip comments from C files.
* Usage:
* cstripc [file [...]]
* cstripc [-t]
*
* The '-t' options is used for testing. It prints some pointers to strings
* that are interlaced with comment characters.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*************** **/
/**** GLOBALS ****/
/*************** **/
static const char *progname;
static int debug_flag;
/*************** *******/
/**** MAIN PROGRAM ****/
/*************** *******/
static void print_usage(voi d);
static void print_test(void );
static FILE * open_input_file (const char *filename);
static void close_input_fil e(FILE *infile);
static void parse_input_fil e(FILE *infile);
int
main(int argc, char *argv[])
{
progname = argv[0];
if (progname == 0) {
progname = "cstripc";
}
while (argc > 1) {
if ((*argv[1] != '-') || (strcmp(argv[1], "-") == 0)) {
break;
}
if (strcmp(argv[1], "-t") == 0) {
print_test();
exit(0);
} else if (strcmp(argv[1], "-d") == 0) {
debug_flag = 1;
} else {
fprintf(stderr, "%s: Unrecognized option '%s'\n",
progname, argv[1]);
print_usage();
exit(EXIT_FAILU RE);
}
--argc;
++argv;
}
if (argc <= 1) {
parse_input_fil e(stdin);
exit(0);
}
while (argc > 1) {
FILE *infile;
parse_input_fil e(infile = open_input_file (argv[1]));
close_input_fil e(infile);
--argc;
++argv;
}
}
/*************** ***********/
/**** PRINT USAGE/TEST ****/
/*************** ***********/
static const char *usage_string =
"%s: A C program to strip comments from C files.\n"
"Usage:\n"
" %s [file [...]]\n"
" %s [-t]\n"
"\n"
"The '-t' options is used for testing. It prints some pointers to strings\n"
"that are interlaced with comment characters.\n"
;
static void
print_usage(voi d)
{
fprintf(stderr, usage_string, progname, progname, progname);
}
static const char *a;
static const char *b;
static const char *c;
static void
print_test(void )
{
if (a) puts(a);
if (b) puts(b);
if (c) puts(c);
}
/*************** *************** */
/**** OPEN/CLOSE INPUT FILE ****/
/*************** *************** */
static const char *input_file_nam e;
static FILE *
open_input_file (const char *filename)
{
FILE *infile;
input_file_name = filename;
if (filename == 0) {
return 0;
}
if (strcmp(filenam e, "-") == 0) {
return stdin;
}
infile = fopen(filename, "r");
if (infile == 0) {
fprintf(stderr, "%s: Could not open '%s' for reading.\n",
progname, filename);
}
return infile;
}
static void
close_input_fil e(FILE *infile)
{
if (infile) {
if (infile != stdin) {
if (fclose(infile) == EOF)
fprintf(stderr, "%s, Could not close '%s'.\n",
progname, input_file_name );
} else {
clearerr(stdin) ;
}
}
}
/*************** ***********/
/**** PARSE INPUT FILE ****/
/*************** ***********/
typedef struct scan_state scan_state;
typedef struct scan_context scan_context;
struct scan_context {
scan_state *ss;
char *sbuf;
unsigned sbufsz;
unsigned sbufcnt;
};
struct scan_state {
scan_state *(*scan)(scan_c ontext *ctx, int input);
const char *name;
};
static scan_context initial_scan_co ntext;
static void
parse_input_fil e(FILE *infile)
{
int c;
scan_context ctx;
if (infile == 0) {
return;
}
ctx = initial_scan_co ntext;
while ((c = fgetc(infile)) != EOF) {
if (debug_flag) {
fprintf(stderr, "%s\n", ctx.ss->name);
}
ctx.ss = ctx.ss->scan(&ctx, c);
}
}
/*************** ********/
/**** STATE MACHINE ****/
/*************** ********/
/*
*
*************** *************** *************** *************** ***************
* Assume input is a syntactically correct C program.
*
* The basic algorithm is:
* Scan character by character:
* Treat trigraphs as a single character.
* If the sequence does not start a comment, emit the sequence.
* Otherwise,
* Scan character by character:
* Treat trigraphs as a single character.
* Treat the sequence '\\' '\n' as no character.
* If the sequence does not end a comment, continue consuming.
* Otherwise, emit a space, and loop back to top.
*************** *************** *************** *************** ***************
*
*/
#define SCAN_STATE_DEFI NE(name) \
static scan_state * name##_func(sca n_context *ctx, int input); \
static scan_state name##_state = { name##_func, #name }
SCAN_STATE_DEFI NE(normal);
SCAN_STATE_DEFI NE(normal_maybe _tri_1);
SCAN_STATE_DEFI NE(normal_maybe _tri_2);
SCAN_STATE_DEFI NE(string);
SCAN_STATE_DEFI NE(string_maybe _tri_1);
SCAN_STATE_DEFI NE(string_maybe _tri_2);
SCAN_STATE_DEFI NE(string_maybe _splice);
SCAN_STATE_DEFI NE(char);
SCAN_STATE_DEFI NE(char_maybe_t ri_1);
SCAN_STATE_DEFI NE(char_maybe_t ri_2);
SCAN_STATE_DEFI NE(char_maybe_s plice);
SCAN_STATE_DEFI NE(slash);
SCAN_STATE_DEFI NE(slash_maybe_ tri_1);
SCAN_STATE_DEFI NE(slash_maybe_ tri_2);
SCAN_STATE_DEFI NE(slash_maybe_ splice);
SCAN_STATE_DEFI NE(slashslash);
SCAN_STATE_DEFI NE(slashslash_m aybe_tri_1);
SCAN_STATE_DEFI NE(slashslash_m aybe_tri_2);
SCAN_STATE_DEFI NE(slashslash_m aybe_splice);
SCAN_STATE_DEFI NE(slashsplat);
SCAN_STATE_DEFI NE(slashsplat_s plat);
SCAN_STATE_DEFI NE(slashsplat_s plat_maybe_tri_ 1);
SCAN_STATE_DEFI NE(slashsplat_s plat_maybe_tri_ 2);
SCAN_STATE_DEFI NE(slashsplat_s plat_maybe_spli ce);
#define SCAN_STATE(name ) (&name##_stat e)
static scan_context initial_scan_co ntext = { SCAN_STATE(norm al), 0, 0, 0 };
static void sbuf_append_cha r(scan_context *ctx, int c);
static void sbuf_append_str ing(scan_contex t *ctx, char *s);
static void sbuf_clear(scan _context *ctx);
static void sbuf_emit(scan_ context *ctx);
static scan_state *
normal_func(sca n_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_cha r(ctx, input);
return SCAN_STATE(norm al_maybe_tri_1) ;
case '"': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(stri ng);
case '\'': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char );
case '/': sbuf_emit(ctx);
sbuf_append_cha r(ctx, input);
return SCAN_STATE(slas h);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(norm al);
}
}
static scan_state *
normal_maybe_tr i_1_func(scan_c ontext *ctx, int input)
{
switch (input) {
case '?': sbuf_append_cha r(ctx, input);
return SCAN_STATE(norm al_maybe_tri_2) ;
default: sbuf_emit(ctx);
return SCAN_STATE(norm al)->scan(ctx, input);
}
}
static scan_state *
normal_maybe_tr i_2_func(scan_c ontext *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(norm al_maybe_tri_2) ;
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-':
case '/': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(norm al);
default: sbuf_emit(ctx);
return SCAN_STATE(norm al)->scan(ctx, input);
}
}
static scan_state *
string_func(sca n_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_cha r(ctx, input);
return SCAN_STATE(stri ng_maybe_tri_1) ;
case '"': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(norm al);
case '\\': sbuf_emit(ctx);
sbuf_append_cha r(ctx, input);
return SCAN_STATE(stri ng_maybe_splice );
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(stri ng);
}
}
static scan_state *
string_maybe_tr i_1_func(scan_c ontext *ctx, int input)
{
switch (input) {
case '?': sbuf_append_cha r(ctx, input);
return SCAN_STATE(stri ng_maybe_tri_2) ;
default: sbuf_emit(ctx);
return SCAN_STATE(stri ng)->scan(ctx, input);
}
}
static scan_state *
string_maybe_tr i_2_func(scan_c ontext *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(stri ng_maybe_tri_2) ;
case '/': sbuf_append_car (ctx, input);
return SCAN_STATE(stri ng_maybe_splice );
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(stri ng);
default: sbuf_emit(ctx);
return SCAN_STATE(stri ng)->scan(ctx, input);
}
}
static scan_state *
string_maybe_sp lice_func(scan_ context *ctx, int input)
{
switch (input) {
case '\n':
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(stri ng);
}
}
static scan_state *
char_func(scan_ context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_cha r(ctx, input);
return SCAN_STATE(char _maybe_tri_1);
case '\'': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(norm al);
case '\\': sbuf_emit(ctx);
sbuf_append_cha r(ctx, input);
return SCAN_STATE(char _maybe_splice);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char );
}
}
static scan_state *
char_maybe_tri_ 1_func(scan_con text *ctx, int input)
{
switch (input) {
case '?': sbuf_append_cha r(ctx, input);
return SCAN_STATE(char _maybe_tri_2);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char )->scan(ctx, input);
}
}
static scan_state *
char_maybe_tri_ 2_func(scan_con text *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(char _maybe_tri_2);
case '/': sbuf_append_cha r(ctx, input);
return SCAN_STATE(char _maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char );
default: sbuf_emit(ctx);
return SCAN_STATE(char )->scan(ctx, input);
}
}
static scan_state *
char_maybe_spli ce_func(scan_co ntext *ctx, int input)
{
switch (input) {
case '\n':
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char );
}
}
static scan_state *
slash_func(scan _context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_cha r(ctx, input);
return SCAN_STATE(slas h_maybe_tri_1);
case '\\': sbuf_append_cha r(ctx, input);
return SCAN_STATE(slas h_maybe_splice) ;
case '/': sbuf_clear(ctx) ;
return SCAN_STATE(slas hslash);
case '*': sbuf_clear(ctx) ;
return SCAN_STATE(slas hsplat);
default: sbuf_emit(ctx);
return SCAN_STATE(norm al)->scan(ctx, input);
}
}
static scan_state *
slash_maybe_tri _1_func(scan_co ntext *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slas h_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(norm al)->scan(ctx, input);
}
}
static scan_state *
slash_maybe_tri _2_func(scan_co ntext *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_str ing(ctx, "??");
return SCAN_STATE(norm al_maybe_tri_2) ;
case '/': sbuf_append_cha r(ctx, '?');
sbuf_append_cha r(ctx, input);
return SCAN_STATE(slas h_maybe_splice) ;
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_append_cha r(ctx, '?');
sbuf_append_cha r(ctx, input);
sbuf_emit(ctx);
return SCAN_STATE(norm al);
default: sbuf_append_cha r(ctx, '?');
sbuf_emit(ctx);
return SCAN_STATE(norm al)->scan(ctx, input);
}
}
static scan_state *
slash_maybe_spl ice_func(scan_c ontext *ctx, int input)
{
switch (input) {
case '\n': sbuf_append_cha r(ctx, input);
return SCAN_STATE(slas h);
default: sbuf_emit(ctx);
return SCAN_STATE(norm al)->scan(ctx, input);
}
}
static scan_state *
slashslash_func (scan_context *ctx, int input)
{
/* UNUSED */ ctx = ctx;
switch (input) {
case '?': return SCAN_STATE(slas hslash_maybe_tr i_1);
case '\\': return SCAN_STATE(slas hslash_maybe_sp lice);
case '\n': putchar(' ');
putchar(input);
return SCAN_STATE(norm al);
default: return SCAN_STATE(slas hslash);
}
}
static scan_state *
slashslash_mayb e_tri_1_func(sc an_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slas hslash_maybe_tr i_2);
default: return SCAN_STATE(slas hslash)->scan(ctx, input);
}
}
static scan_state *
slashslash_mayb e_tri_2_func(sc an_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slas hslash_maybe_tr i_2);
case '/': return SCAN_STATE(slas hslash_maybe_sp lice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': return SCAN_STATE(slas hslash);
default: return SCAN_STATE(slas hslash)->scan(ctx, input);
}
}
static scan_state *
slashslash_mayb e_splice_func(s can_context *ctx, int input)
{
switch (input) {
case '\n': return SCAN_STATE(slas hslash);
default: return SCAN_STATE(slas hslash)->scan(ctx, input);
}
}
static scan_state *
slashsplat_func (scan_context *ctx, int input)
{
/* UNUSED */ ctx = ctx;
switch (input) {
case '*': return SCAN_STATE(slas hsplat_splat);
default: return SCAN_STATE(slas hsplat);
}
}
static scan_state *
slashsplat_spla t_func(scan_con text *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slas hsplat_splat_ma ybe_tri_1);
case '\\': return SCAN_STATE(slas hsplat_splat_ma ybe_splice);
case '/': putchar(' ');
return SCAN_STATE(norm al);
default: return SCAN_STATE(slas hsplat)->scan(ctx, input);
}
}
static scan_state *
slashsplat_spla t_maybe_tri_1_f unc(scan_contex t *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slas hsplat_splat_ma ybe_tri_2);
default: return SCAN_STATE(slas hsplat)->scan(ctx, input);
}
}
static scan_state *
slashsplat_spla t_maybe_tri_2_f unc(scan_contex t *ctx, int input)
{
switch (input) {
case '/': return SCAN_STATE(slas hsplat_splat_ma ybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': return SCAN_STATE(slas hsplat);
default: return SCAN_STATE(slas hsplat)->scan(ctx, input);
}
}
static scan_state *
slashsplat_spla t_maybe_splice_ func(scan_conte xt *ctx, int input)
{
switch (input) {
case '\n': return SCAN_STATE(slas hsplat_splat);
default: return SCAN_STATE(slas hsplat)->scan(ctx, input);
}
}
/*************** **********/
/**** BUFFER HANDLING ****/
/*************** **********/
static void
sbuf_append_cha r(scan_context *ctx, int c)
{
if (ctx->sbuf == 0) {
ctx->sbuf = malloc(ctx->sbufsz = 128);
} else if (ctx->sbufcnt == ctx->sbufsz) {
char *p = realloc(ctx->sbuf, ctx->sbufsz *= 2);
if (p == 0) {
fprintf(stderr, "%s: memory allocation failure\n", progname);
exit(EXIT_FAILU RE);
}
ctx->sbuf = p;
}
ctx->sbuf[ctx->sbufcnt++] = c;
ctx->sbuf[ctx->sbufcnt] = '\0';
}
static void
sbuf_append_str ing(scan_contex t *ctx, char *s)
{
while (*s != '\0') {
sbuf_append_cha r(ctx, *s++);
}
}
static void
sbuf_clear(scan _context *ctx)
{
ctx->sbufcnt = 0;
if (ctx->sbuf) {
ctx->sbuf[ctx->sbufcnt] = '\0';
}
}
static void
sbuf_emit(scan_ context *ctx)
{
if (ctx->sbuf == 0 || ctx->sbufcnt == 0) {
return;
}
printf("%s", ctx->sbuf);
sbuf_clear(ctx) ;
}
/*************** *****/
/**** TEST CASES ****/
/*************** *****/
/* a comment */
/\
* a comment split */
/\
\
* a comment split twice */
/*
block comment
*/
/* comment, trailing delimiter split *\
/
/* comment, trailing delimiter split twice *\
\
/
/* comment, trailing delimiter split once, and again by trigraph *\
??/
/
static const char *a = /* comment in code line "*/"Hello, "/**/"World!";
static const char *b = /\
* comment on code line split */ "Hello, " /\
\
* comment on code line split twice */ "World!";
#define FOO ??/* this does not start a comment */
#if defined(__STDC_ _) && (__STDC__ == 1)
#if defined(__STD_V ERSION__) && (__STD_VERSION_ _ >= 199901L)
//*** MORE TEST CASES ***//
/\
/ // comment split
/\
\
/ // comment split twice
static const char *c = // // comment on code line
"Hello, " /\
/ // comment on code line split
"World!" /\
\
/ // comment on code line split twice.
;
#define BAR ??// this does not start a comment
// This is a // comment \
on two lines
#else
static const char *c = "STDC without STD_VERSION";
#endif
#endif