This program follows from the section 6.5 of K&R2 where authors created a
doubly-linked list using a binary-tree based approach. The only thing I
have rewritten myself is the getword function. I am using it because there
are many concepts involved in this program that I want to understand like:
1.) degree of efficiency of this program as compared to sort using hash-table
based approach.
2.) Keeping the program well structured so that anyone can have a look and
in some amount of time he can draw a map of the design and the
key-concepts involved in a program.
3.) Any other advice you think is important to be given to a beginning C
programmer :)
/* K&R2, exercise 6.4
*
* write a program that prints the distinct words in its input sorted
* alphabetically. Precede each word by its count.
*
* ( exercise follows the section on linked lists, so I think it is the general
* idea that I should use linked list to solve the problem )
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
enum MAXSIZE { ARRSIZE = 1000, WORDSIZE = 30 };
int getword( char * , int );
struct tnode *add_to_tree( struct tnode *, char * );
void print_tree( struct tnode * );
struct tnode *tree_alloc( void );
char *strdup( char * );
int main( void )
{
struct tnode *root_node;
char word[ARRSIZE];
root_node = NULL;
while( getword( word, WORDSIZE ) )
{
root_node = add_to_tree( root_node, word );
}
print_tree( root_node );
return 0;
}
struct tnode {
char *word;
int count;
struct tnode *left;
struct tnode *right;
};
/* A program that takes a single word as input. If the input word
* contains more than 30 letters then only the extra letters will be
* discarded . For general purpose usage of English it does not make any
* sense to use a word larger than this size. Nearly every general purpose
* word can be expressed in a word with less than or equal to 30 letters.
*
* I am only considering pure words like "usenet". Words containing anything
* other than the 26 alphabets of English will simply be discarded e.g. like
* "usen@t" or "usenet3".
*
*/
int getword( char *word, int max_length )
{
int c;
char *w = word;
while( isspace( c = getchar() ) )
{
;
}
while( --max_length )
{
if( isalpha( c ) )
{
*w++ = c;
}
else if( isspace( c ) || c == EOF )
{
*w = '\0';
break;
}
else
{
return 0;
}
c = getchar();
}
/* I can simply ignore the if condition and directly write the '\0'
onto the last element because in worst case it will only rewrite
the '\n' that is put in there by else if clause.
or in else if clause, I could replace break with return word[0].
I thought these 2 ideas will be either inefficient or
a bad programming practice, so I did not do it.
*/
if( *w != '\0' )
{
*w = '\0';
}
return word[0];
}
/* adds the word onto the tree */
struct tnode *add_to_tree( struct tnode *tree, char *pc )
{
int match;
if( tree == NULL )
{
tree = tree_alloc();
tree->word = strdup( pc );
tree->count = 1;
tree->left = tree->right = NULL;
}
else if( 0 == (match = strcmp( pc, tree->word )) )
{
++tree->count;
}
else if( match 0 )
{
tree->right = add_to_tree( tree->right, pc );
}
else if( match < 0 )
{
tree->left = add_to_tree( tree->left, pc );
}
return tree;
}
/* prints the tree */
void print_tree( struct tnode *tree )
{
if ( tree )
{
print_tree(tree->left);
printf("%30s -- %4d\n", tree->word, tree->count);
print_tree(tree->right);
}
}
/* allocate memory for a node */
struct tnode *tree_alloc( void )
{
return malloc( sizeof( struct tnode ) );
}
char *strdup( char *pc )
{
char *pc2;
pc2 = malloc( strlen(pc) + 1 );
if( pc )
{
strcpy( pc2, pc );
}
return pc2;
}
============= OUTPUT ===============
[arnuld@raj C]$ gcc -ansi -pedantic -Wall -Wextra test.c
[arnuld@raj C]$ ./a.out
like this usenet and that of usenet is this
and -- 1
is -- 1
like -- 1
of -- 1
that -- 1
this -- 2
usenet -- 2
[arnuld@raj C]$
--
http://lispmachine.wordpress.com/
my email ID is @ the above address