467,223 Members | 1,377 Online
Bytes | Developer Community
Ask Question

Home New Posts Topics Members FAQ

Post your question to a community of 467,223 developers. It's quick & easy.

Function to remove an item from a linked list failing ~1 out of 10,000 times

Hi all,
I'm having some trouble with a linked list function and was wondering if
anyone could shed any light on it. Basically I have a singly-linked list
which stores pid numbers of a process's children - when a child is fork()ed
its pid is added to the linked list. I then have a SIGCHLD handler which is
supposed to remove the pid from the list when a child exits. The problem I'm
having is that very very occasionally and seemingly unpredictably, my
function to remove an item from the list is just failing silently to do so;
a child exits, the signal handler gets run, but the child's pid does not get
removed from the list. I'm 99% sure this is a problem with my linked list
code and not my signal handling code so I hope this is not off-topic for
comp.lang.c.
Anyway, here's my code. First the definitions for my structures:

struct queue_node {
char ipaddr[IPSIZE+1];
int pid;
struct queue_node *next;
};

struct queue_list {
struct queue_node *head;
int elements;
};

struct queue_list.head points to the first node in the list and I refer to
my list by passing a pointer to a struct queue_list as an argument to
functions.

Here's my qdeletepid() function, this function removes a node from anywhere
in the list if its pid field matches the pid passed to the function:

int qdeletepid (struct queue_list *pqueue, int pid) {
struct queue_node *lcur;
struct queue_node *lprev;
if (pqueue->elements == 0) {
listop=FALSE;
return(0);
}
if (pqueue->elements == 1) {
lcur=pqueue->head;
if (lcur->pid == pid) {
pqueue->head=NULL;
pqueue->elements=0;
free(lcur);
}
listop=FALSE;
return(0);
}
lcur=pqueue->head;
lprev=NULL;
while (lcur!=NULL) {
if (lcur->pid == pid) {
if (lprev==NULL) {
pqueue->head=NULL;
pqueue->elements=0;
} else {
lprev->next=lcur->next;
pqueue->elements=pqueue->elements-1;
}
free(lcur);
}
lprev=lcur;
lcur=lcur->next;
}
return(0);
}

----------
For completeness, here's my qaddpid() function:

int qaddpid (struct queue_list *nqueue, char *ip, int pid) {
struct queue_node *new;
struct queue_node *cur;
struct queue_node *prev;
listop=TRUE;
new= (struct queue_node *) malloc(sizeof(struct queue_node));
if (new==NULL) {
syslog(LOG_NOTICE,"Failed to malloc");
return(1);
}
new->next=NULL;
snprintf(new->ipaddr,IPSIZE+1,"%s",ip);
new->pid=pid;
prev=NULL;
cur=nqueue->head;
while (cur != NULL) {
prev=cur;
cur=cur->next;
}
if (prev!=NULL) {
new->next=prev->next;
prev->next=new;
} else {
nqueue->head=new;
}
nqueue->elements++;
return(0);
}

------
And here's the function I have set to handle sigchld:

void childhandle (int signum) {
pid_t cpid=0;
while ((cpid=waitpid(0,NULL,WNOHANG)) > 0)
qdeletepid(&queue,cpid);
}
--
Any ideas why qdeletepid is failing to remove nodes from the list very
occasionally? As this happens only rarely (but the script forks a lot of
children constantly 24/7 so over the course of a couple of days the list
gets progressively bigger and bigger) it's very difficult to debug without
knowing the reason the problem is occurring.

Help very much appreciated.

~Kieran Simkin
Digital Crocus
http://digital-crocus.com/
Nov 14 '05 #1
  • viewed: 2299
Share:
7 Replies
"Kieran Simkin" <ki****@digital-crocus.com> wrote in message
news:Rq6nc.43$3p1.34@newsfe1-win...
Hi all,
I'm having some trouble with a linked list function and was wondering if
anyone could shed any light on it. Basically I have a singly-linked list
which stores pid numbers of a process's children - when a child is fork()ed
its pid is added to the linked list. I then have a SIGCHLD handler which is
supposed to remove the pid from the list when a child exits. The problem I'm
having is that very very occasionally and seemingly unpredictably, my
function to remove an item from the list is just failing silently to do so;
a child exits, the signal handler gets run, but the child's pid does not get
removed from the list. I'm 99% sure this is a problem with my linked list
code and not my signal handling code so I hope this is not off-topic for
comp.lang.c.
Anyway, here's my code. First the definitions for my structures:

struct queue_node {
char ipaddr[IPSIZE+1];
int pid;
struct queue_node *next;
};

struct queue_list {
struct queue_node *head;
int elements;
};

struct queue_list.head points to the first node in the list and I refer to
my list by passing a pointer to a struct queue_list as an argument to
functions.

Here's my qdeletepid() function, this function removes a node from anywhere
in the list if its pid field matches the pid passed to the function:

int qdeletepid (struct queue_list *pqueue, int pid) {
struct queue_node *lcur;
struct queue_node *lprev;
if (pqueue->elements == 0) {
listop=FALSE;
return(0);
}
if (pqueue->elements == 1) {
lcur=pqueue->head;
if (lcur->pid == pid) {
pqueue->head=NULL;
pqueue->elements=0;
free(lcur);
}
listop=FALSE;
return(0);
}
lcur=pqueue->head;
lprev=NULL;
while (lcur!=NULL) {
if (lcur->pid == pid) {
if (lprev==NULL) {
pqueue->head=NULL;
pqueue->elements=0;
} else {
lprev->next=lcur->next;
pqueue->elements=pqueue->elements-1;
}
free(lcur);
}
lprev=lcur;
lcur=lcur->next;
}
return(0);
}

----------
For completeness, here's my qaddpid() function:

int qaddpid (struct queue_list *nqueue, char *ip, int pid) {
struct queue_node *new;
struct queue_node *cur;
struct queue_node *prev;
listop=TRUE;
new= (struct queue_node *) malloc(sizeof(struct queue_node));
if (new==NULL) {
syslog(LOG_NOTICE,"Failed to malloc");
return(1);
}
new->next=NULL;
snprintf(new->ipaddr,IPSIZE+1,"%s",ip);
new->pid=pid;
prev=NULL;
cur=nqueue->head;
while (cur != NULL) {
prev=cur;
cur=cur->next;
}
if (prev!=NULL) {
new->next=prev->next;
prev->next=new;
} else {
nqueue->head=new;
}
nqueue->elements++;
return(0);
}

------
And here's the function I have set to handle sigchld:

void childhandle (int signum) {
pid_t cpid=0;
while ((cpid=waitpid(0,NULL,WNOHANG)) > 0)
qdeletepid(&queue,cpid);
}
--
Any ideas why qdeletepid is failing to remove nodes from the list very
occasionally? As this happens only rarely (but the script forks a lot of
children constantly 24/7 so over the course of a couple of days the list
gets progressively bigger and bigger) it's very difficult to debug without
knowing the reason the problem is occurring.

Help very much appreciated.

~Kieran Simkin
Digital Crocus
http://digital-crocus.com/

How do you know what the child pid is? Does only the
child know what it's pid is or does the parent know
it? If the parent gets the pid and adds it to the list,
perhaps the signal handler is running before the parent
has a chance to add the pid to the list. That is, the
delete pid routine runs *before* the add pid routine,
because the child is finished before the parent has
a chance to do the add, this causing the child-end
handler to run first.
Nov 14 '05 #2

On Sat, 8 May 2004, xarax wrote:
[quoted an entire OT post and then wrote:]
How do you know what the child pid is? Does only the
child know what it's pid is or does the parent know
it? If the parent gets the pid and adds it to the list,
perhaps the signal handler is running before the parent
has a chance to add the pid to the list. That is, the
delete pid routine runs *before* the add pid routine,
because the child is finished before the parent has
a chance to do the add, this causing the child-end
handler to run first.
This is why we encourage people to post Unix questions
to Unix newsgroups. comp.lang.c is a C newsgroup, and
focuses on standard C questions. When you ask off-topic
Unix questions here, you get wrong answers.
"Kieran Simkin" <ki****@digital-crocus.com> wrote...

int qdeletepid (struct queue_list *pqueue, int pid) {
struct queue_node *lcur;
struct queue_node *lprev;
if (pqueue->elements == 0) {
listop=FALSE;
Should you perhaps be setting 'listop' to 'TRUE' somewhere
at the top of this function? I assume you're not forcing the
caller to do it every time he calls this function...
return(0);
}
if (pqueue->elements == 1) {
lcur=pqueue->head;
if (lcur->pid == pid) {
pqueue->head=NULL;
pqueue->elements=0;
free(lcur);
}
listop=FALSE;
return(0);
}
lcur=pqueue->head;
lprev=NULL;
while (lcur!=NULL) {
if (lcur->pid == pid) {
if (lprev==NULL) {
pqueue->head=NULL;
pqueue->elements=0;
This is amazingly, incredibly broken. You meant
pqueue->head = lcur->next;
pqueue->elements -= 1;
} else {
lprev->next=lcur->next;
pqueue->elements=pqueue->elements-1;
}
free(lcur);
....and here you probably meant to insert a
return 0;
The parentheses around 0 are unnecessary, by the way. 0 is a
number all by itself.
}
lprev=lcur;
lcur=lcur->next;
}
return(0);
}

----------
For completeness, here's my qaddpid() function:

int qaddpid (struct queue_list *nqueue, char *ip, int pid) {
struct queue_node *new;
struct queue_node *cur;
struct queue_node *prev;
listop=TRUE;
new= (struct queue_node *) malloc(sizeof(struct queue_node));
Since you're programming in (completely un-C++-portable) C, lose the
cast. Simpler code is less buggy code.
new = malloc(sizeof *new);
And check for NULL while you're at it.
if (new==NULL) {
syslog(LOG_NOTICE,"Failed to malloc");
return(1);
}
new->next=NULL;
snprintf(new->ipaddr,IPSIZE+1,"%s",ip);
Lose the magic number. Code with fewer magic numbers is less buggy
code.
snprintf(new->ipaddr, sizeof new->ipaddr, "%s", ip);
and you might even consider whether this is a job for
strncpy(new->ipaddr, ip, sizeof new->ipaddr);
(but read the documentation first!).
new->pid=pid;
prev=NULL;
cur=nqueue->head;
while (cur != NULL) {
prev=cur;
cur=cur->next;
}
if (prev!=NULL) {
new->next=prev->next;
....which is
new->next = cur;
prev->next=new;
} else {
nqueue->head=new;
....which is a memory leak and an utter trashing of your
entire list. If you're having trouble with linked lists,
read a tutorial. [HINT: For a singly-linked list, insertion
does not require a 'prev' pointer.]
}
nqueue->elements++;
return(0);
}

------
And here's the function I have set to handle sigchld:

void childhandle (int signum) {
pid_t cpid=0;
while ((cpid=waitpid(0,NULL,WNOHANG)) > 0)
qdeletepid(&queue,cpid);
}
--
Any ideas why qdeletepid is failing to remove nodes from the list very
occasionally? As this happens only rarely (but the script forks a lot of
children constantly 24/7 so over the course of a couple of days the list
gets progressively bigger and bigger) it's very difficult to debug without
knowing the reason the problem is occurring.


Your code is terribly borken. Also, I would be very surprised if
the Unixy part of your code didn't have at least two major bugs that
only show up non-deterministically due to faulty signal handling
code. Ask in comp.unix.programmer or whatever the canonical Linux
group is.

-Arthur

Nov 14 '05 #3
Kieran Simkin <ki****@digital-crocus.com> wrote:
I'm having some trouble with a linked list function and was wondering if
anyone could shed any light on it. Basically I have a singly-linked list
which stores pid numbers of a process's children - when a child is fork()ed
its pid is added to the linked list. I then have a SIGCHLD handler which is
supposed to remove the pid from the list when a child exits. The problem I'm
having is that very very occasionally and seemingly unpredictably, my
function to remove an item from the list is just failing silently to do so;
a child exits, the signal handler gets run, but the child's pid does not get
removed from the list. I'm 99% sure this is a problem with my linked list
code and not my signal handling code so I hope this is not off-topic for
comp.lang.c.
Anyway, here's my code. First the definitions for my structures:
There are some issues with the code for deleting the element,
see below. But I fear that the problem can also be related to
deleting the element in a signal handler. There's at least one
thing that could cause problems: you're not allowed to use
free() in a signal handler - if the signal happens while you're
executing somewhere within the innards of one of the memory
allocation functions (and that might happen even when you do
innocently looking calls of e.g. printf() things can go badly
wrong by messing up the internal structures of the malloc
related functions.

<OT>
Another thing is that, at least under some operating systems and
conditions, signals get blocked while you're in the signal handler
and that could lead to signals getting lost. So I would recommend
to rewrite your code in a way that you don't call wait() or
waitpid() in the signal handler but instead do that regularly from
code running in "normal" context, i.e. whenever you have time or
whenever the correctness of the list is of importance, so that you
can delete the elements not used anymore in a context where you
are allowed to use free() and while there's is no danger of losing
signals.
</OT>
int qdeletepid (struct queue_list *pqueue, int pid) {
Why does that need a return value when you only return 0 (and don't
do anything with the return value?
struct queue_node *lcur;
struct queue_node *lprev;
if (pqueue->elements == 0) {
listop=FALSE;
return(0);
}
if (pqueue->elements == 1) {
lcur=pqueue->head;
if (lcur->pid == pid) {
pqueue->head=NULL;
pqueue->elements=0;
free(lcur);
}
listop=FALSE;
return(0);
}
Why do you need this check for the number of elements being 1 - as far
as I can see that's also handled by the following.
lcur=pqueue->head;
lprev=NULL;
while (lcur!=NULL) {
if (lcur->pid == pid) {
if (lprev==NULL) {
pqueue->head=NULL;
pqueue->elements=0;
} else {
lprev->next=lcur->next;
pqueue->elements=pqueue->elements-1;
}
free(lcur);
And now you should return immediately - or do you expect to have the
same pid twice in your list?
}
lprev=lcur;
That's dangerous, if you _don't_ return after having deleted the
element, lprev will know point to the deleteted and free()ed element!
lcur=lcur->next;
And here in that case you would dereference the already free()ed element,
and that's a big no-no.
}
return(0);
}

Regards, Jens
--
\ Jens Thoms Toerring ___ Je***********@physik.fu-berlin.de
\__________________________ http://www.toerring.de
Nov 14 '05 #4
"Arthur J. O'Dwyer" wrote:
On Sat, 8 May 2004, xarax wrote:

[quoted an entire OT post and then wrote:]

How do you know what the child pid is? Does only the
child know what it's pid is or does the parent know
it? If the parent gets the pid and adds it to the list,
perhaps the signal handler is running before the parent
has a chance to add the pid to the list. That is, the
delete pid routine runs *before* the add pid routine,
because the child is finished before the parent has
a chance to do the add, this causing the child-end
handler to run first.


This is why we encourage people to post Unix questions
to Unix newsgroups. comp.lang.c is a C newsgroup, and
focuses on standard C questions. When you ask off-topic
Unix questions here, you get wrong answers.


The query had a C coding core, which is independant of the *IX
timing problems. I was going to take a look at the code, but see
others have done so and I won't bother. It could do with further
factoring into primitives, IMO.

The rarity of the problem seems to point to timing, though.

--
Chuck F (cb********@yahoo.com) (cb********@worldnet.att.net)
Available for consulting/temporary embedded and systems.
<http://cbfalconer.home.att.net> USE worldnet address!
Nov 14 '05 #5
In article <news:Rq6nc.43$3p1.34@newsfe1-win>
Kieran Simkin <ki****@digital-crocus.com> writes:
.... Basically I have a singly-linked list
which [gets things added to the end, and removed from the middle]


In addition to what others have said, your code is what I would
consider to be unnecessarily complicated. If you have access to
the BSD <sys/queue.h> macros, you could look at the various kinds
of lists supported therein. The following is from the queue(3)
manual page:

...
These macros define and operate on three types of data structures: lists,
tail queues, and circular queues. All three structures support the fol-
lowing functionality:
1. Insertion of a new entry at the head of the list.
2. Insertion of a new entry after any element in the list.
3. Insertion of a new entry before any element in the list.
4. Removal of any entry in the list.
5. Forward traversal through the list.

Lists are the simplest of the three data structures and support only the
above functionality.

Tail queues add the following functionality:
1. Entries can be added at the end of a list.
2. They may be traversed backwards, from tail to head.

However:
1. All list insertions and removals must specify the head of the
list.
2. Each head entry requires two pointers rather than one.
3. Reverse traversal is 50% slower than circular queues.
4. Code size is about 15% greater and operations run about 20%
slower than lists.
...

Whether or not you have and choose to use the BSD queue macros,
one fundamental (but somewhat C-specific) trick to simplifying
removal-from-a-list is to use pointers that point to pointers. If
your list structure consists of, e.g.:

struct list {
struct list *next;
some_type some_value;
/* more values as needed */
};

then we can add to the end of the list with code like this:

struct list *list_head;

...
struct list *new_item = make_new_list_item(values);
struct list **pp, *p;
for (pp = &list_head; (p = *pp) != NULL; pp = &p->next)
continue; /* nothing to do here */
new_item->next = p; /* i.e., new_item->next = NULL; */
*pp = new_item;

If the list is to be sorted in some way, replace the "continue"
with a test for whether new_item comes before item p:

for (pp = &list_head; (p = *pp) != NULL; pp = &p->next)
if (ARG1_SHOULD_GO_BEFORE_ARG2(new_item, p))
break;

The "new_item->next = p" line then makes the new item come before
p (by having p come after it), while the "*pp = new_item" line
accomplishes the actual insertion (by making whatever pointed to
p before now point to new_item). No special code is required for
operation on the list head, because "pp" can point to the list
head just as easily as it can point to the "next" field of an
existing list item.

List removal works similarly:

for (pp = &list_head; (p = *pp) != NULL; pp = &p->next) {
if (THIS_IS_THE_ITEM_TO_REMOVE(p)) {
/* remove p from the list */
*pp = p->next;

/* push p onto the front of a free-list */
p->next = freelist;
freelist = p;
break;
}
}
/* optional: */
if (p == NULL)
... item was not found on the list ...

Again, there are no special cases here -- removal from the list
head or tail or middle is all the same.

This method cannot be used in languages that do not allow the use
of pointers pointing to ordinary variables. It relies on the fact
that *pp and list_head name the same underlying object in memory
on the first trip through the loop (but different objects -- one
"sub-part of an actual list item" -- on subsequent trips).
--
In-Real-Life: Chris Torek, Wind River Systems
Salt Lake City, UT, USA (4039.22'N, 11150.29'W) +1 801 277 2603
email: forget about it http://web.torek.net/torek/index.html
Reading email is like searching for food in the garbage, thanks to spammers.
Nov 14 '05 #6
"CBFalconer" <cb********@yahoo.com> wrote in message
news:40***************@yahoo.com...
"Arthur J. O'Dwyer" wrote:
On Sat, 8 May 2004, xarax wrote:

[quoted an entire OT post and then wrote:]

How do you know what the child pid is? Does only the
child know what it's pid is or does the parent know
it? If the parent gets the pid and adds it to the list,
perhaps the signal handler is running before the parent
has a chance to add the pid to the list. That is, the
delete pid routine runs *before* the add pid routine,
because the child is finished before the parent has
a chance to do the add, this causing the child-end
handler to run first.


This is why we encourage people to post Unix questions
to Unix newsgroups. comp.lang.c is a C newsgroup, and
focuses on standard C questions. When you ask off-topic
Unix questions here, you get wrong answers.


The query had a C coding core, which is independant of the *IX
timing problems. I was going to take a look at the code, but see
others have done so and I won't bother. It could do with further
factoring into primitives, IMO.

The rarity of the problem seems to point to timing, though.


Which is what I asking in the first place. No need to
exclaim that my "answer" is wrong. Since it's a multi-threading
question without any implementation specification, fixing
broken code doesn't solve the apparent timing issue.
Nov 14 '05 #7
Great, thanks to everyone who offered advice. I guess this question was more
off-topic than I thought, sorry about that.
The problem of child processes returning before the parent has a chance to
register them as having been started didn't even occur to me and although I
don't think that was the problem in this case, I've rectified it so it
doesn't come back to haunt me.

After taking everyone's advice in and making a few modifications to my list
manipulation functions I'm much happier with my understand of how linked
lists work (this would be the first time I've used any kind of linked list
in a program). However, that alone didn't resolve the problem. It turned out
as mentioned by Jens that using free() inside of a signal handler was
causing the problem, I've changed my code to instead just call waitpid()
when it's important to have accurate child information and all works
perfectly :)
Again my thanks.
~Kieran Simkin
Digital Crocus
http://digital-crocus.com/

"Kieran Simkin" <ki****@digital-crocus.com> wrote in message
news:Rq6nc.43$3p1.34@newsfe1-win...
Hi all,
I'm having some trouble with a linked list function and was wondering if
anyone could shed any light on it. Basically I have a singly-linked list
which stores pid numbers of a process's children - when a child is fork()ed its pid is added to the linked list. I then have a SIGCHLD handler which is supposed to remove the pid from the list when a child exits. The problem I'm having is that very very occasionally and seemingly unpredictably, my
function to remove an item from the list is just failing silently to do so; a child exits, the signal handler gets run, but the child's pid does not get removed from the list. I'm 99% sure this is a problem with my linked list
code and not my signal handling code so I hope this is not off-topic for
comp.lang.c.
Anyway, here's my code. First the definitions for my structures:

struct queue_node {
char ipaddr[IPSIZE+1];
int pid;
struct queue_node *next;
};

struct queue_list {
struct queue_node *head;
int elements;
};

struct queue_list.head points to the first node in the list and I refer to
my list by passing a pointer to a struct queue_list as an argument to
functions.

Here's my qdeletepid() function, this function removes a node from anywhere in the list if its pid field matches the pid passed to the function:

int qdeletepid (struct queue_list *pqueue, int pid) {
struct queue_node *lcur;
struct queue_node *lprev;
if (pqueue->elements == 0) {
listop=FALSE;
return(0);
}
if (pqueue->elements == 1) {
lcur=pqueue->head;
if (lcur->pid == pid) {
pqueue->head=NULL;
pqueue->elements=0;
free(lcur);
}
listop=FALSE;
return(0);
}
lcur=pqueue->head;
lprev=NULL;
while (lcur!=NULL) {
if (lcur->pid == pid) {
if (lprev==NULL) {
pqueue->head=NULL;
pqueue->elements=0;
} else {
lprev->next=lcur->next;
pqueue->elements=pqueue->elements-1;
}
free(lcur);
}
lprev=lcur;
lcur=lcur->next;
}
return(0);
}

----------
For completeness, here's my qaddpid() function:

int qaddpid (struct queue_list *nqueue, char *ip, int pid) {
struct queue_node *new;
struct queue_node *cur;
struct queue_node *prev;
listop=TRUE;
new= (struct queue_node *) malloc(sizeof(struct queue_node));
if (new==NULL) {
syslog(LOG_NOTICE,"Failed to malloc");
return(1);
}
new->next=NULL;
snprintf(new->ipaddr,IPSIZE+1,"%s",ip);
new->pid=pid;
prev=NULL;
cur=nqueue->head;
while (cur != NULL) {
prev=cur;
cur=cur->next;
}
if (prev!=NULL) {
new->next=prev->next;
prev->next=new;
} else {
nqueue->head=new;
}
nqueue->elements++;
return(0);
}

------
And here's the function I have set to handle sigchld:

void childhandle (int signum) {
pid_t cpid=0;
while ((cpid=waitpid(0,NULL,WNOHANG)) > 0)
qdeletepid(&queue,cpid);
}
--
Any ideas why qdeletepid is failing to remove nodes from the list very
occasionally? As this happens only rarely (but the script forks a lot of
children constantly 24/7 so over the course of a couple of days the list
gets progressively bigger and bigger) it's very difficult to debug without
knowing the reason the problem is occurring.

Help very much appreciated.

~Kieran Simkin
Digital Crocus
http://digital-crocus.com/

Nov 14 '05 #8

This discussion thread is closed

Replies have been disabled for this discussion.

Similar topics

23 posts views Thread by Stan Cook | last post: by
reply views Thread by Adict | last post: by
By using this site, you agree to our Privacy Policy and Terms of Use.