In article <vu************@corp.supernews.com> aku <ak*@europe.com> wrote:

I'm looking for the absolute fastest way

to count the nr of bits that are set to "1" ...

<http://groups.google.com/groups?selm=602%40shrike.AUSTIN.LOCKHEED.COM>

Someone reformatted at least some of the code in that, so here is

bct.c (nonportabilities and all).

#ifndef lint

static char rcsid[] = "$Id: bct.c,v 1.5 90/10/13 08:44:12 chris Exp $";

#endif

/*

* bct - bitcount timing

*

* Runs a bunch of different functions-to-count-bits-in-a-longword

* (many assume 32 bits per long) with a timer around each loop, and

* tries to calcuate the time used.

*/

#include <stdio.h>

#include <sys/types.h>

#ifdef FD_SETSIZE

# define USE_GETRUSAGE

# include <sys/time.h>

# include <sys/resource.h>

#else

# include <sys/times.h>

#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

/*

* This function is used to calibrate the timing mechanism.

* This way we can subtract the loop and call overheads.

*/

int

calibrate(n)

register unsigned long n;

{

return 0;

}

/*

* This function counts the number of bits in a long.

* It is limited to 63 bit longs, but a minor mod can cope with 511 bits.

*

* It is so magic, an explanation is required:

* Consider a 3 bit number as being

* 4a+2b+c

* if we shift it right 1 bit, we have

* 2a+b

* subtracting this from the original gives

* 2a+b+c

* if we shift the original 2 bits right we get

* a

* and so with another subtraction we have

* a+b+c

* which is the number of bits in the original number.

* Suitable masking allows the sums of the octal digits in a

* 32 bit number to appear in each octal digit. This isn't much help

* unless we can get all of them summed together.

* This can be done by modulo arithmetic (sum the digits in a number by molulo

* the base of the number minus one) the old "casting out nines" trick they

* taught in school before calculators were invented.

* Now, using mod 7 wont help us, because our number will very likely have

* more than 7 bits set. So add the octal digits together to get base64

* digits, and use modulo 63.

* (Those of you with 64 bit machines need to add 3 octal digits together to

* get base512 digits, and use mod 511.)

*

* This is HACKMEM 169, as used in X11 sources.

*/

int

t0_hackmemmod(n)

register unsigned long n;

{

register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);

return ((tmp + (tmp >> 3)) & 030707070707) % 63;

}

/*

* This is the same as the above, but does not use the % operator.

* Most modern machines have clockless division, and so the modulo is as

* expensive as, say, an addition.

*/

int

t1_hackmemloop(n)

register unsigned long n;

{

register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);

tmp = (tmp + (tmp >> 3)) & 030707070707;

while (tmp > 63)

tmp = (tmp & 63) + (tmp >> 6);

return tmp;

}

/*

* Above, without using while loop. It takes at most 5 iterations of the

* loop, so we just do all 5 in-line. The final result is never 63

* (this is assumed above as well).

*/

int

t2_hackmemunrolled(n)

register unsigned long n;

{

register unsigned long tmp;

tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);

tmp = (tmp + (tmp >> 3)) & 030707070707;

tmp = (tmp & 63) + (tmp >> 6);

tmp = (tmp & 63) + (tmp >> 6);

tmp = (tmp & 63) + (tmp >> 6);

tmp = (tmp & 63) + (tmp >> 6);

tmp = (tmp & 63) + (tmp >> 6);

return (tmp);

}

/*

* This function counts the bits in a long.

*

* It removes the lsb and counting the number of times round the loop.

* The expression (n & -n) yields the lsb of a number,

* but it only works on 2's compliment machines.

*/

int

t3_rmlsbsub(n)

register unsigned long n;

{

register int count;

for (count = 0; n; n -= (n & -n))

count++;

return count;

}

int

t4_rmlsbmask(n)

register unsigned long n;

{

register int count;

for (count = 0; n; count++)

n &= n - 1; /* take away lsb */

return (count);

}

/*

* This function counts the bits in a long.

*

* It works by shifting the number down and testing the bottom bit.

*/

int

t5_testlsb(n)

register unsigned long n;

{

register int count;

for (count = 0; n; n >>= 1)

if (n & 1)

count++;

return count;

}

/*

* This function counts the bits in a long.

*

* It works by shifting the number left and testing the top bit.

* On many machines shift is expensive, so it uses a cheap addition instead.

*/

int

t6_testmsb(n)

register unsigned long n;

{

register int count;

for (count = 0; n; n += n)

if (n & ~(~(unsigned long)0 >> 1))

count++;

return count;

}

int

t7_testsignandshift(n)

register unsigned long n;

{

register int count;

for (count = 0; n; n <<= 1)

if ((long)n < 0)

count++;

return (count);

}

/*

* This function counts the bits in a long.

*

* It works by masking each bit.

* This is the second most intuitively obvious method,

* and is independent of the number of bits in the long.

*/

int

t8_testeachbit(n)

register unsigned long n;

{

register int count;

register unsigned long mask;

count = 0;

for (mask = 1; mask; mask += mask)

if (n & mask)

count++;

return count;

}

/*

* This function counts the bits in a long.

*

* It works by masking each bit.

* This is the most intuitively obvious method,

* but how do you a priori know how many bits in the long?

* (except for ''sizeof(long) * CHAR_BITS'' expression)

*/

int

t9_testeachbit1shl(n)

register unsigned long n;

{

register int count;

register int bit;

count = 0;

for (bit = 0; bit < 32; ++bit)

if (n & ((unsigned long)1 << bit))

count++;

return count;

}

static char nbits[256] = {

0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,

};

static int inbits[256] = {

0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,

};

int

tA_tableshift(n)

register unsigned long n;

{

return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +

nbits[(n >> 16) & 0xff] + nbits[n >> 24]);

}

int

tB_tableuchar(n) unsigned long n;

{

register unsigned char *p = (unsigned char *)&n;

return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);

}

int

tC_tableshiftcast(n)

register unsigned long n;

{

return nbits[(unsigned char) n] +

nbits[(unsigned char) (n >> 8)] +

nbits[(unsigned char) (n >> 16)] +

nbits[(unsigned char) (n >> 24)];

}

int

tD_itableshift(n)

register unsigned long n;

{

return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] +

inbits[(n >> 16) & 0xff] + inbits[n >> 24]);

}

int

tE_itableuchar(n)

unsigned long n;

{

register unsigned char *p = (unsigned char *)&n;

return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]);

}

int

tF_itableshiftcast(n)

register unsigned long n;

{

return inbits[(unsigned char) n] +

inbits[(unsigned char) (n >> 8)] +

inbits[(unsigned char) (n >> 16)] +

inbits[(unsigned char) (n >> 24)];

}

/*

* Explanation:

* First we add 32 1-bit fields to get 16 2-bit fields.

* Each 2-bit field is one of 00, 01, or 10 (binary).

* We then add all the two-bit fields to get 8 4-bit fields.

* These are all one of 0000, 0001, 0010, 0011, or 0100.

*

* Now we can do something different, becuase for the first

* time the value in each k-bit field (k now being 4) is small

* enough that adding two k-bit fields results in a value that

* still fits in the k-bit field. The result is four 4-bit

* fields containing one of {0000,0001,...,0111,1000} and four

* more 4-bit fields containing junk (sums that are uninteresting).

* Pictorially:

* n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh

* n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg

* sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ

* where W, X, Y, and Z are the interesting sums (each at most 1000,

* or 8 decimal). Masking with 0x0f0f0f0f extracts these.

*

* Now we can change tactics yet again, because now we have:

* n = 0000WWWW0000XXXX0000YYYY0000ZZZZ

* n>>8 = 000000000000WWWW0000XXXX0000YYYY

* so sum = 0000WWWW000ppppp000qqqqq000rrrrr

* where p and r are the interesting sums (and each is at most

* 10000, or 16 decimal). The sum `q' is junk, like i, j, and

* k above; but it is not necessarry to discard it this time.

* One more fold, this time by sixteen bits, gives

* n = 0000WWWW000ppppp000qqqqq000rrrrr

* n>>16 = 00000000000000000000WWWW000ppppp

* so sum = 0000WWWW000ppppp000sssss00tttttt

* where s is at most 11000 and t is it most 100000 (32 decimal).

*

* Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)),

* or in other words, t is the number of bits set in the original

* 32-bit longword. So all we have to do is return the low byte

* (or low 6 bits, but `low byte' is typically just as easy if not

* easier).

*

* This technique is also applicable to 64 and 128 bit words, but

* 256 bit or larger word sizes require at least one more masking

* step.

*/

int

tG_sumbits(n)

register unsigned long n;

{

n = (n & 0x55555555) + ((n >> 1) & 0x55555555);

n = (n & 0x33333333) + ((n >> 2) & 0x33333333);

n = (n + (n >> 4)) & 0x0f0f0f0f;

n += n >> 8;

n += n >> 16;

return (n & 0xff);

}

/*

* build a long random number.

* The standard rand() returns at least a 15 bit number.

* We use the top 9 of 15 (since the lower N bits of the usual rand()

* repeat with a period of 2^N).

*/

unsigned long

bigrand()

{

#define randbits() ((unsigned long)((rand() >> 6) & 0777))

register unsigned long r;

r = randbits() << 27;

r |= randbits() << 18;

r |= randbits() << 9;

r |= randbits();

return (r);

}

/*

* Run the test many times.

* You will need a running time in the 10s of seconds

* for an accurate result.

*

* To give a fair comparison, the random number generator

* is seeded the same for each measurement.

*

* Return value is seconds per iteration.

*/

#ifndef REPEAT

#if defined(mips) || defined(sparc)

#define REPEAT (1L<<20)

#else

#define REPEAT (1L<<18)

#endif

#endif

double

measure(func)

register int (*func)();

{

#ifdef USE_GETRUSAGE

struct rusage ru0, ru1;

register long j;

srand(1);

(void) getrusage(RUSAGE_SELF, &ru0);

for (j = 0; j < REPEAT; ++j)

func(bigrand());

(void) getrusage(RUSAGE_SELF, &ru1);

ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;

if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {

ru1.ru_utime.tv_usec += 1000000;

ru1.ru_utime.tv_sec--;

}

return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /

(double)REPEAT);

#else

register long j;

struct tms start;

struct tms finish;

srand(1);

times(&start);

for (j = 0; j < REPEAT; ++j)

func(bigrand());

times(&finish);

return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);

#endif

}

struct table {

char *name;

int (*func)();

double measurement;

int trial;

};

struct table table[] =

{

{ "hackmemmod", t0_hackmemmod },

{ "hackmemloop", t1_hackmemloop },

{ "hackmemunrolled", t2_hackmemunrolled },

{ "rmlsbsub", t3_rmlsbsub },

{ "rmlsbmask", t4_rmlsbmask },

{ "testlsb", t5_testlsb },

{ "testmsb", t6_testmsb },

{ "testsignandshift", t7_testsignandshift },

{ "testeachbit", t8_testeachbit },

{ "testeachbit1shl", t9_testeachbit1shl },

{ "tableshift", tA_tableshift },

{ "tableuchar", tB_tableuchar },

{ "tableshiftcast", tC_tableshiftcast },

{ "itableshift", tD_itableshift },

{ "itableuchar", tE_itableuchar },

{ "itableshiftcast", tF_itableshiftcast },

{ "sumbits", tG_sumbits },

};

main(argc, argv)

int argc;

char **argv;

{

double calibration, v, best;

int j, k, m, verbose;

verbose = argc > 1;

/*

* run a few tests to make sure they all agree

*/

srand(getpid());

for (j = 0; j < 10; ++j) {

unsigned long n;

int bad;

n = bigrand();

for (k = 0; k < SIZEOF(table); ++k)

table[k].trial = table[k].func(n);

bad = 0;

for (k = 0; k < SIZEOF(table); ++k)

for (m = 0; m < SIZEOF(table); ++m)

if (table[k].trial != table[m].trial)

bad = 1;

if (bad) {

printf("wrong: %08lX", n);

for (k = 0; k < SIZEOF(table); ++k)

printf(" %3d", table[k].trial);

printf("\n");

}

}

/*

* calibrate the timing mechanism

*/

calibration = measure(calibrate);

if (verbose)

printf("calibration: %g\n", calibration);

/*

* time them all, keeping track of the best (smallest)

*/

for (j = 0; j < SIZEOF(table); ++j) {

v = measure(table[j].func) - calibration;

if (verbose)

printf("%s: %g\n", table[j].name, v);

table[j].measurement = v;

if (!j || v < best)

best = v;

}

(void) printf("%-24s %-14sratio\n", "function", "time");

for (j = 0; j < SIZEOF(table); ++j) {

(void) printf("%-24s %#10.8g %6.3f\n",

table[j].name,

table[j].measurement,

table[j].measurement / best);

}

exit(0);

}

--

In-Real-Life: Chris Torek, Wind River Systems

Salt Lake City, UT, USA (40°39.22'N, 111°50.29'W) +1 801 277 2603

email: forget about it

http://web.torek.net/torek/index.html
Reading email is like searching for food in the garbage, thanks to spammers.