"Kitty" <No spam> wrote:
Hi, everyone.
Given a vector<int>, what is the fastest way to find out whether there is
a repeated element in it? The result is just "true" or "false". Thanks.
Kitty
Sorry for the long post. I have implemented about 10 methods and some test
code to actually find out the fastest. After playing around with the test
code, I am of the opinon that there are only two serious contestants as far
as worst case runtime is concerned. Their average speed appears also to be
fine.
a) A non-generic method based upon partition exchange: put the even numbers
to the left of the odd ones and during the same pass shift down by one bit.
Then recurse on the left and right segment. This is *has_dublicates_a*, and
*has_dublicate_b* is a small modification.
b) A generic method based on Mussers modification of quick-sort. This is
*has_dublicates_f*.
Both methods have a memory overhead of N and a runtime of O(N*log(N)). The
memory-overhead is due to copying the vector beforehand. Predictably,
method (b) has some problems on sorted sequences. The pure quick-sort idea
implemented in has_dublicates_e will just bail out on these.
If someone sees undefined behavior, any kind of bad code, or others ways to
improve this, I would love to hear about it.
Best
Kai-Uwe Bux
#include <vector>
#include <set>
#include <algorithm>
#include <iostream>
typedef std::vector< unsigned long > Uvector;
typedef unsigned long u_long;
class has_dublicates_notification {};
// this uses an even/odd partitioning scheme:
// (works for unsigned integer types)
void has_dublicates_helper_a ( u_long* from, u_long* to, u_long max ) {
// WARNING, range is closed: [from,to]
if ( to <= from ) {
return;
}
if ( max < static_cast< unsigned long >( to - from ) ) {
throw( has_dublicates_notification() );
}
u_long* low = from;
u_long* high = to;
while ( true ) {
#if 0
// before inner loop optimization
while ( ( low < high ) && ( *low % 2 == 0 ) ) {
*low >>= 1;
++low;
}
while ( ( low < high ) && ( *high % 2 != 0 ) ) {
*high >>= 1;
--high;
}
#else
// optimized inner loop using sentinells.
unsigned long dummy = *high;
*high = 1; // odd sentinel
while ( *low % 2 == 0 ) {
*low >>= 1;
++low;
}
*high = dummy;
dummy = *low;
*low = 0; // even sentinell
while ( *high % 2 != 0 ) {
*high >>= 1;
--high;
}
*low = dummy;
#endif
// either ( low == high ) or ( *high is even )
if ( low < high ) {
// *low is odd and *high is even
std::swap( *low, *high );
} else {
break;
}
}
// low == high;
if ( *low % 2 == 0 ) {
*low >>= 1;
has_dublicates_helper_a( from, low, max >> 1 );
has_dublicates_helper_a( high+1, to, max >>1 );
} else {
*low >>= 1;
has_dublicates_helper_a( from, low-1, max >> 1 );
has_dublicates_helper_a( high, to, max >> 1 );
}
}
template < typename ConstIter >
bool has_dublicates_a ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type u_vect ( from, to );
try {
has_dublicates_helper_a( &u_vect[0], &u_vect[ u_vect.size()-1 ],
std::numeric_limits< unsigned long >::max() );
return( false );
}
catch ( has_dublicates_notification ) {
return( true );
}
}
// this uses an even/odd partitioning scheme:
// (works for unsigned integer types)
// same as _a, but not using throw
bool has_dublicates_helper_aa ( u_long* from, u_long* to, u_long max ) {
// WARNING, range is closed: [from,to]
if ( to <= from ) {
return( false );
}
if ( max < static_cast< unsigned long >( to - from ) ) {
return( true );
}
u_long* low = from;
u_long* high = to;
while ( true ) {
#if 0
// before inner loop optimization
while ( ( low < high ) && ( *low % 2 == 0 ) ) {
*low >>= 1;
++low;
}
while ( ( low < high ) && ( *high % 2 != 0 ) ) {
*high >>= 1;
--high;
}
#else
// optimized inner loop using sentinells.
unsigned long dummy = *high;
*high = 1; // odd sentinel
while ( *low % 2 == 0 ) {
*low >>= 1;
++low;
}
*high = dummy;
dummy = *low;
*low = 0; // even sentinell
while ( *high % 2 != 0 ) {
*high >>= 1;
--high;
}
*low = dummy;
#endif
// either ( low == high ) or ( *high is even )
if ( low < high ) {
// *low is odd and *high is even
std::swap( *low, *high );
} else {
break;
}
}
// low == high;
if ( *low % 2 == 0 ) {
*low >>= 1;
return(
has_dublicates_helper_aa( from, low, max >> 1 )
||
has_dublicates_helper_aa( high+1, to, max >>1 )
);
} else {
*low >>= 1;
return(
has_dublicates_helper_aa( from, low-1, max >> 1 )
||
has_dublicates_helper_aa( high, to, max >> 1 )
);
}
}
template < typename ConstIter >
bool has_dublicates_aa ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type u_vect ( from, to );
return( has_dublicates_helper_aa( &u_vect[0], &u_vect[ u_vect.size()-1 ],
std::numeric_limits< unsigned long >::max() ) );
}
// this also uses the even/odd partitioning scheme,
// but a different stop criterion.
// (works for unsigned integer types)
void has_dublicates_helper_b ( u_long* from, u_long* to ) {
// WARNING, range is closed: [from,to]
if ( to <= from ) {
return;
}
// so to > from
if ( *from == *to ) {
throw( has_dublicates_notification() );
}
u_long* low = from;
u_long* high = to;
while ( true ) {
#if 0
// before inner loop optimization
while ( ( low < high ) && ( *low % 2 == 0 ) ) {
*low >>= 1;
++low;
}
while ( ( low < high ) && ( *high % 2 != 0 ) ) {
*high >>= 1;
--high;
}
#else
// optimized inner loop using sentinells.
unsigned long dummy = *high;
*high = 1; // odd sentinel
while ( *low % 2 == 0 ) {
*low >>= 1;
++low;
}
*high = dummy;
dummy = *low;
*low = 0; // even sentinell
while ( *high % 2 != 0 ) {
*high >>= 1;
--high;
}
*low = dummy;
#endif
// either ( low == high ) or ( *high is even )
if ( low < high ) {
// *low is odd and *high is even
std::swap( *low, *high );
} else {
break;
}
}
// low == high;
if ( *low % 2 == 0 ) {
*low >>= 1;
has_dublicates_helper_b( from, low );
has_dublicates_helper_b( high+1, to );
} else {
*low >>= 1;
has_dublicates_helper_b( from, low-1 );
has_dublicates_helper_b( high, to );
}
}
template < typename ConstIter >
bool has_dublicates_b ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type u_vect ( from, to );
try {
has_dublicates_helper_b( &u_vect[0], &u_vect[ u_vect.size()-1 ] );
return( false );
}
catch ( has_dublicates_notification ) {
return( true );
}
}
// this is the most straight forward:
template < typename ConstIter >
bool has_dublicates_c ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type v ( from, to );
std::sort( v.begin(), v.end() );
return( std::adjacent_find( v.begin(), v.end() ) != v.end() );
}
// this modifies heap sort:
// (unfortunately this is slow, if someone can make this fast,
// I would love to see that.)
template < typename ConstIter >
bool has_dublicates_d ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
typedef typename vector_type::size_type size_type;
vector_type vect ( from, to );
/*
| This one is a little tough to grasp.
| We are searching for a dublicate. As long asa we do not
| find one, we organize the initial segment as a left-slanted heap,
| i.e., we think of the natural numbers as labelling a binary tree
| wherein K is the parent of 2K+1 and 2K+2. The heap invariant
| is that a root is always strictly bigger than it children and that
| every node in the left subtree is strictly bigger than every node
| in the right subtree.
*/
size_type high = 1;
size_type last = vect.size();
while ( high < last ) {
size_type bad_node = high;
while ( bad_node > 0 ) {
// THROUGHOUT THIS LOOP:
// The tree satisfies:
// a) left-slanted heap condition everywhere but possibly 'bad_node'
// b) the subtree headed by bad_node is a left-slanted heap.
// c) if 'bad_node' is a leaf, it does not have a right sibling.
// d)
if ( bad_node % 2 == 0 ) {
// 'bad_node' is a right child of its parent:
size_type left = bad_node - 1;
size_type right_most_child_in_left = left;
while ( right_most_child_in_left < high/2 ) {
right_most_child_in_left = 2*right_most_child_in_left + 2;
}
size_type parent = left / 2;
if ( vect[right_most_child_in_left] < vect[bad_node] ) {
std::swap( vect[right_most_child_in_left], vect[bad_node] );
bad_node = right_most_child_in_left;
// we are back to a leaf position:
} else if ( vect[right_most_child_in_left] == vect[bad_node] ) {
// found doublicate
return( true );
} else {
bad_node = parent;
}
} else {
// 'bad_node' is a left child.
size_type parent = bad_node / 2;
if ( vect[bad_node] == vect[parent] ) {
return( true );
} else if ( vect[parent] < vect[bad_node] ) {
std::swap( vect[parent], vect[bad_node] );
}
bad_node = parent;
}
}
++ high;
}
return( false );
}
// this modifies quicksort:
template < typename ConstIter >
void has_dublicates_helper_e ( ConstIter from, ConstIter to ) {
ConstIter low = from;
ConstIter high = to;
while ( low < high ) {
while ( *low < *high ) {
++low;
}
if ( low < high ) {
if ( *low == *high ) {
throw( has_dublicates_notification() );
}
std::swap( *low, *high );
}
while ( *low < *high ) {
--high;
}
if ( low < high ) {
if ( *low == *high ) {
throw( has_dublicates_notification() );
}
std::swap( *low, *high );
}
}
if ( from < low ) {
has_dublicates_helper_e( from, low-1 );
}
if ( high < to ) {
has_dublicates_helper_e( high+1, to );
}
}
template < typename ConstIter >
bool has_dublicates_e ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type vect ( from, to );
try {
if ( vect.size() > 1 ) {
has_dublicates_helper_e( vect.begin(), vect.end() - 1 );
}
return( false );
}
catch ( has_dublicates_notification ) {
return( true );
}
}
// this modifies quicksort:
// (same as _e, but not using throw)
template < typename ConstIter >
bool has_dublicates_helper_ee ( ConstIter from, ConstIter to ) {
ConstIter low = from;
ConstIter high = to;
while ( low < high ) {
while ( *low < *high ) {
++low;
}
if ( low < high ) {
if ( *low == *high ) {
return( true );
}
std::swap( *low, *high );
}
while ( *low < *high ) {
--high;
}
if ( low < high ) {
if ( *low == *high ) {
return( true );
}
std::swap( *low, *high );
}
}
if ( ( from < low ) && has_dublicates_helper_ee( from, low-1 ) ) {
return( true );
}
if ( ( high < to ) && has_dublicates_helper_ee( high+1, to ) ) {
return( true );
}
return( false );
}
template < typename ConstIter >
bool has_dublicates_ee ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type vect ( from, to );
return( has_dublicates_helper_ee( vect.begin(), vect.end() - 1 ) );
}
// this modifies introsort:
// (guaranteed O( N *log(N) )
template < typename ConstIter >
void has_dublicates_helper_f ( ConstIter from, ConstIter to,
unsigned short depth ) {
// WARNING: the range is [from,to].
if ( depth == 0 ) {
// Mussers self-inspection trick to ensure O( N * log(N) ).
std::stable_sort( from, to + 1 );
if ( std::adjacent_find( from, to+1 ) != to+1 ) {
throw( has_dublicates_notification() );
}
return;
}
ConstIter low = from;
ConstIter high = to;
while ( low < high ) {
while ( *low < *high ) {
++low;
}
if ( low < high ) {
if ( *low == *high ) {
throw( has_dublicates_notification() );
}
std::swap( *low, *high );
}
while ( *low < *high ) {
--high;
}
if ( low < high ) {
if ( *low == *high ) {
throw( has_dublicates_notification() );
}
std::swap( *low, *high );
}
}
if ( from < low ) {
has_dublicates_helper_f( from, low-1, depth - 1 );
}
if ( high < to ) {
has_dublicates_helper_f( high+1, to, depth - 1 );
}
}
template < typename ConstIter >
bool has_dublicates_f ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type vect ( from, to );
unsigned short log = 0;
typename vector_type::size_type v_size = vect.size();
while ( v_size > 0 ) {
++ log;
v_size >>=1;
}
try {
has_dublicates_helper_f( vect.begin(), vect.end() - 1, 2*log );
return( false );
}
catch ( has_dublicates_notification ) {
return( true );
}
}
// this modifies introsort:
// (guaranteed O( N *log(N) )
// (same as _f, but not using throw)
template < typename ConstIter >
bool has_dublicates_helper_ff ( ConstIter from, ConstIter to,
unsigned short depth ) {
// WARNING: the range is [from,to].
if ( depth == 0 ) {
// Mussers self-inspection trick to ensure O( N * log(N) ).
std::stable_sort( from, to + 1 );
return ( std::adjacent_find( from, to+1 ) != to+1 );
}
ConstIter low = from;
ConstIter high = to;
while ( low < high ) {
while ( *low < *high ) {
++low;
}
if ( low < high ) {
if ( *low == *high ) {
return( true );
}
std::swap( *low, *high );
}
while ( *low < *high ) {
--high;
}
if ( low < high ) {
if ( *low == *high ) {
return( true );
}
std::swap( *low, *high );
}
}
if ( ( from < low ) && has_dublicates_helper_ff( from, low-1, depth - 1
) ) {
return( true );
}
if ( ( high < to ) && has_dublicates_helper_ff( high+1, to, depth - 1 ) )
{
return( true );
}
return( false );
}
template < typename ConstIter >
bool has_dublicates_ff ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::vector< value_type > vector_type;
vector_type vect ( from, to );
unsigned short log = 0;
typename vector_type::size_type v_size = vect.size();
while ( v_size > 0 ) {
++ log;
v_size >>=1;
}
return( has_dublicates_helper_ff( vect.begin(), vect.end() - 1, 2*log )
);
}
// this uses a set, but only checks at the very end:
// (the worst of all O( N * log( N ) )
template < typename ConstIter >
bool has_dublicates_g ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::set< value_type > set_type;
typedef typename set_type::size_type size_type;
size_type i = 0;
set_type s;
for ( ConstIter iter = from; iter != to; ++iter ) {
++ i;
s.insert( *iter );
}
if ( s.size() != i ) {
return( true );
}
return( false );
}
// this uses a has table of sets:
// (implemented for unsigned integer types only)
template < typename ConstIter >
bool has_dublicates_h ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::set< value_type > set_type;
typedef typename set_type::size_type size_type;
typedef set_type set_array [ 256 ];
typedef size_type size_array [ 256 ];
size_array sizes;
set_array sets;
for ( unsigned long i = 0; i < 256; ++i ) {
sizes[i] = 0;
}
for ( ConstIter iter = from; iter != to; ++iter ) {
unsigned int hash = *iter % 256;
++sizes[hash];
sets[hash].insert( *iter );
if ( sizes[hash] != sets[hash].size() ) {
return( true );
}
}
return( false );
}
// this one is quadratic:
// but does not need to copy the range
template < typename ConstIter >
bool has_dublicates_i ( ConstIter from, ConstIter to ) {
for ( ConstIter high = from; high < to; ++high ) {
for ( ConstIter low = from; low < high; ++low ) {
if ( *low == *high ) {
return( true );
}
}
}
return( false );
}
// this uses a set:
template < typename ConstIter >
bool has_dublicates_j ( ConstIter from, ConstIter to ) {
typedef typename std::iterator_traits< ConstIter >::value_type value_type;
typedef typename std::set< value_type > set_type;
typedef typename set_type::size_type size_type;
size_type i = 0;
set_type s;
for ( ConstIter iter = from; iter != to; ++iter ) {
++ i;
s.insert( *iter );
// this test could also be done after the loop:
if ( s.size() != i ) {
return( true );
}
}
return( false );
}
#include <ctime>
#include <cstdlib>
class RandomNumberGenerator {
public:
RandomNumberGenerator ( unsigned long int seed )
{
std::srand( seed );
}
RandomNumberGenerator ( void )
{}
unsigned long lower ( void ) const {
return ( 0 );
}
unsigned long upper ( void ) const {
return ( RAND_MAX );
}
unsigned long operator() ( void ) {
return( std::rand() );
}
unsigned long int operator() ( unsigned long int bound ) {
unsigned long int_width = RAND_MAX / bound;
unsigned long max_valid = int_width * bound;
unsigned long seed;
do {
seed = std::rand();
} while ( seed >= max_valid );
return( seed / int_width );
}
}; // RandomNumberGeneratorBase
typedef std::vector< unsigned long > u_vector;
void print_measurements ( const u_vector & vect ) {
{
std::clock_t start = std::clock();
std::cout << "_a: " << has_dublicates_a( vect.begin(), vect.end() ) <<
" ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
{
std::clock_t start = std::clock();
std::cout << "_aa: " << has_dublicates_aa( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
{
std::clock_t start = std::clock();
std::cout << "_b: " << has_dublicates_b( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
/*
{
std::clock_t start = std::clock();
std::cout << "_c: " << has_dublicates_c( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
*/
/*
{
std::clock_t start = std::clock();
std::cout << "_d: " << has_dublicates_d( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
*/
/*
{
std::clock_t start = std::clock();
std::cout << "_e: " << has_dublicates_e( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
{
std::clock_t start = std::clock();
std::cout << "_ee: " << has_dublicates_ee( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
*/
{
std::clock_t start = std::clock();
std::cout << "_f: " << has_dublicates_f( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
{
std::clock_t start = std::clock();
std::cout << "_ff: " << has_dublicates_ff( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
/*
{
std::clock_t start = std::clock();
std::cout << "_g: " << has_dublicates_g( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
*/
/*
{
std::clock_t start = std::clock();
std::cout << "_h: " << has_dublicates_h( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
*/
/*
{
std::clock_t start = std::clock();
std::cout << "_i: " << has_dublicates_i( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
*/
/*
{
std::clock_t start = std::clock();
std::cout << "_i: " << has_dublicates_j( vect.begin(), vect.end() )
<< " ";
std::clock_t stop = std::clock();
std::cout << stop - start << "\n";
}
*/
}
int main( int argn, char ** args ){
u_vector vect;
unsigned long size = std::atoi( args[1] );
unsigned long max = std::atoi( args[2] ) + 1;
unsigned long seed = std::atoi( args[3] );
RandomNumberGenerator R ( seed );
for ( unsigned long i = 0; i < size; ++i ) {
vect.push_back( R(max) );
}
std::sort ( vect.begin(), vect.end() );
std::cout << "random, sorte up:\n";
print_measurements( vect );
std::reverse( vect.begin(), vect.end() );
std::cout << "random, sorted down:\n";
print_measurements( vect );
std::random_shuffle( vect.begin(), vect.end() );
std::cout << "random:\n";
print_measurements( vect );
vect.clear();
for ( unsigned long i = 0; i < size; ++i ) {
vect.push_back( i );
}
std::cout << "unique, sorted up\n";
print_measurements( vect );
std::reverse( vect.begin(), vect.end() );
std::cout << "unique, sorted down:\n";
print_measurements( vect );
std::random_shuffle( vect.begin(), vect.end() );
std::cout << "unique, random:\n";
print_measurements( vect );
}
/*
usage:
a.out <length> <random_number_range> <random_number_seed>
*/