Sierra Toolkit Version of the Day
RadixSort.hpp
00001 /*------------------------------------------------------------------------*/
00002 /*                 Copyright 2010 Sandia Corporation.                     */
00003 /*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
00004 /*  license for use of this work by or on behalf of the U.S. Government.  */
00005 /*  Export of this program may require a license from the                 */
00006 /*  United States Government.                                             */
00007 /*------------------------------------------------------------------------*/
00008 
00009 // Portions of this source code file are:
00010 // Copyright (c) 2010, Victor J. Duvanenko.
00011 // All rights reserved.
00012 // Used with permission.
00013 //
00014 // Redistribution and use in source and binary forms, with or without
00015 // modification, are permitted provided that the following conditions
00016 // are met:
00017 //   * Redistributions of source code must retain the above copyright
00018 //     notice, this list of conditions and the following disclaimer.
00019 //   * Redistributions in binary form must reproduce the above copyright
00020 //     notice, this list of conditions and the following disclaimer in the
00021 //     documentation and/or other materials provided with the distribution.
00022 //   * The name of Victor J. Duvanenko may not be used to endorse or
00023 //     promote products derived from this software without specific prior
00024 //     written permission.
00025 //
00026 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER "AS IS" AND ANY
00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE
00030 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
00033 // BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00034 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00035 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00036 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00037 
00038 #ifndef STK_UTIL_UTIL_RADIXSORT_HPP
00039 #define STK_UTIL_UTIL_RADIXSORT_HPP
00040 
00041 //
00042 // Internal implementation
00043 //
00044 namespace { // anonymous namespace
00045 
00046 // Swap that does not check for self-assignment.
00047 template< class T >
00048 inline void swap_impl( T& a, T& b )
00049 {
00050   T tmp = a;
00051   a     = b;
00052   b     = tmp;
00053 }
00054 
00055 // insertion sort similar to STL with no self-assignment
00056 template <class T>
00057 inline void insertion_sort_impl( T* a, size_t a_size )
00058 {
00059   for ( size_t i = 1; i < a_size; ++i )
00060   {
00061     if ( a[ i ] < a[ i - 1 ] )    // no need to do (j > 0) compare for the first iteration
00062     {
00063       T currentElement = a[ i ];
00064       a[ i ] = a[ i - 1 ];
00065       size_t j;
00066       for ( j = i - 1; j > 0 && currentElement < a[ j - 1 ]; --j )
00067       {
00068         a[ j ] = a[ j - 1 ];
00069       }
00070       a[ j ] = currentElement;  // always necessary work/write
00071     }
00072     // Perform no work at all if the first comparison fails - i.e. never assign an element to itself!
00073   }
00074 }
00075 
00076 // Recursive implementation of radix sort for unsigned integer types only.
00077 //
00078 //    PowerOfTwoRadix - must be a power of 2, 256 is suggested for current hardware as of 03/20/2011.
00079 //    Log2ofPowerOfTwoRadix - for example log( 256 ) = 8
00080 //    Threshold - length below which array sections are sorted with an insertion sort
00081 //    a - pointer to start of an array of integer type
00082 //    last - one less than the length of the a array
00083 //    bitMask - controls how many and which bits we process at a time
00084 //    shiftRightAmount - sizeof(T) * 8 - Log2ofPowerOfTwoRadix
00085 template< class T, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold >
00086 inline void radix_sort_unsigned_impl( T* a, long last, T bitMask, unsigned long shiftRightAmount )
00087 {
00088   unsigned long count[ PowerOfTwoRadix ];
00089   for( unsigned long i = 0; i < PowerOfTwoRadix; ++i )     count[ i ] = 0;
00090   for ( long _current = 0; _current <= last; ++_current )  count[ (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount ) ]++;  // Scan the array and count the number of times each value appears
00091 
00092   long startOfBin[ PowerOfTwoRadix + 1 ], endOfBin[ PowerOfTwoRadix ], nextBin = 1;
00093   startOfBin[ 0 ] = endOfBin[ 0 ] = 0;    startOfBin[ PowerOfTwoRadix ] = -1;     // sentinel
00094   for( unsigned long i = 1; i < PowerOfTwoRadix; ++i )
00095     startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ];
00096 
00097   for ( long _current = 0; _current <= last; )
00098   {
00099     unsigned long digit;
00100     T _current_element = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
00101     while( endOfBin[ digit = (unsigned long)(( _current_element & bitMask ) >> shiftRightAmount )] != _current )  swap_impl( _current_element, a[ endOfBin[ digit ]++ ] );
00102     a[ _current ] = _current_element;
00103 
00104     endOfBin[ digit ]++;
00105     while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] )  ++nextBin; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin
00106     _current = endOfBin[ nextBin - 1 ];
00107   }
00108   bitMask >>= Log2ofPowerOfTwoRadix;
00109   if ( bitMask != 0 )           // end recursion when all the bits have been processes
00110   {
00111     if ( shiftRightAmount >= Log2ofPowerOfTwoRadix )  shiftRightAmount -= Log2ofPowerOfTwoRadix;
00112     else                        shiftRightAmount  = 0;
00113 
00114     for( unsigned long i = 0; i < PowerOfTwoRadix; ++i )
00115     {
00116       long numberOfElements = endOfBin[ i ] - startOfBin[ i ];
00117       if ( numberOfElements >= Threshold )    // endOfBin actually points to one beyond the bin
00118         radix_sort_unsigned_impl< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount );
00119       else if ( numberOfElements >= 2 )
00120         insertion_sort_impl( &a[ startOfBin[ i ]], numberOfElements );
00121     }
00122   }
00123 }
00124 
00125 } // end anonymous namespace
00126 
00127 //
00128 // Public API
00129 //
00130 namespace stk {
00131 namespace util {
00132 
00153 template< class T >
00154 void radix_sort_unsigned( T* a, size_t a_size )
00155 {
00156   const long Threshold                      =  25;  // smaller array sections sorted by insertion sort
00157   const unsigned long PowerOfTwoRadix       = 256;  // Radix - must be a power of 2
00158   const unsigned long Log2ofPowerOfTwoRadix =   8;  // log( 256 ) = 8
00159 
00160   if ( a_size >= (size_t)Threshold ) {
00161     if (a_size > (size_t)std::numeric_limits<long>::max()) {
00162       std::ostringstream msg ;
00163       msg << "stk::utility::radix_sort() exceeded allowable array size (";
00164       msg << a_size << " < " << std::numeric_limits<long>::max() << ")";
00165       throw std::runtime_error( msg.str() );
00166     }
00167     unsigned long shiftRightAmount = sizeof(T) * 8 - Log2ofPowerOfTwoRadix;
00168     T bitMask = (T)( ((T)( PowerOfTwoRadix - 1 )) << shiftRightAmount );  // bitMask controls/selects how many and which bits we process at a time
00169     radix_sort_unsigned_impl< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( a, a_size - 1, bitMask, shiftRightAmount );
00170   }
00171   else
00172     insertion_sort_impl( a, a_size );
00173 }
00174 
00175 } // namespace util
00176 } // namespace stk
00177 
00178 
00179 //--------------------------------------
00180 // PARALLEL RADIX SORT using TBB library
00181 //--------------------------------------
00182 
00183 /* TODO: this code works with TBB, it just needs minor modifications and some renaming
00184 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T >
00185 class RadixInPlaceOperation_3
00186 {
00187   T* my_a;                    // a local copy to the input array to provide a pointer to each parallel task
00188   long*  my_startOfBin;
00189   long*  my_endOfBin;
00190   T  my_bitMask;                  // a local copy of the bitMask
00191   unsigned long my_shiftRightAmount;
00192   static const unsigned long Threshold_P = 10000;   // threshold when to switch between parallel and non-parallel implementations
00193 public:
00194   static const unsigned long numberOfBins = PowerOfTwoRadix;
00195   unsigned long my_count[ numberOfBins ];       // the count for this task
00196 
00197   RadixInPlaceOperation_3(  T a[], long* startOfBin, long* endOfBin, T bitMask, unsigned long shiftRightAmount ) :
00198                 my_a( a ), my_startOfBin( startOfBin ), my_endOfBin( endOfBin ),
00199                 my_bitMask( bitMask ), my_shiftRightAmount( shiftRightAmount )
00200   {}
00201   void operator()( const blocked_range< long >& r ) const
00202   {
00203     for( long i = r.begin(); i != r.end(); ++i )
00204     {
00205       long numOfElements = my_endOfBin[ i ] - my_startOfBin[ i ];
00206       if ( numOfElements >= Threshold_P )
00207         _RadixSort_Unsigned_PowerOf2Radix_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount );
00208       else if ( numOfElements >= Threshold )
00209         _RadixSort_Unsigned_PowerOf2Radix_1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount );
00210       else if ( numOfElements >= 2 )
00211         insertion_sort_impl( &my_a[ my_startOfBin[ i ]], numOfElements );
00212     }
00213   }
00214 };
00215 
00216 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T >
00217 inline void _RadixSort_Unsigned_PowerOf2Radix_3( T* a, long last, T bitMask, unsigned long shiftRightAmount )
00218 {
00219   const unsigned long numberOfBins = PowerOfTwoRadix;
00220 
00221   CountingType_1< PowerOfTwoRadix, T >  count( a, bitMask, shiftRightAmount );  // contains the count array, which is initialized to all zeros
00222   // Scan the array and count the number of times each value appears
00223   parallel_reduce( blocked_range< unsigned long >( 0, last + 1, 1000 ), count );
00224 
00225   long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin;
00226   startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0;
00227   for( unsigned long i = 1; i < numberOfBins; i++ )
00228     startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count.my_count[ i - 1 ];
00229 
00230   for ( long _current = 0; _current <= last; )
00231   {
00232     unsigned long digit;
00233     T tmp = a[ _current ];  // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
00234     while ( true ) {
00235       digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount );  // extract the digit we are sorting based on
00236       if ( endOfBin[ digit ] == _current )
00237         break;
00238       swap_impl( tmp, a[ endOfBin[ digit ] ] );
00239       endOfBin[ digit ]++;
00240     }
00241     a[ _current ] = tmp;
00242 
00243     endOfBin[ digit ]++;          // leave the element at its location and grow the bin
00244     _current++;               // advance the current pointer to the next element
00245     while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins )
00246       nextBin++;
00247     while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins )
00248       nextBin++;
00249     if ( _current < endOfBin[ nextBin - 1 ] )
00250        _current = endOfBin[ nextBin - 1 ];
00251   }
00252   bitMask >>= Log2ofPowerOfTwoRadix;
00253   if ( bitMask != 0 )           // end recursion when all the bits have been processes
00254   {
00255     if ( shiftRightAmount >= Log2ofPowerOfTwoRadix )  shiftRightAmount -= Log2ofPowerOfTwoRadix;
00256     else                        shiftRightAmount  = 0;
00257 
00258     RadixInPlaceOperation_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold, T >  radixOp( a, startOfBin, endOfBin, bitMask, shiftRightAmount );
00259     parallel_for( blocked_range< long >( 0, numberOfBins ), radixOp );
00260   }
00261 }
00262 
00263 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T >
00264 inline void _RadixSort_Unsigned_PowerOf2Radix_1( T* a, long last, T bitMask, unsigned long shiftRightAmount )
00265 {
00266   const unsigned long numberOfBins = PowerOfTwoRadix;
00267   unsigned long count[ numberOfBins ];
00268 // Counting occurrence of digits within the array (related to Counting Sort)
00269   for( unsigned long i = 0; i < numberOfBins; i++ )
00270     count[ i ] = 0;
00271 
00272   for ( long _current = 0; _current <= last; _current++ ) // Scan the array and count the number of times each value appears
00273   {
00274     unsigned long digit = (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount );  // extract the digit we are sorting based on
00275     count[ digit ]++;
00276   }
00277 // Moving array elements into their bins
00278   long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin;
00279   startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0;
00280   for( unsigned long i = 1; i < numberOfBins; i++ )
00281     startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ];
00282 
00283   for ( long _current = 0; _current <= last; )
00284   {
00285     unsigned long digit;
00286     T tmp = a[ _current ];  // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
00287     while ( true ) {
00288       digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount );  // extract the digit we are sorting based on
00289       if ( endOfBin[ digit ] == _current )
00290         break;
00291       swap_impl( tmp, a[ endOfBin[ digit ] ] );
00292       endOfBin[ digit ]++;
00293     }
00294     a[ _current ] = tmp;
00295 
00296     endOfBin[ digit ]++;          // leave the element at its location and grow the bin
00297     _current++;               // advance the current pointer to the next element
00298     while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins )
00299       nextBin++;
00300     while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins )
00301       nextBin++;
00302     if ( _current < endOfBin[ nextBin - 1 ] )
00303        _current = endOfBin[ nextBin - 1 ];
00304   }
00305 // Recursion for each bin
00306   bitMask >>= Log2ofPowerOfTwoRadix;
00307   if ( bitMask != 0 )           // end recursion when all the bits have been processes
00308   {
00309     if ( shiftRightAmount >= Log2ofPowerOfTwoRadix )  shiftRightAmount -= Log2ofPowerOfTwoRadix;
00310     else                        shiftRightAmount  = 0;
00311 
00312     for( unsigned long i = 0; i < numberOfBins; i++ )
00313     {
00314       long numberOfElements = endOfBin[ i ] - startOfBin[ i ];
00315       if ( numberOfElements >= Threshold ) {    // endOfBin actually points to one beyond the bin
00316         _RadixSort_Unsigned_PowerOf2Radix_1< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount );
00317     }
00318     else if ( numberOfElements >= 2 ) {
00319     insertion_sort_impl( &a[ startOfBin[ i ]], numberOfElements );
00320     }
00321     }
00322   }
00323 }
00324 
00325 // Top-level template function
00326 template< class T >
00327 inline void RadixSortParallel( T* a, unsigned long a_size )
00328 {
00329   if ( a_size < 2 ) return;
00330 
00331   const unsigned long Threshold             =  32;  // Threshold of when to switch to using Insertion Sort
00332   const unsigned long PowerOfTwoRadix       = 256;  // Radix - must be a power of 2
00333   const unsigned long Log2ofPowerOfTwoRadix =   8;
00334   unsigned long shiftRightAmount = sizeof( T ) * 8 - Log2ofPowerOfTwoRadix;   // Create bit-mask and shift right amount
00335   T bitMask = (T)( ((T)( PowerOfTwoRadix - 1 )) << shiftRightAmount );  // bitMask controls/selects how many and which bits we process at a time
00336 
00337   if ( a_size >= Threshold )
00338     _RadixSort_Unsigned_PowerOf2Radix_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( a, a_size - 1, bitMask, shiftRightAmount );
00339   else
00340     insertion_sort_impl( a, a_size );
00341 }
00342 
00343 */
00344 
00345 
00346 
00347 #endif /* STK_UTIL_UTIL_RADIXSORT_HPP */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends