I was trying to optimize the Radix Sort code, because I felt there was room for it as traditional codes in books and on web seem a direct copy of one another and also they work very slow as they take an arbitrary number such as 10 for modulo operation. I have optimized the code as far as I could go, maybe I might have missed some optimization techniques. In that case please enlighten me.
Motivation for optimization:
http://codercorner.com/RadixSortRevisited.htm
http://stereopsis.com/radix.html
I was unable to implement all the optimizations in the articles, mostly it was beyond my skills and understanding and lack of sufficient time, if you can feel free to implement them.
EDIT 4
This Java version of Radix Sort calculates all histograms in 1 read and does not need to fill array Z with zeros after every LSB sort along with the usual ability to skip sorting and jump to next LSB sorting if all previous LSB's are same. As usual this is only for 32-bit integers but a 64-bit version can be created from it.
protected static int[] DSC(int A[])// Sorts in descending order
{
int tmp[] = new int[A.length] ;
int Z[] = new int[1024] ;
int i, Jump, Jump2, Jump3, Jump4, swap[] ;
Jump = A[0] & 255 ;
Z[Jump] = 1 ;
Jump2 = ((A[0] >> 8) & 255) + 256 ;
Z[Jump2] = 1 ;
Jump3 = ((A[0] >> 16) & 255) + 512 ;
Z[Jump3] = 1 ;
Jump4 = (A[0] >> 24) + 768 ;
Z[Jump4] = 1 ;
// Histograms creation
for (i = 1 ; i < A.length; ++i)
{
++Z[A[i] & 255] ;
++Z[((A[i] >> 8) & 255) + 256] ;
++Z[((A[i] >> 16) & 255) + 512] ;
++Z[(A[i] >> 24) + 768] ;
}
// 1st LSB Byte Sort
if( Z[Jump] != A.length )
{
Z[0] = A.length - Z[0];
for (i = 1; i < 256; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[A[i] & 255]++] = A[i];
}
swap = A ; A = tmp ; tmp = swap ;
}
// 2nd LSB Byte Sort
if( Z[Jump2] != A.length )
{
Z[256] = A.length - Z[256];
for (i = 257; i < 512; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[((A[i] >> 8) & 255) + 256]++] = A[i];
}
swap = A ; A = tmp ; tmp = swap ;
}
// 3rd LSB Byte Sort
if( Z[Jump3] != A.length )
{
Z[512] = A.length - Z[512];
for (i = 513; i < 768; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[((A[i] >> 16) & 255) + 512]++] = A[i];
}
swap = A ; A = tmp ; tmp = swap ;
}
// 4th LSB Byte Sort
if( Z[Jump4] != A.length )
{
Z[768] = A.length - Z[768];
for (i = 769; i < Z.length; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[(A[i] >> 24) + 768]++] = A[i];
}
return tmp ;
}
return A ;
}
The Java version ran faster with != sign than == sign
if( Z[Jump] != A.length )
{
// lines of code
}...
but in C the below version was on average, 25% faster (with equalto sign) than its counterpart with != sign. Your hardware might react differently.
if( Z[Jump] == A.length );
else
{
// lines of code
}...
Below is the C code ( "long" on my machine is 32 bits )
long* Radix_2_ac_long(long *A, size_t N, long *Temp)// Sorts in ascending order
{
size_t Z[1024] = {0};
long *swp;
size_t i, Jump, Jump2, Jump3, Jump4;
// Sort-circuit set-up
Jump = *A & 255;
Z[Jump] = 1;
Jump2 = ((*A >> 8) & 255) + 256;
Z[Jump2] = 1;
Jump3 = ((*A >> 16) & 255) + 512;
Z[Jump3] = 1;
Jump4 = (*A >> 24) + 768;
Z[Jump4] = 1;
// Histograms creation
for(i = 1 ; i < N ; ++i)
{
++Z[*(A+i) & 255];
++Z[((*(A+i) >> 8) & 255) + 256];
++Z[((*(A+i) >> 16) & 255) + 512];
++Z[(*(A+i) >> 24) + 768];
}
// 1st LSB byte sort
if( Z[Jump] == N );
else
{
for( i = 1 ; i < 256 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[*(A+i) & 255] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 2nd LSB byte sort
if( Z[Jump2] == N );
else
{
for( i = 257 ; i < 512 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 3rd LSB byte sort
if( Z[Jump3] == N );
else
{
for( i = 513 ; i < 768 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 4th LSB byte sort
if( Z[Jump4] == N );
else
{
for( i = 769 ; i < 1024 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[(*(A+i) >> 24) + 768] + Temp) = *(A+i);
}
return Temp;
}
return A;
}
EDIT 5
The sort now handles negative numbers too. Only some minor/negligible tweaks to the code did it. It runs a little slower as a result but the effect is not significant. Coded in C, below ( "long" on my system is 32 bits )
long* Radix_Sort(long *A, size_t N, long *Temp)
{
size_t Z[1024] = {0};
long *swp;
size_t Jump, Jump2, Jump3, Jump4;
long i;
// Sort-circuit set-up
Jump = *A & 255;
Z[Jump] = 1;
Jump2 = ((*A >> 8) & 255) + 256;
Z[Jump2] = 1;
Jump3 = ((*A >> 16) & 255) + 512;
Z[Jump3] = 1;
Jump4 = ((*A >> 24) & 255) + 768;
Z[Jump4] = 1;
// Histograms creation
for(i = 1 ; i < N ; ++i)
{
++Z[*(A+i) & 255];
++Z[((*(A+i) >> 8) & 255) + 256];
++Z[((*(A+i) >> 16) & 255) + 512];
++Z[((*(A+i) >> 24) & 255) + 768];
}
// 1st LSB byte sort
if( Z[Jump] == N );
else
{
for( i = 1 ; i < 256 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[*(A+i) & 255] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 2nd LSB byte sort
if( Z[Jump2] == N );
else
{
for( i = 257 ; i < 512 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 3rd LSB byte sort
if( Z[Jump3] == N );
else
{
for( i = 513 ; i < 768 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 4th LSB byte sort and negative numbers sort
if( Z[Jump4] == N );
else
{
for( i = 897 ; i < 1024 ; ++i )// -ve values frequency starts after index 895, i.e at 896 ( 896 = 768 + 128 ), goes upto 1023
{
Z[i] = Z[i-1] + Z[i];
}
Z[768] = Z[768] + Z[1023];
for( i = 769 ; i < 896 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[((*(A+i) >> 24) & 255) + 768] + Temp) = *(A+i);
}
return Temp;
}
return A;
}
EDIT 6
Below is the pointer optimized version ( accesses array locations via pointers ) that takes on average, approximately 20% less time to sort than the one above. It also uses 4 separate arrays for faster address calculation ( "long" on my system is 32 bits ).
long* Radix_Sort(long *A, size_t N, long *Temp)
{
long Z1[256] ;
long Z2[256] ;
long Z3[256] ;
long Z4[256] ;
long T = 0 ;
while(T != 256)
{
*(Z1+T) = 0 ;
*(Z2+T) = 0 ;
*(Z3+T) = 0 ;
*(Z4+T) = 0 ;
++T;
}
size_t Jump, Jump2, Jump3, Jump4;
// Sort-circuit set-up
Jump = *A & 255 ;
Z1[Jump] = 1;
Jump2 = (*A >> 8) & 255 ;
Z2[Jump2] = 1;
Jump3 = (*A >> 16) & 255 ;
Z3[Jump3] = 1;
Jump4 = (*A >> 24) & 255 ;
Z4[Jump4] = 1;
// Histograms creation
long *swp = A + N;
long *i = A + 1;
for( ; i != swp ; ++i)
{
++Z1[*i & 255];
++Z2[(*i >> 8) & 255];
++Z3[(*i >> 16) & 255];
++Z4[(*i >> 24) & 255];
}
// 1st LSB byte sort
if( Z1[Jump] == N );
else
{
swp = Z1+256 ;
for( i = Z1+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A-1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z1[*i & 255] + Temp) = *i;
}
swp = A;
A = Temp;
Temp = swp;
}
// 2nd LSB byte sort
if( Z2[Jump2] == N );
else
{
swp = Z2+256 ;
for( i = Z2+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A-1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z2[(*i >> 8) & 255] + Temp) = *i;
}
swp = A;
A = Temp;
Temp = swp;
}
// 3rd LSB byte sort
if( Z3[Jump3] == N );
else
{
swp = Z3 + 256 ;
for( i = Z3+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A-1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z3[(*i >> 16) & 255] + Temp) = *i;
}
swp = A;
A = Temp;
Temp = swp;
}
// 4th LSB byte sort and negative numbers sort
if( Z4[Jump4] == N );
else
{
swp = Z4 + 256 ;
for( i = Z4+129 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
*Z4 = *Z4 + *(Z4+255) ;
swp = Z4 + 128 ;
for( i = Z4+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A - 1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z4[(*i >> 24) & 255] + Temp) = *i;
}
return Temp;
}
return A;
}
See Question&Answers more detail:
os