Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
339 views
in Technique[技术] by (71.8m points)

c - Radix Sort Optimization

I was trying to optimize the Radix Sort code, because I felt there was room for it as traditional codes in books and on web seem a direct copy of one another and also they work very slow as they take an arbitrary number such as 10 for modulo operation. I have optimized the code as far as I could go, maybe I might have missed some optimization techniques. In that case please enlighten me.

Motivation for optimization:
http://codercorner.com/RadixSortRevisited.htm
http://stereopsis.com/radix.html
I was unable to implement all the optimizations in the articles, mostly it was beyond my skills and understanding and lack of sufficient time, if you can feel free to implement them.

EDIT 4
This Java version of Radix Sort calculates all histograms in 1 read and does not need to fill array Z with zeros after every LSB sort along with the usual ability to skip sorting and jump to next LSB sorting if all previous LSB's are same. As usual this is only for 32-bit integers but a 64-bit version can be created from it.

protected static int[] DSC(int A[])// Sorts in descending order
{
    int tmp[] = new int[A.length] ;
    int Z[] = new int[1024] ;
    int i, Jump, Jump2, Jump3, Jump4, swap[] ;

    Jump = A[0] & 255 ;
    Z[Jump] = 1 ;
    Jump2 = ((A[0] >> 8) & 255) + 256 ;
    Z[Jump2] = 1 ;
    Jump3 = ((A[0] >> 16) & 255) + 512 ;
    Z[Jump3] = 1 ;
    Jump4 = (A[0] >> 24) + 768 ;
    Z[Jump4] = 1 ;

    // Histograms creation
    for (i = 1 ; i < A.length; ++i)
    {
        ++Z[A[i] & 255] ;
        ++Z[((A[i] >> 8) & 255) + 256] ;
        ++Z[((A[i] >> 16) & 255) + 512] ;
        ++Z[(A[i] >> 24) + 768] ;
    }

    // 1st LSB Byte Sort
    if( Z[Jump] != A.length )
    {
        Z[0] = A.length - Z[0];
        for (i = 1; i < 256; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[A[i] & 255]++] = A[i];
        }
        swap = A ; A = tmp ; tmp = swap ;
    }

    // 2nd LSB Byte Sort
    if( Z[Jump2] != A.length )
    {
        Z[256] = A.length - Z[256];
        for (i = 257; i < 512; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[((A[i] >> 8) & 255) + 256]++] = A[i];
        }
        swap = A ; A = tmp ; tmp = swap ;
    }

    // 3rd LSB Byte Sort
    if( Z[Jump3] != A.length )
    {
        Z[512] = A.length - Z[512];
        for (i = 513; i < 768; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[((A[i] >> 16) & 255) + 512]++] = A[i];
        }
        swap = A ; A = tmp ; tmp = swap ;
    }

    // 4th LSB Byte Sort
    if( Z[Jump4] != A.length )
    {
        Z[768] = A.length - Z[768];
        for (i = 769; i < Z.length; ++i)
        {
            Z[i] = Z[i - 1] - Z[i];
        }
        for (i = 0; i < A.length; ++i)
        {
            tmp[Z[(A[i] >> 24) + 768]++] = A[i];
        }
        return tmp ;
    }
    return A ;
}

The Java version ran faster with != sign than == sign

if( Z[Jump] != A.length )
{
    // lines of code
}...

but in C the below version was on average, 25% faster (with equalto sign) than its counterpart with != sign. Your hardware might react differently.

if( Z[Jump] == A.length );
else
{
    // lines of code
}...

Below is the C code ( "long" on my machine is 32 bits )

long* Radix_2_ac_long(long *A, size_t N, long *Temp)// Sorts in ascending order
{
    size_t Z[1024] = {0};
    long *swp;
    size_t i, Jump, Jump2, Jump3, Jump4;

    // Sort-circuit set-up
    Jump = *A & 255;
    Z[Jump] = 1;
    Jump2 = ((*A >> 8) & 255) + 256;
    Z[Jump2] = 1;
    Jump3 = ((*A >> 16) & 255) + 512;
    Z[Jump3] = 1;
    Jump4 = (*A >> 24) + 768;
    Z[Jump4] = 1;

    // Histograms creation
    for(i = 1 ; i < N ; ++i)
    {
        ++Z[*(A+i) & 255];
        ++Z[((*(A+i) >> 8) & 255) + 256];
        ++Z[((*(A+i) >> 16) & 255) + 512];
        ++Z[(*(A+i) >> 24) + 768];
    }

    // 1st LSB byte sort
    if( Z[Jump] == N );
    else
    {
        for( i = 1 ; i < 256 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[*(A+i) & 255] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 2nd LSB byte sort
    if( Z[Jump2] == N );
    else
    {
        for( i = 257 ; i < 512 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 3rd LSB byte sort
    if( Z[Jump3] == N );
    else
    {
        for( i = 513 ; i < 768 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 4th LSB byte sort
    if( Z[Jump4] == N );
    else
    {
        for( i = 769 ; i < 1024 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i < N ; --i )
        {
            *(--Z[(*(A+i) >> 24) + 768] + Temp) = *(A+i);
        }
        return Temp;
    }
    return A;
}

EDIT 5
The sort now handles negative numbers too. Only some minor/negligible tweaks to the code did it. It runs a little slower as a result but the effect is not significant. Coded in C, below ( "long" on my system is 32 bits )

long* Radix_Sort(long *A, size_t N, long *Temp)
{
    size_t Z[1024] = {0};
    long *swp;
    size_t Jump, Jump2, Jump3, Jump4;
    long i;

    // Sort-circuit set-up
    Jump = *A & 255;
    Z[Jump] = 1;
    Jump2 = ((*A >> 8) & 255) + 256;
    Z[Jump2] = 1;
    Jump3 = ((*A >> 16) & 255) + 512;
    Z[Jump3] = 1;
    Jump4 = ((*A >> 24) & 255) + 768;
    Z[Jump4] = 1;

    // Histograms creation
    for(i = 1 ; i < N ; ++i)
    {
        ++Z[*(A+i) & 255];
        ++Z[((*(A+i) >> 8) & 255) + 256];
        ++Z[((*(A+i) >> 16) & 255) + 512];
        ++Z[((*(A+i) >> 24) & 255) + 768];
    }

    // 1st LSB byte sort
    if( Z[Jump] == N );
    else
    {
        for( i = 1 ; i < 256 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[*(A+i) & 255] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 2nd LSB byte sort
    if( Z[Jump2] == N );
    else
    {
        for( i = 257 ; i < 512 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 3rd LSB byte sort
    if( Z[Jump3] == N );
    else
    {
        for( i = 513 ; i < 768 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 4th LSB byte sort and negative numbers sort
    if( Z[Jump4] == N );
    else
    {
        for( i = 897 ; i < 1024 ; ++i )// -ve values frequency starts after index 895, i.e at 896 ( 896 = 768 + 128 ), goes upto 1023
        {
            Z[i] = Z[i-1] + Z[i];
        }
        Z[768] = Z[768] + Z[1023];
        for( i = 769 ; i < 896 ; ++i )
        {
            Z[i] = Z[i-1] + Z[i];
        }
        for( i = N-1 ; i >= 0 ; --i )
        {
            *(--Z[((*(A+i) >> 24) & 255) + 768] + Temp) = *(A+i);
        }
        return Temp;
    }
    return A;
}

EDIT 6
Below is the pointer optimized version ( accesses array locations via pointers ) that takes on average, approximately 20% less time to sort than the one above. It also uses 4 separate arrays for faster address calculation ( "long" on my system is 32 bits ).

long* Radix_Sort(long *A, size_t N, long *Temp)
{
    long Z1[256] ;
    long Z2[256] ;
    long Z3[256] ;
    long Z4[256] ;
    long T = 0 ;
    while(T != 256)
    {
        *(Z1+T) = 0 ;
        *(Z2+T) = 0 ;
        *(Z3+T) = 0 ;
        *(Z4+T) = 0 ;
        ++T;
    }
    size_t Jump, Jump2, Jump3, Jump4;

    // Sort-circuit set-up
    Jump = *A & 255 ;
    Z1[Jump] = 1;
    Jump2 = (*A >> 8) & 255 ;
    Z2[Jump2] = 1;
    Jump3 = (*A >> 16) & 255 ;
    Z3[Jump3] = 1;
    Jump4 = (*A >> 24) & 255 ;
    Z4[Jump4] = 1;

    // Histograms creation
    long *swp = A + N;
    long *i = A + 1;
    for( ; i != swp ; ++i)
    {
        ++Z1[*i & 255];
        ++Z2[(*i >> 8) & 255];
        ++Z3[(*i >> 16) & 255];
        ++Z4[(*i >> 24) & 255];
    }

    // 1st LSB byte sort
    if( Z1[Jump] == N );
    else
    {
        swp = Z1+256 ;
        for( i = Z1+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A-1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z1[*i & 255] + Temp) = *i;
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 2nd LSB byte sort
    if( Z2[Jump2] == N );
    else
    {
        swp = Z2+256 ;
        for( i = Z2+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A-1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z2[(*i >> 8) & 255] + Temp) = *i;
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 3rd LSB byte sort
    if( Z3[Jump3] == N );
    else
    {
        swp = Z3 + 256 ;
        for( i = Z3+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A-1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z3[(*i >> 16) & 255] + Temp) = *i;
        }
        swp = A;
        A = Temp;
        Temp = swp;
    }

    // 4th LSB byte sort and negative numbers sort
    if( Z4[Jump4] == N );
    else
    {
        swp = Z4 + 256 ;
        for( i = Z4+129 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        *Z4 = *Z4 + *(Z4+255) ;
        swp = Z4 + 128 ;
        for( i = Z4+1 ; i != swp ; ++i )
        {
            *i = *(i-1) + *i;
        }
        swp = A - 1;
        for( i = A+N-1 ; i != swp ; --i )
        {
            *(--Z4[(*i >> 24) & 255] + Temp) = *i;
        }
        return Temp;
    }
    return A;
}
See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

The edit 4 version is good enough if the original and temp arrays fit in cache. If the array size is much greater than cache size, most of the overhead is due to the random order writes to the arrays. A hybrid msb/lsb radix sort can avoid this issue. For example split the array into 256 bins according to the most significant byte, then do a lsb radix sort on each of the 256 bins. The idea here is that a pair (original and temp) of bins will fit within the cache, where random order writes are not an issue (for most cache implementations).

For a 8MB cache, the goal is for each of the bins to be < 4MB in size = 1 million 32 bit integers if the integers evenly distribute into the bins. This strategy would work for array size up to 256 million 32 bit integers. For larger arrays, the msb phase could split up the array into 1024 bins, for up to 1 billion 32 bit integers. On my system, sorting 16,777,216 (2^24) 32 bit integers with a classic 8,8,8,8 lsb radix sort took 0.45 seconds, while the hybrid 8 msb : 8,8,8 lsb took 0.24 seconds.

// split array into 256 bins according to most significant byte
void RadixSort(uint32_t * a, size_t count)
{
size_t aIndex[260] = {0};               // count / array
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i;
    for(i = 0; i < count; i++)          // generate histogram
        aIndex[1+((size_t)(a[i] >> 24))]++;
    for(i = 2; i < 257; i++)            // convert to indices
        aIndex[i] += aIndex[i-1];
    for(i = 0; i < count; i++)          //  sort by msb
        b[aIndex[a[i]>>24]++] = a[i];
    for(i = 256; i; i--)                // restore aIndex
        aIndex[i] = aIndex[i-1];
    aIndex[0] = 0;
    for(i = 0; i < 256; i++)            // radix sort the 256 bins
        RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
    delete[] b;
}

// sort a bin by 3 least significant bytes
void RadixSort3(uint32_t * a, uint32_t *b, size_t count)
{
size_t mIndex[3][256] = {0};            // count / matrix
size_t i,j,m,n;
uint32_t u;
    if(count == 0)
        return;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 3; j++){
            mIndex[j][(size_t)(u & 0xff)]++;
            u >>= 8;
        }       
    }
    for(j = 0; j < 3; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 256; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }       
    }
    for(j = 0; j < 3; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<3))&0xff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
}

Example code for classic lsb radix sorts:

Example C++ lsb radix sort using 8,8,8,8 bit fields:

typedef unsigned int uint32_t;

void RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[4][256] = {0};            // count / index matrix
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i,j,m,n;
uint32_t u;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 4; j++){
            mIndex[j][(size_t)(u & 0xff)]++;
            u >>= 8;
        }       
    }
    for(j = 0; j < 4; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 256; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }       
    }
    for(j = 0; j < 4; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<3))&0xff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
    delete[] b;
}

Example C++ code using 16,16 bit fields:

typedef unsigned int uint32_t;

uint32_t * RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[2][65536] = {0};          // count / index matrix
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i,j,m,n;
uint32_t u;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 2; j++){
            mIndex[j][(size_t)(u & 0xffff)]++;
            u >>= 16;
        }
    }
    for(j = 0; j < 2; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 65536; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }
    }       
    for(j = 0; j < 2; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<4))&0xffff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
    delete[] b;
    return(a);
}

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...