Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
486 views
in Technique[技术] by (71.8m points)

c - Best assembly or compilation for minimum of three values

I'm looking at code generated by GCC-4.8 for x86_64 and wondering if there is a better (faster) way to compute the minimum of three values.

Here's an excerpt from Python's collections module that computes the minimum of m, rightindex+1, and leftindex:

    ssize_t m = n;
    if (m > rightindex + 1)
        m = rightindex + 1;
    if (m > leftindex)
        m = leftindex;

GCC generates serially dependent code with CMOVs:

leaq    1(%rbp), %rdx
cmpq    %rsi, %rdx
cmovg   %rsi, %rdx
cmpq    %rbx, %rdx
cmovg   %rbx, %rdx

Is there faster code that can take advantage of processor out-of-order parallel execution by removing the data dependencies? I'm wondering if there are known tricks for computing the minimum of multiple values without using conditionals or predicated instructions. Am also wondering if there are some saturating arithmetic intrinsics that would help in this situation.

EDITS:

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

Here are benchmark results for the methods suggested. Processor is Intel Core-i7 2600K running at 4GHz. Units are nanoseconds per loop (smaller is better). Measurements include pseudo-random test data generation and function call overhead, in addition to the actual min3 code under test.

                          gcc cmov      conditional      classical
                          sequence      branches         branchless

pseudo-random data        2.24          6.31             2.39
fixed data                2.24          2.99             2.39

Benchmark source code

functions.s: the 3 solutions evaluated:

//----------------------------------------------------------------------------

.text
.intel_syntax noprefix

//-----------------------------------------------------------------------------
// uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_a
min3_a:
   mov   rax, rcx
   cmp   rax, rdx
   cmovg rax, rdx
   cmp   rax, r8
   cmovg rax, r8
   retq

//-----------------------------------------------------------------------------
// uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_b
min3_b:
   mov   rax, rcx
   cmp   rax, rdx
   jle   skip1
   mov   rax, rdx
skip1:
   cmp   rax, r8
   jle   skip2
   mov   rax, r8
skip2:
   retq

//-----------------------------------------------------------------------------
// uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_c
min3_c:
   sub   rdx, rcx
   sbb   rax, rax
   and   rax, rdx
   add   rcx, rax
   sub   r8,  rcx
   sbb   rax, rax
   and   rax, r8
   add   rax, rcx
   retq

//-----------------------------------------------------------------------------

min3test.c, main program (mingw64/windows):

#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>

 uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8);
 uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8);
 uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8);

//-----------------------------------------------------------------------------
//
//  queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
//                            count directly.

uint64_t queryPerformanceCounter (void)
    {
    LARGE_INTEGER int64;
    QueryPerformanceCounter (&int64);
    return int64.QuadPart;
    }

//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns  count direcly.

uint64_t queryPerformanceFrequency (void)
    {
    LARGE_INTEGER int64;

    QueryPerformanceFrequency (&int64);
    return int64.QuadPart;
    }

//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
   {
   uint64_t carryOut = data >> 63;
   uint64_t maskOrZ = -carryOut; 
   return (data << 1) ^ (maskOrZ & mask);
   }

//---------------------------------------------------------------------------

uint64_t min3 (uint64_t a, uint64_t b, uint64_t c)
    {
    uint64_t smallest;

    smallest = a;
    if (smallest > b) smallest = b;
    if (smallest > c) smallest = c;
    return smallest;
    }

//---------------------------------------------------------------------------

static void runtests (uint64_t pattern, uint64_t mask)
    {
    uint64_t startCount, elapsed, index, loops = 800000000;
    double ns;

    startCount = queryPerformanceCounter ();
    for (index = 0; index < loops; index++)
        {
        pattern = lfsr64gpr (pattern, mask);
        min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
        }
    elapsed = queryPerformanceCounter () - startCount;
    ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
    printf ("%7.2f ns
", ns);

    startCount = queryPerformanceCounter ();
    for (index = 0; index < loops; index++)
        {
        pattern = lfsr64gpr (pattern, mask);
        min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
        }
    elapsed = queryPerformanceCounter () - startCount;
    ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
    printf ("%7.2f ns
", ns);

    startCount = queryPerformanceCounter ();
    for (index = 0; index < loops; index++)
        {
        pattern = lfsr64gpr (pattern, mask);
        min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
        }
    elapsed = queryPerformanceCounter () - startCount;
    ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
    printf ("%7.2f ns
", ns);
    }

//---------------------------------------------------------------------------

int main (void)
    {
    uint64_t index, pattern, mask;
    uint64_t count_a = 0, count_b = 0, count_c = 0;

    mask = 0xBEFFFFFFFFFFFFFF;
    pattern = 1;

    // sanity check the asm functions
    for (index = 0; index < 1000000; index++)
        {
        uint64_t expected, result_a, result_b, result_c;
        uint64_t pattern1 = (pattern >>  0) & 0xFFFFF;
        uint64_t pattern2 = (pattern >> 20) & 0xFFFFF;
        uint64_t pattern3 = (pattern >> 40) & 0xFFFFF;
        pattern = lfsr64gpr (pattern, mask);
        expected = min3 (pattern1, pattern2, pattern3);
        result_a = min3_a (pattern1, pattern2, pattern3);
        result_b = min3_b (pattern1, pattern2, pattern3);
        result_c = min3_c (pattern1, pattern2, pattern3);
        if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu
", expected, result_a, pattern1, pattern2, pattern3);
        if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu
", expected, result_b, pattern1, pattern2, pattern3);
        if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu
", expected, result_c, pattern1, pattern2, pattern3);
        if (expected == pattern1) count_a++;
        if (result_b == pattern2) count_b++;
        if (result_c == pattern3) count_c++;
        }
    printf ("pseudo-random distribution: %llu, %llu, %llu
", count_a, count_b, count_c);


    // raise our priority to increase measurement accuracy
    SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);

    printf ("using pseudo-random data
");
    runtests (1, mask);
    printf ("using fixed data
");
    runtests (0, mask);
    return 0;
    }

//---------------------------------------------------------------------------

build command line:

gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...