Suppose a1
, b1
, c1
, and d1
point to heap memory and my numerical code has the following core loop.
(假设a1
, b1
, c1
和d1
指向堆内存,而我的数字代码具有以下核心循环。)
const int n = 100000;
for (int j = 0; j < n; j++) {
a1[j] += b1[j];
c1[j] += d1[j];
}
This loop is executed 10,000 times via another outer for
loop.
(此循环通过另一个外部for
循环执行了10,000次。)
To speed it up, I changed the code to: (为了加快速度,我将代码更改为:)
for (int j = 0; j < n; j++) {
a1[j] += b1[j];
}
for (int j = 0; j < n; j++) {
c1[j] += d1[j];
}
Compiled on MS Visual C++ 10.0 with full optimization and SSE2 enabled for 32-bit on a Intel Core 2 Duo (x64), the first example takes 5.5 seconds and the double-loop example takes only 1.9 seconds.
(在具有完全优化功能的MS Visual C ++ 10.0上编译,并在Intel Core 2 Duo(x64)上为32位启用了SSE2 ,第一个示例花费5.5秒,而双循环示例仅花费1.9秒。)
My question is: (Please refer to the my rephrased question at the bottom) (我的问题是:(请参阅底部的我改写的问题))
PS: I am not sure, if this helps:
(PS:我不确定,这是否有帮助:)
Disassembly for the first loop basically looks like this (this block is repeated about five times in the full program):
(第一个循环的反汇编基本上是这样的(在整个程序中此块重复了大约五次):)
movsd xmm0,mmword ptr [edx+18h]
addsd xmm0,mmword ptr [ecx+20h]
movsd mmword ptr [ecx+20h],xmm0
movsd xmm0,mmword ptr [esi+10h]
addsd xmm0,mmword ptr [eax+30h]
movsd mmword ptr [eax+30h],xmm0
movsd xmm0,mmword ptr [edx+20h]
addsd xmm0,mmword ptr [ecx+28h]
movsd mmword ptr [ecx+28h],xmm0
movsd xmm0,mmword ptr [esi+18h]
addsd xmm0,mmword ptr [eax+38h]
Each loop of the double loop example produces this code (the following block is repeated about three times):
(双循环示例的每个循环都会生成此代码(以下块重复大约三遍):)
addsd xmm0,mmword ptr [eax+28h]
movsd mmword ptr [eax+28h],xmm0
movsd xmm0,mmword ptr [ecx+20h]
addsd xmm0,mmword ptr [eax+30h]
movsd mmword ptr [eax+30h],xmm0
movsd xmm0,mmword ptr [ecx+28h]
addsd xmm0,mmword ptr [eax+38h]
movsd mmword ptr [eax+38h],xmm0
movsd xmm0,mmword ptr [ecx+30h]
addsd xmm0,mmword ptr [eax+40h]
movsd mmword ptr [eax+40h],xmm0
The question turned out to be of no relevance, as the behavior severely depends on the sizes of the arrays (n) and the CPU cache.
(事实证明这个问题无关紧要,因为行为严重取决于阵列(n)的大小和CPU缓存。)
So if there is further interest, I rephrase the question: (因此,如果有进一步的兴趣,我重新提出一个问题:)
Could you provide some solid insight into the details that lead to the different cache behaviors as illustrated by the five regions on the following graph?
(您能否对导致不同缓存行为的细节提供深入的了解,如下图的五个区域所示?)
It might also be interesting to point out the differences between CPU/cache architectures, by providing a similar graph for these CPUs.
(通过为这些CPU提供类似的图形来指出CPU /缓存体系结构之间的差异也可能很有趣。)
PPS: Here is the full code.
(PPS:这是完整的代码。)
It uses TBB Tick_Count
for higher resolution timing, which can be disabled by not defining the TBB_TIMING
Macro: (它使用TBB Tick_Count
获得更高分辨率的时序,可以通过不定义TBB_TIMING
宏来禁用它:)
#include <iostream>
#include <iomanip>
#include <cmath>
#include <string>
//#define TBB_TIMING
#ifdef TBB_TIMING
#include <tbb/tick_count.h>
using tbb::tick_count;
#else
#include <time.h>
#endif
using namespace std;
//#define preallocate_memory new_cont
enum { new_cont, new_sep };
double *a1, *b1, *c1, *d1;
void allo(int cont, int n)
{
switch(cont) {
case new_cont:
a1 = new double[n*4];
b1 = a1 + n;
c1 = b1 + n;
d1 = c1 + n;
break;
case new_sep:
a1 = new double[n];
b1 = new double[n];
c1 = new double[n];
d1 = new double[n];
break;
}
for (int i = 0; i < n; i++) {
a1[i] = 1.0;
d1[i] = 1.0;
c1[i] = 1.0;
b1[i] = 1.0;
}
}
void ff(int cont)
{
switch(cont){
case new_sep:
delete[] b1;
delete[] c1;
delete[] d1;
case new_cont:
delete[] a1;
}
}
double plain(int n, int m, int cont, int loops)
{
#ifndef preallocate_memory
allo(cont,n);
#endif
#ifdef TBB_TIMING
tick_count t0 = tick_count::now();
#else
clock_t start = clock();
#endif
if (loops == 1) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++){
a1[j] += b1[j];
c1[j] += d1[j];
}
}
} else {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a1[j] += b1[j];
}
for (int j = 0; j < n; j++) {
c1[j] += d1[j];
}
}
}
double ret;
#ifdef TBB_TIMING
tick_count t1 = tick_count::now();
ret = 2.0*double(n)*double(m)/(t1-t0).seconds();
#else
clock_t end = clock();
ret = 2.0*double(n)*double(m)/(double)(end - start) *double(CLOCKS_PER_SEC);
#endif
#ifndef preallocate_memory
ff(cont);
#endif
return ret;
}
void main()
{
freopen("C:\test.csv", "w", stdout);
char *s = " ";
string na[2] ={"new_cont", "new_sep"};
cout << "n";
for (int j = 0; j < 2; j++)
for (int i = 1; i <= 2; i++)
#ifdef preallocate_memory
cout << s << i << "_loops_" << na[preallocate_memory];
#else
cout << s << i << "_loops_" << na[j];
#endif
cout << endl;
long long nmax = 1000000;
#ifdef preallocate_memory
allo(preallocate_memory, nmax);
#endif
for (long long n = 1L; n < nmax; n = max(n+1, long long(n*1.2)))
{
const long long m = 10000000/n;
cout << n;
for (int j = 0; j < 2; j++)
for (int i = 1; i <= 2; i++)
cout << s << plain(n, m, j, i);
cout << endl;
}
}
(It shows FLOP/s for different values of n
.)
((它显示n
不同值的FLOP / s。))
ask by Johannes Gerer translate from so