CUDA – 关于“分支”和“分支”Visual Profiler结果的混淆（2）

我使用NVIDIA Visual Profiler来分析我的代码。 testing内核是：

//////////////////////////////////////////////////////////////// Group 1 static __global__ void gpu_test_divergency_0(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < 0) { a[tid] = tid; } else { b[tid] = tid; } } static __global__ void gpu_test_divergency_1(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { a[tid] = tid; } else { b[tid] = tid; } } static __global__ void gpu_test_divergency_2(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= 0) { a[tid] = tid; } else { b[tid] = tid; } } static __global__ void gpu_test_divergency_3(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid > 0) { a[tid] = tid; } else { b[tid] = tid; } } //////////////////////////////////////////////////////////////// Group 2 static __global__ void gpu_test_divergency_4(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < 0) { a[tid] = tid + 1; } else { b[tid] = tid + 2; } } static __global__ void gpu_test_divergency_5(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { a[tid] = tid + 1; } else { b[tid] = tid + 2; } } static __global__ void gpu_test_divergency_6(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= 0) { a[tid] = tid + 1; } else { b[tid] = tid + 2; } } static __global__ void gpu_test_divergency_7(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid > 0) { a[tid] = tid + 1; } else { b[tid] = tid + 2; } } //////////////////////////////////////////////////////////////// Group 3 static __global__ void gpu_test_divergency_8(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < 0) { a[tid] = tid + 1.0; } else { b[tid] = tid + 2.0; } } static __global__ void gpu_test_divergency_9(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { a[tid] = tid + 1.0; } else { b[tid] = tid + 2.0; } } static __global__ void gpu_test_divergency_10(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= 0) { a[tid] = tid + 1.0; } else { b[tid] = tid + 2.0; } } static __global__ void gpu_test_divergency_11(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid > 0) { a[tid] = tid + 1.0; } else { b[tid] = tid + 2.0; } }

当我用<<< 1，32 >>>启动testing内核时，我从这个profiler得到了结果：

 gpu_test_divergency_0 : Branch Efficiency = 100% branch = 1 divergent branch = 0 gpu_test_divergency_1 : Branch Efficiency = 100% branch = 1 divergent branch = 0 gpu_test_divergency_2 : Branch Efficiency = 100% branch = 1 divergent branch = 0 gpu_test_divergency_3 : Branch Efficiency = 100% branch = 1 divergent branch = 0 gpu_test_divergency_4 : Branch Efficiency = 100% branch = 3 divergent branch = 0 gpu_test_divergency_5 : Branch Efficiency = 100% branch = 3 divergent branch = 0 gpu_test_divergency_6 : Branch Efficiency = 100% branch = 2 divergent branch = 0 gpu_test_divergency_7 : Branch Efficiency = 100% branch = 3 divergent branch = 0 gpu_test_divergency_8 : Branch Efficiency = 100% branch = 3 divergent branch = 0 gpu_test_divergency_9 : Branch Efficiency = 75% branch = 4 divergent branch = 1 gpu_test_divergency_10 : Branch Efficiency = 100% branch = 2 divergent branch = 0 gpu_test_divergency_11 : Branch Efficiency = 75% branch = 4 divergent branch = 1

当我用<<< 1，64 >>>启动testing内核时，我得到了这样的分析器的结果：

 gpu_test_divergency_0 : Branch Efficiency = 100% branch = 2 divergent branch = 0 gpu_test_divergency_1 : Branch Efficiency = 100% branch = 2 divergent branch = 0 gpu_test_divergency_2 : Branch Efficiency = 100% branch = 2 divergent branch = 0 gpu_test_divergency_3 : Branch Efficiency = 100% branch = 2 divergent branch = 0 gpu_test_divergency_4 : Branch Efficiency = 100% branch = 6 divergent branch = 0 gpu_test_divergency_5 : Branch Efficiency = 100% branch = 6 divergent branch = 0 gpu_test_divergency_6 : Branch Efficiency = 100% branch = 4 divergent branch = 0 gpu_test_divergency_7 : Branch Efficiency = 100% branch = 5 divergent branch = 0 gpu_test_divergency_8 : Branch Efficiency = 100% branch = 6 divergent branch = 0 gpu_test_divergency_9 : Branch Efficiency = 85.7% branch = 7 divergent branch = 1 gpu_test_divergency_10 : Branch Efficiency = 100% branch = 4 divergent branch = 0 gpu_test_divergency_11 : Branch Efficiency = 83.3% branch = 6 divergent branch = 1

我在Linux上使用CUDAfunction2.0和NVIDIA Visual Profiler v4.2的“GeForce GTX 570”。根据文件：

“branch” – “执行内核的线程占用的分支数，如果warp中至less有一个线程占用了分支，则该计数器将增加1”。

“发散分支” – “一个经线内发散分支的数目，如果一个经线中至less有一个线迹通过一个依赖于数据的条件分支发散（即遵循不同的执行path），则该计数器将加1。

但是我对结果非常困惑。为什么每个testing组的“分支”数量都不一样？为什么只有第三个testing组似乎有正确的“发散分支”呢？

@JackOLantern：我在发布模式下编译。我用你的方式拆卸它。 “gpu_test_divergency_4”的结果与您的结果完全相同，但“gpu_test_divergency_0”的结果不同：

  Function : _Z21gpu_test_divergency_0PfS_ /*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100]; /*0008*/ /*0x94001c042c000000*/ S2R R0, SR_CTAid_X; /*0010*/ /*0x84009c042c000000*/ S2R R2, SR_Tid_X; /*0018*/ /*0x20009ca320044000*/ IMAD R2, R0, c [0x0] [0x8], R2; /*0020*/ /*0xfc21dc23188e0000*/ ISETP.LT.AND P0, pt, R2, RZ, pt; /*0028*/ /*0x0920de0418000000*/ I2F.F32.S32 R3, R2; /*0030*/ /*0x9020204340004000*/ @!P0 ISCADD R0, R2, c [0x0] [0x24], 0x2; /*0038*/ /*0x8020804340004000*/ @P0 ISCADD R2, R2, c [0x0] [0x20], 0x2; /*0040*/ /*0x0000e08590000000*/ @!P0 ST [R0], R3; /*0048*/ /*0x0020c08590000000*/ @P0 ST [R2], R3; /*0050*/ /*0x00001de780000000*/ EXIT;

我想，就像你说的那样，转换指令（I2F在这种情况下）不会增加额外的分支。

但是我看不到这些反汇编代码和Profiler结果之间的关系。我从另一篇文章（ https://devtalk.nvidia.com/default/topic/463316/branch-divergent-branches/ ）了解到，发散分支是用SMs上的实际线程（warp）运行情况计算的。所以我想我们不能根据这些反汇编代码推导出每个实际运行的分支分歧。我对吗？

跟随 – 使用表决本质来检查线程差异

我认为检查经线内线程分歧的最好方法是使用投票内在函数，特别是__ballot和__popc内在函数。关于__ballot和__popc一个很好的解释可以在Shane Cook，CUDA Programming，Morgan Kaufmann的书中找到。

__ballot的原型如下

 unsigned int __ballot(int predicate);

如果谓词不为零， __ballot将返回一个第N位置位的值，其中N是threadIdx.x 。

另一方面， __popc返回用32位参数设置的位数。

所以，通过共同使用__ballot ， __popc和atomicAdd ，可以检查一个warp是否发散。

为此，我设置了下面的代码

 #include <cuda.h> #include <stdio.h> #include <iostream> #include <cuda.h> #include <cuda_runtime.h> __device__ unsigned int __ballot_non_atom(int predicate) { if (predicate != 0) return (1 << (threadIdx.x % 32)); else return 0; } __global__ void gpu_test_divergency_0(unsigned int* d_ballot, int Num_Warps_per_Block) { int tid = threadIdx.x + blockIdx.x * blockDim.x; const unsigned int warp_num = threadIdx.x >> 5; atomicAdd(&d_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot_non_atom(tid > 2))); // atomicAdd(&d_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid > 2))); } #include <conio.h> int main(int argc, char *argv[]) { unsigned int Num_Threads_per_Block = 64; unsigned int Num_Blocks_per_Grid = 1; unsigned int Num_Warps_per_Block = Num_Threads_per_Block/32; unsigned int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/32; unsigned int* h_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int)); unsigned int* d_ballot; cudaMalloc((void**)&d_ballot, Num_Warps_per_Grid*sizeof(unsigned int)); for (int i=0; i<Num_Warps_per_Grid; i++) h_ballot[i] = 0; cudaMemcpy(d_ballot, h_ballot, Num_Warps_per_Grid*sizeof(unsigned int), cudaMemcpyHostToDevice); gpu_test_divergency_0<<<Num_Blocks_per_Grid,Num_Threads_per_Block>>>(d_ballot,Num_Warps_per_Block); cudaMemcpy(h_ballot, d_ballot, Num_Warps_per_Grid*sizeof(unsigned int), cudaMemcpyDeviceToHost); for (int i=0; i<Num_Warps_per_Grid; i++) { if ((h_ballot[i] == 0)||(h_ballot[i] == 32)) std::cout << "Warp " << i << " IS NOT divergent- Predicate true for " << h_ballot[i] << " threads\n"; else std::cout << "Warp " << i << " IS divergent - Predicate true for " << h_ballot[i] << " threads\n"; } getch(); return EXIT_SUCCESS; }

请注意，我现在正在计算能力1.2卡上运行代码，所以在上面的示例中，我使用了__ballot_non_atom ，这是__ballot_non_atom的非内在等价物，因为__ballot仅适用于计算能力> = 2.0 。换句话说，如果您的计算能力> = 2.0的卡，请在内核函数中使用__ballot取消注释。

使用上面的代码，只需在内核函数中更改相关谓词，就可以使用上面的所有内核函数。

先前的回答

我在释放模式下编译了一个计算能力2.0代码，我用-keep来保留中间文件和cuobjdump实用程序来产生两个内核的反汇编，即：

 static __global__ void gpu_test_divergency_0(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < 0) a[tid] = tid; else b[tid] = tid; }

和

 static __global__ void gpu_test_divergency_4(float *a, float *b) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < 0) a[tid] = tid + 1; else b[tid] = tid + 2; }

结果如下

 gpu_test_divergency_0 /*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */ /*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */ /*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */ /*0018*/ IMAD R2, R0, c[0x0][0x8], R2; /* 0x2004400020009ca3 */ /*0020*/ ISETP.LT.AND P0, PT, R2, RZ, PT; /* 0x188e0000fc21dc23 */ /*0028*/ I2F.F32.S32 R0, R2; /* 0x1800000009201e04 */ /*0030*/ @!P0 ISCADD R3, R2, c[0x0][0x24], 0x2; /* 0x400040009020e043 */ /*0038*/ @P0 ISCADD R2, R2, c[0x0][0x20], 0x2; /* 0x4000400080208043 */ /*0040*/ @!P0 ST [R3], R0; /* 0x9000000000302085 */ /*0048*/ @P0 ST [R2], R0; /* 0x9000000000200085 */ /*0050*/ EXIT ; /* 0x8000000000001de7 */

和

 gpu_test_divergency_4 /*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */ /*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */ R0 = BlockIdx.x /*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */ R2 = ThreadIdx.x /*0018*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */ R0 = R0 * c + R2 /*0020*/ ISETP.LT.AND P0, PT, R0, RZ, PT; /* 0x188e0000fc01dc23 */ If statement /*0028*/ @P0 BRA.U 0x58; /* 0x40000000a00081e7 */ Branch 1 - Jump to 0x58 /*0030*/ @!P0 IADD R2, R0, 0x2; /* 0x4800c0000800a003 */ Branch 2 - R2 = R0 + 2 /*0038*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */ Branch 2 - Calculate gmem address /*0040*/ @!P0 I2F.F32.S32 R2, R2; /* 0x180000000920a204 */ Branch 2 - R2 = R2 after int to float cast /*0048*/ @!P0 ST [R0], R2; /* 0x900000000000a085 */ Branch 2 - gmem store /*0050*/ @!P0 BRA.U 0x78; /* 0x400000008000a1e7 */ Branch 2 - Jump to 0x78 (exit) /*0058*/ @P0 IADD R2, R0, 0x1; /* 0x4800c00004008003 */ Branch 1 - R2 = R0 + 1 /*0060*/ @P0 ISCADD R0, R0, c[0x0][0x20], 0x2; /* 0x4000400080000043 */ Branch 1 - Calculate gmem address /*0068*/ @P0 I2F.F32.S32 R2, R2; /* 0x1800000009208204 */ Branch 1 - R2 = R2 after int to float cast /*0070*/ @P0 ST [R0], R2; /* 0x9000000000008085 */ Branch 1 - gmem store /*0078*/ EXIT ; /* 0x8000000000001de7 */

从上面的反汇编中，我会期望你的分支分歧测试的结果是一样的。

你正在编译调试或发布模式？