CUDA固定内存从设备中刷新

CUDA 5，设备功能3.5，VS 2012,64位Win 2012 Server。CUDA固定内存从设备中刷新

线程之间没有共享内存访问，每个线程都是独立的。

我使用零拷贝的固定内存。在主机上，只有当我在主机上发出cudaDeviceSynchronize时，我才能读取设备写入的固定内存。

我希望能够到：

水冲到锁定的存储，一旦设备已经更新了它。

不会阻止设备线程（可能由异步复制）

我打过电话__threadfence_system和__threadfence每个设备的写入后，但没有刷新。

下面是一个完整的示例代码CUDA演示我的问题：

#include <conio.h> 
#include <cstdio> 
#include "cuda.h" 
#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

__global__ void Kernel(volatile float* hResult) 
{ 
    int tid = threadIdx.x + blockIdx.x * blockDim.x; 

    printf("Kernel %u: Before Writing in Kernel\n", tid); 
    hResult[tid] = tid + 1; 
    __threadfence_system(); 
    // expecting that the data is getting flushed to host here! 
    printf("Kernel %u: After Writing in Kernel\n", tid); 
    // time waster for-loop (sleep) 
    for (int timeWater = 0; timeWater < 100000000; timeWater++); 
} 

void main() 
{ 
    size_t blocks = 2; 
    volatile float* hResult; 
    cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped); 
    Kernel<<<1,blocks>>>(hResult); 
    int filledElementsCounter = 0; 
    // naiive thread implementation that can be impelemted using 
    // another host thread 
    while (filledElementsCounter < blocks) 
    { 
     // blocks until the value changes, this moves sequentially 
     // while threads have no order (fine for this sample). 
     while(hResult[filledElementsCounter] == 0); 
     printf("%f\n", hResult[filledElementsCounter]);; 
     filledElementsCounter++; 
    } 
    cudaFreeHost((void *)hResult); 
    system("pause"); 
}

目前该样品没有被从设备读取，除非我发出cudaDeviceSynchronize将无限期地等待。下面的作品样本，但它是不我希望，因为它违背了异步复制的目的是什么：

void main() 
{ 
    size_t blocks = 2; 
    volatile float* hResult; 
    cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped); 
    Kernel<<<1,blocks>>>(hResult); 
    cudaError_t error = cudaDeviceSynchronize(); 
    if (error != cudaSuccess) { throw; } 
    for(int i = 0; i < blocks; i++) 
    { 
     printf("%f\n", hResult[i]); 
    } 
    cudaFreeHost((void *)hResult); 
    system("pause"); 
}

来源

2013-05-07 Adam

你解决了这个问题吗？您是否尝试使用动态并行机制将数据写入CPU主机的内存？在内核函数中使用'cudaMemcpyAsync（uva_host_ptr，device_ptr，size）;'，如以下链接所示：http://on-demand.gputechconf.com/gtc/2012/presentations/S0338-GTC2012-CUDA-Programming- Model.pdf – Alex 2013-10-13 21:34:50

您不能直接通过主机指针到内核。如果使用cudaHostAlloc和cudaHostAllocMapped标志分配主机内存，则首先必须检索映射主机内存的设备指针，然后才能在内核中使用它。使用cudaHostGetDevicePointer获取映射主机内存的设备指针。

float* hResult, *dResult; 
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped); 
cudaHostGetDevicePointer(&dResult,hResult); 
Kernel<<<1,blocks>>>(dResult);

来源

2013-05-07 11:16:04 sgarizvi

当你说“你不能通过”，你的意思是解决我的脸红问题，或者你的意思是一般？因为当我用'cudaDeviceSynchronize'替换我的while循环时，我可以在不使用'cudaMemcpy'的情况下访问hResult中的数据。我仍然无法看到您建议的解决方案如何解决冲洗问题。我是否一直在dResult上执行'cudaMemcpyAsync'，直到找到它里面的东西？ – Adam 2013-05-07 11:25:23

其实我指出了一个会导致未定义行为的一般错误。刷新问题可能是由于内核中的'printf'语句引起的。因为内核中的'printf'在内核完成执行后会转储它的输出。 – sgarizvi 2013-05-07 11:29:57

我在问题中增加了另一个示例，它是可以工作但同步的问题。你是否告诉我第二个样本有未定义的行为？它正在工作，即使我删除了内核'printf' – Adam 2013-05-07 11:47:13

调用__threadfence_system()将确保写入对系统可见继续之前，但你的CPU会被缓存h_result变量，因此你只是在一个无限循环旋转的旧值。尝试将h_result标记为volatile。

来源

2013-05-07 11:58:35 Tom

我已经更新了上面的示例，并添加了__threadfence_system（）和volatile，因为添加volatile是一个好主意。但是，我仍然阻止不能读取任何东西。 – Adam 2013-05-07 12:33:37

我用你的代码上播放一个CentOS 6.2 CUDA 5.5和特斯拉M2090，可以断定这一点：

它不会在您的系统的问题必须是驱动的问题，我建议你吃TCC司机。

我附上我的代码，运行良好，做你想做的。内核结束之前，这些值出现在主机端。正如你所看到的，我添加了一些计算代码来防止由于编译器优化而删除for循环。我添加了一个流和一个回调，在流中的所有工作完成后执行。程序输出12很长一段时间没有任何操作，直到stream finished...被打印到控制台。

#include <iostream> 
#include "cuda.h" 
#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#define SEC_CUDA_CALL(val)   checkCall ((val), #val, __FILE__, __LINE__) 

bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line) 
{ 
    if (result != cudaSuccess) 
    { 
      std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl; 
    } 
    return result != cudaSuccess; 
} 

class Callback 
{ 
public: 
    static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData); 

private: 
    void call(); 
}; 

void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData) 
{ 
    Callback* cb = (Callback*) userData; 
    cb->call(); 
} 

void Callback::call() 
{ 
    std::cout << "stream finished..." << std::endl; 
} 



__global__ void Kernel(volatile float* hResult) 
{ 
    int tid = threadIdx.x + blockIdx.x * blockDim.x; 

    hResult[tid] = tid + 1; 
    __threadfence_system(); 
    float A = 0; 
    for (int timeWater = 0; timeWater < 100000000; timeWater++) 
    { 
     A = sin(cos(log(hResult[0] * hResult[1]))) + A; 
     A = sqrt(A); 
    } 
} 

int main(int argc, char* argv[]) 
{ 
    size_t blocks = 2; 
    volatile float* hResult; 
    SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped)); 

    cudaStream_t stream; 
    SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 
    Callback obj; 
    Kernel<<<1,blocks,NULL,stream>>>(hResult); 
    SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0)); 

    int filledElementsCounter = 0; 

    while (filledElementsCounter < blocks) 
    { 
     while(hResult[filledElementsCounter] == 0); 
     std::cout << hResult[filledElementsCounter] << std::endl; 
     filledElementsCounter++; 
    } 

    SEC_CUDA_CALL(cudaStreamDestroy(stream)); 
    SEC_CUDA_CALL(cudaFreeHost((void *)hResult)); 
}

没有调用返回错误，并且cuda-memcheck没有发现任何问题。这按预期工作。你应该真的尝试TCC驱动程序。

来源

2013-10-15 12:27:10

谢谢！但是可能你的意思是'Kernel <<< 1，threads'而不是'Kernel <<< 1，blocks'？我可以从http://www.nvidia.com/object/software-for-tesla-products.html下载TCC驱动程序。但是，我可以将它用于nVidia Quadro Mobile吗？或者我必须使用什么来解决此问题使用GPU nVidia ** Quadro（开普勒GK107/GK106）**？ – Alex 2013-10-15 14:43:28

代码是从原始问题复制的，但是第二个参数是针对线程的。我对TCC驱动程序没有期望，但我认为它也适用于Quadros。看看这里：http://stackoverflow.com/questions/19098650/does-the-cuda-tcc-driver-work-with-geforce-cards-on-windows – 2013-10-15 20:25:49

CUDA固定内存从设备中刷新

回答

相关问题