NVIDIA CUDA

Posted on 2020/09/27, 10:43 AM By admin22

NVIDIAのGPGPUプラットホームCUDA(Compute Unified Device Architecture)。Jetson Nanoで試してみました。

Jetson Nano

文法は部分的にCを拡張してGPU側のプログラムがかけるようになっています。
ビルドは、専用のコンパイラでCと同様に実行します。

nvcc -o sample sample.cu

参考) Tutorial 01: Say Hello to CUDA
https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial01/

まずはCのサンプルプログラムを実行したものを確認し、それをCUDAに置き換えます。

#include<stdio.h>
#include<stdlib.h>

#define N 10000000

void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
        out[i] = a[i] + b[i];
    }
}

int main(){
    float *a, *b, *out; 

    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;
    }

    vector_add(out, a, b, N);
   
    for(int i=0;i<10;i++){
	   printf("%f ", out[i]);
    }
    printf("\n");

    free(a);
    free(b);
    free(out);
}

#include<stdio.h>

#include<stdlib.h>

#define N 10000000

void vector_add(float *out, float *a, float *b, int n) {

for(int i = 0; i < n; i++){

out[i] = a[i] + b[i];

}

int main(){

float *a, *b, *out;

a = (float*)malloc(sizeof(float) * N);

b = (float*)malloc(sizeof(float) * N);

out = (float*)malloc(sizeof(float) * N);

for(int i = 0; i < N; i++){

a[i] = 1.0f; b[i] = 2.0f;

}

vector_add(out, a, b, N);

for(int i=0;i<10;i++){

printf("%f ", out[i]);

}

printf("\n");

free(a);

free(b);

free(out);

}

CPU->GPUにメモリコピーをして、GPU側で実行したものをCPUに戻します。

#include<stdio.h>

#define N 10000000

__global__ void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
        out[i] = a[i] + b[i];
    }
}

int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out; //デバイス側

    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    
    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;
    }
    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);

    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

    vector_add<<<1,1>>>(d_out, d_a, d_b, N);

    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

    for(int i=0;i<10;i++){
	    printf("%f ", out[i]);
    }
    printf("\n");

    // Cleanup after kernel execution
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
}

#include<stdio.h>

#define N 10000000

__global__ void vector_add(float *out, float *a, float *b, int n) {

for(int i = 0; i < n; i++){

out[i] = a[i] + b[i];

}

int main(){

float *a, *b, *out;

float *d_a, *d_b, *d_out; //デバイス側

a = (float*)malloc(sizeof(float) * N);

b = (float*)malloc(sizeof(float) * N);

out = (float*)malloc(sizeof(float) * N);

for(int i = 0; i < N; i++){

a[i] = 1.0f; b[i] = 2.0f;

}

cudaMalloc((void**)&d_a, sizeof(float) * N);

cudaMalloc((void**)&d_b, sizeof(float) * N);

cudaMalloc((void**)&d_out, sizeof(float) * N);

cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

vector_add<<<1,1>>>(d_out, d_a, d_b, N);

cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);

for(int i=0;i<10;i++){

printf("%f ", out[i]);

}

printf("\n");

// Cleanup after kernel execution

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_out);

free(a);

free(b);

free(out);

}

プロファイルコマンドで実行した結果。

sudo /usr/local/cuda/bin/nvprof ./sample-c
3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000
======== Warning: No CUDA application was profiled, exiting

sudo /usr/local/cuda/bin/nvprof ./sample-cuda
==11896== NVPROF is profiling process 11896, command: ./sample-cuda
==11896== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000
==11896== Profiling application: ./sample-cuda
==11896== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.01% 2.03638s 1 2.03638s 2.03638s 2.03638s vector_add(float*, float*, float*, int)
1.66% 34.757ms 2 17.378ms 16.505ms 18.252ms [CUDA memcpy HtoD]
1.33% 27.948ms 1 27.948ms 27.948ms 27.948ms [CUDA memcpy DtoH]
API calls: 83.78% 2.10600s 3 702.00ms 16.941ms 2.07038s cudaMemcpy
16.15% 405.83ms 3 135.28ms 23.077ms 358.74ms cudaMalloc
0.06% 1.5414ms 3 513.79us 493.29us 535.21us cudaFree
0.00% 115.68us 1 115.68us 115.68us 115.68us cudaLaunchKernel
0.00% 111.36us 97 1.1480us 625ns 26.667us cuDeviceGetAttribute
0.00% 9.1150us 1 9.1150us 9.1150us 9.1150us cuDeviceTotalMem
0.00% 6.4580us 3 2.1520us 1.1980us 3.4370us cuDeviceGetCount
0.00% 2.9690us 2 1.4840us 1.2500us 1.7190us cuDeviceGet
0.00% 1.8230us 1 1.8230us 1.8230us 1.8230us cuDeviceGetName
0.00% 833ns 1 833ns 833ns 833ns cuDeviceGetUuid

これだけのプログラムでは、実行完了にかかった時間はCUDAの方が長いくらいなので、速度のテストにはなりませんが、CPU-GPU間のデータのやりとりが確認できました。

About admin22

Categories: 未分類タグ: CUDA

About admin22

Recent Posts

Tag

Links

Archive