Оглавление
§ Просто вычисления
Это всего лишь справка по тому, как сделать минимальный cuda-файл и скомпилировать его. На этом все.
1nvcc cuda.cu -o cuda
2./cuda
И теперь сам код:
1#include <cuda_runtime_api.h>
2#include <stdio.h>
3
4void get_device_info()
5{
6 int deviceCount;
7 cudaDeviceProp deviceProp;
8
9
10 cudaGetDeviceCount(&deviceCount);
11
12 printf("Device count: %d\n\n", deviceCount);
13
14 for (int i = 0; i < deviceCount; i++)
15 {
16
17 cudaGetDeviceProperties(&deviceProp, i);
18
19
20 printf("Device name: %s\n", deviceProp.name);
21 printf("Total global memory: %d\n", (int)deviceProp.totalGlobalMem);
22 printf("Shared memory per block: %d\n", (int)deviceProp.sharedMemPerBlock);
23 printf("Registers per block: %d\n", deviceProp.regsPerBlock);
24 printf("Warp size: %d\n", deviceProp.warpSize);
25 printf("Memory pitch: %d\n", (int)deviceProp.memPitch);
26 printf("Max threads per block: %d\n", deviceProp.maxThreadsPerBlock);
27
28 printf("Max threads dimensions: x = %d, y = %d, z = %d\n",
29 deviceProp.maxThreadsDim[0],
30 deviceProp.maxThreadsDim[1],
31 deviceProp.maxThreadsDim[2]);
32
33 printf("Max grid size: x = %d, y = %d, z = %d\n",
34 deviceProp.maxGridSize[0],
35 deviceProp.maxGridSize[1],
36 deviceProp.maxGridSize[2]);
37
38 printf("Clock rate: %d\n", deviceProp.clockRate);
39 printf("Total constant memory: %d\n", (int)deviceProp.totalConstMem);
40 printf("Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor);
41 printf("Texture alignment: %d\n", (int)deviceProp.textureAlignment);
42 printf("Device overlap: %d\n", deviceProp.deviceOverlap);
43 printf("Multiprocessor count: %d\n", deviceProp.multiProcessorCount);
44
45 printf("Kernel execution timeout enabled: %s\n",
46 deviceProp.kernelExecTimeoutEnabled ? "true" : "false");
47 }
48}
49
50__global__ void SomeKernel(int* data, int length)
51{
52 unsigned int threadId = blockIdx.x * blockDim.x + threadIdx.x;
53
54 if (threadId < length)
55 {
56 data[threadId] = 1;
57 }
58}
59
60
61int main()
62{
63 get_device_info();
64
65 int w = 16;
66 int h = 16;
67 int length = w * h;
68
69
70 int* hostData = (int*)malloc(length * sizeof(int));
71
72
73 for (int i = 0; i < length; i += 2)
74 {
75 hostData[i] = 2;
76 hostData[i+1] = 1;
77 }
78
79
80 int* deviceData;
81
82 cudaMalloc((void**)&deviceData, length * sizeof(int));
83
84
85 cudaMemcpy(deviceData, hostData, length * sizeof(int), cudaMemcpyHostToDevice);
86
87
88 dim3 threads = dim3(w);
89 dim3 blocks = dim3(h);
90
91
92
93 SomeKernel<<<blocks, threads>>>(deviceData, length);
94
95
96 cudaMemcpy(hostData, deviceData, length * sizeof(int), cudaMemcpyDeviceToHost);
97
98
99 for (int i = 0; i < length; i++)
100 {
101 if (i % 16 == 0) printf("\n");
102 printf("%d\t", (int)hostData[i]);
103 }
104
105 printf("\n");
106
107 return 0;
108}
§ С использованием SDL1.2
В этом коде создается SDL-окно и ничего не происходит с ним, просто показывается, что можно так сделать.
1#include <stdio.h>
2#include <SDL.h>
3
4__global__
5void saxpy(int n, float a, float *x, float *y)
6{
7 int i = blockIdx.x * blockDim.x + threadIdx.x;
8 if (i < n) y[i] = cos(sin(a*x[i]) + y[i]);
9}
10
11int main(void)
12{
13
14 int N = 1 << 27;
15
16 SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER);
17 SDL_EnableUNICODE(1);
18 SDL_Surface* screen = SDL_SetVideoMode(320, 200, 32, SDL_HWSURFACE | SDL_DOUBLEBUF);
19 SDL_WM_SetCaption("CUDA NVCC", 0);
20
21 float *x, *y, *d_x, *d_y;
22
23
24 x = (float*) malloc(N*sizeof(float));
25 y = (float*) malloc(N*sizeof(float));
26
27
28 cudaMalloc(&d_x, N*sizeof(float));
29 cudaMalloc(&d_y, N*sizeof(float));
30
31 Uint32 t1 = SDL_GetTicks();
32 printf("START\n");
33
34 for (int i = 0; i < N; i++) {
35 x[i] = 1.0f;
36 y[i] = 2.0f;
37 }
38
39 Uint32 t2 = SDL_GetTicks();
40 printf("FILL :: %d\n", t2 - t1);
41
42
43 cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
44 cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
45
46 Uint32 t3 = SDL_GetTicks();
47 printf("COPY x 2 :: %d\n", t3 - t2);
48
49
50 saxpy<<<(N+255) / 256, 256>>>(N, 2.0f, d_x, d_y);
51
52
53 cudaDeviceSynchronize();
54
55 Uint32 t4 = SDL_GetTicks();
56 printf("COMPUTE :: %d\n", t4 - t3);
57
58
59 cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
60
61 Uint32 t5 = SDL_GetTicks();
62 printf("COPY BACK :: %d\n", t5 - t4);
63
64 cudaFree(d_x);
65 cudaFree(d_y);
66 free(x);
67 free(y);
68}
Код для компиляции
1nvcc main.cu `sdl-config --cflags --libs` -lSDL -lm -o main