M4rK5: Mi primer programa en GPU -CUDA

Despues de unas leidas para unos trabajos que hare mi primer programa en GPU aún es una simple multiplicación de matrizes con Hilos en CUDA. Las versiones mejoradas ya las subire.

#include <stdio.h>

#include <assert.h>

#include <iostream>

using namespace std;

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char *msg);

// Part 3 of 5: implement the kernel

__global__ void MatrixSimple(int *d_a ,int *d_b,int *d_c,int width )

{

int row=blockIdx.x*width+threadIdx.x;

int col=blockIdx.y*width+threadIdx.y;

int pvalue=0;

//for each computes one element of the block sub-matrix

for(int i=0;i<width;i++)

{

pvalue+=(d_a[row*width+i]*d_b[i*width+col]);

}

d_c[row*width+col]=pvalue;

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main( int argc, char** argv)

{

// pointer for host memory

int *h_a,*h_b,*h_c;

// pointer for device memory

int *d_a,*d_b,*d_c;

// define grid and block size

int numBlocks = 1;

int numThreadsPerBlock = 64;

// Part 1 of 5: allocate host and device memory

size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);

h_a = (int *) malloc(memSize);

cudaMalloc((void**) &d_a,memSize );

h_b = (int *) malloc(memSize);

cudaMalloc((void**) &d_b,memSize );

h_c = (int *) malloc(memSize);

cudaMalloc((void**) &d_c,memSize );

for(int n=0;n<numThreadsPerBlock*numBlocks;n++)

{

h_a[n]=h_b[n]=1;

}

// Part 2 of 5: configure and launch kernel

dim3 dimGrid( numBlocks );

dim3 dimBlock( numThreadsPerBlock/8,numThreadsPerBlock/8);

// check if kernel execution generated an error

checkCUDAError("kernel execution");

// Part 4 of 5: device to host copy

cudaMemcpy( d_a,h_a ,memSize,cudaMemcpyHostToDevice);

cudaMemcpy( d_b,h_b ,memSize,cudaMemcpyHostToDevice);

MatrixSimple<<< dimGrid ,dimBlock >>>(d_a,d_b,d_c,numThreadsPerBlock/8);

// block until the device has completed

cudaThreadSynchronize();

cudaMemcpy( h_c,d_c ,memSize,cudaMemcpyDeviceToHost);

// Check for any CUDA errors

checkCUDAError("cudaMemcpy");

for(int i=0;i<8;i++)

{

for(int j=0;j<8;j++)

cout<<h_c[8*i+j]<<" ";

cout<<endl;

}

// free device memory

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

// free host memory

free(h_a);

free(h_b);

free(h_c);

// If the program makes it this far, then the results are correct and

// there are no run-time errors. Good work!

printf("Correct!\n");

return 0;

}

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err)

{

fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

exit(-1);

}

Hasta la prox!!!

M4rK5

lunes, 8 de noviembre de 2010

Mi primer programa en GPU -CUDA

No hay comentarios: