Instalar CUDA GPU NVidia en Ubuntu 10.10

En este tutorial en español describere como pude instalar CUDA Nvidia teniendo una tarjeta de modelo 9500GT. El sistema operativo en el que instale fue Ubuntu 10.10.

Paso 1: (Descargando CUDA)
De este link descarga los siguientes paquetes:
Developer Driver for Linux (260.19.14)
Cuda Toolkit
GPU Computing SDK code sample
Donde descargastes cada uno de los paquetes abre un terminal en esa carpeta y pon : chmod 0777 nombre_paquete

Paso 2: Modifiquemos y removemos nvidia por defecto

En un terminal como modo superusuario hagamos lo sgte:

#nano /etc/modprobe.d/blacklist.conf :
blacklist vga16fb
blacklist rivafb
blacklist nvidiafb
blacklist rivatv

Agregemos las siguientes lineas al final del archivo:
Ejecutemos lo sgte:
#apt-get purge nvidia-*
#reboot -n
Paso 3: Modo X , logueo y parada de servicio gdm

nos logueamos y ejecutamos $ sudo service gdm stop

Paso 4: Instalando devdriver
$ sudo sh
Aceptamos todas las licencias
La compatibilidad de OpenGl acepta "Yes"
y preguntara si nvidia-xconfig se desee ejecutar automaticamente para la configuración del archivo X . Responde "Yes"

Paso 5: Instalemos el toolkit de CUDA

Una vez instalado el driver de Nvidia , procedemos a instalar el cudatoolkit. Ejecutando lo siguiente:

$ sudo sh
Deja la ruta por defecto , presiona Enter.

Paso 6: Modifiquemos variables del ambiente

Agrega variables del sistema operativo. En un terminal haz lo siguiente:
$sudo nano /etc/environment
Agregemos ":usr/local/cuda/bin" antes de el ultimo simbolo " . Guarda el archivo y salimos del editor

En el archivo .bashrc q esta en tu home. Agregamos lo siguiente:
export CUDA_HOME="/usr/local/cuda"
export PATH=${CUDA_HOME}/bin:${PATH}

# nano /etc/ (creamos nuevo archivo)


Paso 7: Instalamos "GPU Computing SDK"

Instalemos gpucomputing : $ sh
Enter para instalar el path por defecto.
Dirigimos la ruta de CUDA en este caso esta por defecto en /usr/local/cuda . Presionamos enter para confirmar

Instalamos los paquetes necesarios
$ sudo apt-get install g++ freeglut3-dev libxi-dev
Rompemos el enlace de
$ sudo rm -f /usr/lib/
$ sudo ln -s /usr/lib/ /usr/lib/
Creamos un enlaze para libXmu

$sudo ln -s /usr/lib/ /usr/lib/

Compilemos GPU COmputing

$ cd ~/NVIDIA_GPU_Computing_SDK/C
$ make

Para encontrar los ejecutable se ubican en ~/NVIDIA_GPU_Computing_SDK/C/bin/linux/release . Uno de sus ejemplos es esta simulacion del humo de cigarro.

Espero les sirva este post!!! xD.

Julia SeT in CPU and GPU

Reading the boook cuda by example , find a problem simple "Julia Set" (definition of Julia Set)


* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.

#include "../common/book.h"
#include "../common/cpu_bitmap.h"

#define DIM 1000

struct cuComplex {
float r;
float i;
cuComplex( float a, float b ) : r(a), i(b) {}
float magnitude2( void ) { return r * r + i * i; }
cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);

int julia( int x, int y ) {
const float scale = 1.5;
float jx = scale * (float)(DIM/2 - x)/(DIM/2);
float jy = scale * (float)(DIM/2 - y)/(DIM/2);

cuComplex c(-0.8, 0.156);
cuComplex a(jx, jy);

int i = 0;
for (i=0; i<200; i++) {
a = a * a + c;
if (a.magnitude2() > 1000)
return 0;

return 1;

void kernel( unsigned char *ptr ){
for (int y=0; y<DIM; y++) {
for (int x=0; x<DIM; x++) {
int offset = x + y * DIM;

int juliaValue = julia( x, y );
ptr[offset*4 + 1] = 255 * juliaValue;
ptr[offset*4 + 0] = 0;
ptr[offset*4 + 2] = 0;
ptr[offset*4 + 3] = 255;

int main( void ) {
CPUBitmap bitmap( DIM, DIM );
unsigned char *ptr = bitmap.get_ptr();

kernel( ptr );



* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.

#include "../common/book.h"
#include "../common/cpu_bitmap.h"

#define DIM 1000

struct cuComplex {
float r;
float i;
cuComplex( float a, float b ) : r(a), i(b) {}
__device__ float magnitude2( void ) {
return r * r + i * i;
__device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
__device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);

__device__ int julia( int x, int y ) {
const float scale = 1.5;
float jx = scale * (float)(DIM/2 - x)/(DIM/2);
float jy = scale * (float)(DIM/2 - y)/(DIM/2);

cuComplex c(-0.8, 0.156);
cuComplex a(jx, jy);

int i = 0;
for (i=0; i<200; i++) {
a = a * a + c;
if (a.magnitude2() > 1000)
return 0;

return 1;

__global__ void kernel( unsigned char *ptr ) {
// map from blockIdx to pixel position
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y * gridDim.x;

// now calculate the value at that position
int juliaValue = julia( x, y );
ptr[offset*4 + 0] = 255 * juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 0;
ptr[offset*4 + 3] = 255;

// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;

int main( void ) {
DataBlock data;
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;

HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap, bitmap.image_size() ) );
data.dev_bitmap = dev_bitmap;

dim3 grid(DIM,DIM);
kernel<<<grid,1>>>( dev_bitmap );

HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
cudaMemcpyDeviceToHost ) );

HANDLE_ERROR( cudaFree( dev_bitmap ) );


Really GPU is very interesting , why? The future of computing is parallel computing!!!!!.

See you :P

Get Properties of Device GPU //Obten propiedades de tu dispostivo GPU

Si deseas obtener información de tu tarjeta o tarjetas lo puedes hacer de la siguiente manera:

#include <iostream>

using namespace std;

int main()

cudaDeviceProp prop;

int count;
cudaGetDeviceCount(&count); /*count number of devices*/
for(int i=0;i<count;i++)
cudaGetDeviceProperties(∝,i); /*get properties of device*/
cout<<"General Information for device----"<<i<<endl;
cout<<"Name "<<<<endl;
cout<<"Compute capability "<<prop.major<<". "<<prop.minor<<endl;
cout<<"Clock Rate "<<prop.clockRate<<endl;
cout<<"Device copy overlap: ";
else cout<<"Disabled\n";

else cout<<"Disabled\n";

cout<<"---Memory Information for device "<<i<<endl;
cout<<"Total global Mem: "<<prop.totalGlobalMem<<endl;
cout<<"Total COnstant Mem: "<<prop.totalConstMem<<endl;
cout<<"Max mem pitch: "<<prop.memPitch<<endl;
cout<<"Texture ALignment "<<prop.textureAlignment;
cout<<"\n -------MP Information for devices "<<i;
cout<<" \nMultiprocessor count : "<<prop.multiProcessorCount<<endl;

cout<<"Shared mem per mp: "<<prop.sharedMemPerBlock<<endl;
cout<<"Registers per mp: "<<prop.regsPerBlock<<endl;
cout<<"Threads in warp: "<<prop.warpSize<<endl;
cout<<"Max Threads per block "<<prop.maxThreadsPerBlock<<endl;
cout<<"Max Threads dimensions: ("<<prop.maxThreadsDim[0]<<","<<prop.maxThreadsDim[1]

cout<<"Max grid dimensions: ("<<prop.maxGridSize[0]<<","<<prop.maxGridSize[1]
return 0;

see you!!! xD

Multiplicación de Matricez Multibloque con CUDA

Ahora multiplicación de Matrizes , pero usando un nuevo concepto TILES , en el cual podemos hacer multiplicación de matrices multibloque.

Acontinuación el sgte codigo en el que aplica ese concepto:

#include <stdio.h>
#include <assert.h>
#include <iostream>

#define TILE_WIDTH 4
using namespace std;

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);

// Part 3 of 5: implement the kernel
__global__ void MatrixTiles(int *d_a ,int *d_b,int *d_c,int width )
int row=blockIdx.x*TILE_WIDTH+threadIdx.x;
int col=blockIdx.y*TILE_WIDTH+threadIdx.y;

int pvalue=0;
//for each computes one element of the block sub-matrix
for(int i=0;i<width;i++)

// Program main
int main( int argc, char** argv)
// pointer for host memory
int *h_a,*h_b,*h_c;

// pointer for device memory
int *d_a,*d_b,*d_c;

// define grid and block size
int numBlocks = 4;
int numThreadsPerBlock = 16;

// Part 1 of 5: allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc((void**) &d_a,memSize );

h_b = (int *) malloc(memSize);
cudaMalloc((void**) &d_b,memSize );

h_c = (int *) malloc(memSize);
cudaMalloc((void**) &d_c,memSize );

for(int n=0;n<numThreadsPerBlock*numBlocks;n++)

// Part 2 of 5: configure and launch kernel
dim3 dimGrid( numBlocks/2,numBlocks/2 );
dim3 dimBlock( numThreadsPerBlock/TILE_WIDTH,numThreadsPerBlock/TILE_WIDTH);

// check if kernel execution generated an error
checkCUDAError("kernel execution");

// Part 4 of 5: device to host copy
cudaMemcpy( d_a,h_a ,memSize,cudaMemcpyHostToDevice);
cudaMemcpy( d_b,h_b ,memSize,cudaMemcpyHostToDevice);

MatrixTiles<<< dimGrid ,dimBlock >>>(d_a,d_b,d_c,TILE_WIDTH*2);
// block until the device has completed
cudaMemcpy( h_c,d_c ,memSize,cudaMemcpyDeviceToHost);

// Check for any CUDA errors

for(int i=0;i<8;i++)
for(int j=0;j<8;j++)
cout<<h_c[8*i+j]<<" ";

// free device memory

// free host memory

// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!

return 0;

void checkCUDAError(const char *msg)
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

El sgte post será mejorando esto. Los Tiles se usan porque cada bloque en CUDA tienen un tamaño que es de 512 Threads.

Mi primer programa en GPU -CUDA

Despues de unas leidas para unos trabajos que hare mi primer programa en GPU aún es una simple multiplicación de matrizes con Hilos en CUDA. Las versiones mejoradas ya las subire.

#include <stdio.h>
#include <assert.h>
#include <iostream>

using namespace std;

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);

// Part 3 of 5: implement the kernel
__global__ void MatrixSimple(int *d_a ,int *d_b,int *d_c,int width )
int row=blockIdx.x*width+threadIdx.x;
int col=blockIdx.y*width+threadIdx.y;
int pvalue=0;
//for each computes one element of the block sub-matrix
for(int i=0;i<width;i++)

// Program main
int main( int argc, char** argv)
// pointer for host memory
int *h_a,*h_b,*h_c;

// pointer for device memory
int *d_a,*d_b,*d_c;

// define grid and block size
int numBlocks = 1;
int numThreadsPerBlock = 64;

// Part 1 of 5: allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc((void**) &d_a,memSize );
h_b = (int *) malloc(memSize);
cudaMalloc((void**) &d_b,memSize );
h_c = (int *) malloc(memSize);
cudaMalloc((void**) &d_c,memSize );
for(int n=0;n<numThreadsPerBlock*numBlocks;n++)

// Part 2 of 5: configure and launch kernel
dim3 dimGrid( numBlocks );
dim3 dimBlock( numThreadsPerBlock/8,numThreadsPerBlock/8);

// check if kernel execution generated an error
checkCUDAError("kernel execution");

// Part 4 of 5: device to host copy
cudaMemcpy( d_a,h_a ,memSize,cudaMemcpyHostToDevice);
cudaMemcpy( d_b,h_b ,memSize,cudaMemcpyHostToDevice);
MatrixSimple<<< dimGrid ,dimBlock >>>(d_a,d_b,d_c,numThreadsPerBlock/8);
// block until the device has completed
cudaMemcpy( h_c,d_c ,memSize,cudaMemcpyDeviceToHost);

// Check for any CUDA errors

for(int i=0;i<8;i++)
for(int j=0;j<8;j++)
cout<<h_c[8*i+j]<<" ";

// free device memory

// free host memory

// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!

return 0;

void checkCUDAError(const char *msg)
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

Hasta la prox!!!