M4rK5: noviembre 2010

viernes, 19 de noviembre de 2010

Instalar CUDA GPU NVidia en Ubuntu 10.10

En este tutorial en español describere como pude instalar CUDA Nvidia teniendo una tarjeta de modelo 9500GT. El sistema operativo en el que instale fue Ubuntu 10.10.

Paso 1: (Descargando CUDA)

De este link descarga los siguientes paquetes:

Developer Driver for Linux (260.19.14)

Cuda Toolkit

GPU Computing SDK code sample

Donde descargastes cada uno de los paquetes abre un terminal en esa carpeta y pon : chmod 0777 nombre_paquete

Paso 2: Modifiquemos y removemos nvidia por defecto

En un terminal como modo superusuario hagamos lo sgte:

#nano /etc/modprobe.d/blacklist.conf :

blacklist vga16fb

blacklist rivafb

blacklist nvidiafb

blacklist rivatv

Agregemos las siguientes lineas al final del archivo:

Ejecutemos lo sgte:

#apt-get purge nvidia-*

#reboot -n

Paso 3: Modo X , logueo y parada de servicio gdm

ctrl+alt+f5

nos logueamos y ejecutamos $ sudo service gdm stop

Paso 4: Instalando devdriver

$ sudo sh devdriver_3.2_linux_32_260.19.21.run

Aceptamos todas las licencias

La compatibilidad de OpenGl acepta "Yes"

y preguntara si nvidia-xconfig se desee ejecutar automaticamente para la configuración del archivo X . Responde "Yes"

Paso 5: Instalemos el toolkit de CUDA

Una vez instalado el driver de Nvidia , procedemos a instalar el cudatoolkit. Ejecutando lo siguiente:

$ sudo sh cudatoolkit_3.2.16_linux_32_ubuntu10.04.run

Deja la ruta por defecto , presiona Enter.

Paso 6: Modifiquemos variables del ambiente

Agrega variables del sistema operativo. En un terminal haz lo siguiente:

$sudo nano /etc/environment

Agregemos ":usr/local/cuda/bin" antes de el ultimo simbolo " . Guarda el archivo y salimos del editor

En el archivo .bashrc q esta en tu home. Agregamos lo siguiente:

export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}":${CUDA_HOME}/lib
export PATH=${CUDA_HOME}/bin:${PATH}

# nano /etc/ld.so.conf.d/cuda.conf (creamos nuevo archivo)

/usr/local/cuda/lib64

/usr/local/cuda/lib

Recargamos LD_LIBRARY_PATH

#ldconfig

Paso 7: Instalamos "GPU Computing SDK"

Instalemos gpucomputing : $ sh gpucomputingsdk_3.2.16_linux.run

Enter para instalar el path por defecto.

Dirigimos la ruta de CUDA en este caso esta por defecto en /usr/local/cuda . Presionamos enter para confirmar

Instalamos los paquetes necesarios

$ sudo apt-get install g++ freeglut3-dev libxi-dev

Rompemos el enlace de libGL.so

$ sudo rm -f /usr/lib/libGL.so

$ sudo ln -s /usr/lib/libGL.so.260.19.21 /usr/lib/libGL.so

Creamos un enlaze para libXmu

$sudo ln -s /usr/lib/libXmu.so.6 /usr/lib/libXmu.so

Compilemos GPU COmputing

$ cd ~/NVIDIA_GPU_Computing_SDK/C

$ make

Para encontrar los ejecutable se ubican en ~/NVIDIA_GPU_Computing_SDK/C/bin/linux/release . Uno de sus ejemplos es esta simulacion del humo de cigarro.

Espero les sirva este post!!! xD.

sábado, 13 de noviembre de 2010

Julia SeT in CPU and GPU

Reading the boook cuda by example , find a problem simple "Julia Set" (definition of Julia Set)

CPU


/*
* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/


#include "../common/book.h"
#include "../common/cpu_bitmap.h"

#define DIM 1000

struct cuComplex {
  float   r;
  float   i;
  cuComplex( float a, float b ) : r(a), i(b)  {}
  float magnitude2( void ) { return r * r + i * i; }
  cuComplex operator*(const cuComplex& a) {
      return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
  }
  cuComplex operator+(const cuComplex& a) {
      return cuComplex(r+a.r, i+a.i);
  }
};

int julia( int x, int y ) {
  const float scale = 1.5;
  float jx = scale * (float)(DIM/2 - x)/(DIM/2);
  float jy = scale * (float)(DIM/2 - y)/(DIM/2);

  cuComplex c(-0.8, 0.156);
  cuComplex a(jx, jy);

  int i = 0;
  for (i=0; i<200; i++) {
      a = a * a + c;
      if (a.magnitude2() > 1000)
          return 0;
  }

  return 1;
}

void kernel( unsigned char *ptr ){
  for (int y=0; y<DIM; y++) {
      for (int x=0; x<DIM; x++) {
          int offset = x + y * DIM;

          int juliaValue = julia( x, y );
          ptr[offset*4 + 1] = 255 * juliaValue;
          ptr[offset*4 + 0] = 0;
          ptr[offset*4 + 2] = 0;
          ptr[offset*4 + 3] = 255;
      }
  }
}

int main( void ) {
  CPUBitmap bitmap( DIM, DIM );
  unsigned char *ptr = bitmap.get_ptr();

  kernel( ptr );

  bitmap.display_and_exit();
}

GPU


/*
* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/


#include "../common/book.h"
#include "../common/cpu_bitmap.h"

#define DIM 1000

struct cuComplex {
   float   r;
   float   i;
   cuComplex( float a, float b ) : r(a), i(b)  {}
   __device__ float magnitude2( void ) {
       return r * r + i * i;
   }
   __device__ cuComplex operator*(const cuComplex& a) {
       return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
   }
   __device__ cuComplex operator+(const cuComplex& a) {
       return cuComplex(r+a.r, i+a.i);
   }
};

__device__ int julia( int x, int y ) {
   const float scale = 1.5;
   float jx = scale * (float)(DIM/2 - x)/(DIM/2);
   float jy = scale * (float)(DIM/2 - y)/(DIM/2);

   cuComplex c(-0.8, 0.156);
   cuComplex a(jx, jy);

   int i = 0;
   for (i=0; i<200; i++) {
       a = a * a + c;
       if (a.magnitude2() > 1000)
           return 0;
   }

   return 1;
}

__global__ void kernel( unsigned char *ptr ) {
   // map from blockIdx to pixel position
   int x = blockIdx.x;
   int y = blockIdx.y;
   int offset = x + y * gridDim.x;

   // now calculate the value at that position
   int juliaValue = julia( x, y );
   ptr[offset*4 + 0] = 255 * juliaValue;
   ptr[offset*4 + 1] = 0;
   ptr[offset*4 + 2] = 0;
   ptr[offset*4 + 3] = 255;
}

// globals needed by the update routine
struct DataBlock {
   unsigned char   *dev_bitmap;
};

int main( void ) {
   DataBlock   data;
   CPUBitmap bitmap( DIM, DIM, &data );
   unsigned char    *dev_bitmap;

   HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap, bitmap.image_size() ) );
   data.dev_bitmap = dev_bitmap;

   dim3    grid(DIM,DIM);
   kernel<<<grid,1>>>( dev_bitmap );

   HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
                             bitmap.image_size(),
                             cudaMemcpyDeviceToHost ) );
                            
   HANDLE_ERROR( cudaFree( dev_bitmap ) );
                            
   bitmap.display_and_exit();
}

Really GPU is very interesting , why? The future of computing is parallel computing!!!!!.

See you :P

Get Properties of Device GPU //Obten propiedades de tu dispostivo GPU

Si deseas obtener información de tu tarjeta o tarjetas lo puedes hacer de la siguiente manera:


#include <iostream>

using namespace std;

int main()
{

 cudaDeviceProp prop;

 int count;
 cudaGetDeviceCount(&count); /*count number of devices*/
 for(int i=0;i<count;i++)
 {
   cudaGetDeviceProperties(∝,i); /*get properties of device*/
   cout<<"General Information for device----"<<i<<endl;
   cout<<"Name "<<prop.name<<endl;
   cout<<"Compute capability "<<prop.major<<". "<<prop.minor<<endl;
   cout<<"Clock Rate "<<prop.clockRate<<endl;
   cout<<"Device copy overlap:  ";
   if(prop.deviceOverlap)
     cout<<"Enable\n";
   else cout<<"Disabled\n";
  
   if(prop.kernelExecTimeoutEnabled)
     cout<<"Enable\n";
   else cout<<"Disabled\n";
  
  
   cout<<"---Memory Information for device "<<i<<endl;
   cout<<"Total global Mem: "<<prop.totalGlobalMem<<endl;
   cout<<"Total COnstant Mem: "<<prop.totalConstMem<<endl;
   cout<<"Max mem pitch: "<<prop.memPitch<<endl;
   cout<<"Texture ALignment "<<prop.textureAlignment;
   cout<<"\n -------MP Information for devices "<<i;
   cout<<" \nMultiprocessor count : "<<prop.multiProcessorCount<<endl;
  
   cout<<"Shared mem per mp: "<<prop.sharedMemPerBlock<<endl;
   cout<<"Registers per mp: "<<prop.regsPerBlock<<endl;
   cout<<"Threads in warp: "<<prop.warpSize<<endl;
   cout<<"Max Threads per block "<<prop.maxThreadsPerBlock<<endl;
   cout<<"Max Threads dimensions: ("<<prop.maxThreadsDim[0]<<","<<prop.maxThreadsDim[1]
   <<","<<prop.maxThreadsDim[2]<<")\n";
  
   cout<<"Max grid dimensions: ("<<prop.maxGridSize[0]<<","<<prop.maxGridSize[1]
   <<","<<prop.maxGridSize[2]<<")\n";
 }
 return 0;
}

see you!!! xD

Multiplicación de Matricez Multibloque con CUDA

Ahora multiplicación de Matrizes , pero usando un nuevo concepto TILES , en el cual podemos hacer multiplicación de matrices multibloque.

Acontinuación el sgte codigo en el que aplica ese concepto:




#include <stdio.h>
#include <assert.h>
#include <iostream>

#define TILE_WIDTH 4
using namespace std;

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);

// Part 3 of 5: implement the kernel
__global__ void MatrixTiles(int *d_a ,int *d_b,int *d_c,int width )
{
  int row=blockIdx.x*TILE_WIDTH+threadIdx.x;
  int col=blockIdx.y*TILE_WIDTH+threadIdx.y;
  
  
  int pvalue=0;
  //for each computes one element of the block sub-matrix
  for(int i=0;i<width;i++)
  {
    pvalue+=(d_a[row*width+i]*d_b[i*width+col]);
  }
  d_c[row*width+col]=pvalue;
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
    // pointer for host memory
    int *h_a,*h_b,*h_c;

    // pointer for device memory
    int *d_a,*d_b,*d_c;

    // define grid and block size
    int numBlocks = 4;
    int numThreadsPerBlock = 16;

    // Part 1 of 5: allocate host and device memory
    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
    h_a = (int *) malloc(memSize);
    cudaMalloc((void**) &d_a,memSize );
    
    
    h_b = (int *) malloc(memSize);
    cudaMalloc((void**) &d_b,memSize );
    
    h_c = (int *) malloc(memSize);
    cudaMalloc((void**) &d_c,memSize );
    
    
    
    for(int n=0;n<numThreadsPerBlock*numBlocks;n++)
    {
      h_a[n]=h_b[n]=1;
    }

    // Part 2 of 5: configure and launch kernel
    dim3 dimGrid( numBlocks/2,numBlocks/2 );
    dim3 dimBlock(  numThreadsPerBlock/TILE_WIDTH,numThreadsPerBlock/TILE_WIDTH);
    

   

    // check if kernel execution generated an error
    checkCUDAError("kernel execution");

    // Part 4 of 5: device to host copy
    cudaMemcpy( d_a,h_a ,memSize,cudaMemcpyHostToDevice);
    cudaMemcpy( d_b,h_b ,memSize,cudaMemcpyHostToDevice);
    
    
    MatrixTiles<<< dimGrid ,dimBlock  >>>(d_a,d_b,d_c,TILE_WIDTH*2);
     // block until the device has completed
    cudaThreadSynchronize();
    cudaMemcpy( h_c,d_c ,memSize,cudaMemcpyDeviceToHost);

    // Check for any CUDA errors
    checkCUDAError("cudaMemcpy");

    for(int i=0;i<8;i++)
    {
      for(int j=0;j<8;j++)
         cout<<h_c[8*i+j]<<" ";
      cout<<endl;
    }

    // free device memory
    cudaFree(d_a);

    // free host memory
    free(h_a);

    // If the program makes it this far, then the results are correct and
    // there are no run-time errors.  Good work!
    printf("Correct!\n");

    return 0;
}

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(-1);
    }                         
}

El sgte post será mejorando esto. Los Tiles se usan porque cada bloque en CUDA tienen un tamaño que es de 512 Threads.

lunes, 8 de noviembre de 2010

Mi primer programa en GPU -CUDA

Despues de unas leidas para unos trabajos que hare mi primer programa en GPU aún es una simple multiplicación de matrizes con Hilos en CUDA. Las versiones mejoradas ya las subire.

#include <stdio.h>

#include <assert.h>

#include <iostream>

using namespace std;

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char *msg);

// Part 3 of 5: implement the kernel

__global__ void MatrixSimple(int *d_a ,int *d_b,int *d_c,int width )

{

int row=blockIdx.x*width+threadIdx.x;

int col=blockIdx.y*width+threadIdx.y;

int pvalue=0;

//for each computes one element of the block sub-matrix

for(int i=0;i<width;i++)

{

pvalue+=(d_a[row*width+i]*d_b[i*width+col]);

}

d_c[row*width+col]=pvalue;

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main( int argc, char** argv)

{

// pointer for host memory

int *h_a,*h_b,*h_c;

// pointer for device memory

int *d_a,*d_b,*d_c;

// define grid and block size

int numBlocks = 1;

int numThreadsPerBlock = 64;

// Part 1 of 5: allocate host and device memory

size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);

h_a = (int *) malloc(memSize);

cudaMalloc((void**) &d_a,memSize );

h_b = (int *) malloc(memSize);

cudaMalloc((void**) &d_b,memSize );

h_c = (int *) malloc(memSize);

cudaMalloc((void**) &d_c,memSize );

for(int n=0;n<numThreadsPerBlock*numBlocks;n++)

{

h_a[n]=h_b[n]=1;

}

// Part 2 of 5: configure and launch kernel

dim3 dimGrid( numBlocks );

dim3 dimBlock( numThreadsPerBlock/8,numThreadsPerBlock/8);

// check if kernel execution generated an error

checkCUDAError("kernel execution");

// Part 4 of 5: device to host copy

cudaMemcpy( d_a,h_a ,memSize,cudaMemcpyHostToDevice);

cudaMemcpy( d_b,h_b ,memSize,cudaMemcpyHostToDevice);

MatrixSimple<<< dimGrid ,dimBlock >>>(d_a,d_b,d_c,numThreadsPerBlock/8);

// block until the device has completed

cudaThreadSynchronize();

cudaMemcpy( h_c,d_c ,memSize,cudaMemcpyDeviceToHost);

// Check for any CUDA errors

checkCUDAError("cudaMemcpy");

for(int i=0;i<8;i++)

{

for(int j=0;j<8;j++)

cout<<h_c[8*i+j]<<" ";

cout<<endl;

}

// free device memory

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

// free host memory

free(h_a);

free(h_b);

free(h_c);

// If the program makes it this far, then the results are correct and

// there are no run-time errors. Good work!

printf("Correct!\n");

return 0;

}

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err)

{

fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

exit(-1);

}

Hasta la prox!!!