NVIDIA GPUs
From UF HPC Wiki
Contents |
Introduction
The UF HPC Center has acquired two nVidia Teslas. Each Tesla S1070 1U chassis contains four (4) nVidia Tesla (C1060) GPUs and 16 GB of available RAM (4 GB per GPU). The GPUs are hosted by four (4) independent host nodes (tesla1, tesla2, tesla3, tesla4) such that each node has access to two GPU devices. Each host has four (4) Intel E5462 cores running at 2.8 GHz, 16 GB of RAM, and the same software environment as the rest of the HPC Center cluster nodes. In essence, each of the four nodes is a Phase III cluster compute host with attached GPUs.
We currently have no production codes that support the GPUs although there is a beta version (2.7b1) of NAMD with limited GPU support. Therefore, these nodes and the associated GPUs are available for interactive testing and development. You need only log into one of the four hosts via ssh from submit. The CUDA software tools are available in /opt/cuda and /opt/cuda-sdk.
CUDA is the nVidia Compute Unified Device Architecture environment. It is a software development environment that allows you to port and develop applications which will utilize the parallel processing capabilities of the nVidia GPUs. See the following links for more information.
nVidia GPU Comparison
| GPU | Processors | GFlops1 |
|---|---|---|
| C870 | 128 | 519 |
| C1060 | 240 | 936 |
- 1. Single-Precision
Other Resources
Example: CUBLAS vs. Optimized BLAS
Note that double-precision linear algebra is a less than ideal application for the GPUs. Still, it is a functional example of using one of the existing CUDA libraries.
#include <stdio.h>
#include <stdlib.h>
#include <acml.h>
#include <math.h>
#include <time.h>
#include "cublas.h"
void resuse(char *str);
double timeDiff( struct timespec *t1, struct timespec *t2)
{
double T1, T2;
T2 = (double)t2->tv_sec + (double)t2->tv_nsec / 1.0e9;
T1 = (double)t1->tv_sec - (double)t1->tv_nsec / 1.0e9;
return(T2 - T1);
}
main()
{
int dim = 9100;
int i,j,k;
int status;
double *psa, *psb, *psc;
double *sap, *sbp, *scp;
double *pda, *pdb, *pdc;
double *dap, *dbp, *dcp;
char CblasNoTrans = 'N';
double alpha = 1.0;
double beta = 0.0;
double gflops = 0.0;
float deltaT = 0.0;
double gflopCnt = 2.0 * dim * dim * dim / 1.0e9;
struct timespec t1;
struct timespec t2;
int ptime();
pda = NULL;
pdb = NULL;
pdc = NULL;
psa = (double *) malloc(dim * dim * sizeof(*psa) );
psb = (double *) malloc(dim * dim * sizeof(*psb) );
psc = (double *) malloc(dim * dim * sizeof(*psc) );
printf("Initializing Matrices...");
clock_gettime(CLOCK_MONOTONIC, &t1);
sap = psa;
sbp = psb;
scp = psc;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++) {
*sap++ = 1.0;
*sbp++ = 1.0;
*scp++ = 0.0;
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
fflush(stdout);
printf("Starting parallel DGEMM...");
fflush(stdout);
clock_gettime(CLOCK_MONOTONIC, &t1);
dgemm(CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, psa, dim, psb, dim, beta, psc, dim);
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
printf(" ");
printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
if ( (float) dim - psc[0] > 1.0e-5 ||
(float) dim - psc[dim*dim-1] > 1.0e-5 ) {
printf("Error: Incorrect Results!\n");
printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
}
/* Initialize CUDA */
printf("Initializing CUDA...");
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
printf("Done.\n");
/* Re-initialize the A, B */
printf("Re-initializing Matrices...");
clock_gettime(CLOCK_MONOTONIC, &t1);
sap = psa;
sbp = psb;
scp = psc;
for (i = 0; i < dim; i++) {
for (j = 0; j < dim; j++) {
*sap++ = 1.0;
*sbp++ = 1.0;
*scp++ = 0.0;
}
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
fflush(stdout);
/* Allocate device memory for the matrices */
printf("Starting CUDA DGEMM...");
clock_gettime(CLOCK_MONOTONIC, &t1);
status = cublasAlloc(dim*dim, sizeof(*pda), (void**) &pda);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
status = cublasAlloc(dim*dim, sizeof(*pdb), (void**) &pdb);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (B)\n");
return EXIT_FAILURE;
}
status = cublasAlloc(dim*dim, sizeof(*pdc), (void**) &pdc);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(dim*dim, sizeof(*psa), psa, 1, pda, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(dim*dim, sizeof(*pdb), psb, 1, pdb, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write B)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(dim*dim, sizeof(*psc), psc, 1, pdc, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
/* Clear last error */
cublasGetError();
/* Performs operation using cublas */
cublasDgemm('n', 'n', dim, dim, dim, alpha, pda, dim, pdb, dim, beta, pdc, dim);
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Read the result back */
status = cublasGetVector(dim*dim, sizeof(*psc), pdc, 1, psc, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
printf(" ");
printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
if ( (float) dim - psc[0] > 1.0e-5 ||
(float) dim - psc[dim*dim-1] > 1.0e-5 ) {
printf("Error: Incorrect Results!\n");
printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
}
}
/*---------------------------------------------------------------------------*
* resuse.c *
* *
* resuse_(desc) *
* char *desc; *
*---------------------------------------------------------------------------*
* This routine makes use of the getrusage system call to collect and *
* display resource utilization from one point in a program to another. *
* Thus, it works somewhat like a stopwatch in the since that the first call *
* initializes, or starts, the resource collection over the interval. The *
* second call stops the collection and displays the results accumulated *
* since the previous call. This means, of course, that two calls are *
* required for each section of code to be monitored. *
*
* The underscore at the end of the routine name is there so that the routine*
* may be called as an integer valued FORTRAN function name RESUSE(), under *
* both the SunOS and Ultrix f77 compilers. AIX inter-language calling *
* conventions are different so the routine must be referenced as RESUSE_() *
* under AIX (RISC/6000) FORTRAN (xlf).
*
* From a FORTRAN routine:
*
* INT = RESUSE("Some String")
*
* fortran code to be timed
*
* INT = RESUSE("Some String")
*
* Just ignore the returned value.
*---------------------------------------------------------------------------*/
#include <sys/time.h>
#include <sys/resource.h>
static int first_call = 1;
static struct rusage initial;
void resuse(str)
char *str;
{
int pgminor, pgmajor, nswap, nvcsw, nivcsw;
int inblock, oublock;
struct rusage final;
float usr, sys, secs;
getrusage(RUSAGE_SELF, &final);
if ( ! first_call )
{
secs = final.ru_utime.tv_sec + final.ru_utime.tv_usec / 1000000.0;
usr = secs;
secs = initial.ru_utime.tv_sec + initial.ru_utime.tv_usec / 1000000.0;
usr = usr - secs;
secs = final.ru_stime.tv_sec + final.ru_stime.tv_usec / 1000000.0;
sys = secs;
secs = initial.ru_stime.tv_sec + initial.ru_stime.tv_usec / 1000000.0;
sys = sys - secs;
pgminor = final.ru_minflt - initial.ru_minflt;
pgmajor = final.ru_majflt - initial.ru_majflt;
nswap = final.ru_nswap - initial.ru_nswap;
inblock = final.ru_inblock - initial.ru_inblock;
oublock = final.ru_oublock - initial.ru_oublock;
nvcsw = final.ru_nvcsw - initial.ru_nvcsw ;
nivcsw = final.ru_nivcsw - initial.ru_nivcsw ;
printf("=============================================================\n");
printf("%s: Resource Usage Data...\n", str);
printf("-------------------------------------------------------------\n");
printf("User Time (secs) : %10.3f\n", usr);
printf("System Time (secs) : %10.3f\n", sys);
printf("Total Time (secs) : %10.3f\n", usr + sys);
printf("Minor Page Faults : %10d\n", pgminor);
printf("Major Page Faults : %10d\n", pgmajor);
printf("Swap Count : %10d\n", nswap);
printf("Voluntary Context Switches : %10d\n", nvcsw);
printf("Involuntary Context Switches: %10d\n", nivcsw);
printf("Block Input Operations : %10d\n", inblock);
printf("Block Output Operations : %10d\n", oublock);
printf("=============================================================\n");
}
else
{
printf("=============================================================\n");
printf("%s: Collecting Resource Usage Data\n", str);
printf("=============================================================\n");
}
first_call = !first_call;
initial = final;
return;
}
Makefile for Building CUBLAS Example
CC = icc INCS = -I/opt/intel/acml/410/ifort64_mp/include CFLAGS = -O3 -mp $(INCS) LIB_ACML = -L/opt/intel/acml/410/ifort64/lib -lacml -lacml_mv LIB_ACML_MP = -L/opt/intel/acml/410/ifort64_mp/lib -lacml_mp -lacml_mv LIB_GOTO = -L/opt/lib/goto -lgoto_opteron64p-r1.02 -lpthread LIBS = $(LIB_ACML_MP) -lrt -lpthread #LIBS = $(LIB_ACML_MP) $(LIB_PSC) -lrt #LIBS = $(LIB_GOTO) CUDA_INCS = -I/usr/local/cuda/include $(INCS) CUDA_LIBS = -L/usr/local/cuda/lib -lcublas $(LIBS) CUDA_CFLAGS = -O0 -g cuda_bm: cuda_bm.c resuse.o $(CC) $(CUDA_CFLAGS) $(CUDA_INCS) -o cuda_bm cuda_bm.c resuse.o $(CUDA_LIBS) clean: rm -f *.o core.* bm cuda_bm
