Using CUDA

Kashif Rasul
and Raqibul Hassan

l a b s

• Intro to Mathematica and its API
• CUDA + Mathematica
• Some examples
Mathematica intro
• Mathematica is a modular
  computational system in which the
  kernel is separate from the front end
  which handles the interaction with the

• The most common way to work is to use
  interactive documents called notebooks
  which mix text input and output as well
  as graphics and other material.
Structure of
• An import aspect of Mathematica is
  that it can also interact with other

• This is achieved through MathLink,
  a standardised API for two-way
  communication with the kernel.

• MathLink allows external programs both
  to call Mathematica, and to be called by

• We will use MathLink to let Mathematica
  call CUDA functions inside an external
Simple example
:Function:        addtwo
:Pattern:         AddTwo[i_Integer,j_Integer]
:Arguments:       { i, j }
:ArgumentTypes:   {Integer,Integer}
:ReturnType:      Integer

#include <mathlink.h>

int addtwo( int i, int j)
  return i+j;

int main(int argc, char* argv[])
  return MLMain(argc, argv);
mprep & gcc

$ mprep -o addtwotm.c

$ gcc -I${INCDIR} addtwotm.c addtwo.c
  -L${LIBDIR} -lMLi3 -lstdc++ -o addtwo
In[3]:=   SetDirectory
            " Applications SystemFiles Links MathLink DeveloperKit

Out[3]=     Applications SystemFiles Links MathLink DeveloperKit

 In[4]:=   link      Install ". addtwo"
Out[4]=    LinkObject
             Applications SystemFiles Links MathLink DeveloperKit
              PrebuiltExamples addtwo, 524, 8

 In[5]:=   LinkPatterns link
Out[5]=     AddTwo i_Integer, j_Integer

 In[6]:=   ? AddTwo

           AddTwo x , y gives the sum of two machine integers x and y.

 In[7]:=   AddTwo 2, 3
Out[7]=    5

 In[8]:=   AddTwo 2^31         1, 1
Out[8]=        2 147 483 648

 In[9]:=   Uninstall link
Out[9]=     Applications SystemFiles Links MathLink DeveloperKit
             PrebuiltExamples addtwo
        Template file
• When a MathLink template file is
  processed, two basic things are done:
 •   :Pattern:& :Arguments: specifications
     are used to generate a Mathematica
 •   :Function:, :ArgumentTypes:
     & :ReturnType: specifications are used
     to generate C source code

Mathematica specification   C specification

       Integer                  int
         Real                 double
     IntegerList            int*, long
       RealList            double*, long
        String                 char*
        Symbol                 char*
        Manual                 void
              Lists & Arrays
:Begin:                          int sumList(int *a, long alen)
:Function:     sumList           {
:Pattern:      SumList[a_List]      int i, tot=0;
:Arguments:    {a}
:ArgumentTypes:{IntegerList}         for(i=0; i<alen; i++)
:ReturnType:   Integer                  tot += a[i];
                                     return tot;
Manual ArgumentTypes
                 :Function:     sumList
                 :Pattern:      SumList[a:{___Integer}]
                 :Arguments:    {a}
                 :ReturnType:   Integer

int sumList(void) {                       int sumList(void) {
  int n, i;                                 int n;
  int a[MAX];                               int *a;

  MLCheckFunction(stdlink, "List", &n);       MLGetInteger32List(stdlink, &a, &n);
  for (i=0; i<n; i++)                         MLReleaseInteger32List(stdlink, a, n);
    MLGetInteger32(stdlink, a+i);             ...
...                                       }
Array of arb. depth
#include <mathlink.h>

/* read an array of double-precision floating-point numbers from a link */
void f(MLINK lp)
    double *data;
    int *dims;
    char **heads;
    int d; /* stores the rank of the array */

    if(! MLGetRealArray(lp, &data, &dims, &heads, &d))
            /* unable to read the array from lp */
    /* ... */
    MLReleaseRealArray(lp, data, dims, heads, d);
Handling Complex
                    In[1]:=   Head 2         3
                   Out[1]=    Complex

If you pass a list of complex numbers to your external program,
then MLGetReal64Array() will create a two-dimensional array
containing a sequence of pairs of real and imaginary parts. In this
case, heads[0] will be "List" while heads[1] will be "Complex".

    //get an array of floating-point numbers of any depth
Summary of API
//get a list of integers, allocating the memory needed to store it
//get a list of floating-point numbers
//release the memory associated with a list of integers
MLReleaseInteger32List(stdlink,int*a,int n);
//release the memory associated with a list of floating-point numbers
MLReleaseReal64List(stdlink,double*a,int n);

//get an array of integers of any depth
//get an array of floating-point numbers of any depth
//release memory associated with an integer array
MLReleaseInteger32Array(stdlink,int*a,int*dims,char**heads,int d);
//release memory associated with a floating-point array
MLReleaseReal32Array(stdlink,float*a,int*dims,char**heads,int d);
Manual ReturnType
                                   void bits(int i)
                                     int a[32], k;
:Function:     bits                    for(k=0; k<32; k++) {
:Pattern:      ToBits[i_Integer]         a[k] = i%2;
:Arguments:    {i}                       i >>= 1;
:ArgumentTypes:{Integer}                 if (i==0) break;
:ReturnType:   Manual                  }
                                       if (k<32) k++;

                                                          a, k);
General array
     int a[8][16][100];
     int dims[] = {8, 16, 100};

     MLPutInteger32Array(stdlink, a, dims, NULL, 3);

     int ***a;

     MLPutFunction(stdlink, "List", n1);
     for (i=0; i<n1; i++) {
       MLPutFunction(stdlink, "List", n2);
       for (j=0; j<n2; j++) {
         MLPutInteger32List(stdlink, a[i][j], n3);
Unkown length
 In[10]:=     Sequence 1, Sequence 4, Sequence
Out[10]=      1, 4

            MLPutFunction(stdlink, "List", 1);

            while( condition )
              /* generate an element */
              MLPutFunction(stdlink, "Sequence", 2);
              MLPutInteger32(stdlink, i );

            MLPutFunction(stdlink, "Sequence", 0);
Return Complex
// Complex data type
typedef float2 Complex;

Complex* h_convolved_signal;

// Return transformed signal to Mathematica as a Complex List
for (long i = 0; i < n; i++) {
Return Complex
 In[4]:=   list   Table RandomReal   , 12
Out[4]=    0.389421, 0.222396, 0.434636, 0.0886136, 0.233102, 0.941771,
           0.928712, 0.764119, 0.791473, 0.381426, 0.757661, 0.44273

 In[5]:=   Map Function   x , Apply Complex, x   , Partition list, 2

Out[5]=    0.389421   0.222396 , 0.434636   0.0886136 , 0.233102 0.941771 ,
           0.928712   0.764119 , 0.791473   0.381426 , 0.757661 0.44273

           // Return transformed signal to Mathematica as a Complex List
           MLPutFunction(stdlink, "Map", 2);
           MLPutFunction(stdlink, "Function", 2);
           MLPutFunction(stdlink, "List", 1);
           MLPutSymbol(stdlink, "x");
           MLPutFunction(stdlink, "Apply", 2);
           MLPutSymbol(stdlink, "Complex");
           MLPutSymbol(stdlink, "x");
           MLPutFunction(stdlink, "Partition", 2);
           MLPutFunction(stdlink, "Times", 2);
           MLPutReal(stdlink, norm);
           MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n);
           MLPutInteger(stdlink, 2);
Error & Interrupt
if(! MLPutInteger(stdlink, 10))              if(! MLPutReal64(stdlink, 3.22))
{                                            {
  /* check the possible errors */              /* unable to send 3.22 to lp */
  switch(MLError(stdlink))                     printf("MathLink Error: %sn",
  {                                                   MLErrorMessage(stdlink));
    case MLEDEAD:                              MLClearError(stdlink);
      /* the link died unexpectedly */       }
    case MLECLOSED:
      /* the other side closed the link */
    case MLEOK:
      /* no error occurred */                while(len--)
      break;                                 {
    default:                                   sum += *list++;
    /* ... */                                  /* check for the abort */
  }                                            if(MLAbort) return (double)0;
}                                            }
Running on remote
           $ ./addtwo -linkcreate -linkprotocol TCPIP
           Link created on: 63166@,63167@

 In[5]:=   Install LinkConnect "63166,63167",
             LinkProtocol "TCPIP"

Out[5]=    LinkObject 63166,63167, 1110, 8

 In[6]:=   AddTwo 2, 3
Out[6]=    5
Mathematica + CUDA
#include <cutil_inline.h>

int main(int argc, char **argv)
    // use command-line specified CUDA device,
    // otherwise use device with highest Gflops/s
    if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
         cutilDeviceInit(argc, argv);
         cudaSetDevice( cutGetMaxGflopsDeviceId() );

    return MLMain(argc, argv);
# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES     :=
# CUDA dependency files
# CU_DEPS       :=
# C/C++ source files (compiled with gcc / c++)
# CCFILES       :=
# Additional libraries needed by the project
USECUFFT        := 1
# MathLink Template files
TMFILES     :=

# Rules and targets

include ../../common/
FindCUDA +
FindMathLink via CMake

 • CMake
 • FindCUDA

 • FindMathLink



double to float
#include <cutil_inline.h>
// General check for CUDA GPU SM Capabilities
//inline bool cutilDrvCudaCapabilities(int major_version, int minor_version);

char **heads;
int *dims;
int rank;
float *h_float;
double *h_double;

if (cutilDrvCudaCapabilities( 1,3 ))
     MLGetReal64Array(stdlink, &h_double, &dims, &heads, &rank);
     MLGetReal32Array(stdlink, &h_float, &dims, &heads, &rank);

• Follow the usual routine of sending data
  to the MathLink app

• Return result back to Mathematica
In[1]:=   ListLinePlot Abs Fourier RandomReal 1, 200   ^2




Out[1]= 0.15



                       50          100        150           200
Clone mathematica_cuda

$ git clone

$ cd mathematica_cuda/src

$ mkdir cuFourier

$ mate cuFourier

:Function:     cuFourier1D
:Pattern:      CUFourier1D[ a:{__?NumericQ} ]
:Arguments:    { a }
:ArgumentTypes:{ RealList }
:ReturnType:   Manual
// includes system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes cuda
#include <cufft.h>
#include <cutil_inline.h>

// includes mathlink
#include <mathlink.h>

// Complex data type
typedef float2 Complex;

// Showing the use of CUFFT for fast convolution using FFT.
extern "C" void cuFourier1D(double*, long);
// Main program
int main(int argc, char *argv[])
    // use command-line specified CUDA device, otherwise use device
    // with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
         cutilDeviceInit(argc, argv);
         cudaSetDevice( cutGetMaxGflopsDeviceId() );

    return MLMain(argc, argv);
void cuFourier1D (double *h_A, long n)
    double norm = 1.0/sqrt((double) n);
    long mem_size = sizeof(Complex) * n;

   // Allocate host memory for the signal
   Complex* h_signal = (Complex*)malloc(mem_size);

   // Initalize the memory for the signal
   for (long i = 0; i < n; ++i) {
       h_signal[i].x = (float)h_A[i];
       h_signal[i].y = 0.0f;

   // Allocate device memory for signal
   Complex* d_signal;
   cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size));
   // Copy host memory to device
   cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size,
// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlan1d(&plan, n, CUFFT_C2C, 1));

// Transform signal
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal,
                                 (cufftComplex *)d_signal,

// Copy device memory to host
Complex* h_convolved_signal = h_signal;
cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal,
                         mem_size, cudaMemcpyDeviceToHost));

// Release d_signal

// Destroy CUFFT context
// Return transformed signal to Mathematica as a Complex List
    MLPutFunction(stdlink, "Map", 2);
    MLPutFunction(stdlink, "Function", 2);
    MLPutFunction(stdlink, "List", 1);
    MLPutSymbol(stdlink, "x");
    MLPutFunction(stdlink, "Apply", 2);
    MLPutSymbol(stdlink, "Complex");
    MLPutSymbol(stdlink, "x");
    MLPutFunction(stdlink, "Partition", 2);
    MLPutFunction(stdlink, "Times", 2);
    MLPutReal(stdlink, norm);
    MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n);
    MLPutInteger(stdlink, 2);

    // Cleanup memory

# Build script for project

# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES     :=
# Additional libraries needed by the project
USECUFFT        := 1

# MathLink Template files
TMFILES     :=

# Rules and targets
include ../../common/
In[35]:=   link
              " Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
                 release cuFourier"
Out[35]=    LinkObject
              Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
               release cuFourier, 605, 9

 In[36]:=   LinkPatterns link
Out[36]=     CUFourier1D a : __ ?NumericQ

 In[37]:=   ListLinePlot Abs CUFourier1D RandomReal 1, 200    ^2





                         50          100        150          200

 In[38]:=   Uninstall link
Out[38]=     Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
              release cuFourier
Image Deconvolution
  for Life Sciences

• Confocal and Widefield microscopy
  3D or 4D images

• Multichannel (3 or more channels)
• Comes in a wide variety of formats
Bio-Formats Java lib.

• Standalone Java library for reading and
  writing life science image formats

• Get both the pixels and metadata
• Licensed under GPL
Java + Mathematica:
Needs "JLink`"

 ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
   " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
    Xmx256m   Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
    Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
   com.wolfram.jlink.Install init " tmp m000001207601", 4, 4

ReinstallJava ClassPath   " home kashif Dropbox BioFormats Java loci_tools.jar"
 ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
   " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
    Xmx256m   Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
    Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
   com.wolfram.jlink.Install init " tmp m000002207601", 8, 4
Reading LIF images
reader    JavaNew "loci.formats.ImageReader"      LoadJavaClass "loci.formats.FormatTools"

« JavaObject loci.formats.ImageReader »           JavaClass loci.formats.FormatTools,

                                                  bpp   FormatTools`getBytesPerPixel pixelType
reader setId " media cdrom xyz 1ch by2 MT1.lif"
reader getSeriesCount
                                                  reader getSizeX
reader setSeries 0
                                                  reader getSizeY
sizeC    reader getSizeC                          512
                                                  reader getSizeZ
pixelType    reader getPixelType                  90


num     reader getImageCount
Reading pixel volume
 LoadJavaClass "loci.common.DataTools"
 JavaClass loci.common.DataTools,

      Table DataTools`makeDataArray
        reader openBytes z, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ,
        z, 0, reader getSizeZ    1    ;

unflatten e_, d__ ? IntegerQ               && Positive     &   :
 Fold Partition, e, Take d ,              1, 2, 1      ; Length e                Times d

array   unflatten volume, reader getSizeX                    , reader getSizeY         ,
    reader getSizeZ     ;
View a slice
Image array   165, All, All   255
Image deconvled
                  165, All, All
Wiener Deconv.
:Function:      wienerDeconvolve
:Pattern:       WienerDeconvolve[nx_Integer, ny_Integer, nz_Integer,
                                 epsilon_Real, sigma_Real, inImage:{___Real}]
:Arguments:     { nx, ny, nz, epsilon, sigma, inImage }
:ArgumentTypes: { Integer, Integer, Integer, Real, Real, Manual }
:ReturnType:    Manual

void wienerDeconvolve(int nx, int ny, int nz, double epsilon, double sigma)
    float *inImage;
    int length;

    if(! MLGetReal32List(stdlink, &inImage, &length))
amira Projection view
Export " home kashif Amira522 data deconv alphalobe MaxLike.raw",
  result, "Real32" ;
Remote Sensing
Landsat TM Data
Band 3 & Band 4
Reading Landsat Images
   In[4]:=   reader    JavaNew "loci.formats.ImageReader"
  Out[4]=    « JavaObject loci.formats.ImageReader »

   In[5]:=   reader    JavaNew "loci.formats.ChannelSeparator", reader
  Out[5]=    « JavaObject loci.formats.ChannelSeparator »

  In[35]:=   reader setId " Users sabman satellite_images multispectral bhtmref.tif"

   In[7]:=   reader getSeriesCount
  Out[7]=    1

   In[8]:=   sizeC    reader getSizeC
  Out[8]=    6

   In[9]:=   pixelType    reader getPixelType
  Out[9]=    1

  In[11]:=   num     reader getImageCount
 Out[11]=    6

  In[12]:=   pixelType    reader getPixelType
Loading Landsat data
  in Mathematica
    In[14]:=   LoadJavaClass "loci.formats.FormatTools"
   Out[14]=    JavaClass loci.formats.FormatTools,

    In[15]:=   bpp    FormatTools`getBytesPerPixel pixelType
   Out[15]=    1

    In[16]:=   reader getSizeX
   Out[16]=    512

    In[17]:=   isLittle   reader isLittleEndian
   Out[17]=    True

    In[18]:=   reader getSizeY
   Out[18]=    512

    In[19]:=   LoadJavaClass "loci.common.DataTools"
   Out[19]=    JavaClass loci.common.DataTools,
In[31]:=   red      DataTools`makeDataArray
                 reader openBytes 2, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ;

In[53]:=   Image Partition 100 Normalize red , reader getSizeX
In[56]:=   NIR      DataTools`makeDataArray
                 reader openBytes 3, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ;

In[57]:=   Image Partition 100 Normalize NIR , reader getSizeX
In[39]:=   link   Install " Users sabman mathematica_cuda bin darwin emurelease ndvi"
Out[39]=    LinkObject   Users sabman mathematica_cuda bin darwin emurelease ndvi, 41, 10

 In[40]:=   LinkPatterns link
Out[40]=    ndvi a_List, b_List

 In[41]:=   NDVI   ndvi Partition NIR, reader getSizeX    , Partition red, reader getSizeX   ;

 In[42]:=   Image Partition NDVI, reader getSizeX

:Function:        ndvi
:Pattern:         ndvi[ a_List, b_List ]
:Arguments:       { a, b }
:ArgumentTypes:   { Manual }
:ReturnType:      Manual
void ndvi(void)
    short int *h_A, *h_B;
    float *h_C_GPU;
    short int *d_A, *d_B;
    float *d_C;

    char **heads_A, **heads_B;
    int *dims_A, *dims_B;
    int rank_A, rank_B;

   if(! MLGetInteger16Array(stdlink, &h_A, &dims_A, &heads_A, &rank_A))

   if(! MLGetInteger16Array(stdlink, &h_B, &dims_B, &heads_B, &rank_B))
//Initializing data
h_C_GPU = (float *)malloc(dims_A[0]*dims_A[1]*sizeof(float));

//Allocating GPU memory
cutilSafeCall( cudaMalloc((void **)&d_A, dims_A[0]*dims_A[1]*sizeof(short int)) );
cutilSafeCall( cudaMalloc((void **)&d_B, dims_A[0]*dims_A[1]*sizeof(short int)) );
cutilSafeCall( cudaMalloc((void **)&d_C, dims_A[0]*dims_A[1]*sizeof(float)) );

//Copy data to GPU memory for further processing
cutilSafeCall( cudaMemcpy(d_A, h_A, dims_A[0]*dims_A[1]*sizeof(short int),
               cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, dims_A[0]*dims_A[1]*sizeof(short int),
               cudaMemcpyHostToDevice) );

cutilSafeCall( cudaThreadSynchronize() );

dim3 grid(ceil((float)dims_A[0]/(float)16.0f), ceil((float) dims_A[1]/32.0f), 1);
dim3 threads(ceil( dims_A[0]/(float)grid.x), ceil( dims_A[1]/(float)grid.y), 1);

ndviGPU<<<grid, threads>>>(d_C, d_A, d_B, dims_A[0], dims_A[1]);
cutilCheckMsg("ndviGPU() execution failedn");
cutilSafeCall( cudaThreadSynchronize() );
//Release d_A and d_B
cutilSafeCall( cudaFree(d_B) );
cutilSafeCall( cudaFree(d_A) );

//Read back GPU results into h_C_GPU
cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float),
               cudaMemcpyDeviceToHost) );

//Release d_C
cutilSafeCall( cudaFree(d_C) );

//Return result
MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]);

//Release h_A and h_B
MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A);
MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B);

NDVI Kernel
// Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C

__global__ void ndviGPU(
    float *d_C,
    short int *d_A,
    short int *d_B,
    int width,
    int height

    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

    if(xIndex < width && yIndex < height)
        unsigned int i = yIndex * (width) + xIndex;
        d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) );
NDVI output

                                 0                     1

In[64]:=   ArrayPlot Partition NDVI, reader getSizeX       , ColorFunction   "Rainbow"
         twitter krasul

