SlideShare uma empresa Scribd logo
1 de 67
Baixar para ler offline
Using CUDA
within
Mathematica

Kashif Rasul
and Raqibul Hassan

l a b s
Overview


• Intro to Mathematica and its API
• CUDA + Mathematica
• Some examples
Mathematica intro
• Mathematica is a modular
  computational system in which the
  kernel is separate from the front end
  which handles the interaction with the
  user.

• The most common way to work is to use
  interactive documents called notebooks
  which mix text input and output as well
  as graphics and other material.
Structure of
      Mathematica
• An import aspect of Mathematica is
  that it can also interact with other
  applications.

• This is achieved through MathLink,
  a standardised API for two-way
  communication with the kernel.
MathLink

• MathLink allows external programs both
  to call Mathematica, and to be called by
  Mathematica.

• We will use MathLink to let Mathematica
  call CUDA functions inside an external
  program.
Simple example
                  addtwo.tm
:Begin:
:Function:        addtwo
:Pattern:         AddTwo[i_Integer,j_Integer]
:Arguments:       { i, j }
:ArgumentTypes:   {Integer,Integer}
:ReturnType:      Integer
:End:
addtwo.c

#include <mathlink.h>

int addtwo( int i, int j)
{
  return i+j;
}

int main(int argc, char* argv[])
{
  return MLMain(argc, argv);
}
mprep & gcc


$ mprep addtwo.tm -o addtwotm.c

$ gcc -I${INCDIR} addtwotm.c addtwo.c
  -L${LIBDIR} -lMLi3 -lstdc++ -o addtwo
In[3]:=   SetDirectory
            " Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
              PrebuiltExamples"

Out[3]=     Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
             PrebuiltExamples

 In[4]:=   link      Install ". addtwo"
Out[4]=    LinkObject
             Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
              PrebuiltExamples addtwo, 524, 8

 In[5]:=   LinkPatterns link
Out[5]=     AddTwo i_Integer, j_Integer

 In[6]:=   ? AddTwo

           AddTwo x , y gives the sum of two machine integers x and y.


 In[7]:=   AddTwo 2, 3
Out[7]=    5

 In[8]:=   AddTwo 2^31         1, 1
Out[8]=        2 147 483 648

 In[9]:=   Uninstall link
Out[9]=     Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
             PrebuiltExamples addtwo
MathLink
        Template file
• When a MathLink template file is
  processed, two basic things are done:
 •   :Pattern:& :Arguments: specifications
     are used to generate a Mathematica
     definition
 •   :Function:, :ArgumentTypes:
     & :ReturnType: specifications are used
     to generate C source code
:ArgumentTypes:

Mathematica specification   C specification

       Integer                  int
         Real                 double
     IntegerList            int*, long
       RealList            double*, long
        String                 char*
        Symbol                 char*
        Manual                 void
Handling
              Lists & Arrays
:Begin:                          int sumList(int *a, long alen)
:Function:     sumList           {
:Pattern:      SumList[a_List]      int i, tot=0;
:Arguments:    {a}
:ArgumentTypes:{IntegerList}         for(i=0; i<alen; i++)
:ReturnType:   Integer                  tot += a[i];
:End:
                                     return tot;
                                 }
Manual ArgumentTypes
                 :Begin:
                 :Function:     sumList
                 :Pattern:      SumList[a:{___Integer}]
                 :Arguments:    {a}
                 :ArgumentTypes:{Manual}
                 :ReturnType:   Integer
                 :End:

int sumList(void) {                       int sumList(void) {
  int n, i;                                 int n;
  int a[MAX];                               int *a;

  MLCheckFunction(stdlink, "List", &n);       MLGetInteger32List(stdlink, &a, &n);
                                              ...
  for (i=0; i<n; i++)                         MLReleaseInteger32List(stdlink, a, n);
    MLGetInteger32(stdlink, a+i);             ...
...                                       }
}
Array of arb. depth
#include <mathlink.h>

/* read an array of double-precision floating-point numbers from a link */
void f(MLINK lp)
{
    double *data;
    int *dims;
    char **heads;
    int d; /* stores the rank of the array */

    if(! MLGetRealArray(lp, &data, &dims, &heads, &d))
        {
            /* unable to read the array from lp */
            return;
        }
    /* ... */
    MLReleaseRealArray(lp, data, dims, heads, d);
}
Handling Complex
           numbers
                    In[1]:=   Head 2         3
                   Out[1]=    Complex

If you pass a list of complex numbers to your external program,
then MLGetReal64Array() will create a two-dimensional array
containing a sequence of pairs of real and imaginary parts. In this
case, heads[0] will be "List" while heads[1] will be "Complex".

    //get an array of floating-point numbers of any depth
    MLGetReal64Array(stdlink,double**a,int**dims,char***heads,int*d);
Summary of API
//get a list of integers, allocating the memory needed to store it
MLGetInteger32List(stdlink,int**a,int*n);
//get a list of floating-point numbers
MLGetReal64List(stdlink,double**a,int*n);
//release the memory associated with a list of integers
MLReleaseInteger32List(stdlink,int*a,int n);
//release the memory associated with a list of floating-point numbers
MLReleaseReal64List(stdlink,double*a,int n);



//get an array of integers of any depth
MLGetInteger32Array(stdlink,int**a,int**dims,char***heads,int*d);
//get an array of floating-point numbers of any depth
MLGetReal32Array(stdlink,float**a,int**dims,char***heads,int*d);
//release memory associated with an integer array
MLReleaseInteger32Array(stdlink,int*a,int*dims,char**heads,int d);
//release memory associated with a floating-point array
MLReleaseReal32Array(stdlink,float*a,int*dims,char**heads,int d);
Manual ReturnType
                                   void bits(int i)
                                   {
                                     int a[32], k;
:Begin:
:Function:     bits                    for(k=0; k<32; k++) {
:Pattern:      ToBits[i_Integer]         a[k] = i%2;
:Arguments:    {i}                       i >>= 1;
:ArgumentTypes:{Integer}                 if (i==0) break;
:ReturnType:   Manual                  }
:End:
                                       if (k<32) k++;

                                       MLPutInteger32List(stdlink,
                                                          a, k);
                                       return;
                                   }
General array
     int a[8][16][100];
     int dims[] = {8, 16, 100};

     MLPutInteger32Array(stdlink, a, dims, NULL, 3);

or
     int ***a;

     MLPutFunction(stdlink, "List", n1);
     for (i=0; i<n1; i++) {
       MLPutFunction(stdlink, "List", n2);
       for (j=0; j<n2; j++) {
         MLPutInteger32List(stdlink, a[i][j], n3);
       }
     }
Unkown length
 In[10]:=     Sequence 1, Sequence 4, Sequence
Out[10]=      1, 4


            MLPutFunction(stdlink, "List", 1);

            while( condition )
            {
              /* generate an element */
              MLPutFunction(stdlink, "Sequence", 2);
              MLPutInteger32(stdlink, i );
            }

            MLPutFunction(stdlink, "Sequence", 0);
Return Complex
             numbers
// Complex data type
typedef float2 Complex;

Complex* h_convolved_signal;

// Return transformed signal to Mathematica as a Complex List
MLPutFunction(stdlink,"List",n);
for (long i = 0; i < n; i++) {
    MLPutFunction(stdlink,"Complex",2);
    MLPutFloat(stdlink,h_convolved_signal[i].x*norm);
    MLPutFloat(stdlink,h_convolved_signal[i].y*norm);
}
Return Complex
              numbers
 In[4]:=   list   Table RandomReal   , 12
Out[4]=    0.389421, 0.222396, 0.434636, 0.0886136, 0.233102, 0.941771,
           0.928712, 0.764119, 0.791473, 0.381426, 0.757661, 0.44273

 In[5]:=   Map Function   x , Apply Complex, x   , Partition list, 2


Out[5]=    0.389421   0.222396 , 0.434636   0.0886136 , 0.233102 0.941771 ,
           0.928712   0.764119 , 0.791473   0.381426 , 0.757661 0.44273

           // Return transformed signal to Mathematica as a Complex List
           MLPutFunction(stdlink, "Map", 2);
           MLPutFunction(stdlink, "Function", 2);
           MLPutFunction(stdlink, "List", 1);
           MLPutSymbol(stdlink, "x");
           MLPutFunction(stdlink, "Apply", 2);
           MLPutSymbol(stdlink, "Complex");
           MLPutSymbol(stdlink, "x");
           MLPutFunction(stdlink, "Partition", 2);
           MLPutFunction(stdlink, "Times", 2);
           MLPutReal(stdlink, norm);
           MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n);
           MLPutInteger(stdlink, 2);
Error & Interrupt
if(! MLPutInteger(stdlink, 10))              if(! MLPutReal64(stdlink, 3.22))
{                                            {
  /* check the possible errors */              /* unable to send 3.22 to lp */
  switch(MLError(stdlink))                     printf("MathLink Error: %sn",
  {                                                   MLErrorMessage(stdlink));
    case MLEDEAD:                              MLClearError(stdlink);
      /* the link died unexpectedly */       }
      break;
    case MLECLOSED:
      /* the other side closed the link */
      break;
    case MLEOK:
      /* no error occurred */                while(len--)
      break;                                 {
    default:                                   sum += *list++;
    /* ... */                                  /* check for the abort */
  }                                            if(MLAbort) return (double)0;
}                                            }
Running on remote
                  computers
           $ ./addtwo -linkcreate -linkprotocol TCPIP
           Link created on: 63166@192.168.1.107,63167@192.168.1.107



 In[5]:=   Install LinkConnect "63166 192.168.1.107,63167 192.168.1.107",
             LinkProtocol "TCPIP"

Out[5]=    LinkObject 63166 192.168.1.107,63167 192.168.1.107, 1110, 8

 In[6]:=   AddTwo 2, 3
Out[6]=    5
Mathematica + CUDA
#include <cutil_inline.h>

int main(int argc, char **argv)
{
    // use command-line specified CUDA device,
    // otherwise use device with highest Gflops/s
    if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
         cutilDeviceInit(argc, argv);
    else
         cudaSetDevice( cutGetMaxGflopsDeviceId() );

    return MLMain(argc, argv);
}
mathematica_cuda
# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES     := cuFourier.cu
# CUDA dependency files
# CU_DEPS       :=
# C/C++ source files (compiled with gcc / c++)
# CCFILES       :=
# Additional libraries needed by the project
USECUFFT        := 1
# MathLink Template files
TMFILES     := cuFourier.tm

###################################################
# Rules and targets

include ../../common/common.mk
FindCUDA +
FindMathLink via CMake

 • CMake http://www.cmake.org/
 • FindCUDA https://gforge.sci.utah.edu/
   gf/project/findcuda/

 • FindMathLink http://github.com/kashif/
   FindMathLink/tree
CMakeLists.txt
set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
set(source_files test_bin.cu)
CUDA_COMPILE(CUDA_FILES test_bin.cu)

MathLink_ADD_TM(test.tm)

INCLUDE_DIRECTORIES(
  ${MathLink_INCLUDE_DIR}
  )
LINK_DIRECTORIES(
  ${MathLink_LIBRARY_DIR}
  )

ADD_EXECUTABLE(cuda_compile_example
   ${CUDA_FILES}
   ${source_files}
   test.tm.c
   main.cc
   external_dependency.h
   )
TARGET_LINK_LIBRARIES(cuda_compile_example
 ${MathLink_LIBRARIES}
 ${CUDA_LIBRARIES}
 )
double to float
                 conversion
#include <cutil_inline.h>
// General check for CUDA GPU SM Capabilities
//inline bool cutilDrvCudaCapabilities(int major_version, int minor_version);

char **heads;
int *dims;
int rank;
float *h_float;
double *h_double;

if (cutilDrvCudaCapabilities( 1,3 ))
{
     MLGetReal64Array(stdlink, &h_double, &dims, &heads, &rank);
}
else
{
     MLGetReal32Array(stdlink, &h_float, &dims, &heads, &rank);
}
CUBLAS & CUFFT

• Follow the usual routine of sending data
  to the MathLink app

• Use CUBLAS or CUFFT
• Return result back to Mathematica
cuFourier
In[1]:=   ListLinePlot Abs Fourier RandomReal 1, 200   ^2

          0.30


          0.25


          0.20


Out[1]= 0.15


          0.10


          0.05



                       50          100        150           200
Clone mathematica_cuda


$ git clone
  git://github.com/kashif/mathematica_cuda.git

$ cd mathematica_cuda/src

$ mkdir cuFourier

$ mate cuFourier
cuFourier.tm


:Begin:
:Function:     cuFourier1D
:Pattern:      CUFourier1D[ a:{__?NumericQ} ]
:Arguments:    { a }
:ArgumentTypes:{ RealList }
:ReturnType:   Manual
:End:
cuFourier.cu
// includes system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes cuda
#include <cufft.h>
#include <cutil_inline.h>

// includes mathlink
#include <mathlink.h>

// Complex data type
typedef float2 Complex;

///////////////////////////////////////////////////////////////
// Showing the use of CUFFT for fast convolution using FFT.
///////////////////////////////////////////////////////////////
extern "C" void cuFourier1D(double*, long);
////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
    // use command-line specified CUDA device, otherwise use device
    // with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
         cutilDeviceInit(argc, argv);
    else
         cudaSetDevice( cutGetMaxGflopsDeviceId() );

    return MLMain(argc, argv);
}
void cuFourier1D (double *h_A, long n)
{
    double norm = 1.0/sqrt((double) n);
    long mem_size = sizeof(Complex) * n;

   // Allocate host memory for the signal
   Complex* h_signal = (Complex*)malloc(mem_size);

   // Initalize the memory for the signal
   for (long i = 0; i < n; ++i) {
       h_signal[i].x = (float)h_A[i];
       h_signal[i].y = 0.0f;
   }

   // Allocate device memory for signal
   Complex* d_signal;
   cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size));
   // Copy host memory to device
   cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size,
                            cudaMemcpyHostToDevice));
// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlan1d(&plan, n, CUFFT_C2C, 1));

// Transform signal
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal,
                                 (cufftComplex *)d_signal,
                                 CUFFT_INVERSE));

// Copy device memory to host
Complex* h_convolved_signal = h_signal;
cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal,
                         mem_size, cudaMemcpyDeviceToHost));

// Release d_signal
cutilSafeCall(cudaFree(d_signal));

// Destroy CUFFT context
cufftSafeCall(cufftDestroy(plan));
// Return transformed signal to Mathematica as a Complex List
    MLPutFunction(stdlink, "Map", 2);
    MLPutFunction(stdlink, "Function", 2);
    MLPutFunction(stdlink, "List", 1);
    MLPutSymbol(stdlink, "x");
    MLPutFunction(stdlink, "Apply", 2);
    MLPutSymbol(stdlink, "Complex");
    MLPutSymbol(stdlink, "x");
    MLPutFunction(stdlink, "Partition", 2);
    MLPutFunction(stdlink, "Times", 2);
    MLPutReal(stdlink, norm);
    MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n);
    MLPutInteger(stdlink, 2);

    // Cleanup memory
    free(h_signal);

    cudaThreadExit();
}
Makefile
##################################################################
#
# Build script for project
#
##################################################################

# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES     := cuFourier.cu
# Additional libraries needed by the project
USECUFFT        := 1

# MathLink Template files
TMFILES     := cuFourier.tm

##################################################################
# Rules and targets
include ../../common/common.mk
In[35]:=   link
             Install
              " Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
                 release cuFourier"
Out[35]=    LinkObject
              Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
               release cuFourier, 605, 9

 In[36]:=   LinkPatterns link
Out[36]=     CUFourier1D a : __ ?NumericQ

 In[37]:=   ListLinePlot Abs CUFourier1D RandomReal 1, 200    ^2



            0.4



            0.3


Out[37]=
            0.2



            0.1




                         50          100        150          200


 In[38]:=   Uninstall link
Out[38]=     Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
              release cuFourier
Image Deconvolution
  for Life Sciences

• Confocal and Widefield microscopy
  3D or 4D images

• Multichannel (3 or more channels)
• Comes in a wide variety of formats
Bio-Formats Java lib.

• Standalone Java library for reading and
  writing life science image formats

• Get both the pixels and metadata
• Licensed under GPL
• http://www.loci.wisc.edu/ome/
  formats.html
Java + Mathematica:
             J/Link
Needs "JLink`"

InstallJava
LinkObject
 ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
   " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
    Xmx256m   Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
    Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
   com.wolfram.jlink.Install init " tmp m000001207601", 4, 4

ReinstallJava ClassPath   " home kashif Dropbox BioFormats Java loci_tools.jar"
LinkObject
 ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
   " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
    Xmx256m   Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
    Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
   com.wolfram.jlink.Install init " tmp m000002207601", 8, 4
Reading LIF images
reader    JavaNew "loci.formats.ImageReader"      LoadJavaClass "loci.formats.FormatTools"

« JavaObject loci.formats.ImageReader »           JavaClass loci.formats.FormatTools,

                                                  bpp   FormatTools`getBytesPerPixel pixelType
reader setId " media cdrom xyz 1ch by2 MT1.lif"
                                                  1
reader getSeriesCount
                                                  reader getSizeX
7
                                                  512
reader setSeries 0
                                                  reader getSizeY
sizeC    reader getSizeC                          512
1
                                                  reader getSizeZ
pixelType    reader getPixelType                  90

1

num     reader getImageCount
90
Reading pixel volume
 LoadJavaClass "loci.common.DataTools"
 JavaClass loci.common.DataTools,

 volume
   Flatten
    N
      Table DataTools`makeDataArray
        reader openBytes z, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ,
        z, 0, reader getSizeZ    1    ;




unflatten e_, d__ ? IntegerQ               && Positive     &   :
 Fold Partition, e, Take d ,              1, 2, 1      ; Length e                Times d

array   unflatten volume, reader getSizeX                    , reader getSizeY         ,
    reader getSizeZ     ;
View a slice
Image array   165, All, All   255
Image deconvled
                   Result
                  165, All, All
Wiener Deconv.
:Begin:
:Function:      wienerDeconvolve
:Pattern:       WienerDeconvolve[nx_Integer, ny_Integer, nz_Integer,
                                 epsilon_Real, sigma_Real, inImage:{___Real}]
:Arguments:     { nx, ny, nz, epsilon, sigma, inImage }
:ArgumentTypes: { Integer, Integer, Integer, Real, Real, Manual }
:ReturnType:    Manual
:End:



void wienerDeconvolve(int nx, int ny, int nz, double epsilon, double sigma)
{
    float *inImage;
    int length;

    if(! MLGetReal32List(stdlink, &inImage, &length))
    {
        return;
    }
amira Projection view
     ®


http://www.amiravis.com
Export " home kashif Amira522 data deconv alphalobe MaxLike.raw",
  result, "Real32" ;
Remote Sensing
  application
Reflectance
Vegetation
Landsat TM Data
Band 3 & Band 4
NDVI = NIR-R/NIR+R
Reading Landsat Images
   In[4]:=   reader    JavaNew "loci.formats.ImageReader"
  Out[4]=    « JavaObject loci.formats.ImageReader »

   In[5]:=   reader    JavaNew "loci.formats.ChannelSeparator", reader
  Out[5]=    « JavaObject loci.formats.ChannelSeparator »

  In[35]:=   reader setId " Users sabman satellite_images multispectral bhtmref.tif"

   In[7]:=   reader getSeriesCount
  Out[7]=    1

   In[8]:=   sizeC    reader getSizeC
  Out[8]=    6

   In[9]:=   pixelType    reader getPixelType
  Out[9]=    1

  In[11]:=   num     reader getImageCount
 Out[11]=    6

  In[12]:=   pixelType    reader getPixelType
Loading Landsat data
  in Mathematica
    In[14]:=   LoadJavaClass "loci.formats.FormatTools"
   Out[14]=    JavaClass loci.formats.FormatTools,

    In[15]:=   bpp    FormatTools`getBytesPerPixel pixelType
   Out[15]=    1

    In[16]:=   reader getSizeX
   Out[16]=    512

    In[17]:=   isLittle   reader isLittleEndian
   Out[17]=    True

    In[18]:=   reader getSizeY
   Out[18]=    512

    In[19]:=   LoadJavaClass "loci.common.DataTools"
   Out[19]=    JavaClass loci.common.DataTools,
In[31]:=   red      DataTools`makeDataArray
                 reader openBytes 2, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ;

In[53]:=   Image Partition 100 Normalize red , reader getSizeX
In[56]:=   NIR      DataTools`makeDataArray
                 reader openBytes 3, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ;

In[57]:=   Image Partition 100 Normalize NIR , reader getSizeX
In[39]:=   link   Install " Users sabman mathematica_cuda bin darwin emurelease ndvi"
Out[39]=    LinkObject   Users sabman mathematica_cuda bin darwin emurelease ndvi, 41, 10

 In[40]:=   LinkPatterns link
Out[40]=    ndvi a_List, b_List

 In[41]:=   NDVI   ndvi Partition NIR, reader getSizeX    , Partition red, reader getSizeX   ;

 In[42]:=   Image Partition NDVI, reader getSizeX
ndvi.tm


:Begin:
:Function:        ndvi
:Pattern:         ndvi[ a_List, b_List ]
:Arguments:       { a, b }
:ArgumentTypes:   { Manual }
:ReturnType:      Manual
:End:
ndvi.cu
void ndvi(void)
{
    short int *h_A, *h_B;
    float *h_C_GPU;
    short int *d_A, *d_B;
    float *d_C;

    char **heads_A, **heads_B;
    int *dims_A, *dims_B;
    int rank_A, rank_B;

   if(! MLGetInteger16Array(stdlink, &h_A, &dims_A, &heads_A, &rank_A))
   {
       return;
   }

   if(! MLGetInteger16Array(stdlink, &h_B, &dims_B, &heads_B, &rank_B))
   {
       return;
   }
//Initializing data
h_C_GPU = (float *)malloc(dims_A[0]*dims_A[1]*sizeof(float));

//Allocating GPU memory
cutilSafeCall( cudaMalloc((void **)&d_A, dims_A[0]*dims_A[1]*sizeof(short int)) );
cutilSafeCall( cudaMalloc((void **)&d_B, dims_A[0]*dims_A[1]*sizeof(short int)) );
cutilSafeCall( cudaMalloc((void **)&d_C, dims_A[0]*dims_A[1]*sizeof(float)) );

//Copy data to GPU memory for further processing
cutilSafeCall( cudaMemcpy(d_A, h_A, dims_A[0]*dims_A[1]*sizeof(short int),
               cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, dims_A[0]*dims_A[1]*sizeof(short int),
               cudaMemcpyHostToDevice) );

cutilSafeCall( cudaThreadSynchronize() );

dim3 grid(ceil((float)dims_A[0]/(float)16.0f), ceil((float) dims_A[1]/32.0f), 1);
dim3 threads(ceil( dims_A[0]/(float)grid.x), ceil( dims_A[1]/(float)grid.y), 1);

ndviGPU<<<grid, threads>>>(d_C, d_A, d_B, dims_A[0], dims_A[1]);
cutilCheckMsg("ndviGPU() execution failedn");
cutilSafeCall( cudaThreadSynchronize() );
//Release d_A and d_B
cutilSafeCall( cudaFree(d_B) );
cutilSafeCall( cudaFree(d_A) );

//Read back GPU results into h_C_GPU
cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float),
               cudaMemcpyDeviceToHost) );

//Release d_C
cutilSafeCall( cudaFree(d_C) );

//Return result
MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]);

//Release h_A and h_B
MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A);
MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B);

cudaThreadExit();
NDVI Kernel
///////////////////////////////////////////////////////////////////////////////
// Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C
///////////////////////////////////////////////////////////////////////////////

__global__ void ndviGPU(
    float *d_C,
    short int *d_A,
    short int *d_B,
    int width,
    int height
){

    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

    if(xIndex < width && yIndex < height)
    {
        unsigned int i = yIndex * (width) + xIndex;
        d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) );
    }
}
NDVI output




                                 0                     1


In[64]:=   ArrayPlot Partition NDVI, reader getSizeX       , ColorFunction   "Rainbow"
Questions?

http://hpc.nomad-labs.com
    kashif@nomad-labs.com
         twitter krasul

Mais conteúdo relacionado

Mais procurados

Mais procurados (20)

Data structure lab manual
Data structure lab manualData structure lab manual
Data structure lab manual
 
C Programming - Refresher - Part II
C Programming - Refresher - Part II C Programming - Refresher - Part II
C Programming - Refresher - Part II
 
A taste of Functional Programming
A taste of Functional ProgrammingA taste of Functional Programming
A taste of Functional Programming
 
Scala is java8.next()
Scala is java8.next()Scala is java8.next()
Scala is java8.next()
 
Lecture11 standard template-library
Lecture11 standard template-libraryLecture11 standard template-library
Lecture11 standard template-library
 
Matlab Functions
Matlab FunctionsMatlab Functions
Matlab Functions
 
Single linked list
Single linked listSingle linked list
Single linked list
 
Lec 45.46- virtual.functions
Lec 45.46- virtual.functionsLec 45.46- virtual.functions
Lec 45.46- virtual.functions
 
Introduction to functional programming using Ocaml
Introduction to functional programming using OcamlIntroduction to functional programming using Ocaml
Introduction to functional programming using Ocaml
 
C++ Pointers
C++ PointersC++ Pointers
C++ Pointers
 
C++11 & C++14
C++11 & C++14C++11 & C++14
C++11 & C++14
 
07. Arrays
07. Arrays07. Arrays
07. Arrays
 
Arrays
ArraysArrays
Arrays
 
Functional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Functional Core and Imperative Shell - Game of Life Example - Haskell and ScalaFunctional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Functional Core and Imperative Shell - Game of Life Example - Haskell and Scala
 
L7 pointers
L7 pointersL7 pointers
L7 pointers
 
friends functionToshu
friends functionToshufriends functionToshu
friends functionToshu
 
Pointers [compatibility mode]
Pointers [compatibility mode]Pointers [compatibility mode]
Pointers [compatibility mode]
 
Unit 6 pointers
Unit 6   pointersUnit 6   pointers
Unit 6 pointers
 
13. Java text processing
13.  Java text processing13.  Java text processing
13. Java text processing
 
C tech questions
C tech questionsC tech questions
C tech questions
 

Semelhante a Using Cuda Within Mathematica

Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0Sheik Uduman Ali
 
Arrays and function basic c programming notes
Arrays and function basic c programming notesArrays and function basic c programming notes
Arrays and function basic c programming notesGOKULKANNANMMECLECTC
 
operating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdfoperating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdfaptcomputerzone
 
C++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdfC++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdfaashisha5
 
Best C++ Programming Homework Help
Best C++ Programming Homework HelpBest C++ Programming Homework Help
Best C++ Programming Homework HelpC++ Homework Help
 
Pointers in C Language
Pointers in C LanguagePointers in C Language
Pointers in C Languagemadan reddy
 
Operator overloading2
Operator overloading2Operator overloading2
Operator overloading2zindadili
 
C++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdfC++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdfsaradashata
 
An imperative study of c
An imperative study of cAn imperative study of c
An imperative study of cTushar B Kute
 
Background Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdfBackground Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdfaaseletronics2013
 

Semelhante a Using Cuda Within Mathematica (20)

Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0
 
Arrays and function basic c programming notes
Arrays and function basic c programming notesArrays and function basic c programming notes
Arrays and function basic c programming notes
 
Functional Programming
Functional ProgrammingFunctional Programming
Functional Programming
 
Pointer
PointerPointer
Pointer
 
The STL
The STLThe STL
The STL
 
operating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdfoperating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdf
 
C++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdfC++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdf
 
Best C++ Programming Homework Help
Best C++ Programming Homework HelpBest C++ Programming Homework Help
Best C++ Programming Homework Help
 
TechTalk - Dotnet
TechTalk - DotnetTechTalk - Dotnet
TechTalk - Dotnet
 
Pointers in C Language
Pointers in C LanguagePointers in C Language
Pointers in C Language
 
Codejunk Ignitesd
Codejunk IgnitesdCodejunk Ignitesd
Codejunk Ignitesd
 
Operator overloading2
Operator overloading2Operator overloading2
Operator overloading2
 
CPP Homework Help
CPP Homework HelpCPP Homework Help
CPP Homework Help
 
Copy on write
Copy on writeCopy on write
Copy on write
 
C++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdfC++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdf
 
An imperative study of c
An imperative study of cAn imperative study of c
An imperative study of c
 
Background Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdfBackground Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdf
 
C programming
C programmingC programming
C programming
 
Lecture5
Lecture5Lecture5
Lecture5
 
Lecture5
Lecture5Lecture5
Lecture5
 

Mais de Shoaib Burq

Async. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.jsAsync. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.jsShoaib Burq
 
Global Random Hacks of Kindness Berlin
Global Random Hacks of Kindness BerlinGlobal Random Hacks of Kindness Berlin
Global Random Hacks of Kindness BerlinShoaib Burq
 
OpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers WorkflowOpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers WorkflowShoaib Burq
 
Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)Shoaib Burq
 
OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake Shoaib Burq
 
Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010Shoaib Burq
 
Opening of Geographic Data
Opening of Geographic DataOpening of Geographic Data
Opening of Geographic DataShoaib Burq
 
Mapping Multan and beyond with OSM
Mapping Multan and beyond with OSMMapping Multan and beyond with OSM
Mapping Multan and beyond with OSMShoaib Burq
 
Where20 2008 Ruby Tutorial
Where20 2008 Ruby TutorialWhere20 2008 Ruby Tutorial
Where20 2008 Ruby TutorialShoaib Burq
 
learning interoperability from web2.0
learning interoperability from web2.0learning interoperability from web2.0
learning interoperability from web2.0Shoaib Burq
 

Mais de Shoaib Burq (11)

Async. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.jsAsync. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.js
 
Global Random Hacks of Kindness Berlin
Global Random Hacks of Kindness BerlinGlobal Random Hacks of Kindness Berlin
Global Random Hacks of Kindness Berlin
 
OpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers WorkflowOpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers Workflow
 
Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)
 
OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake
 
Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010
 
Opening of Geographic Data
Opening of Geographic DataOpening of Geographic Data
Opening of Geographic Data
 
Mapping Multan and beyond with OSM
Mapping Multan and beyond with OSMMapping Multan and beyond with OSM
Mapping Multan and beyond with OSM
 
Where20 2008 Ruby Tutorial
Where20 2008 Ruby TutorialWhere20 2008 Ruby Tutorial
Where20 2008 Ruby Tutorial
 
learning interoperability from web2.0
learning interoperability from web2.0learning interoperability from web2.0
learning interoperability from web2.0
 
Rails Gis Hacks
Rails Gis HacksRails Gis Hacks
Rails Gis Hacks
 

Último

Take control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test SuiteTake control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test SuiteDianaGray10
 
From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .Alan Dix
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clashcharlottematthew16
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek SchlawackFwdays
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Scott Keck-Warren
 
Developer Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQLDeveloper Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQLScyllaDB
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr BaganFwdays
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Commit University
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Mattias Andersson
 
Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfAlex Barbosa Coqueiro
 
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdfHyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdfPrecisely
 
Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsMiki Katsuragi
 
SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024Lorenzo Miniero
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxNavinnSomaal
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsPixlogix Infotech
 
DSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine TuningDSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine TuningLars Bell
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebUiPathCommunity
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity PlanDatabarracks
 

Último (20)

Take control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test SuiteTake control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test Suite
 
From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clash
 
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptxE-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
 
DMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special EditionDMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special Edition
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024
 
Developer Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQLDeveloper Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQL
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?
 
Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdf
 
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdfHyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
 
Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering Tips
 
SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptx
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and Cons
 
DSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine TuningDSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine Tuning
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio Web
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity Plan
 

Using Cuda Within Mathematica

  • 2. Overview • Intro to Mathematica and its API • CUDA + Mathematica • Some examples
  • 3. Mathematica intro • Mathematica is a modular computational system in which the kernel is separate from the front end which handles the interaction with the user. • The most common way to work is to use interactive documents called notebooks which mix text input and output as well as graphics and other material.
  • 4. Structure of Mathematica • An import aspect of Mathematica is that it can also interact with other applications. • This is achieved through MathLink, a standardised API for two-way communication with the kernel.
  • 5. MathLink • MathLink allows external programs both to call Mathematica, and to be called by Mathematica. • We will use MathLink to let Mathematica call CUDA functions inside an external program.
  • 6. Simple example addtwo.tm :Begin: :Function: addtwo :Pattern: AddTwo[i_Integer,j_Integer] :Arguments: { i, j } :ArgumentTypes: {Integer,Integer} :ReturnType: Integer :End:
  • 7. addtwo.c #include <mathlink.h> int addtwo( int i, int j) { return i+j; } int main(int argc, char* argv[]) { return MLMain(argc, argv); }
  • 8. mprep & gcc $ mprep addtwo.tm -o addtwotm.c $ gcc -I${INCDIR} addtwotm.c addtwo.c -L${LIBDIR} -lMLi3 -lstdc++ -o addtwo
  • 9. In[3]:= SetDirectory " Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples" Out[3]= Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples In[4]:= link Install ". addtwo" Out[4]= LinkObject Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples addtwo, 524, 8 In[5]:= LinkPatterns link Out[5]= AddTwo i_Integer, j_Integer In[6]:= ? AddTwo AddTwo x , y gives the sum of two machine integers x and y. In[7]:= AddTwo 2, 3 Out[7]= 5 In[8]:= AddTwo 2^31 1, 1 Out[8]= 2 147 483 648 In[9]:= Uninstall link Out[9]= Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples addtwo
  • 10. MathLink Template file • When a MathLink template file is processed, two basic things are done: • :Pattern:& :Arguments: specifications are used to generate a Mathematica definition • :Function:, :ArgumentTypes: & :ReturnType: specifications are used to generate C source code
  • 11. :ArgumentTypes: Mathematica specification C specification Integer int Real double IntegerList int*, long RealList double*, long String char* Symbol char* Manual void
  • 12. Handling Lists & Arrays :Begin: int sumList(int *a, long alen) :Function: sumList { :Pattern: SumList[a_List] int i, tot=0; :Arguments: {a} :ArgumentTypes:{IntegerList} for(i=0; i<alen; i++) :ReturnType: Integer tot += a[i]; :End: return tot; }
  • 13. Manual ArgumentTypes :Begin: :Function: sumList :Pattern: SumList[a:{___Integer}] :Arguments: {a} :ArgumentTypes:{Manual} :ReturnType: Integer :End: int sumList(void) { int sumList(void) { int n, i; int n; int a[MAX]; int *a; MLCheckFunction(stdlink, "List", &n); MLGetInteger32List(stdlink, &a, &n); ... for (i=0; i<n; i++) MLReleaseInteger32List(stdlink, a, n); MLGetInteger32(stdlink, a+i); ... ... } }
  • 14. Array of arb. depth #include <mathlink.h> /* read an array of double-precision floating-point numbers from a link */ void f(MLINK lp) { double *data; int *dims; char **heads; int d; /* stores the rank of the array */ if(! MLGetRealArray(lp, &data, &dims, &heads, &d)) { /* unable to read the array from lp */ return; } /* ... */ MLReleaseRealArray(lp, data, dims, heads, d); }
  • 15. Handling Complex numbers In[1]:= Head 2 3 Out[1]= Complex If you pass a list of complex numbers to your external program, then MLGetReal64Array() will create a two-dimensional array containing a sequence of pairs of real and imaginary parts. In this case, heads[0] will be "List" while heads[1] will be "Complex". //get an array of floating-point numbers of any depth MLGetReal64Array(stdlink,double**a,int**dims,char***heads,int*d);
  • 16. Summary of API //get a list of integers, allocating the memory needed to store it MLGetInteger32List(stdlink,int**a,int*n); //get a list of floating-point numbers MLGetReal64List(stdlink,double**a,int*n); //release the memory associated with a list of integers MLReleaseInteger32List(stdlink,int*a,int n); //release the memory associated with a list of floating-point numbers MLReleaseReal64List(stdlink,double*a,int n); //get an array of integers of any depth MLGetInteger32Array(stdlink,int**a,int**dims,char***heads,int*d); //get an array of floating-point numbers of any depth MLGetReal32Array(stdlink,float**a,int**dims,char***heads,int*d); //release memory associated with an integer array MLReleaseInteger32Array(stdlink,int*a,int*dims,char**heads,int d); //release memory associated with a floating-point array MLReleaseReal32Array(stdlink,float*a,int*dims,char**heads,int d);
  • 17. Manual ReturnType void bits(int i) { int a[32], k; :Begin: :Function: bits for(k=0; k<32; k++) { :Pattern: ToBits[i_Integer] a[k] = i%2; :Arguments: {i} i >>= 1; :ArgumentTypes:{Integer} if (i==0) break; :ReturnType: Manual } :End: if (k<32) k++; MLPutInteger32List(stdlink, a, k); return; }
  • 18. General array int a[8][16][100]; int dims[] = {8, 16, 100}; MLPutInteger32Array(stdlink, a, dims, NULL, 3); or int ***a; MLPutFunction(stdlink, "List", n1); for (i=0; i<n1; i++) { MLPutFunction(stdlink, "List", n2); for (j=0; j<n2; j++) { MLPutInteger32List(stdlink, a[i][j], n3); } }
  • 19. Unkown length In[10]:= Sequence 1, Sequence 4, Sequence Out[10]= 1, 4 MLPutFunction(stdlink, "List", 1); while( condition ) { /* generate an element */ MLPutFunction(stdlink, "Sequence", 2); MLPutInteger32(stdlink, i ); } MLPutFunction(stdlink, "Sequence", 0);
  • 20. Return Complex numbers // Complex data type typedef float2 Complex; Complex* h_convolved_signal; // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink,"List",n); for (long i = 0; i < n; i++) { MLPutFunction(stdlink,"Complex",2); MLPutFloat(stdlink,h_convolved_signal[i].x*norm); MLPutFloat(stdlink,h_convolved_signal[i].y*norm); }
  • 21. Return Complex numbers In[4]:= list Table RandomReal , 12 Out[4]= 0.389421, 0.222396, 0.434636, 0.0886136, 0.233102, 0.941771, 0.928712, 0.764119, 0.791473, 0.381426, 0.757661, 0.44273 In[5]:= Map Function x , Apply Complex, x , Partition list, 2 Out[5]= 0.389421 0.222396 , 0.434636 0.0886136 , 0.233102 0.941771 , 0.928712 0.764119 , 0.791473 0.381426 , 0.757661 0.44273 // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink, "Map", 2); MLPutFunction(stdlink, "Function", 2); MLPutFunction(stdlink, "List", 1); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Apply", 2); MLPutSymbol(stdlink, "Complex"); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Partition", 2); MLPutFunction(stdlink, "Times", 2); MLPutReal(stdlink, norm); MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n); MLPutInteger(stdlink, 2);
  • 22. Error & Interrupt if(! MLPutInteger(stdlink, 10)) if(! MLPutReal64(stdlink, 3.22)) { { /* check the possible errors */ /* unable to send 3.22 to lp */ switch(MLError(stdlink)) printf("MathLink Error: %sn", { MLErrorMessage(stdlink)); case MLEDEAD: MLClearError(stdlink); /* the link died unexpectedly */ } break; case MLECLOSED: /* the other side closed the link */ break; case MLEOK: /* no error occurred */ while(len--) break; { default: sum += *list++; /* ... */ /* check for the abort */ } if(MLAbort) return (double)0; } }
  • 23. Running on remote computers $ ./addtwo -linkcreate -linkprotocol TCPIP Link created on: 63166@192.168.1.107,63167@192.168.1.107 In[5]:= Install LinkConnect "63166 192.168.1.107,63167 192.168.1.107", LinkProtocol "TCPIP" Out[5]= LinkObject 63166 192.168.1.107,63167 192.168.1.107, 1110, 8 In[6]:= AddTwo 2, 3 Out[6]= 5
  • 24. Mathematica + CUDA #include <cutil_inline.h> int main(int argc, char **argv) { // use command-line specified CUDA device, // otherwise use device with highest Gflops/s if(cutCheckCmdLineFlag(argc, (const char**)argv, "device")) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); return MLMain(argc, argv); }
  • 25. mathematica_cuda # Add source files here EXECUTABLE := cuFourier # CUDA source files (compiled with cudacc) CUFILES := cuFourier.cu # CUDA dependency files # CU_DEPS := # C/C++ source files (compiled with gcc / c++) # CCFILES := # Additional libraries needed by the project USECUFFT := 1 # MathLink Template files TMFILES := cuFourier.tm ################################################### # Rules and targets include ../../common/common.mk
  • 26. FindCUDA + FindMathLink via CMake • CMake http://www.cmake.org/ • FindCUDA https://gforge.sci.utah.edu/ gf/project/findcuda/ • FindMathLink http://github.com/kashif/ FindMathLink/tree
  • 27. CMakeLists.txt set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) set(source_files test_bin.cu) CUDA_COMPILE(CUDA_FILES test_bin.cu) MathLink_ADD_TM(test.tm) INCLUDE_DIRECTORIES( ${MathLink_INCLUDE_DIR} ) LINK_DIRECTORIES( ${MathLink_LIBRARY_DIR} ) ADD_EXECUTABLE(cuda_compile_example ${CUDA_FILES} ${source_files} test.tm.c main.cc external_dependency.h ) TARGET_LINK_LIBRARIES(cuda_compile_example ${MathLink_LIBRARIES} ${CUDA_LIBRARIES} )
  • 28. double to float conversion #include <cutil_inline.h> // General check for CUDA GPU SM Capabilities //inline bool cutilDrvCudaCapabilities(int major_version, int minor_version); char **heads; int *dims; int rank; float *h_float; double *h_double; if (cutilDrvCudaCapabilities( 1,3 )) { MLGetReal64Array(stdlink, &h_double, &dims, &heads, &rank); } else { MLGetReal32Array(stdlink, &h_float, &dims, &heads, &rank); }
  • 29. CUBLAS & CUFFT • Follow the usual routine of sending data to the MathLink app • Use CUBLAS or CUFFT • Return result back to Mathematica
  • 30. cuFourier In[1]:= ListLinePlot Abs Fourier RandomReal 1, 200 ^2 0.30 0.25 0.20 Out[1]= 0.15 0.10 0.05 50 100 150 200
  • 31. Clone mathematica_cuda $ git clone git://github.com/kashif/mathematica_cuda.git $ cd mathematica_cuda/src $ mkdir cuFourier $ mate cuFourier
  • 32. cuFourier.tm :Begin: :Function: cuFourier1D :Pattern: CUFourier1D[ a:{__?NumericQ} ] :Arguments: { a } :ArgumentTypes:{ RealList } :ReturnType: Manual :End:
  • 33. cuFourier.cu // includes system #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> // includes cuda #include <cufft.h> #include <cutil_inline.h> // includes mathlink #include <mathlink.h> // Complex data type typedef float2 Complex; /////////////////////////////////////////////////////////////// // Showing the use of CUFFT for fast convolution using FFT. /////////////////////////////////////////////////////////////// extern "C" void cuFourier1D(double*, long);
  • 34. //////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { // use command-line specified CUDA device, otherwise use device // with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); return MLMain(argc, argv); }
  • 35. void cuFourier1D (double *h_A, long n) { double norm = 1.0/sqrt((double) n); long mem_size = sizeof(Complex) * n; // Allocate host memory for the signal Complex* h_signal = (Complex*)malloc(mem_size); // Initalize the memory for the signal for (long i = 0; i < n; ++i) { h_signal[i].x = (float)h_A[i]; h_signal[i].y = 0.0f; } // Allocate device memory for signal Complex* d_signal; cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size)); // Copy host memory to device cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice));
  • 36. // CUFFT plan cufftHandle plan; cufftSafeCall(cufftPlan1d(&plan, n, CUFFT_C2C, 1)); // Transform signal cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE)); // Copy device memory to host Complex* h_convolved_signal = h_signal; cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost)); // Release d_signal cutilSafeCall(cudaFree(d_signal)); // Destroy CUFFT context cufftSafeCall(cufftDestroy(plan));
  • 37. // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink, "Map", 2); MLPutFunction(stdlink, "Function", 2); MLPutFunction(stdlink, "List", 1); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Apply", 2); MLPutSymbol(stdlink, "Complex"); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Partition", 2); MLPutFunction(stdlink, "Times", 2); MLPutReal(stdlink, norm); MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n); MLPutInteger(stdlink, 2); // Cleanup memory free(h_signal); cudaThreadExit(); }
  • 38. Makefile ################################################################## # # Build script for project # ################################################################## # Add source files here EXECUTABLE := cuFourier # CUDA source files (compiled with cudacc) CUFILES := cuFourier.cu # Additional libraries needed by the project USECUFFT := 1 # MathLink Template files TMFILES := cuFourier.tm ################################################################## # Rules and targets include ../../common/common.mk
  • 39. In[35]:= link Install " Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier" Out[35]= LinkObject Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier, 605, 9 In[36]:= LinkPatterns link Out[36]= CUFourier1D a : __ ?NumericQ In[37]:= ListLinePlot Abs CUFourier1D RandomReal 1, 200 ^2 0.4 0.3 Out[37]= 0.2 0.1 50 100 150 200 In[38]:= Uninstall link Out[38]= Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier
  • 40. Image Deconvolution for Life Sciences • Confocal and Widefield microscopy 3D or 4D images • Multichannel (3 or more channels) • Comes in a wide variety of formats
  • 41. Bio-Formats Java lib. • Standalone Java library for reading and writing life science image formats • Get both the pixels and metadata • Licensed under GPL • http://www.loci.wisc.edu/ome/ formats.html
  • 42. Java + Mathematica: J/Link Needs "JLink`" InstallJava LinkObject ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar" Xmx256m Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory com.wolfram.jlink.Install init " tmp m000001207601", 4, 4 ReinstallJava ClassPath " home kashif Dropbox BioFormats Java loci_tools.jar" LinkObject ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar" Xmx256m Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory com.wolfram.jlink.Install init " tmp m000002207601", 8, 4
  • 43. Reading LIF images reader JavaNew "loci.formats.ImageReader" LoadJavaClass "loci.formats.FormatTools" « JavaObject loci.formats.ImageReader » JavaClass loci.formats.FormatTools, bpp FormatTools`getBytesPerPixel pixelType reader setId " media cdrom xyz 1ch by2 MT1.lif" 1 reader getSeriesCount reader getSizeX 7 512 reader setSeries 0 reader getSizeY sizeC reader getSizeC 512 1 reader getSizeZ pixelType reader getPixelType 90 1 num reader getImageCount 90
  • 44. Reading pixel volume LoadJavaClass "loci.common.DataTools" JavaClass loci.common.DataTools, volume Flatten N Table DataTools`makeDataArray reader openBytes z, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True , z, 0, reader getSizeZ 1 ; unflatten e_, d__ ? IntegerQ && Positive & : Fold Partition, e, Take d , 1, 2, 1 ; Length e Times d array unflatten volume, reader getSizeX , reader getSizeY , reader getSizeZ ;
  • 45. View a slice Image array 165, All, All 255
  • 46. Image deconvled Result 165, All, All
  • 47. Wiener Deconv. :Begin: :Function: wienerDeconvolve :Pattern: WienerDeconvolve[nx_Integer, ny_Integer, nz_Integer, epsilon_Real, sigma_Real, inImage:{___Real}] :Arguments: { nx, ny, nz, epsilon, sigma, inImage } :ArgumentTypes: { Integer, Integer, Integer, Real, Real, Manual } :ReturnType: Manual :End: void wienerDeconvolve(int nx, int ny, int nz, double epsilon, double sigma) { float *inImage; int length; if(! MLGetReal32List(stdlink, &inImage, &length)) { return; }
  • 48. amira Projection view ® http://www.amiravis.com
  • 49. Export " home kashif Amira522 data deconv alphalobe MaxLike.raw", result, "Real32" ;
  • 50. Remote Sensing application
  • 54. Band 3 & Band 4
  • 56. Reading Landsat Images In[4]:= reader JavaNew "loci.formats.ImageReader" Out[4]= « JavaObject loci.formats.ImageReader » In[5]:= reader JavaNew "loci.formats.ChannelSeparator", reader Out[5]= « JavaObject loci.formats.ChannelSeparator » In[35]:= reader setId " Users sabman satellite_images multispectral bhtmref.tif" In[7]:= reader getSeriesCount Out[7]= 1 In[8]:= sizeC reader getSizeC Out[8]= 6 In[9]:= pixelType reader getPixelType Out[9]= 1 In[11]:= num reader getImageCount Out[11]= 6 In[12]:= pixelType reader getPixelType
  • 57. Loading Landsat data in Mathematica In[14]:= LoadJavaClass "loci.formats.FormatTools" Out[14]= JavaClass loci.formats.FormatTools, In[15]:= bpp FormatTools`getBytesPerPixel pixelType Out[15]= 1 In[16]:= reader getSizeX Out[16]= 512 In[17]:= isLittle reader isLittleEndian Out[17]= True In[18]:= reader getSizeY Out[18]= 512 In[19]:= LoadJavaClass "loci.common.DataTools" Out[19]= JavaClass loci.common.DataTools,
  • 58. In[31]:= red DataTools`makeDataArray reader openBytes 2, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True ; In[53]:= Image Partition 100 Normalize red , reader getSizeX
  • 59. In[56]:= NIR DataTools`makeDataArray reader openBytes 3, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True ; In[57]:= Image Partition 100 Normalize NIR , reader getSizeX
  • 60. In[39]:= link Install " Users sabman mathematica_cuda bin darwin emurelease ndvi" Out[39]= LinkObject Users sabman mathematica_cuda bin darwin emurelease ndvi, 41, 10 In[40]:= LinkPatterns link Out[40]= ndvi a_List, b_List In[41]:= NDVI ndvi Partition NIR, reader getSizeX , Partition red, reader getSizeX ; In[42]:= Image Partition NDVI, reader getSizeX
  • 61. ndvi.tm :Begin: :Function: ndvi :Pattern: ndvi[ a_List, b_List ] :Arguments: { a, b } :ArgumentTypes: { Manual } :ReturnType: Manual :End:
  • 62. ndvi.cu void ndvi(void) { short int *h_A, *h_B; float *h_C_GPU; short int *d_A, *d_B; float *d_C; char **heads_A, **heads_B; int *dims_A, *dims_B; int rank_A, rank_B; if(! MLGetInteger16Array(stdlink, &h_A, &dims_A, &heads_A, &rank_A)) { return; } if(! MLGetInteger16Array(stdlink, &h_B, &dims_B, &heads_B, &rank_B)) { return; }
  • 63. //Initializing data h_C_GPU = (float *)malloc(dims_A[0]*dims_A[1]*sizeof(float)); //Allocating GPU memory cutilSafeCall( cudaMalloc((void **)&d_A, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_B, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_C, dims_A[0]*dims_A[1]*sizeof(float)) ); //Copy data to GPU memory for further processing cutilSafeCall( cudaMemcpy(d_A, h_A, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_B, h_B, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaThreadSynchronize() ); dim3 grid(ceil((float)dims_A[0]/(float)16.0f), ceil((float) dims_A[1]/32.0f), 1); dim3 threads(ceil( dims_A[0]/(float)grid.x), ceil( dims_A[1]/(float)grid.y), 1); ndviGPU<<<grid, threads>>>(d_C, d_A, d_B, dims_A[0], dims_A[1]); cutilCheckMsg("ndviGPU() execution failedn"); cutilSafeCall( cudaThreadSynchronize() );
  • 64. //Release d_A and d_B cutilSafeCall( cudaFree(d_B) ); cutilSafeCall( cudaFree(d_A) ); //Read back GPU results into h_C_GPU cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float), cudaMemcpyDeviceToHost) ); //Release d_C cutilSafeCall( cudaFree(d_C) ); //Return result MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]); //Release h_A and h_B MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A); MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B); cudaThreadExit();
  • 65. NDVI Kernel /////////////////////////////////////////////////////////////////////////////// // Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C /////////////////////////////////////////////////////////////////////////////// __global__ void ndviGPU( float *d_C, short int *d_A, short int *d_B, int width, int height ){ unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; if(xIndex < width && yIndex < height) { unsigned int i = yIndex * (width) + xIndex; d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) ); } }
  • 66. NDVI output 0 1 In[64]:= ArrayPlot Partition NDVI, reader getSizeX , ColorFunction "Rainbow"
  • 67. Questions? http://hpc.nomad-labs.com kashif@nomad-labs.com twitter krasul