2. Overview
• Intro to Mathematica and its API
• CUDA + Mathematica
• Some examples
3. Mathematica intro
• Mathematica is a modular
computational system in which the
kernel is separate from the front end
which handles the interaction with the
user.
• The most common way to work is to use
interactive documents called notebooks
which mix text input and output as well
as graphics and other material.
4. Structure of
Mathematica
• An import aspect of Mathematica is
that it can also interact with other
applications.
• This is achieved through MathLink,
a standardised API for two-way
communication with the kernel.
5. MathLink
• MathLink allows external programs both
to call Mathematica, and to be called by
Mathematica.
• We will use MathLink to let Mathematica
call CUDA functions inside an external
program.
6. Simple example
addtwo.tm
:Begin:
:Function: addtwo
:Pattern: AddTwo[i_Integer,j_Integer]
:Arguments: { i, j }
:ArgumentTypes: {Integer,Integer}
:ReturnType: Integer
:End:
9. In[3]:= SetDirectory
" Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
PrebuiltExamples"
Out[3]= Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
PrebuiltExamples
In[4]:= link Install ". addtwo"
Out[4]= LinkObject
Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
PrebuiltExamples addtwo, 524, 8
In[5]:= LinkPatterns link
Out[5]= AddTwo i_Integer, j_Integer
In[6]:= ? AddTwo
AddTwo x , y gives the sum of two machine integers x and y.
In[7]:= AddTwo 2, 3
Out[7]= 5
In[8]:= AddTwo 2^31 1, 1
Out[8]= 2 147 483 648
In[9]:= Uninstall link
Out[9]= Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
PrebuiltExamples addtwo
10. MathLink
Template file
• When a MathLink template file is
processed, two basic things are done:
• :Pattern:& :Arguments: specifications
are used to generate a Mathematica
definition
• :Function:, :ArgumentTypes:
& :ReturnType: specifications are used
to generate C source code
11. :ArgumentTypes:
Mathematica specification C specification
Integer int
Real double
IntegerList int*, long
RealList double*, long
String char*
Symbol char*
Manual void
12. Handling
Lists & Arrays
:Begin: int sumList(int *a, long alen)
:Function: sumList {
:Pattern: SumList[a_List] int i, tot=0;
:Arguments: {a}
:ArgumentTypes:{IntegerList} for(i=0; i<alen; i++)
:ReturnType: Integer tot += a[i];
:End:
return tot;
}
13. Manual ArgumentTypes
:Begin:
:Function: sumList
:Pattern: SumList[a:{___Integer}]
:Arguments: {a}
:ArgumentTypes:{Manual}
:ReturnType: Integer
:End:
int sumList(void) { int sumList(void) {
int n, i; int n;
int a[MAX]; int *a;
MLCheckFunction(stdlink, "List", &n); MLGetInteger32List(stdlink, &a, &n);
...
for (i=0; i<n; i++) MLReleaseInteger32List(stdlink, a, n);
MLGetInteger32(stdlink, a+i); ...
... }
}
14. Array of arb. depth
#include <mathlink.h>
/* read an array of double-precision floating-point numbers from a link */
void f(MLINK lp)
{
double *data;
int *dims;
char **heads;
int d; /* stores the rank of the array */
if(! MLGetRealArray(lp, &data, &dims, &heads, &d))
{
/* unable to read the array from lp */
return;
}
/* ... */
MLReleaseRealArray(lp, data, dims, heads, d);
}
15. Handling Complex
numbers
In[1]:= Head 2 3
Out[1]= Complex
If you pass a list of complex numbers to your external program,
then MLGetReal64Array() will create a two-dimensional array
containing a sequence of pairs of real and imaginary parts. In this
case, heads[0] will be "List" while heads[1] will be "Complex".
//get an array of floating-point numbers of any depth
MLGetReal64Array(stdlink,double**a,int**dims,char***heads,int*d);
16. Summary of API
//get a list of integers, allocating the memory needed to store it
MLGetInteger32List(stdlink,int**a,int*n);
//get a list of floating-point numbers
MLGetReal64List(stdlink,double**a,int*n);
//release the memory associated with a list of integers
MLReleaseInteger32List(stdlink,int*a,int n);
//release the memory associated with a list of floating-point numbers
MLReleaseReal64List(stdlink,double*a,int n);
//get an array of integers of any depth
MLGetInteger32Array(stdlink,int**a,int**dims,char***heads,int*d);
//get an array of floating-point numbers of any depth
MLGetReal32Array(stdlink,float**a,int**dims,char***heads,int*d);
//release memory associated with an integer array
MLReleaseInteger32Array(stdlink,int*a,int*dims,char**heads,int d);
//release memory associated with a floating-point array
MLReleaseReal32Array(stdlink,float*a,int*dims,char**heads,int d);
17. Manual ReturnType
void bits(int i)
{
int a[32], k;
:Begin:
:Function: bits for(k=0; k<32; k++) {
:Pattern: ToBits[i_Integer] a[k] = i%2;
:Arguments: {i} i >>= 1;
:ArgumentTypes:{Integer} if (i==0) break;
:ReturnType: Manual }
:End:
if (k<32) k++;
MLPutInteger32List(stdlink,
a, k);
return;
}
18. General array
int a[8][16][100];
int dims[] = {8, 16, 100};
MLPutInteger32Array(stdlink, a, dims, NULL, 3);
or
int ***a;
MLPutFunction(stdlink, "List", n1);
for (i=0; i<n1; i++) {
MLPutFunction(stdlink, "List", n2);
for (j=0; j<n2; j++) {
MLPutInteger32List(stdlink, a[i][j], n3);
}
}
20. Return Complex
numbers
// Complex data type
typedef float2 Complex;
Complex* h_convolved_signal;
// Return transformed signal to Mathematica as a Complex List
MLPutFunction(stdlink,"List",n);
for (long i = 0; i < n; i++) {
MLPutFunction(stdlink,"Complex",2);
MLPutFloat(stdlink,h_convolved_signal[i].x*norm);
MLPutFloat(stdlink,h_convolved_signal[i].y*norm);
}
22. Error & Interrupt
if(! MLPutInteger(stdlink, 10)) if(! MLPutReal64(stdlink, 3.22))
{ {
/* check the possible errors */ /* unable to send 3.22 to lp */
switch(MLError(stdlink)) printf("MathLink Error: %sn",
{ MLErrorMessage(stdlink));
case MLEDEAD: MLClearError(stdlink);
/* the link died unexpectedly */ }
break;
case MLECLOSED:
/* the other side closed the link */
break;
case MLEOK:
/* no error occurred */ while(len--)
break; {
default: sum += *list++;
/* ... */ /* check for the abort */
} if(MLAbort) return (double)0;
} }
23. Running on remote
computers
$ ./addtwo -linkcreate -linkprotocol TCPIP
Link created on: 63166@192.168.1.107,63167@192.168.1.107
In[5]:= Install LinkConnect "63166 192.168.1.107,63167 192.168.1.107",
LinkProtocol "TCPIP"
Out[5]= LinkObject 63166 192.168.1.107,63167 192.168.1.107, 1110, 8
In[6]:= AddTwo 2, 3
Out[6]= 5
24. Mathematica + CUDA
#include <cutil_inline.h>
int main(int argc, char **argv)
{
// use command-line specified CUDA device,
// otherwise use device with highest Gflops/s
if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
return MLMain(argc, argv);
}
25. mathematica_cuda
# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES := cuFourier.cu
# CUDA dependency files
# CU_DEPS :=
# C/C++ source files (compiled with gcc / c++)
# CCFILES :=
# Additional libraries needed by the project
USECUFFT := 1
# MathLink Template files
TMFILES := cuFourier.tm
###################################################
# Rules and targets
include ../../common/common.mk
33. cuFourier.cu
// includes system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes cuda
#include <cufft.h>
#include <cutil_inline.h>
// includes mathlink
#include <mathlink.h>
// Complex data type
typedef float2 Complex;
///////////////////////////////////////////////////////////////
// Showing the use of CUFFT for fast convolution using FFT.
///////////////////////////////////////////////////////////////
extern "C" void cuFourier1D(double*, long);
35. void cuFourier1D (double *h_A, long n)
{
double norm = 1.0/sqrt((double) n);
long mem_size = sizeof(Complex) * n;
// Allocate host memory for the signal
Complex* h_signal = (Complex*)malloc(mem_size);
// Initalize the memory for the signal
for (long i = 0; i < n; ++i) {
h_signal[i].x = (float)h_A[i];
h_signal[i].y = 0.0f;
}
// Allocate device memory for signal
Complex* d_signal;
cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size));
// Copy host memory to device
cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size,
cudaMemcpyHostToDevice));
37. // Return transformed signal to Mathematica as a Complex List
MLPutFunction(stdlink, "Map", 2);
MLPutFunction(stdlink, "Function", 2);
MLPutFunction(stdlink, "List", 1);
MLPutSymbol(stdlink, "x");
MLPutFunction(stdlink, "Apply", 2);
MLPutSymbol(stdlink, "Complex");
MLPutSymbol(stdlink, "x");
MLPutFunction(stdlink, "Partition", 2);
MLPutFunction(stdlink, "Times", 2);
MLPutReal(stdlink, norm);
MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n);
MLPutInteger(stdlink, 2);
// Cleanup memory
free(h_signal);
cudaThreadExit();
}
38. Makefile
##################################################################
#
# Build script for project
#
##################################################################
# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES := cuFourier.cu
# Additional libraries needed by the project
USECUFFT := 1
# MathLink Template files
TMFILES := cuFourier.tm
##################################################################
# Rules and targets
include ../../common/common.mk
39. In[35]:= link
Install
" Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
release cuFourier"
Out[35]= LinkObject
Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
release cuFourier, 605, 9
In[36]:= LinkPatterns link
Out[36]= CUFourier1D a : __ ?NumericQ
In[37]:= ListLinePlot Abs CUFourier1D RandomReal 1, 200 ^2
0.4
0.3
Out[37]=
0.2
0.1
50 100 150 200
In[38]:= Uninstall link
Out[38]= Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
release cuFourier
40. Image Deconvolution
for Life Sciences
• Confocal and Widefield microscopy
3D or 4D images
• Multichannel (3 or more channels)
• Comes in a wide variety of formats
41. Bio-Formats Java lib.
• Standalone Java library for reading and
writing life science image formats
• Get both the pixels and metadata
• Licensed under GPL
• http://www.loci.wisc.edu/ome/
formats.html
42. Java + Mathematica:
J/Link
Needs "JLink`"
InstallJava
LinkObject
' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
" usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
Xmx256m Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
com.wolfram.jlink.Install init " tmp m000001207601", 4, 4
ReinstallJava ClassPath " home kashif Dropbox BioFormats Java loci_tools.jar"
LinkObject
' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
" usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
Xmx256m Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
com.wolfram.jlink.Install init " tmp m000002207601", 8, 4
64. //Release d_A and d_B
cutilSafeCall( cudaFree(d_B) );
cutilSafeCall( cudaFree(d_A) );
//Read back GPU results into h_C_GPU
cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float),
cudaMemcpyDeviceToHost) );
//Release d_C
cutilSafeCall( cudaFree(d_C) );
//Return result
MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]);
//Release h_A and h_B
MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A);
MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B);
cudaThreadExit();
65. NDVI Kernel
///////////////////////////////////////////////////////////////////////////////
// Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C
///////////////////////////////////////////////////////////////////////////////
__global__ void ndviGPU(
float *d_C,
short int *d_A,
short int *d_B,
int width,
int height
){
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if(xIndex < width && yIndex < height)
{
unsigned int i = yIndex * (width) + xIndex;
d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) );
}
}