Last updated July 08, 2010 09:58, by Michael Bien
This project moved to http://jocl.jogamp.org/'
Java Binding for the OpenCL API
This project provides a easy to use Java binding for the OpenCL API. GlueGen is used to generate a low level binding directly from the official Khronos C header files. A hand written high level binding on top of generated code provides a convenient interface and reduces verbosity to a minimum.
Hello World - Java OpenCL
HelloJOCL can be found in the jocl-demos GIT repository.
package com.mbien.opencl.demos.hellojocl;
import com.mbien.opencl.*;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Random;
import static java.lang.System.*;
import static com.mbien.opencl.CLBuffer.Mem.*;
/**
* Hello Java OpenCL example. Adds all elements of buffer A to buffer B
* and stores the result in buffer C.<br/>
* Sample was inspired by the Nvidia VectorAdd example written in C/C++
* which is bundled in the Nvidia OpenCL SDK.
* @author Michael Bien
*/
public class HelloJOCL {
public static void main(String[] args) throws IOException {
int elementCount = 11444777; // Length of arrays to process
int localWorkSize = 256; // Local work size dimensions
// rounded up to the nearest multiple of the localWorkSize
int globalWorkSize = roundUp(localWorkSize, elementCount);
// set up
CLContext context = CLContext.create();
CLProgram program = context.createProgram(
HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
CLBuffer<FloatBuffer> clBufferA =
context.createFloatBuffer(globalWorkSize, READ_ONLY);
CLBuffer<FloatBuffer> clBufferB =
context.createFloatBuffer(globalWorkSize, READ_ONLY);
CLBuffer<FloatBuffer> clBufferC =
context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
out.println("used device memory: "
+ (clBufferA.buffer.capacity()+clBufferB.buffer.capacity()
+clBufferC.buffer.capacity())*4/1000000 +"MB");
// fill read buffers with random numbers
fillBuffer(clBufferA.buffer, 12345);
fillBuffer(clBufferB.buffer, 67890);
// get a reference to the kernel functon with the name 'VectorAdd'
// and map the buffers to its input parameters.
CLKernel kernel = program.getCLKernels().get("VectorAdd");
kernel.setArg(0, clBufferA)
.setArg(1, clBufferB)
.setArg(2, clBufferC)
.setArg(3, elementCount);
// create command queue on fastest device.
CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
// asynchronous write of data to GPU device, blocking read later
// to get the computed results back.
long time = nanoTime();
queue.putWriteBuffer(clBufferA, false)
.putWriteBuffer(clBufferB, false)
.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
.putReadBuffer(clBufferC, true);
time = nanoTime() - time;
// cleanup all resources associated with this context.
context.release();
// print first few elements of the resulting buffer to the console.
out.println("a+b=c results snapshot: ");
for(int i = 0; i < 10; i++)
out.print(clBufferC.buffer.get() + ", ");
out.println("...; " + clBufferC.buffer.remaining() + " more");
out.println("computation took: "+(time/1000000)+"ms");
}
private static final void fillBuffer(FloatBuffer buffer, int seed) {
Random rnd = new Random(seed);
while(buffer.remaining() != 0)
buffer.put(rnd.nextFloat()*100);
buffer.rewind();
}
private static final int roundUp(int groupSize, int globalSize) {
int r = globalSize % groupSize;
if (r == 0) {
return globalSize;
} else {
return globalSize + groupSize - r;
}
}
}
OpenCL source file
// OpenCL Kernel Function for element by element vector addition
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements) {
// get index into global data array
int iGID = get_global_id(0);
// bound check
if (iGID >= numElements) {
return;
}
// add the vector elements
c[iGID] = a[iGID] + b[iGID];
}






