Java as a Technology Glue, or How to Use GPU From JavaScript
We look into how to run the Java-based GPU via JavaScript. This should be fun!
Join the DZone community and get the full member experience.
Join For FreeAh! It’s been a while since I posted last!
But this year was quite a DE(VOXX)ED for me! First I had a chance to speak in wonderful Romania, then beautiful Vienna, after that amazing Lugano, then the loveliest Krakow, finally at the huge Antwerp event! In between, I also gave some talks in my hometown of Sofia at Java2Days and St. Petersburg at Joker! Let’s not forget that I was in the dream team of 5 to organize and run the coolest jPrime conf!
Quite an intense year!
So, finally, this weekend I had some time just to play around. Of course I spent it coding!
As some of you may have seen, I’m really interested in the Nashorn engine to run JavaScript on the JVM. I've even given some talks about it!
But this is not the only thing I'm into! For several years I’ve been interested in General Purpose computations on Video Cards (GPGPU)! I even gave some introductory talks on how to use GPU and Java together.
Here's a link to my presentation on this subject from Devoxx in English:
https://www.youtube.com/watch?v=BjdYRtL6qjg
And my talk on Joker in Russian will be hopefully available soon.
But, what will happen if I actually unite these two passions? How can we run some code on the GPU but submit it using... JavaScript?!
You will say just use WebCL! But, isn’t it not quite ready yet? And as far as I know it is not exactly a full binding to OpenCL but mainly does browser-oriented stuff. Please correct me if I’m wrong. I’ve even played around with some experimental drafts.
What if I want to utilize the full control of our computations on GPU through JS and make it very dynamic?
Yes, yes, Nashorn is here to help us! And yes, we have the full power of the available bindings like those provided by JOCL!
So let’s do it! The usual example with vector add will be cool enough.
Here's the JavaScript code:
print("GPU in JS!")
//some typedefs
var String = java.lang.String;
var System = java.lang.System;
var CL = org.jocl.CL;
var Pointer = org.jocl.Pointer;
var Sizeof = org.jocl.Sizeof;
var cl_command_queue = org.jocl.cl_command_queue;
var cl_context = org.jocl.cl_context;
var cl_context_properties = org.jocl.cl_context_properties;
var cl_device_id = org.jocl.cl_device_id;
var cl_kernel = org.jocl.cl_kernel;
var cl_mem = org.jocl.cl_mem;
var cl_platform_id = org.jocl.cl_platform_id;
var cl_program = org.jocl.cl_program;
var LongArray = Java.type("long[]");
var ByteArray = Java.type("byte[]");
var FloatArray = Java.type("float[]");
var StringArray = Java.type("java.lang.String[]");
var IntArray = Java.type("int[]");
var ClPlatformIdArray = Java.type("org.jocl.cl_platform_id[]");
var ClDeviceIdArray = Java.type("org.jocl.cl_device_id[]");
var ClMemArray = Java.type("org.jocl.cl_mem[]");
//helper function
function getString(device, paramName) {
var size = new LongArray(1);
CL.clGetDeviceInfo(device, paramName, 0, null, size);
var buffer = new ByteArray(size[0]);
CL.clGetDeviceInfo(device, paramName, buffer.length, Pointer.to(buffer), null);
return new String(buffer, 0, buffer.length-1);
}
//this goes to GPU
var programSource =
"__kernel void "+
"sampleKernel(__global const float *a,"+
" __global const float *b,"+
" __global float *c)"+
"{"+
" int gid = get_global_id(0);"+
" c[gid] = a[gid] + b[gid];"+
"}";
var start= System.currentTimeMillis();
var n = 1024;
var srcArrayA = new FloatArray(n);
var srcArrayB = new FloatArray(n);
var dstArray = new FloatArray(n);
for (var i=0; i<n; i++){
srcArrayA[i] = i;
srcArrayB[i] = i;
}
var srcA = Pointer.to(srcArrayA);
var srcB = Pointer.to(srcArrayB);
var dst = Pointer.to(dstArray);
var platformIndex = 0;
var deviceType = CL.CL_DEVICE_TYPE_ALL;
var deviceIndex = 2;
CL.setExceptionsEnabled(true);
var numPlatformsArray = new IntArray(1);
CL.clGetPlatformIDs(0, null, numPlatformsArray);
var numPlatforms = numPlatformsArray[0];
var platforms = new ClPlatformIdArray(numPlatforms);
CL.clGetPlatformIDs(platforms.length, platforms, null);
var platform = platforms[platformIndex];
var contextProperties = new cl_context_properties();
contextProperties.addProperty(CL.CL_CONTEXT_PLATFORM, platform);
var numDevicesArray = new IntArray(1);
CL.clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
var numDevices = numDevicesArray[0];
//just print out the name
var devices = new ClDeviceIdArray(numDevices);
CL.clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
var device = devices[deviceIndex];
var deviceName = getString(device, CL.CL_DEVICE_NAME);
print("Device: "+deviceName);
var devices = new ClDeviceIdArray(1);
devices[0]=device;
var context = CL.clCreateContext(
contextProperties, 1, devices,
null, null, null);
var commandQueue =
CL.clCreateCommandQueue(context, device, 0, null);
var memObjects = new ClMemArray(3);
memObjects[0] = CL.clCreateBuffer(context,
CL.CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR,
Sizeof.cl_float * n, srcA, null);
memObjects[1] = CL.clCreateBuffer(context,
CL.CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR,
Sizeof.cl_float * n, srcB, null);
memObjects[2] = CL.clCreateBuffer(context,
CL.CL_MEM_READ_WRITE,
Sizeof.cl_float * n, null, null);
var ps = new StringArray(1);
ps[0] = programSource;
var program = CL.clCreateProgramWithSource(context,
1, ps, null, null);
CL.clBuildProgram(program, 0, null, null, null, null);
var kernel = CL.clCreateKernel(program, "sampleKernel", null);
CL.clSetKernelArg(kernel, 0,
Sizeof.cl_mem, Pointer.to(memObjects[0]));
CL.clSetKernelArg(kernel, 1,
Sizeof.cl_mem, Pointer.to(memObjects[1]));
CL.clSetKernelArg(kernel, 2,
Sizeof.cl_mem, Pointer.to(memObjects[2]));
var global_work_size = new LongArray(1);
global_work_size[0] = n;
var local_work_size = new LongArray(1);
local_work_size[0] = 1;
// Execute the kernel
CL.clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
global_work_size, local_work_size, 0, null, null);
CL.clEnqueueReadBuffer(commandQueue, memObjects[2], CL.CL_TRUE, 0,
n * Sizeof.cl_float, dst, 0, null, null);
CL.clReleaseMemObject(memObjects[0]);
CL.clReleaseMemObject(memObjects[1]);
CL.clReleaseMemObject(memObjects[2]);
CL.clReleaseKernel(kernel);
CL.clReleaseProgram(program);
CL.clReleaseCommandQueue(commandQueue);
CL.clReleaseContext(context);
print("Calculated on GPU:");
for (var i=0; i<n; i++){
print(srcArrayA[i], " + ",srcArrayB[i]," = ",dstArray[i]);
}
print("Done in: "+(System.currentTimeMillis()-start)+" ms.");
I just put it all in a jscl.js
file.
As you'll note, in the beginning I’ve added some type definition shortcuts just to make the code look more readable.
To run it in the console I just use the following command:
jjs -cp ./lib/JOCL-0.1.9.jar -scripting jscl.js
JJS is available by default if you have a minimum of Java 8 installed.
You can just get the JOCL jar from their site.
And what we have (on my Mac):
GPU in JS!
Device: GeForce GT 650M
Calculated on GPU:
0 + 0 = 0
1 + 1 = 2
2 + 2 = 4
....
1019 + 1019 = 2038
1020 + 1020 = 2040
1021 + 1021 = 2042
1022 + 1022 = 2044
1023 + 1023 = 2046
Done in: 704 ms.
Isn’t this lovely?!
Funny fact, because of memory latency, it's about 700 ms whether if I make 1,000 computations or 100 million computations.
Still, this may be a good example of how Java/JVM can be a kind of technology glue. Keeping in mind that there are more than 40 languages running on JVM, all of them can benefit from the de facto standard libraries already available for Java.
This is so cool! Have fun!
Published at DZone with permission of Dmitry Alexandrov. See the original article here.
Opinions expressed by DZone contributors are their own.
Comments