library('clrng')
runifGpu
get_system_info()
## $`1. Operating System`
## [1] "Linux"
##
## $`2. CPU`
## [1] "Intel(R) Xeon(R) W-2145 CPU @ 3.70GHz"
##
## $`3. GPU`
## [1] "\020n\r\xaa9\177" "\020n\r\xaa9\177" "gfx906:sramecc+:xnack-"
##
## $`4. OpenCL Version`
## [1] "OpenCL 3.0 LINUX"
## [2] "OpenCL 1.2 Intel(R) FPGA SDK for OpenCL(TM), Version 20.3"
## [3] "OpenCL 2.1 AMD-APP (3614.0)"
if (detectGPUs()) {
## show all the available platforms
gpuR::listContexts()[,'platform']
## set the context and show the device we are using
setContext(grep("gpu", listContexts()$device_type)[1])
currentDevice()
## set the initial seed of the package/first stream
setBaseCreator(rep(12235,6))
## check the size of work items and GPU precision type at the moment
getOption('clrng.Nglobal')
getOption('clrng.type')
## create default number of streams on GPU
myStreamsGpu = clrng::createStreamsGpu()
## show its dimention
dim(myStreamsGpu)
## create 10 uniform random numbers using the streams and show detailed backend code
as.vector(clrng::runifGpu(10, myStreamsGpu, verbose=2))
# generating a 100 x 100 matrix of random uniforms
b<-clrng::runifGpu(c(100,100), myStreamsGpu)
bvector<-as.vector(as.matrix(b))
# plot the histogram of the uniform random numbers
hist(bvector, breaks=40)
# check its quantiles
quantile(bvector)
} else {
message("No GPU detected. Skipping GPU-dependent code.")
}
##
## #pragma OPENCL EXTENSION cl_khr_fp64 : enable
## #define PI_2 M_PI_2
## #define TWOPI 6.283185307179586231996
## #define mrg31k3p_NORM_cl 4.656612873077392578125e-10
## //TWOPI * mrg31k3p_NORM_cl
## #define TWOPI_mrg31k3p_NORM_cl 2.925836158534319248049e-09
##
##
## #define Nrow 10
## #define Ncol 1
## #define NpadStreams 128
## #define NpadCol 128
## #define mrg31k3p_M1 2147483647
## #define mrg31k3p_M2 2147462579
## #define mrg31k3p_MASK12 511
## #define mrg31k3p_MASK13 16777215
## #define mrg31k3p_MASK2 65535
## #define mrg31k3p_MULT2 21069
##
## void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){
## int Drow, Dcol, DrowStart; for(Drow = 0, DrowStart = start, Dcol = DrowStart + 3;
## Drow < 3; Drow++, DrowStart++, Dcol++){
## g1[Drow] = streams[DrowStart];
## g2[Drow] = streams[Dcol];
## }
## }
##
## void streamsFromPrivate(__global int* streams, uint* g1, uint* g2, const int start){
## int Drow, Dcol, DrowStart; for(Drow = 0,DrowStart = start, Dcol = DrowStart + 3;
## Drow < 3; Drow++, DrowStart++, Dcol++){
## streams[DrowStart] = g1[Drow];
## streams[Dcol] = g2[Drow];
## }
## }
##
## uint clrngMrg31k3pNextState(uint *g1, uint *g2) {
## uint y1, y2;
## y1 = ((g1[1] & mrg31k3p_MASK12) << 22) + (g1[1] >> 9)
## + ((g1[2] & mrg31k3p_MASK13) << 7) + (g1[2] >> 24);
## if (y1 >= mrg31k3p_M1)
## y1 -= mrg31k3p_M1;
## y1 += g1[2];
## if (y1 >= mrg31k3p_M1)
## y1 -= mrg31k3p_M1;
## g1[2] = g1[1];
## g1[1] = g1[0];
## g1[0] = y1;
## y1 = ((g2[0] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[0] >> 16));
## if (y1 >= mrg31k3p_M2)
## y1 -= mrg31k3p_M2;
## y2 = ((g2[2] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[2] >> 16));
## if (y2 >= mrg31k3p_M2)
## y2 -= mrg31k3p_M2;
## y2 += g2[2];
## if (y2 >= mrg31k3p_M2)
## y2 -= mrg31k3p_M2;
## y2 += y1;
## if (y2 >= mrg31k3p_M2)
## y2 -= mrg31k3p_M2;
## g2[2] = g2[1];
## g2[1] = g2[0];
## g2[0] = y2;
## if (g1[0] <= g2[0]){
## return (g1[0] - g2[0] + mrg31k3p_M1);
## } else {
## return(g1[0] - g2[0]);
## }
## }
##
##
##
## __kernel void mrg31k3pMatrix(
## __global int* streams,
## __global double* out){
##
## const int index = get_global_id(0)*get_global_size(1) + get_global_id(1);
## int Drow, Dcol, DrowStart, Dentry, DrowBlock, DcolBlock, DrowInBounds;
## const int DrowStartInc = get_global_size(0) * NpadCol;
## uint g1[3], g2[3];
## const int startvalue=index * NpadStreams;
## double temp;
## const double fact = mrg31k3p_NORM_cl;
## streamsToPrivate(streams,g1,g2,startvalue);
## for(DrowBlock = 0, Drow=get_global_id(0), DrowStart = Drow * NpadCol;
## DrowBlock < Nrow;
## Drow += get_global_size(0), DrowBlock +=get_global_size(0), DrowStart += DrowStartInc) {
## DrowInBounds = Drow < Nrow;
## for(DcolBlock = 0, Dcol=get_global_id(1), Dentry = DrowStart + Dcol;
## DcolBlock < Ncol;
## DcolBlock += get_global_size(1), Dentry += get_global_size(1) ) {
## temp = fact * clrngMrg31k3pNextState(g1, g2);
## if(DrowInBounds) out[Dentry] = temp;
## }//Dcol
## }//Drow
## streamsFromPrivate(streams,g1,g2,startvalue);
## }//kernel
## 0% 25% 50% 75% 100%
## 4.066154e-06 2.527364e-01 5.037022e-01 7.535758e-01 9.999677e-01
rnormGpu
if (detectGPUs()) {
setContext(grep("gpu", listContexts()$device_type)[1])
## configure the size of work items
options(clrng.Nglobal = c(128,64))
## create default (here is 128*64) number of streams on GPU
myStreamsGpu = clrng::createStreamsGpu()
## check its dimension
dim(myStreamsGpu)
## generate a vector of 10 random normal numbers on GPU, and print out the kernel code
as.vector(clrng::rnormGpu(10, myStreamsGpu, verbose=2))
## generate a 4x4 matrix of random normal numbers on GPU, still using the ctreated streams
as.matrix(clrng::rnormGpu(c(4, 4), myStreamsGpu))
## create many new streams and generate a 1024x512 matrix of normal random numbers, with specified Nglobal
streamsGpu2 <- createStreamsGpu(n =512*128)
a<-clrng::rnormGpu(c(1024,512), streams=streamsGpu2, Nglobal=c(512,128))
## see the histogram of the produced normal random numbers
avector<-as.vector(as.matrix(a))
hist(avector,breaks=40)
# do a Q-Q plot with quantiles computed on GPU by clrng package
clrng::qqnormGpu(avector)
# can also do the Q-Q plot calculation on CPU by stats::qqnorm
stats::qqnorm(avector)
} else {
message("No GPU detected. Skipping GPU-dependent code.")
}
##
## #pragma OPENCL EXTENSION cl_khr_fp64 : enable
## #define PI_2 M_PI_2
## #define TWOPI 6.283185307179586231996
## #define mrg31k3p_NORM_cl 4.656612873077392578125e-10
## //TWOPI * mrg31k3p_NORM_cl
## #define TWOPI_mrg31k3p_NORM_cl 2.925836158534319248049e-09
##
##
## #define Nrow 1
## #define Ncol 10
## #define NpadStreams 128
## #define NpadCol 128
## #define mrg31k3p_M1 2147483647
## #define mrg31k3p_M2 2147462579
## #define mrg31k3p_MASK12 511
## #define mrg31k3p_MASK13 16777215
## #define mrg31k3p_MASK2 65535
## #define mrg31k3p_MULT2 21069
##
## void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){
## int Drow, Dcol, DrowStart; for(Drow = 0, DrowStart = start, Dcol = DrowStart + 3;
## Drow < 3; Drow++, DrowStart++, Dcol++){
## g1[Drow] = streams[DrowStart];
## g2[Drow] = streams[Dcol];
## }
## }
##
## void streamsFromPrivate(__global int* streams, uint* g1, uint* g2, const int start){
## int Drow, Dcol, DrowStart; for(Drow = 0,DrowStart = start, Dcol = DrowStart + 3;
## Drow < 3; Drow++, DrowStart++, Dcol++){
## streams[DrowStart] = g1[Drow];
## streams[Dcol] = g2[Drow];
## }
## }
##
## uint clrngMrg31k3pNextState(uint *g1, uint *g2) {
## uint y1, y2;
## y1 = ((g1[1] & mrg31k3p_MASK12) << 22) + (g1[1] >> 9)
## + ((g1[2] & mrg31k3p_MASK13) << 7) + (g1[2] >> 24);
## if (y1 >= mrg31k3p_M1)
## y1 -= mrg31k3p_M1;
## y1 += g1[2];
## if (y1 >= mrg31k3p_M1)
## y1 -= mrg31k3p_M1;
## g1[2] = g1[1];
## g1[1] = g1[0];
## g1[0] = y1;
## y1 = ((g2[0] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[0] >> 16));
## if (y1 >= mrg31k3p_M2)
## y1 -= mrg31k3p_M2;
## y2 = ((g2[2] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[2] >> 16));
## if (y2 >= mrg31k3p_M2)
## y2 -= mrg31k3p_M2;
## y2 += g2[2];
## if (y2 >= mrg31k3p_M2)
## y2 -= mrg31k3p_M2;
## y2 += y1;
## if (y2 >= mrg31k3p_M2)
## y2 -= mrg31k3p_M2;
## g2[2] = g2[1];
## g2[1] = g2[0];
## g2[0] = y2;
## if (g1[0] <= g2[0]){
## return (g1[0] - g2[0] + mrg31k3p_M1);
## } else {
## return(g1[0] - g2[0]);
## }
## }
##
##
##
## __kernel void mrg31k3pMatrix(
## __global int* streams,
## __global double* out){
##
## const int index = get_global_id(0)*get_global_size(1) + get_global_id(1);
## int Drow, Dcol, DrowStart, Dentry, DrowBlock, DcolBlock, DrowInBounds;
## const int DrowStartInc = get_global_size(0) * NpadCol;
## uint g1[3], g2[3];
## const int startvalue=index * NpadStreams;
## double temp;
## const double fact[2] = { mrg31k3p_NORM_cl, TWOPI * mrg31k3p_NORM_cl };
## const double addForSine[2] = { 0.0, - PI_2 };
## local double part[2];
## streamsToPrivate(streams,g1,g2,startvalue);
## for(DrowBlock = 0, Drow=get_global_id(0), DrowStart = Drow * NpadCol;
## DrowBlock < Nrow;
## Drow += get_global_size(0), DrowBlock +=get_global_size(0), DrowStart += DrowStartInc) {
## DrowInBounds = Drow < Nrow;
## for(DcolBlock = 0, Dcol=get_global_id(1), Dentry = DrowStart + Dcol;
## DcolBlock < Ncol;
## DcolBlock += get_global_size(1), Dentry += get_global_size(1) ) {
## part[get_local_id(1)] = fact[get_local_id(1)] * clrngMrg31k3pNextState(g1, g2);
## barrier(CLK_LOCAL_MEM_FENCE);
## temp = sqrt( -2.0*log(part[0]) ) * cos(part[1] + addForSine[get_local_id(1)] );// is cos for local0, sine for local1
## if(DrowInBounds) out[Dentry] = temp;
## barrier(CLK_LOCAL_MEM_FENCE);
## }//Dcol
## }//Drow
## streamsFromPrivate(streams,g1,g2,startvalue);
## }//kernel
rexpGpu
if (detectGPUs()) {
## generate a 200x100 matrix of exponential random numbers of mean = 1,
## using the supplied streams, and specify the size of Nglobal to be equal to the number of strams
b<-clrng::rexpGpu(c(200,100), rate=1, streams = myStreamsGpu, Nglobal=c(64,16))
# convert GPU matrix to an R vector on CPU, plot its histogram
bGpu<-as.vector(as.matrix(b))
hist(bGpu, freq=TRUE, breaks=40)
# generate on CPU a vector of exponential random numbers of mean = 1, plot its histogram
bCpu <- stats::rexp(20000, rate=1)
hist(bCpu, freq=TRUE, breaks = 40)
# compare the two sequence of random exponential numbers by looking at their distribution quantiles
quantile(bGpu)
quantile(bCpu)
} else {
message("No GPU detected. Skipping GPU-dependent code.")
}
## 0% 25% 50% 75% 100%
## 9.343121e-05 2.847599e-01 6.849064e-01 1.386177e+00 9.095221e+00