Hi, I think there is a thread on this:
http://www.fractalforums.com/programming/buddhabrot-on-gpu/I'm don't think the approach used in Fragmentarium will help here: for something to run efficiently on a GPU you must be able to divide the job into many separate tasks (threads). For a GPU, the number of threads must be even larger than the number of computational cores, to hide the different kind of latencies. This means 1000+ of tasks on a modern GPU. In Fragmentarium each pixel is calculated in its own thread - which is the dream job for a GPU - roughly a million threads, which can be executed independently. THis is erfect for ray marched, distance estimated 3D fractals, but I don't think it is easy to frame the Buddabrot formula in this setting.
That's correct.
Below is an openCL implementation of the buddhabrot. but this is inefficient. The memory latency of the GPU (and branching) is a huge bottleneck and there isn't anything you can do about it.
The threads can write anywhere (and it's not predictible, of course) on the global memory, and the global memory latency kill the performances.
bool isInMSet(
float cr,
float ci,
const uint maxIter,
const float escapeOrbit)
{
int iter = 0;
float zr = 0.0;
float zi = 0.0;
float ci2 = ci*ci;
float temp;
//Quick rejection check if c is in 2nd order period bulb
if( (cr+1.0) * (cr+1.0) + ci2 < 0.0625) return true;
//Quick rejection check if c is in main cardioid
float q = (cr-0.25)*(cr-0.25) + ci2;
if( q*(q+(cr-0.25)) < 0.25*ci2) return true;
// test for the smaller bulb left of the period-2 bulb
if (( ((cr+1.309)*(cr+1.309)) + ci*ci) < 0.00345) return true;
// check for the smaller bulbs on top and bottom of the cardioid
if ((((cr+0.125)*(cr+0.125)) + (ci-0.744)*(ci-0.744)) < 0.0088) return true;
if ((((cr+0.125)*(cr+0.125)) + (ci+0.744)*(ci+0.744)) < 0.0088) return true;
while( (iter < maxIter) && ((zr*zr+zi*zi) < escapeOrbit) )
{
temp = zr * zi;
zr = zr*zr - zi*zi + cr;
zi = temp + temp + ci;
iter++;
}
if( iter < maxIter)
{
return false;
} else {
return true;
}
}
//Main kernel
__kernel void buddhabrot(
const float realMin,
const float realMax,
const float imaginaryMin,
const float imaginaryMax,
const uint minIter,
const uint maxIter,
const uint width,
const uint height,
const float escapeOrbit,
const uint4 minColor,
const uint4 maxColor,
__global float2* randomXYBuffer,
__global uint4* outputBuffer)
{
float2 rand = randomXYBuffer[get_global_id(0)];
const float deltaReal = (realMax - realMin);
const float deltaImaginary = (imaginaryMax - imaginaryMin);
//mix(a,b,c) = a + (b-a)*c //(c must be in the range 0.0 ... 1.0
//float cr = realMin + rand.x * deltaReal ;
//float cr = realMin + (realMax - realMin) * rand.x ;
//float ci = imaginaryMin + rand.y * deltaImaginary ;
float cr = mix(realMin, realMax, rand.x);
float ci = mix(imaginaryMin, imaginaryMax, rand.y);
int x, y;
int iter = 0;
float zr = 0.0;
float zi = 0.0;
float temp = 0.0;
if( isInMSet(cr,ci, maxIter, escapeOrbit) == false)
{
while( (iter < maxIter) && ((zr*zr+zi*zi) < escapeOrbit) )
{
temp = zr * zi;
zr = zr*zr - zi*zi + cr;
zi = temp + temp + ci;
x = ((width) * (zr - realMin) / deltaReal);
y = ((height) * (zi - imaginaryMin) / deltaImaginary);
if( (iter > minIter) && (x>0) && (y>0) && (x<width) && (y<height) )
{
if( (iter > minColor.x) && (iter < maxColor.x) ) { outputBuffer[x + (y * width)].x++; }
if( (iter > minColor.y) && (iter < maxColor.y) ) { outputBuffer[x + (y * width)].y++; }
if( (iter > minColor.z) && (iter < maxColor.z) ) { outputBuffer[x + (y * width)].z++; }
}
iter++;
}
}
}
__kernel void xorshift(
uint s1,
uint s2,
uint s3,
uint s4,
const int bufferSize,
__global float2* randomXYBuffer
)
{
uint st;
float2 tmp;
for(int i=0; i < bufferSize; i++)
{
st = s1 ^ (s1 << 11);
s1 = s2;
s2 = s3;
s3 = s4;
s4 = s4 ^ (s4 >> 19) ^ ( st ^ (st >> 18));
tmp.x = (float)s4 / UINT_MAX;
st = s1 ^ (s1 << 11);
s1 = s2;
s2 = s3;
s3 = s4;
s4 = s4 ^ (s4 >> 19) ^ ( st ^ (st >> 18));
tmp.y = (float)s4 / UINT_MAX;
randomXYBuffer[i] = tmp;
}
}