Its not fast enough for 2560x1600 yet, but who knows... Fermi chip hopefully coming soon
60 FPS in 1280x800, GT200b
3 iterations:
4 iterations:
"Improvement" to what ive seen on the forum:
- scalar derivative computation
Those images produced by the following HLSL code:
#define P 8
inline void powN1(inout float3 z, float zr0, inout float dr) {
// float zr = sqrt( dot(z,z) );
float zo0 = asin( z.z/zr0 );
float zi0 = atan2( z.y,z.x );
float zr = pow( zr0, P-1 );
float zo = zo0 * P;
float zi = zi0 * P;
dr = zr*dr*P + 1;
zr *= zr0;
z = zr*float3( cos(zo)*cos(zi), cos(zo)*sin(zi), sin(zo) );
}
inline float DE(float3 z0)
{
float3 z=z0;
float r;
float dr=1;
int i=4;
r=length(z);
while(r<4 && i--) {
powN1(z,r,dr);
z+=z0;
r=length(z);
}
return -0.5*log(r)*r/dr;
}
DX10 bytecode disassembly:
dp3 r0.w, r1.xyzx, r1.xyzx
sqrt r0.w, r0.w
mov r2.xyz, r1.xyzx
mov r1.w, r0.w
mov r2.w, l(1.000000)
mov r3.x, l(4)
loop
lt r3.y, r1.w, l(4.000000)
iadd r3.z, r3.x, l(-1)
ine r3.w, r3.x, l(0)
and r3.y, r3.y, r3.w
mov r3.x, r3.z
breakc_z r3.y
div r3.y, r2.z, r1.w
add r3.w, -|r3.y|, l(1.000000)
sqrt r3.w, r3.w
mad r4.x, |r3.y|, l(-0.018729), l(0.074261)
mad r4.x, r4.x, |r3.y|, l(-0.212114)
mad r4.x, r4.x, |r3.y|, l(1.570729)
mul r4.y, r3.w, r4.x
mad r4.y, r4.y, l(-2.000000), l(3.141593)
lt r3.y, r3.y, -r3.y
and r3.y, r4.y, r3.y
mad r3.y, r4.x, r3.w, r3.y
add r3.y, -r3.y, l(1.570796)
min r3.w, |r2.x|, |r2.y|
max r4.x, |r2.x|, |r2.y|
div r4.x, l(1.000000, 1.000000, 1.000000, 1.000000), r4.x
mul r3.w, r3.w, r4.x
mul r4.x, r3.w, r3.w
mad r4.y, r4.x, l(0.020835), l(-0.085133)
mad r4.y, r4.x, r4.y, l(0.180141)
mad r4.y, r4.x, r4.y, l(-0.330299)
mad r4.x, r4.x, r4.y, l(0.999866)
mul r4.y, r3.w, r4.x
lt r4.z, |r2.x|, |r2.y|
mad r4.y, r4.y, l(-2.000000), l(1.570796)
and r4.y, r4.z, r4.y
mad r3.w, r3.w, r4.x, r4.y
lt r4.x, r2.x, -r2.x
and r4.x, r4.x, l(0xc0490fdb)
add r3.w, r3.w, r4.x
min r4.x, r2.x, r2.y
max r4.y, r2.x, r2.y
lt r4.x, r4.x, -r4.x
ge r4.y, r4.y, -r4.y
and r4.x, r4.x, r4.y
movc r3.w, r4.x, -r3.w, r3.w
log r4.x, r1.w
mul r4.x, r4.x, l(7.000000)
exp r4.x, r4.x
mul r3.yw, r3.yyyw, l(0.000000, 8.000000, 0.000000, 8.000000)
mul r4.y, r2.w, r4.x
mad r2.w, r4.y, l(8.000000), l(1.000000)
mul r4.x, r1.w, r4.x
sincos null, r4.yz, r3.yywy
mul r5.x, r4.z, r4.y
sincos r3.w, null, r3.w
mul r5.y, r4.y, r3.w
sincos r5.z, null, r3.y
mad r2.xyz, r4.xxxx, r5.xyzx, r1.xyzx
dp3 r3.y, r2.xyzx, r2.xyzx
sqrt r1.w, r3.y
mov r3.x, r3.z
endloop
log r0.w, r1.w
mul r0.w, r1.w, r0.w
mul r0.w, r0.w, l(-0.346574)
div r0.w, r0.w, r2.w
------------------------------------
added: exe + source
edit shader.fx for
power of z^p+c
max iteration count (4)
max raytrace step count (50)
distance threshold (-0.00025)
4x 16x AA
30 fps in 1920x1080 on default settengs, GT200b 1620MHz
F1 - fly mode
F8 - stereo, O,P,K,L - separation/convergence
http://rapidshare.de/files/48733881/mandelbulb.enforcer.v1.zip.html