Skip to content

Commit

Permalink
Merge branch 'master' into filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
edgomez committed Apr 24, 2012
2 parents 9e2d427 + 48274c4 commit 38e4950
Show file tree
Hide file tree
Showing 5 changed files with 362 additions and 107 deletions.
262 changes: 186 additions & 76 deletions data/kernels/nlmeans.cl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
This file is part of darktable,
copyright (c) 2011 johannes hanika.
copyright (c) 2012 Ulrich Pegelow.
darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -18,107 +19,216 @@

const sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
const sampler_t samplerf = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
const sampler_t samplerc = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

#define ICLAMP(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))



/*
To speed up processing we use an algorithm proposed from B. Goossens, H.Q. Luong, J. Aelterman, A. Pizurica, and W. Philips,
"A GPU-Accelerated Real-Time NLMeans Algorithm for Denoising Color Video Sequences", in Proc. ACIVS (2), 2010, pp.46-57.
Benchmarking figures (export of a 20MPx image on a i7-2600 with an NVIDIA GTS450):
This GPU-code: 18s
Brute force GPU-code: 136s
Optimized CPU-code: 27s
*/


float gh(const float f)
{
// make spread bigger: less smoothing
const float spread = 100.f;
return 1.0f/(1.0f + fabs(f)*spread);
}


float ddirac(const int2 q)
{
return ((q.x || q.y) ? 1.0f : 0.0f);
}


kernel void
nlmeans (read_only image2d_t in, write_only image2d_t out, const int width, const int height, const int P, const int K, const float nL, const float nC)
nlmeans_init(write_only image2d_t out, const int width, const int height)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int maxx = width - 1;
const int maxy = height - 1;
const float4 norm2 = (float4)(nL, nC, nC, 1.0f);

#if 0
// this is 20s (compared to 29s brute force below) but still unusable:
// load a block of shared memory, initialize to zero
local float4 block[32*32];//get_local_size(0)*get_local_size(1)];
block[get_local_id(0) + get_local_id(1) * get_local_size(0)] = (float4)0.0f;
barrier(CLK_LOCAL_MEM_FENCE);

if(x >= width || y >= height) return;

// coalesced mem accesses:
const float4 p1 = read_imagef(in, sampleri, (int2)(x, y));
write_imagef (out, (int2)(x, y), (float4)0.0f);
}


// for each shift vector
for(int kj=-K;kj<=K;kj++)
kernel void
nlmeans_dist(read_only image2d_t in, write_only image2d_t U4, const int width, const int height,
const int2 q, const float nL2, const float nC2)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const float4 norm2 = (float4)(nL2, nC2, nC2, 1.0f);

if(x >= width || y >= height) return;

float4 p1 = read_imagef(in, sampleri, (int2)(x, y));
float4 p2 = read_imagef(in, sampleri, (int2)(x, y) + q);
float4 tmp = (p1 - p2)*(p1 - p2)*norm2;
float dist = tmp.x + tmp.y + tmp.z;

write_imagef (U4, (int2)(x, y), dist);
}

kernel void
nlmeans_horiz(read_only image2d_t U4_in, write_only image2d_t U4_out, const int width, const int height,
const int2 q, const int P, local float *buffer)
{
const int lid = get_local_id(0);
const int lsz = get_local_size(0);
const int x = get_global_id(0);
const int y = get_global_id(1);

if(y >= height) return;

/* fill center part of buffer */
buffer[P + lid] = read_imagef(U4_in, samplerc, (int2)(x, y)).x;

/* left wing of buffer */
for(int n=0; n <= P/lsz; n++)
{
for(int ki=-K;ki<=K;ki++)
{
const float4 p2 = read_imagef(in, sampleri, (int2)(ICLAMP(x+ki, 0, maxx), ICLAMP(y+kj, 0, maxy)));
const float4 tmp = (p1 - p2)*norm2;
const float dist = tmp.x + tmp.y + tmp.z;
for(int pj=-P;pj<=P;pj++)
{
for(int pi=-P;pi<=P;pi++)
{
float4 p2 = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi+ki, 0, maxx), ICLAMP(y+pj+kj, 0, maxy)));
p2.w = dist;
const int i = get_local_id(0) + pi, j = get_local_id(1) + pj;
if(i >= 0 && i < get_local_size(0) && j >= 0 && j < get_local_size(1))
{
// TODO: for non-linear gh(), this produces results different than the CPU
block[i + get_local_size(0) * j].x += gh(p2.x);
block[i + get_local_size(0) * j].y += gh(p2.y);
block[i + get_local_size(0) * j].z += gh(p2.z);
block[i + get_local_size(0) * j].w += gh(p2.w);
}
}
}
}
const int l = mad24(n, lsz, lid + 1);
if(l > P) continue;
const int xx = mad24((int)get_group_id(0), lsz, -l);
buffer[P - l] = read_imagef(U4_in, samplerc, (int2)(xx, y)).x;
}
// write back normalized shm

/* right wing of buffer */
for(int n=0; n <= P/lsz; n++)
{
const int r = mad24(n, lsz, lsz - lid);
if(r > P) continue;
const int xx = mad24((int)get_group_id(0), lsz, lsz - 1 + r);
buffer[P + lsz - 1 + r] = read_imagef(U4_in, samplerc, (int2)(xx, y)).x;
}

barrier(CLK_LOCAL_MEM_FENCE);
const float4 tmp = block[get_local_id(0) + get_local_id(1) * get_local_size(0)];
tmp.x *= 1.0f/tmp.w;
tmp.y *= 1.0f/tmp.w;
tmp.z *= 1.0f/tmp.w;
write_imagef (out, (int2)(x, y), tmp);
#endif

if(x >= width) return;

#if 1
if(x >= width || y >= height) return;
buffer += lid + P;

float distacc = 0.0f;
for(int pi = -P; pi <= P; pi++)
{
distacc += buffer[pi];
}

write_imagef (U4_out, (int2)(x, y), distacc);
}


kernel void
nlmeans_vert(read_only image2d_t U4_in, write_only image2d_t U4_out, const int width, const int height,
const int2 q, const int P, local float *buffer)
{
const int lid = get_local_id(1);
const int lsz = get_local_size(1);
const int x = get_global_id(0);
const int y = get_global_id(1);

if(x >= width) return;

/* fill center part of buffer */
buffer[P + lid] = read_imagef(U4_in, samplerc, (int2)(x, y)).x;

/* left wing of buffer */
for(int n=0; n <= P/lsz; n++)
{
const int l = mad24(n, lsz, lid + 1);
if(l > P) continue;
const int yy = mad24((int)get_group_id(1), lsz, -l);
buffer[P - l] = read_imagef(U4_in, samplerc, (int2)(x, yy)).x;
}

const float4 acc = (float4)(0.0f);
// brute force (superslow baseline)!
// for each shift vector
for(int kj=-K;kj<=K;kj++)
/* right wing of buffer */
for(int n=0; n <= P/lsz; n++)
{
for(int ki=-K;ki<=K;ki++)
{
float dist = 0.0f;
for(int pj=-P;pj<=P;pj++)
{
for(int pi=-P;pi<=P;pi++)
{
float4 p1 = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi, 0, maxx), ICLAMP(y+pj, 0, maxy)));
float4 p2 = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi+ki, 0, maxx), ICLAMP(y+pj+kj, 0, maxy)));
float4 tmp = (p1 - p2)*norm2;
dist += tmp.x + tmp.y + tmp.z;
}
}
float4 pin = read_imagef(in, sampleri, (int2)(ICLAMP(x+ki, 0, maxx), ICLAMP(y+kj, 0, maxy)));
dist = gh(dist);
acc.x += dist * pin.x;
acc.y += dist * pin.y;
acc.z += dist * pin.z;
acc.w += dist;
}
const int r = mad24(n, lsz, lsz - lid);
if(r > P) continue;
const int yy = mad24((int)get_group_id(1), lsz, lsz - 1 + r);
buffer[P + lsz - 1 + r] = read_imagef(U4_in, samplerc, (int2)(x, yy)).x;
}
acc.x *= 1.0f/acc.w;
acc.y *= 1.0f/acc.w;
acc.z *= 1.0f/acc.w;
write_imagef (out, (int2)(x, y), acc);
#endif

barrier(CLK_LOCAL_MEM_FENCE);

if(y >= height) return;

buffer += lid + P;

float distacc = 0.0f;
for(int pj = -P; pj <= P; pj++)
{
distacc += buffer[pj];
}

distacc = gh(distacc);

write_imagef (U4_out, (int2)(x, y), distacc);
}



kernel void
nlmeans_accu(read_only image2d_t in, read_only image2d_t U2_in, read_only image2d_t U4_in,
write_only image2d_t U2_out, const int width, const int height,
const int2 q)
{
const int x = get_global_id(0);
const int y = get_global_id(1);

if(x >= width || y >= height) return;

float4 u1_pq = read_imagef(in, sampleri, (int2)(x, y) + q);
float4 u1_mq = read_imagef(in, sampleri, (int2)(x, y) - q);

float4 u2 = read_imagef(U2_in, sampleri, (int2)(x, y));

float u4 = read_imagef(U4_in, sampleri, (int2)(x, y)).x;
float u4_mq = read_imagef(U4_in, sampleri, (int2)(x, y) - q).x;

float u3 = u2.w;
float u4_mq_dd = u4_mq * ddirac(q);

u2 += (u4 * u1_pq) + (u4_mq_dd * u1_mq);
u3 += (u4 + u4_mq_dd);

u2.w = u3;

write_imagef(U2_out, (int2)(x, y), u2);
}


kernel void
nlmeans_finish(read_only image2d_t in, read_only image2d_t U2, write_only image2d_t out,
const int width, const int height, const float4 weight)
{
const int x = get_global_id(0);
const int y = get_global_id(1);

if(x >= width || y >= height) return;

float4 i = read_imagef(in, sampleri, (int2)(x, y));
float4 u2 = read_imagef(U2, sampleri, (int2)(x, y));
float u3 = u2.w;

float4 o = i * ((float4)1.0f - weight) + u2/u3 * weight;
o.w = i.w;

write_imagef(out, (int2)(x, y), o);
}



1 change: 1 addition & 0 deletions po/POTFILES.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ src/control/jobs/film_jobs.c
src/control/jobs/image_jobs.c
src/develop/develop.c
src/develop/imageop.c
src/develop/tiling.c
src/dtgtk/resetlabel.c
src/dtgtk/slider.c
src/libs/similarity.c
Expand Down
6 changes: 3 additions & 3 deletions src/common/mipmap_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -674,13 +674,13 @@ void dt_mipmap_cache_init(dt_mipmap_cache_t *cache)
thumbnails, k, thumbnails * cache->mip[k].buffer_size/(1024.0*1024.0));
}
// full buffer needs dynamic alloc:
const int full_entries = parallel;
const int full_entries = MAX(2, parallel); // even with one thread you want two buffers. one for dr one for thumbs.
int32_t max_mem_bufs = nearest_power_of_two(full_entries);

// for this buffer, because it can be very busy during import, we want the minimum
// number of entries in the hashtable to be 16, but leave the quota as is. the dynamic
// alloc/free properties of this cache take care that no more memory is required.
dt_cache_init(&cache->mip[DT_MIPMAP_FULL].cache, MAX(2, max_mem_bufs), parallel, 64, 0.9f*max_mem_bufs);
dt_cache_init(&cache->mip[DT_MIPMAP_FULL].cache, max_mem_bufs, parallel, 64, max_mem_bufs);
dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_FULL].cache,
dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_FULL]);
// dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_FULL].cache,
Expand Down Expand Up @@ -860,7 +860,7 @@ dt_mipmap_cache_read_get(
if(cache->compression_type)
{
// get per-thread temporary storage without malloc from a separate cache:
const int key = (int)pthread_self();
const int key = dt_control_get_threadid();
// const void *cbuf =
dt_cache_read_get(&cache->scratchmem.cache, key);
uint8_t *scratchmem = (uint8_t *)dt_cache_write_get(&cache->scratchmem.cache, key);
Expand Down
16 changes: 6 additions & 10 deletions src/control/control.c
Original file line number Diff line number Diff line change
Expand Up @@ -868,20 +868,16 @@ int32_t dt_control_revive_job(dt_control_t *s, dt_job_t *job)

int32_t dt_control_get_threadid()
{
int32_t threadid = 0;
while( !pthread_equal(darktable.control->thread[threadid],pthread_self()) && threadid < darktable.control->num_threads-1)
threadid++;
assert(pthread_equal(darktable.control->thread[threadid],pthread_self()));
return threadid;
for(int k=0;k<darktable.control->num_threads;k++)
if(pthread_equal(darktable.control->thread[k], pthread_self())) return k;
return darktable.control->num_threads;
}

int32_t dt_control_get_threadid_res()
{
int32_t threadid = 0;
while(!pthread_equal(darktable.control->thread_res[threadid],pthread_self()) && threadid < DT_CTL_WORKER_RESERVED-1)
threadid++;
assert(pthread_equal(darktable.control->thread_res[threadid], pthread_self()));
return threadid;
for(int k=0;k<DT_CTL_WORKER_RESERVED;k++)
if(pthread_equal(darktable.control->thread_res[k], pthread_self())) return k;
return DT_CTL_WORKER_RESERVED;
}

void *dt_control_work_res(void *ptr)
Expand Down
Loading

0 comments on commit 38e4950

Please sign in to comment.