Merge branch 'master' into filtering

darktable-org · Apr 24, 2012 · 38e4950 · 38e4950
2 parents 9e2d427 + 48274c4
commit 38e4950
Show file tree

Hide file tree

Showing 5 changed files with 362 additions and 107 deletions.
diff --git a/data/kernels/nlmeans.cl b/data/kernels/nlmeans.cl
@@ -1,6 +1,7 @@
 /*
     This file is part of darktable,
     copyright (c) 2011 johannes hanika.
+    copyright (c) 2012 Ulrich Pegelow.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -18,107 +19,216 @@
 
 const sampler_t sampleri =  CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
 const sampler_t samplerf =  CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+const sampler_t samplerc =  CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP         | CLK_FILTER_NEAREST;
 
 #define ICLAMP(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
 
+
+
+/* 
+    To speed up processing we use an algorithm proposed from B. Goossens, H.Q. Luong, J. Aelterman, A. Pizurica,  and W. Philips, 
+    "A GPU-Accelerated Real-Time NLMeans Algorithm for Denoising Color Video Sequences", in Proc. ACIVS (2), 2010, pp.46-57. 
+
+    Benchmarking figures (export of a 20MPx image on a i7-2600 with an NVIDIA GTS450):
+
+    This GPU-code: 18s
+    Brute force GPU-code: 136s
+    Optimized CPU-code: 27s
+
+*/
+
+
 float gh(const float f)
 {
   // make spread bigger: less smoothing
   const float spread = 100.f;
   return 1.0f/(1.0f + fabs(f)*spread);
 }
 
+
+float ddirac(const int2 q)
+{
+  return ((q.x || q.y) ? 1.0f : 0.0f);
+}
+
+
 kernel void
-nlmeans (read_only image2d_t in, write_only image2d_t out, const int width, const int height, const int P, const int K, const float nL, const float nC)
+nlmeans_init(write_only image2d_t out, const int width, const int height)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
-  const int maxx = width - 1;
-  const int maxy = height - 1;
-  const float4 norm2 = (float4)(nL, nC, nC, 1.0f);
-
-#if 0
-  // this is 20s (compared to 29s brute force below) but still unusable:
-  // load a block of shared memory, initialize to zero
-  local float4 block[32*32];//get_local_size(0)*get_local_size(1)];
-  block[get_local_id(0)  + get_local_id(1) * get_local_size(0)] = (float4)0.0f;
-  barrier(CLK_LOCAL_MEM_FENCE);
 
   if(x >= width || y >= height) return;
 
-  // coalesced mem accesses:
-  const float4 p1  = read_imagef(in, sampleri, (int2)(x, y));
+  write_imagef (out, (int2)(x, y), (float4)0.0f);
+}
+
 
-  // for each shift vector
-  for(int kj=-K;kj<=K;kj++)
+kernel void
+nlmeans_dist(read_only image2d_t in, write_only image2d_t U4, const int width, const int height, 
+             const int2 q, const float nL2, const float nC2)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const float4 norm2 = (float4)(nL2, nC2, nC2, 1.0f);
+
+  if(x >= width || y >= height) return;
+
+  float4 p1 = read_imagef(in, sampleri, (int2)(x, y));
+  float4 p2 = read_imagef(in, sampleri, (int2)(x, y) + q);
+  float4 tmp = (p1 - p2)*(p1 - p2)*norm2;
+  float dist = tmp.x + tmp.y + tmp.z;
+
+  write_imagef (U4, (int2)(x, y), dist);
+}
+
+kernel void
+nlmeans_horiz(read_only image2d_t U4_in, write_only image2d_t U4_out, const int width, const int height, 
+              const int2 q, const int P, local float *buffer)
+{
+  const int lid = get_local_id(0);
+  const int lsz = get_local_size(0);
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  if(y >= height) return;
+
+  /* fill center part of buffer */
+  buffer[P + lid] = read_imagef(U4_in, samplerc, (int2)(x, y)).x;
+
+  /* left wing of buffer */
+  for(int n=0; n <= P/lsz; n++)
   {
-    for(int ki=-K;ki<=K;ki++)
-    {
-      const float4 p2  = read_imagef(in, sampleri, (int2)(ICLAMP(x+ki, 0, maxx), ICLAMP(y+kj, 0, maxy)));
-      const float4 tmp = (p1 - p2)*norm2;
-      const float dist = tmp.x + tmp.y + tmp.z;
-      for(int pj=-P;pj<=P;pj++)
-      {
-        for(int pi=-P;pi<=P;pi++)
-        {
-          float4 p2 = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi+ki, 0, maxx), ICLAMP(y+pj+kj, 0, maxy)));
-          p2.w = dist;
-          const int i = get_local_id(0) + pi, j = get_local_id(1) + pj;
-          if(i >= 0 && i < get_local_size(0) && j >= 0 && j < get_local_size(1))
-          {
-            // TODO: for non-linear gh(), this produces results different than the CPU
-            block[i + get_local_size(0) * j].x += gh(p2.x);
-            block[i + get_local_size(0) * j].y += gh(p2.y);
-            block[i + get_local_size(0) * j].z += gh(p2.z);
-            block[i + get_local_size(0) * j].w += gh(p2.w);
-          }
-        }
-      }
-    }
+    const int l = mad24(n, lsz, lid + 1);
+    if(l > P) continue;
+    const int xx = mad24((int)get_group_id(0), lsz, -l);
+    buffer[P - l] = read_imagef(U4_in, samplerc, (int2)(xx, y)).x;
   }
-  // write back normalized shm
+
+  /* right wing of buffer */
+  for(int n=0; n <= P/lsz; n++)
+  {
+    const int r = mad24(n, lsz, lsz - lid);
+    if(r > P) continue;
+    const int xx = mad24((int)get_group_id(0), lsz, lsz - 1 + r);
+    buffer[P + lsz - 1 + r] = read_imagef(U4_in, samplerc, (int2)(xx, y)).x;
+  }
+
   barrier(CLK_LOCAL_MEM_FENCE);
-  const float4 tmp = block[get_local_id(0)  + get_local_id(1) * get_local_size(0)];
-  tmp.x *= 1.0f/tmp.w;
-  tmp.y *= 1.0f/tmp.w;
-  tmp.z *= 1.0f/tmp.w;
-  write_imagef (out, (int2)(x, y), tmp);
-#endif
 
+  if(x >= width) return;
 
-#if 1
-  if(x >= width || y >= height) return;
+  buffer += lid + P;
+
+  float distacc = 0.0f;
+  for(int pi = -P; pi <= P; pi++)
+  {
+    distacc += buffer[pi];
+  }
+
+  write_imagef (U4_out, (int2)(x, y), distacc);
+}
+
+
+kernel void
+nlmeans_vert(read_only image2d_t U4_in, write_only image2d_t U4_out, const int width, const int height, 
+              const int2 q, const int P, local float *buffer)
+{
+  const int lid = get_local_id(1);
+  const int lsz = get_local_size(1);
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  if(x >= width) return;
+
+  /* fill center part of buffer */
+  buffer[P + lid] = read_imagef(U4_in, samplerc, (int2)(x, y)).x;
+
+  /* left wing of buffer */
+  for(int n=0; n <= P/lsz; n++)
+  {
+    const int l = mad24(n, lsz, lid + 1);
+    if(l > P) continue;
+    const int yy = mad24((int)get_group_id(1), lsz, -l);
+    buffer[P - l] = read_imagef(U4_in, samplerc, (int2)(x, yy)).x;
+  }
 
-  const float4 acc   = (float4)(0.0f);
-  // brute force (superslow baseline)!
-  // for each shift vector
-  for(int kj=-K;kj<=K;kj++)
+  /* right wing of buffer */
+  for(int n=0; n <= P/lsz; n++)
   {
-    for(int ki=-K;ki<=K;ki++)
-    {
-      float dist = 0.0f;
-      for(int pj=-P;pj<=P;pj++)
-      {
-        for(int pi=-P;pi<=P;pi++)
-        {
-          float4 p1  = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi, 0, maxx), ICLAMP(y+pj, 0, maxy)));
-          float4 p2  = read_imagef(in, sampleri, (int2)(ICLAMP(x+pi+ki, 0, maxx), ICLAMP(y+pj+kj, 0, maxy)));
-          float4 tmp = (p1 - p2)*norm2;
-          dist += tmp.x + tmp.y + tmp.z;
-        }
-      }
-      float4 pin = read_imagef(in, sampleri, (int2)(ICLAMP(x+ki, 0, maxx), ICLAMP(y+kj, 0, maxy)));
-      dist = gh(dist);
-      acc.x += dist * pin.x;
-      acc.y += dist * pin.y;
-      acc.z += dist * pin.z;
-      acc.w += dist;
-    }
+    const int r = mad24(n, lsz, lsz - lid);
+    if(r > P) continue;
+    const int yy = mad24((int)get_group_id(1), lsz, lsz - 1 + r);
+    buffer[P + lsz - 1 + r] = read_imagef(U4_in, samplerc, (int2)(x, yy)).x;
   }
-  acc.x *= 1.0f/acc.w;
-  acc.y *= 1.0f/acc.w;
-  acc.z *= 1.0f/acc.w;
-  write_imagef (out, (int2)(x, y), acc);
-#endif
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  if(y >= height) return;
+
+  buffer += lid + P;
+
+  float distacc = 0.0f;
+  for(int pj = -P; pj <= P; pj++)
+  {
+    distacc += buffer[pj];
+  }
+
+  distacc = gh(distacc);
+
+  write_imagef (U4_out, (int2)(x, y), distacc);
 }
 
+
+
+kernel void
+nlmeans_accu(read_only image2d_t in, read_only image2d_t U2_in, read_only image2d_t U4_in,
+             write_only image2d_t U2_out, const int width, const int height, 
+             const int2 q)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  if(x >= width || y >= height) return;
+
+  float4 u1_pq = read_imagef(in, sampleri, (int2)(x, y) + q);
+  float4 u1_mq = read_imagef(in, sampleri, (int2)(x, y) - q);
+
+  float4 u2    = read_imagef(U2_in, sampleri, (int2)(x, y));
+
+  float  u4    = read_imagef(U4_in, sampleri, (int2)(x, y)).x;
+  float  u4_mq = read_imagef(U4_in, sampleri, (int2)(x, y) - q).x;
+
+  float u3 = u2.w;
+  float u4_mq_dd = u4_mq * ddirac(q);
+
+  u2 += (u4 * u1_pq) + (u4_mq_dd * u1_mq);
+  u3 += (u4 + u4_mq_dd);
+
+  u2.w = u3;
+
+  write_imagef(U2_out, (int2)(x, y), u2);
+}
+
+
+kernel void
+nlmeans_finish(read_only image2d_t in, read_only image2d_t U2, write_only image2d_t out, 
+               const int width, const int height, const float4 weight)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  if(x >= width || y >= height) return;
+
+  float4 i  = read_imagef(in, sampleri, (int2)(x, y));
+  float4 u2 = read_imagef(U2, sampleri, (int2)(x, y));
+  float  u3 = u2.w;
+
+  float4 o = i * ((float4)1.0f - weight) + u2/u3 * weight;
+  o.w = i.w;
+
+  write_imagef(out, (int2)(x, y), o);
+}
+
+
+
diff --git a/po/POTFILES.in b/po/POTFILES.in
@@ -17,6 +17,7 @@ src/control/jobs/film_jobs.c
 src/control/jobs/image_jobs.c
 src/develop/develop.c
 src/develop/imageop.c
+src/develop/tiling.c
 src/dtgtk/resetlabel.c
 src/dtgtk/slider.c
 src/libs/similarity.c

diff --git a/src/common/mipmap_cache.c b/src/common/mipmap_cache.c
@@ -674,13 +674,13 @@ void dt_mipmap_cache_init(dt_mipmap_cache_t *cache)
         thumbnails, k, thumbnails * cache->mip[k].buffer_size/(1024.0*1024.0));
   }
   // full buffer needs dynamic alloc:
-  const int full_entries = parallel;
+  const int full_entries = MAX(2, parallel); // even with one thread you want two buffers. one for dr one for thumbs.
   int32_t max_mem_bufs = nearest_power_of_two(full_entries);
 
   // for this buffer, because it can be very busy during import, we want the minimum
   // number of entries in the hashtable to be 16, but leave the quota as is. the dynamic
   // alloc/free properties of this cache take care that no more memory is required.
-  dt_cache_init(&cache->mip[DT_MIPMAP_FULL].cache, MAX(2, max_mem_bufs), parallel, 64, 0.9f*max_mem_bufs);
+  dt_cache_init(&cache->mip[DT_MIPMAP_FULL].cache, max_mem_bufs, parallel, 64, max_mem_bufs);
   dt_cache_set_allocate_callback(&cache->mip[DT_MIPMAP_FULL].cache,
       dt_mipmap_cache_allocate_dynamic, &cache->mip[DT_MIPMAP_FULL]);
   // dt_cache_set_cleanup_callback(&cache->mip[DT_MIPMAP_FULL].cache,
@@ -860,7 +860,7 @@ dt_mipmap_cache_read_get(
           if(cache->compression_type)
           {
             // get per-thread temporary storage without malloc from a separate cache:
-            const int key = (int)pthread_self();
+            const int key = dt_control_get_threadid();
             // const void *cbuf =
             dt_cache_read_get(&cache->scratchmem.cache, key);
             uint8_t *scratchmem = (uint8_t *)dt_cache_write_get(&cache->scratchmem.cache, key);

diff --git a/src/control/control.c b/src/control/control.c
@@ -868,20 +868,16 @@ int32_t dt_control_revive_job(dt_control_t *s, dt_job_t *job)
 
 int32_t dt_control_get_threadid()
 {
-  int32_t threadid = 0;
-  while( !pthread_equal(darktable.control->thread[threadid],pthread_self()) && threadid < darktable.control->num_threads-1)
-    threadid++;
-  assert(pthread_equal(darktable.control->thread[threadid],pthread_self()));
-  return threadid;
+  for(int k=0;k<darktable.control->num_threads;k++)
+    if(pthread_equal(darktable.control->thread[k], pthread_self())) return k;
+  return darktable.control->num_threads;
 }
 
 int32_t dt_control_get_threadid_res()
 {
-  int32_t threadid = 0;
-  while(!pthread_equal(darktable.control->thread_res[threadid],pthread_self()) && threadid < DT_CTL_WORKER_RESERVED-1)
-    threadid++;
-  assert(pthread_equal(darktable.control->thread_res[threadid], pthread_self()));
-  return threadid;
+  for(int k=0;k<DT_CTL_WORKER_RESERVED;k++)
+    if(pthread_equal(darktable.control->thread_res[k], pthread_self())) return k;
+  return DT_CTL_WORKER_RESERVED;  
 }
 
 void *dt_control_work_res(void *ptr)