parallella · ebadi · Jun 7, 2015 · Jun 7, 2015 · Jun 7, 2015 · Jun 7, 2015
diff --git a/README.md b/README.md
@@ -218,7 +218,7 @@ FUNCTION     | NOTES
 [p_max()](src/math/p_max.c)           | finds max val  
 [p_min()](src/math/p_min.c)           | finds min val  
 [p_mean()](src/math/p_mean.c)         | mean operation  
-[p_median()](src/math/p_mean.c)       | finds middle value  
+[p_median()](src/math/p_median.c)       | finds middle value  
 [p_mode()](src/math/p_mode.c)         | finds most common value  
 [p_mul()](src/math/p_mul.c)           | multiplication  
 [p_popcount()](src/math/p_popcount.c) | count the number of bits set  

diff --git a/include/pal_image.h b/include/pal_image.h
@@ -9,7 +9,8 @@
  * r         : output matrix pointer
  * rows      : rows in input image
  * columns   : columns in input image (multiple of 4)
- * msize     : mask size (square)
+ * mrows     : mask rows
+ * mcols     : mask cols
  * opt       : options
  * p         : number of processors "parallelism"
  * team      : team to work with
@@ -18,8 +19,8 @@
  */
 
 /*2d convolution */
-void p_conv2d_f32(const float *x, float *m, float *r, int rows, int cols,
-                  int msize, int p, p_team_t team);
+void p_conv2d_f32(const float *x, const float *m, float *r, int rows, int cols,
+                  int mrows, int mcols,  int p, p_team_t team);
 
 /*2d box (i.e mean) filter(3x3) */
 void p_box3x3_f32(const float *x, float *r, int rows, int cols,

diff --git a/src/dsp/p_conv.c b/src/dsp/p_conv.c
@@ -1,3 +1,22 @@
+/*
+Remove this comment after merge
+
+// FreeMat 
+//--> x = [1, 2, 3, 4];
+//--> h = [5,6,7];
+//--> r = conv(x, h)
+//r =
+//  5 16 34 52 45 28 
+
+// result from pal : 
+   float x[4] = {1,2,3,4};
+   float h[3] = {5,6,7} ;
+   float r[10] = {100,100,100,100,100,100, 100, 100,100, 100};  
+   p_conv_f32( x, h , r ,4 ,3);
+   printf ("%f, %f, %f, %f, %f, %f, %f, %f", r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
+//100.000000, 105.000000, 116.000000, 134.000000, 152.000000, 145.000000, 128.000000, 100.000000
+*/
+
 #include <pal.h>
 
 /**
@@ -27,11 +46,15 @@ void p_conv_f32(const float *x, const float *h, float *r,
 {
     const float *xc = x;
     float *rx = r;
-  	for ( int i = 0; i < nx; i++) {
+    int i,j ;
+        for ( i = 0; i < nx+nh-1; i++) { *(rx++) = 0; } 
+        rx = r ;
+  	for ( i = 0; i < nx; i++) {
         float xv = *xc++;
-        rx++;
-  		for (int j = 0; j < nh; j++) {
-  			*(rx + j) += xv * *(h + j);		
+
+  		for (j = 0; j < nh; j++) {
+  			*(rx + j) += xv * *(h + j);	
   		}
+        rx++;
   	}
 }
diff --git a/src/image/p_conv2d.c b/src/image/p_conv2d.c
@@ -1,3 +1,66 @@
+/*
+Remove this comment after merge
+// A= [17 24 1 8 15; 23 5 7 14 16; 4 6 13 20 22;10 12 19 21 3; 11 18 25 2 9]
+// B=[1 3 1; 0 5 0; 2 1 2]
+// E=[0 0 0; 0 1 0; 0 0 0]
+// C2 = conv2(A,E,'valid')   <-- works correctly since kernel is symmetric   
+// C1 = conv2(A,B,'valid')   <-- Doesn't work correctly
+
+// Response from pal:
+// 155.0, 135.0, 200.0, 
+// 145.0, 190.0, 230.0, 
+// 185.0, 225.0, 270.0,
+
+// Response from FreeMat:
+// 120, 165, 205,
+// 160, 200, 245,
+// 190, 255, 235,
+
+#define W 5
+#define W2 (W-2)
+
+int main(int argc, char *argv[])
+{
+
+    int i, j;
+
+// http://www.johnloomis.org/ece563/notes/filter/conv/convolution.html
+   float src[5*5] = {17,24,1,8,15,
+                    23,5,7,14,16,
+                    4,6,13,20,22,
+                    10,12,19,21,3,
+                    11,18,25,2,9 };
+
+   float kernel[3*3] = {
+          1,3,1,
+          0,5,0,
+          2,1,2 };
+
+    float dest[3*3];
+    p_conv2d_f32(src, kernel, dest, 5, 5, 3);
+
+    // src 
+    for (i = 0; i < W; i++) {
+        for (j = 0; j < W; j++) {    	
+        	printf("%.1f, ", src[i*W+j]);
+        }
+        printf("\n");
+    }
+
+    printf("\n");
+
+    // response
+    for (i = 0; i < W2; i++) {
+        for (j = 0; j < W2; j++) {
+        	printf("%.1f, ", dest[i*W2+j]);
+        }
+        printf("\n");
+    }
+}
+*/
+
+
+
 #include <pal.h>
 
 /** Convolution on input image 'x' with a square kernel 'm' of size 'msize'.
@@ -12,7 +75,9 @@
  *
  * @param cols  Number of columns in input image
  *
- * @param msize Size of convolution kernel
+ * @param mrows number of rows in convolution kernel
+ *
+ * @param mcols number of cols in convolution kernel
  *
  * @param p     Number of processor to use (task parallelism)
  *
@@ -22,37 +87,34 @@
  *
  */
 
-void p_conv2d_f32(const float *x, float *m, float *r, int rows, int cols,
-                  int msize, int p, p_team_t team)
+void p_conv2d_f32(const float *x, const float *m, float *r, int rows, int cols,
+                  int mrows, int mcols, int p, p_team_t team)
 
 {
-    int i, j, k;
-    float P, part;
+    int i, j, ki, kj;
+    float P;
     const float *px, *pm;
     float *pr;
 
     px = x;
-    pm = m;
     pr = r;
 
-    for (i = msize * 0.5; i < (rows - msize * 0.5); i++) {
-        for (j = msize * 0.5; j < (cols - msize * 0.5); j++) {
+    for (i = 0; i < rows - mrows+1  ; i++) {
+        for (j = 0; j < cols - mcols+1 ; j++) {
             P = 0.0f;
-            pm = m;
-            for (k = 0; k < msize; k++) {
-                p_dot_f32(px, pm, &part, msize, 0, team);
-                P += part;
-                px += cols;
-                pm += msize;
+            pm = m+(mcols * mrows)-1;
+            for (ki=0 ; ki< mrows ; ki++){
+                for (kj=0 ; kj< mcols ; kj++){
+                    // printf("(%f*%f)", *px, *pm); // unroll: remove me 
+                    P+= (*px++)* (*pm--) ;
+		}
+            px += cols - mcols;
             }
-            *pr = P;
-            pr++;
-            // move image pointer one index forward compared to
-            // the position from before `for` loop
-            px += 1 - msize * cols;
+            //printf("=[%f]\n", P); // sum : remove me
+            px -= (mrows * cols) -1 ;
+            *(pr++) = P;
         }
-        // move image pointer to the beginning of line
-        // beneath the current line
-        px += (int)(msize * 0.5) * 2;
+    printf("\n\n");
+    px+=mcols-1 ;
     }
 }
diff --git a/src/image/p_median3x3.c b/src/image/p_median3x3.c
@@ -1,5 +1,45 @@
 #include <pal.h>
 
+/*
+ * The following routines have been built from knowledge gathered
+ * around the Web. I am not aware of any copyright problem with
+ * them, so use it as you want.
+ * N. Devillard - 1998
+ */
+
+typedef float pixelvalue ;
+
+#define PIX_SORT(a,b) { if ((a)>(b)) PIX_SWAP((a),(b)); }
+#define PIX_SWAP(a,b) { pixelvalue temp=(a);(a)=(b);(b)=temp; }
+
+/*----------------------------------------------------------------------------
+   Function :   opt_med9()
+   In       :   pointer to an array of 9 pixelvalues
+   Out      :   a pixelvalue
+   Job      :   optimized search of the median of 9 pixelvalues
+   Notice   :   in theory, cannot go faster without assumptions on the
+                signal.
+                Formula from:
+                XILINX XCELL magazine, vol. 23 by John L. Smith
+
+                The input array is *NOT* modified in the process
+                The result array is guaranteed to contain the median
+                value
+ ---------------------------------------------------------------------------*/
+
+pixelvalue opt_med9(pixelvalue * pointer)
+{
+    pixelvalue p[9];
+    memcpy(p, pointer, 9*sizeof(pixelvalue) );
+    PIX_SORT(p[1], p[2]) ; PIX_SORT(p[4], p[5]) ; PIX_SORT(p[7], p[8]) ;
+    PIX_SORT(p[0], p[1]) ; PIX_SORT(p[3], p[4]) ; PIX_SORT(p[6], p[7]) ;
+    PIX_SORT(p[1], p[2]) ; PIX_SORT(p[4], p[5]) ; PIX_SORT(p[7], p[8]) ;
+    PIX_SORT(p[0], p[3]) ; PIX_SORT(p[5], p[8]) ; PIX_SORT(p[4], p[7]) ;
+    PIX_SORT(p[3], p[6]) ; PIX_SORT(p[1], p[4]) ; PIX_SORT(p[2], p[5]) ;
+    PIX_SORT(p[4], p[7]) ; PIX_SORT(p[4], p[2]) ; PIX_SORT(p[6], p[4]) ;
+    PIX_SORT(p[4], p[2]) ; return(p[4]) ;
+}
+
 /*
  * A median 3x3 filter.
  *
@@ -19,45 +59,40 @@
  *
  */
 
-void p_median3x3_f32(const float *x, float *r, int rows, int cols,
-                     int p, p_team_t team)
+void p_median3x3_f32(const float *x, float *r, int rows, int cols, 
+		     int p, p_team_t team)
 {
     float buffer[9];
-    const float *px;
-    float *pr;
-    int i, j, buffer_col;
+    float sorted[9] ;
+    float *px, *pr;
+    int i, j, buffer_pointer;
 
     px = x;
     pr = r;
 
     for (i = 0; i < rows - 2; i++) {
         // fully filling first window
-        buffer[0] = *px;
-        buffer[1] = *(px + 1);
-        buffer[2] = *(px + 2);
+        buffer[0] = *(px);
+        buffer[1] = *(px + cols);
+        buffer[2] = *(px + cols + cols);
 
-        buffer[3] = *(px + cols);
+        buffer[3] = *(px + 1);
         buffer[4] = *(px + cols + 1);
-        buffer[5] = *(px + cols + 2);
-
-        buffer[6] = *(px + cols + cols);
-        buffer[7] = *(px + cols + cols + 1);
-        buffer[8] = *(px + cols + cols + 2);
-
-        p_median_f32(buffer, pr, 9, 0, 0);
-        pr++;
-        px += 3;
+        buffer[5] = *(px + cols + cols + 1);
+        buffer_pointer = 6 ;
         // other windows differ only by one column
-        // so only one is exchanged
-        for (j = 0; j < cols - 3; j++) {
-            buffer_col = j % 3;
-            buffer[buffer_col] = *px;
-            buffer[buffer_col + 3] = *(px + cols);
-            buffer[buffer_col + 6] = *(px + cols + cols);
-
-            p_median_f32(buffer, pr, 9, 0, 0);
+        // so only one column is added to the place where buffer pointer points
+        for (j = 2; j < cols  ; j++) {
+            // in each iteration, three values are replaced in the circular queue
+            buffer_pointer = buffer_pointer % 9;
+            buffer[buffer_pointer]   = *(px + j ) ;
+            buffer[buffer_pointer+1] = *(px + j + cols );
+            buffer[buffer_pointer+2] = *(px + j + cols + cols );
+            buffer_pointer+= 3 ;
+            //p_median_f32(buffer, pr, 9, 0, 0);
+            *pr = opt_med9(buffer);
             pr++;
-            px++;
         }
+        px += cols ;
     }
 }