diff --git a/paralle2/README.md b/paralle2/README.md
index d165035..928cca0 100644
--- a/paralle2/README.md
+++ b/paralle2/README.md
@@ -1,22 +1,45 @@
-Pseudo Eternity II solver under Parallella !
+# A 10x10 Eternity II solver
 
-Demo with a 10x10 subproblem with 16 cores running in parallel.
+## Build and Run
 
-A single eCore reaches 3 Mn/s, that's a tremendous 48 Mn/s for 4.7 W !
-My high-end computer reaches 130 Mn/s with a single core and heavy optimizations, but it consumes 72 W.
+#data BEFORE ; will create bin/bench.bin - a bunch of benchs
 
-This is only the beginning with a basic algorithm, I'm confident we can reach 80 Mn/s with a single 16-core Parallella :D
-For your information a single ARM core with the same basic algorithm reaches 6 Mn/s.
-This kind of problem would be a good candidate to run under the unused parts of the FPGA... that's another story. For experts.
+ ./build_data.sh
+ ./run_data.sh
 
-***
+ ./build.sh
+ ./run.sh value
 
-How to run the demo under a 16-core Parallella:
-./build.sh e2
-time ./run.sh e2
+ Wanting an assembly output ? Use ./buildasm.sh
+ Cross compiling for an x86_64 platform ? You can use ./x86*.sh  
+ 
+## Benchmarks
 
-Please check you use a 16-core Epiphany like mine: E16G301, a Kickstarter model with Zynq 7020 and a 'headless' configuration.
-If not the case, you may adapt the sources: src/e2.c and src/e_e2.c
-This kind of problem is a perfect candidate for clusters too.
-You can easily tweak this code if you have a 64-core Parallella (if you don't know how to code it, just sell it to me lol)
+All programs are full C, sometimes with some assembly.
+Mn/s/W = Million nodes per second per Watt
+
+
+GPU OpenCL                      : not even a tenth of a modest x86 core with a Radeon 5770 graphics card. The numerous branches are a dead end, not to talk about the watts.
+Parallella, one ARM A9 core     :    6 Mn/s ;   3.0 W ;  2    Mn/s/W
+My high-end computer, one core  :  166 Mn/s ;  72.0 W ;  2.3  Mn/s/W ; x86_64, Fedora Core 23, i7 5820k
+Raspberry Pi 3                  :                        8-10 Mn/s/W iirc ; A53, 4-core, 1.2 GHz
+My high-end computer, 12 threads: 1470 Mn/s ; 140.0 W ; 10.0  Mn/s/W
+Odroid XU4                      :  245 Mn/s ;  15.7 W ; 15.6 Mn/s/W ; 8-core ; deeply optimized, not much margin
+Parallella 16-core Epiphany     :  103 Mn/s ;   5.0 W ; 20.6 Mn/s/W ; remove the Ethernet cable to earn 0.6 W due to ssh with the headless Parabuntu distro
+
+
+So...
+To my knowledge, Parallella is today the most energy-efficient platform for this highly recursive task...
+although it does *not* use any float !
+
+Eagerly waiting the 1024-core Epiphany V...
+
+## Author
+
+DonQuichotteComputers at gmail dot com
+2017
+
+## License
+
+BSD-3 clause.
 
diff --git a/paralle2/build.sh b/paralle2/build.sh
index a7d2d11..b74c16d 100644
--- a/paralle2/build.sh
+++ b/paralle2/build.sh
@@ -3,8 +3,8 @@
 set -e
 
 ESDK=${EPIPHANY_HOME}
-ELIBS="-L ${ESDK}/tools/host/lib"
-EINCS="-I ${ESDK}/tools/host/include"
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
 ELDF=${ESDK}/bsps/current/internal.ldf
 
 SCRIPT=$(readlink -f "$0")
@@ -23,14 +23,25 @@ case $(uname -p) in
 		;;
 esac
 
+# Create output dir
+mkdir -p bin
+
 # Build HOST side application
-${CROSS_PREFIX}gcc -Ofast src/$1.c -o Debug/$1.elf ${EINCS} ${ELIBS} -le-hal -le-loader -lpthread
+${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader
 
 # Build DEVICE side program
-#for speed optimization replace $2... with -Ofast
-#e-gcc --help=optimizers gives you hints
-e-gcc -T ${ELDF} -Ofast $2 $3 $4 $5 $6 $7 $8 $9 src/e_$1.c -o Debug/e_$1.elf -le-lib
+# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4
+#-mshort-calls still works :D  
+
+#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib
+#e-gcc 5.4 makes poor use of the option -mfp-mode=int
+#  the option -mfp-iarith slows DOWN my program -- more than 20 % :/ 
 
-# Convert ebinary to SREC file
-e-objcopy --srec-forceS3 --output-target srec Debug/e_$1.elf Debug/e_$1.srec
+ e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 src/e_e2g.c -o bin/e_e2g.elf -le-lib
 
+# trick to get the spare room usage: epiphany-elf-size your_program.elf ; with internal.ldf the value of 'dec' cannot be beyond 32767
+#
+#parallella@parallella:~/parallella-examples/tmp$ epiphany-elf-size bin/e_e2g.elf 
+#   text	   data	    bss	    dec	    hex	filename
+#  18730	   2148	   2808	  23686	   5c86	bin/e_e2g.elf
+#
diff --git a/paralle2/build_data.sh b/paralle2/build_data.sh
new file mode 100644
index 0000000..587ae30
--- /dev/null
+++ b/paralle2/build_data.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+ESDK=${EPIPHANY_HOME}
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
+ELDF=${ESDK}/bsps/current/internal.ldf
+
+SCRIPT=$(readlink -f "$0")
+EXEPATH=$(dirname "$SCRIPT")
+cd $EXEPATH
+
+CROSS_PREFIX=
+case $(uname -p) in
+	arm*)
+		# Use native arm compiler (no cross prefix)
+		CROSS_PREFIX=
+		;;
+	   *)
+		# Use cross compiler
+		CROSS_PREFIX="arm-linux-gnueabihf-"
+		;;
+esac
+
+# Create output dir
+mkdir -p bin
+
+# Build HOST side application
+${CROSS_PREFIX}gcc src/build_data.c -o bin/build_data.elf -I ${EINCS} -L ${ELIBS}
diff --git a/paralle2/buildasm.sh b/paralle2/buildasm.sh
new file mode 100644
index 0000000..d513d73
--- /dev/null
+++ b/paralle2/buildasm.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+
+ESDK=${EPIPHANY_HOME}
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
+ELDF=${ESDK}/bsps/current/internal.ldf
+
+SCRIPT=$(readlink -f "$0")
+EXEPATH=$(dirname "$SCRIPT")
+cd $EXEPATH
+
+CROSS_PREFIX=
+case $(uname -p) in
+	arm*)
+		# Use native arm compiler (no cross prefix)
+		CROSS_PREFIX=
+		;;
+	   *)
+		# Use cross compiler
+		CROSS_PREFIX="arm-linux-gnueabihf-"
+		;;
+esac
+
+# Create output dir
+mkdir -p bin
+
+# Build HOST side application
+${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader
+
+# Build DEVICE side program
+# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4
+#-mshort-calls still works :D  
+
+#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib
+#e-gcc 5.4 makes poor use of the option -mfp-mode=int
+#  the option -mfp-iarith slows DOWN my program -- more than 20 % :/ 
+
+ e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 -mfp-iarith src/e_e2g.c -S -le-lib
diff --git a/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf b/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf
new file mode 100644
index 0000000..31324b1
Binary files /dev/null and b/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf differ
diff --git a/paralle2/run.sh b/paralle2/run.sh
index e332839..3c78ee8 100644
--- a/paralle2/run.sh
+++ b/paralle2/run.sh
@@ -2,8 +2,20 @@
 
 set -e
 
+BENCH_INDEX=""
 
-cd Debug
+if [ $# -lt 1 ]; then
+	echo "Usage:  ./run.sh numberic-value"
+	exit 1
+else
+	if [[ ! "$1" =~ ^[0-9]+$ ]]; then
+		echo "ERROR:  value must be numeric"
+		echo "Usage:  ./run.sh numberic-value"
+		exit 1
+	else
+		BENCH_INDEX=$1
+	fi
+fi
 
-./$1.elf $2 $3 $4 $5 $6 $7 $8 $9
+time bin/e2g.elf ${BENCH_INDEX}
 
diff --git a/paralle2/run_data.sh b/paralle2/run_data.sh
new file mode 100644
index 0000000..7da8aff
--- /dev/null
+++ b/paralle2/run_data.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e
+
+bin/build_data.elf
+
+echo "Building data: done.  Now you can run './build.sh'"
diff --git a/paralle2/src/C_common2.h b/paralle2/src/C_common2.h
new file mode 100644
index 0000000..ebea59a
--- /dev/null
+++ b/paralle2/src/C_common2.h
@@ -0,0 +1,46 @@
+// avoid stdint.h
+#define uint8_t		unsigned char
+#define uint16_t	unsigned short
+#define uint32_t	unsigned int
+#define uint64_t	unsigned long long // <!> unsigned long = 4 bytes under ARM
+
+#define int8_t		char
+#define int16_t		short
+#define int32_t		int
+#define int64_t		long long
+
+// my semantic
+#define uc     unsigned char
+#define ull    unsigned long long
+#define pf      printf
+#define print   printf("\n");
+#define pfv(x)  printf("v: %d\n",x);
+#define LOOP1(x) for(fn1=0;fn1<(x);fn1++)
+#define LOOP2(x) for(fn2=0;fn2<(x);fn2++)
+#define LOOP3(x) for(fn3=0;fn3<(x);fn3++)
+#define LOOP4(x) for(fn4=0;fn4<(x);fn4++)
+#define LOOP5(x) for(fn5=0;fn5<(x);fn5++)
+#define LOOP6(x) for(fn6=0;fn6<(x);fn6++)
+#define LOOP7(x) for(fn7=0;fn7<(x);fn7++)
+#define LOOP8(x) for(fn8=0;fn8<(x);fn8++)
+#define LOOP(x,y) for(x=0;x<y;x++)
+
+// #######################################
+// think x86 asm, think jz... setz... cmovz... think flags...
+#define ifz(x) if(!(x))
+#define ifnz(x) if(x)
+#define ife(x) if(!(x))
+#define ifne(x) if(x)
+// booleans
+#define ifzbool32(x,bit) ifz((x)&(1U<<(bit))) 
+#define ifbool32(x,bit)  if((x)&(1U<<(bit))) 
+#define ifzbool64(x,bit) ifz((uint64_t)(x)&(1ULL<<(bit)))
+#define ifbool64(x,bit)  if((uint64_t)(x)&(1ULL<<(bit)))
+
+// __asm__ __volatile__ ("idle");//repos !
+
+/* pense-bete
+#define ALIGN(x)   __attribute__ ((aligned (x))) 
+#define PACKED     __attribute__ ((packed)) 
+#define SECTION(x) __attribute__ ((section (x)))
+*/
diff --git a/paralle2/src/build_data.c b/paralle2/src/build_data.c
new file mode 100644
index 0000000..deed086
--- /dev/null
+++ b/paralle2/src/build_data.c
@@ -0,0 +1,39 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define BENCH_INIT 0
+#define BENCH_MAX  1023
+
+const unsigned char tbench[16][17]={
+  { 0xFF,0xED,0xDF,0xFC,0xFF,0xF4,0xFF,0xFF,0x25,0x7E,0xEB,0xEF,0x79,0x02,0x05,0x84,0x43 },
+  { 0xE5,0xFF,0xFB,0xFB,0xEF,0xFE,0xFF,0xFB,0xF5,0xFA,0x56,0xBE,0x39,0x19,0x17,0x15,0x44 },
+  { 0xFE,0xBF,0xFF,0xDF,0xEC,0xFB,0xFF,0xFE,0x8D,0x1F,0xAF,0xBF,0x82,0x23,0x45,0x50,0x23 },
+  { 0xFF,0xF5,0xFF,0xFF,0x6E,0xFF,0xFF,0x3B,0x5C,0x0F,0xDF,0xF7,0x32,0x79,0x03,0x97,0x23 },
+  { 0xBF,0xFF,0xFF,0xF9,0x7F,0xF7,0xDE,0xDF,0x3C,0x3E,0x9D,0xFF,0x55,0x58,0x03,0x46,0x33 },
+  { 0xE6,0xD7,0xDF,0xFF,0xFF,0xFF,0xEF,0xFD,0xDD,0x6E,0x5B,0x6F,0x79,0x13,0x30,0x83,0x43 },
+  { 0xFE,0xFB,0xF3,0xEB,0xFF,0x7F,0xFD,0xFF,0xF9,0xE7,0xC8,0xFD,0x63,0x00,0x35,0x33,0x34 },
+  { 0xEF,0xF7,0xFD,0xF7,0xFE,0xF1,0xFF,0xFF,0xF8,0xBB,0x4F,0xAF,0x27,0x50,0x62,0x30,0x13 },
+  { 0xFE,0xF7,0xF7,0xF0,0xFB,0xFF,0xFF,0xFF,0xF9,0x3E,0x5F,0x4F,0x55,0x53,0x10,0x57,0x43 },
+  { 0xBF,0xF6,0xDF,0xEF,0xFF,0xFE,0xBF,0xF7,0x6C,0x9E,0x9B,0xFF,0x33,0x22,0x22,0x98,0x23 },
+  { 0xFF,0xEF,0x4F,0xBB,0xBF,0xBF,0xFF,0xFF,0x68,0xFC,0xEF,0xE7,0x69,0x31,0x02,0x93,0x13 },
+  { 0xBB,0xFD,0xF7,0xFF,0x9F,0xFF,0xFB,0xFE,0xE1,0xFB,0xAD,0xDD,0x22,0x37,0x15,0x60,0x34 },
+  { 0x7F,0xCE,0xFF,0xBD,0xFF,0xFE,0xEF,0xFF,0xE5,0x7A,0x78,0xFF,0x13,0x50,0x07,0x00,0x34 },
+  { 0xFC,0xFF,0x7F,0xFE,0xDF,0xFD,0x3F,0xFF,0x64,0xFA,0xE7,0x7F,0x82,0x58,0x99,0x05,0x14 },
+  { 0xFF,0x7B,0xF7,0xB3,0xFF,0xFF,0xFE,0xBF,0xB7,0x36,0xEC,0xFE,0x64,0x57,0x11,0x06,0x44 },
+  { 0xFD,0xEF,0xDF,0xFB,0x7F,0xFF,0xF2,0xFF,0xF2,0x73,0x3F,0x3F,0x19,0x96,0x07,0x19,0x14 },
+};
+
+int main(void) {
+	unsigned i, j;
+  FILE *f;
+  
+  f=fopen("bin/bench.bin", "wb");
+  if(f == NULL) { printf("Error writing bin/bench.bin\n"); exit(-1); }
+  
+  for(i=0; i<2048; i++)
+    j=fwrite(tbench, 17*16, 1, f);
+  fclose(f);
+  
+  return 0;
+}
diff --git a/paralle2/src/e2g.c b/paralle2/src/e2g.c
new file mode 100644
index 0000000..93e1ba6
--- /dev/null
+++ b/paralle2/src/e2g.c
@@ -0,0 +1,199 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <e-hal.h> // HOST side ; mandatory
+
+#include "C_common2.h"  // common definitions for C
+#include "e2g_common.h" // common definitions for EII project
+
+#define BENCH_MIN 0    //min bench to start with
+#define BENCH_MAX 1024 //max bench to start with
+#define BENCH_N   1 //16   //16 to solve per core
+#define BENCH_LIMIT 10000 //stop after x benchs done ; not implemented actually
+
+//#######################################
+
+void Epiphany_Boot(e_platform_t *epiphany) {
+  e_init(NULL); // initialise the system, establish connection to the device
+  e_reset_system(); // reset the Epiphany chip
+  e_get_platform_info(epiphany);// get the configuration info for the parallella platform
+}
+
+//#######################################
+//printf("%-2d"...) -> left align
+//GOAL: display an array of node numbers
+void Node_Board_Print2(uint *tsolN) {
+  int fn1, fn2, fn3=0, place;
+
+	print
+  LOOP2(9)
+    pf("              %d", 1+fn2);
+  pf("             %2d\n", 1+fn2);
+  //pf("  -  %c\n", 'A');
+  //pf("  -  %c\n", 'B');
+  
+  LOOP1(8) {
+    LOOP2(10) {
+      place=(fn1+2)*16 + fn2;//+2 because basis is C1
+      ifnz(tsolN[fn3]) {
+        pf("   %012u", tsolN[fn3]);
+      }
+      else {
+        pf("       %c%d     ", 'C'+fn1, 1+fn2);//'C' because basis is C1
+        if(fn2!=9) pf(" ");
+      }
+      fn3++;
+    }
+    pf("  -  %c\n", 'C'+fn1);//basis is C1
+  }
+}
+
+//#######################################
+//print out result
+int64_t Output_Print(Soutput out) {
+  int64_t l1=0;
+  int fn1;
+  
+  LOOP1(DAM_SZ) l1+=out.globaltsolN[fn1];
+  pf("\n   %012llu nodes\n", l1);
+  pf("\n   res: %09u\n", out.globalres);
+
+  Node_Board_Print2(out.globaltsolN);
+
+  return l1 + out.globalres;
+}
+
+//#######################################
+
+int main(int argc, char *argv[]) {
+  // Epiphany input/output through shared RAM ; details: e2g_common.h
+  Sio     fromio;//Sio *fromio=(Sio *)malloc(sizeof(Sio));
+  int64_t l1=0;
+	int row, col, i, j, fn1, fn2, bench_start=BENCH_MIN, toccN[CORE_N]={0}, benchlimit=0;
+  e_platform_t epiphany;// Epiphany platform configuration
+	e_epiphany_t dev;
+  FILE *fin;
+  char *tbench=(char *)malloc(MAX_CORE_N * 17 * 16);;
+
+	if(argc > 1) {
+		i=atoi(argv[1]); if(i < BENCH_MAX) bench_start=i;
+  }
+
+  //get data
+  fin=fopen("./bin/bench.bin", "rb");
+  ifz(fin) { printf("Error reading file bin/bench.bin ; did you generate it with build_data.sh ?\n"); exit(-1); }
+  fseek(fin, 17 * bench_start, 0);
+  i=fread(tbench, MAX_CORE_N * 17 * 16, 1, fin);
+  fclose(fin);
+
+  printf("\n\nEternity II running under Parallella :) \n\n\n");
+	
+  Epiphany_Boot(&epiphany);
+
+	// Create a workgroup using all of the cores	
+	e_open(&dev, 0, 0, epiphany.rows, epiphany.cols);
+	e_reset_group(&dev);
+
+	// Load the device code into each core of the chip, and don't start it yet
+	e_load_group("bin/e_e2g.elf", &dev, 0, 0, epiphany.rows, epiphany.cols, E_FALSE);
+
+	// Set the maximum per core test value on each core at address 0x7020
+	i=0;
+  for(row=0;row<epiphany.rows;row++) {
+		for(col=0;col<epiphany.cols;col++) {
+      fromio.out.cmd=CMD_INIT;
+      LOOP1(DAM_SZ) 
+        fromio.in.tdam[fn1]=0;
+      j=17 * bench_start; // for demo purpose
+
+      LOOP1(8)
+        fromio.in.tdam[C2N + fn1] = (tbench[j + 12 + (fn1/2)] >> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte
+      fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1;
+      fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1;
+      fromio.in.tuile2do=  *(uint64_t *)&tbench[j + 0];
+      fromio.in.bordertuile2do=*(uint *)&tbench[j + 8];
+LOOP1(10) pf("%u ", fromio.in.tdam[C1N+fn1]); print
+
+      pf("0x%016llX tiles\n",      fromio.in.tuile2do);
+      pf("0x%08X       borders\n", fromio.in.bordertuile2do);
+      pf("sz(io) = %u\n", sizeof(Sio));
+      LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0;
+      fromio.out.globalres=0;
+      
+			e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio));
+pf("i %u ; in written ; C1N = %u\n", i, fromio.in.tdam[C1N]);
+
+      i++;
+		}
+	}
+
+	// Start all of the cores
+  pf("Some results in a minute... starting the core workgroup...\n\n");
+	e_start_group(&dev);
+  pf("... core workgroup started ; the whole test will last about 120 seconds...\n\n");
+
+	while(1) {
+		usleep(100000);
+    //pf("fromio.out.cmd: 0x%08X\n", fromio.out.cmd);
+		int done = 0;
+
+		// wait for the cores to complete their work
+		i=0;
+    for(row=0;row<epiphany.rows;row++) {
+			for(col=0;col<epiphany.cols;col++) {
+				// Get the number being tested by the core
+				if(e_read(&dev, row, col, SHARED_CMD, &fromio.out.cmd, sizeof(uint)) != sizeof(uint))
+					fprintf(stderr, "\n\nFailed to read\n\n\n");
+
+				if ( fromio.out.cmd != CMD_INIT) { //== CMD_DONE) {
+          if(e_read(&dev, row, col, SHARED_OUT, &fromio.out, sizeof(uint) * (DAM_SZ+1)) != sizeof(uint) * (DAM_SZ+1))
+            fprintf(stderr, "\n\nFailed to read 2\n\n\n");
+          l1 += Output_Print(fromio.out);
+          pf("Crunched %015llu nodes. bench # %u ; cmd output = 0x%08X\n\n", l1, benchlimit, fromio.out.cmd);
+          
+          benchlimit++;
+          if(benchlimit >= BENCH_LIMIT) break;
+          if(toccN[i] == BENCH_N) 
+            done++;
+          else {
+            toccN[i]++;
+            pf("core %4u: done %2u times ; cmd 0x%08X.\n", i, toccN[i], fromio.out.cmd);
+
+            fromio.out.cmd=CMD_INIT;
+            LOOP1(DAM_SZ) 
+              fromio.in.tdam[fn1]=0;
+            j=17 * bench_start; // for demo purpose
+            
+            LOOP1(8)
+              fromio.in.tdam[C2N + fn1] = (tbench[j + 12 + (fn1/2)] >> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte
+            fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1;
+            fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1;
+            fromio.in.tuile2do=  *(uint64_t *)&tbench[j + 0];
+            fromio.in.bordertuile2do=*(uint *)&tbench[j + 8];
+            LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0;
+            fromio.out.globalres=0;
+
+            e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio));
+pf("i %u ; in written again ; C1N = %u\n", i, fromio.in.tdam[C1N]);
+//OBSOLETE ! esdk doc too :/    e_reset_core(&dev, row, col);
+            e_start(&dev, row, col);
+          }
+        }
+          
+        i++;
+			}
+		}
+
+		if ( done >= CORE_N ) // some benchmarks are lengthy
+			break;
+
+    if(benchlimit >= BENCH_LIMIT) break;
+	}
+
+	e_finalize();
+  pf("Crunched %015llu nodes.\n\n", l1);
+
+	return 0;
+}
diff --git a/paralle2/src/e2g_common.h b/paralle2/src/e2g_common.h
new file mode 100644
index 0000000..2fd506d
--- /dev/null
+++ b/paralle2/src/e2g_common.h
@@ -0,0 +1,132 @@
+// 2017/01/28: 103 Mn/s (Million nodes per second) C version compared to previous 81 Mn/s assembly version.
+//   ELF support instead of SREC. Quicker load. File input. No more UNsigned integers, no more char loads, no more "ctz" and "popcount" instructions.
+//   Removed bug from multiple 0x6000 section inputs.
+//   Eagerly waiting for Epiphany V...
+
+#define CORE_N 16       //change it if needed ; our choice for standard 16-core Epiphany
+#define STATS           //undefine STATS to get full performance (from 111.2 to 103.4 s with a 16-core Parallella)
+#define MAX_CORE_N 1024 //Epiphany V ready ;)
+
+// specific to the project
+#define DAM_SZ 90
+
+// to DEVICE
+#pragma pack(4)
+typedef struct S_input {
+	int64_t tuile2do;
+	int bordertuile2do;
+	int tdam[DAM_SZ];
+  int east;
+}Sinput;
+// from DEVICE
+typedef struct S_output {
+	int globaltsolN[DAM_SZ]; //int64_t is twice as long to execute, you need at least 6 ic to increment a 64-bit memory value :/
+  int globalres;
+  int cmd;
+  int fn_idx;
+}Soutput;
+// shared MEMORY
+typedef struct S_io {
+  Sinput  in;
+  Soutput out;
+}Sio;
+// tmp variables for DEVICE, trying a workaround for the -msmall16 compilation option
+typedef struct S_tmp {
+  //int fn_idx;
+  int ttiles[64 + 1 + 32];
+  int j9e;
+  int j1n;
+}Stmp;
+
+// global offset for shared RAM
+#define SHARED_RAM (0x01000000)
+
+// a whole forum post for that
+#define PERFECT_ALIGN8 __asm__ (".balignw 4, 0x01a2\n"); __asm__ (".balignl 8, 0xfc02fcef\n");
+
+// Epiphany local offsets
+#define SHARED_IN  0x6000
+#define SHARED_OUT (SHARED_IN  + sizeof(Sinput))
+#define SHARED_RES (SHARED_OUT + DAM_SZ*4) // offset for result
+#define SHARED_CMD (SHARED_OUT + DAM_SZ*4 + 4) // offset for 'cmd'
+#define R_IDX      (SHARED_OUT + sizeof(Soutput))
+
+// commands for the Epiphany core
+#define CMD_INIT 0x80000000 // host init
+#define CMD_DONE 0x40000000 // eCore did the job properly (probably ; some bug might crush this word but it's highly improbable)
+
+// specific to the project
+#ifdef STATS
+  #define macro_globaltrace(niveau) out.globaltsolN[niveau]++;
+#else
+  #define macro_globaltrace(niveau)
+#endif
+
+#define macro_globaltrace2(niveau) out.globaltsolN[niveau]++;
+
+#define NORTH 0
+#define EAST  1
+#define SOUTH 2
+#define WEST  3
+
+#define B1N   0
+#define C1N  10
+#define C2N  11
+#define C3N  12
+#define C4N  13
+#define C5N  14
+#define C6N  15
+#define C7N  16
+#define C8N  17
+#define C9N  18
+#define C10N 19
+#define D1N  20 //etc
+
+#define G1N  50
+#define G2N  51
+#define G3N  52
+#define G4N  53
+#define G5N  54
+#define G6N  55
+#define G7N  56
+#define G8N  57
+#define G9N  58
+#define G10N 59
+
+#define H1N  60
+#define H2N  61
+#define H3N  62
+#define H4N  63
+#define H5N  64
+#define H6N  65
+#define H7N  66
+#define H8N  67
+#define H9N  68
+#define H10N 69
+
+#define I1N  70
+#define I2N  71
+#define I3N  72
+#define I4N  73
+#define I5N  74
+#define I6N  75
+#define I7N  76
+#define I8N  77
+#define I9N  78
+#define I10N 79
+
+#define J1N  80
+#define J2N  81
+#define J3N  82
+#define J4N  83
+#define J5N  84
+#define J6N  85
+#define J7N  86
+#define J8N  87
+#define J9N  88
+#define J10N 89
+
+#define BORDERCOLOR_D 0
+#define BORDERCOLOR_G 4
+#define BORDERCOLOR_I 9
+#define BORDERCOLOR_N 19 // 19 colors ; 1st one is empty, colors 1-4 stand for D(roite), 5-8 for G(auche), 9-18 for I(nterieur)
diff --git a/paralle2/src/e_e2g.c b/paralle2/src/e_e2g.c
new file mode 100644
index 0000000..9e281d7
--- /dev/null
+++ b/paralle2/src/e_e2g.c
@@ -0,0 +1,891 @@
+#include "e-lib.h" // mandatory even for a minimalist design -- e_get_coreid(), e_read(), e_write()
+
+//from notzed on the forum, "...gcc extended inline asm, 'cc' clobber_php.htm"
+//volatile needed, the compiler may mix code without taking care of the condition flags :/ 
+unsigned int bitrev(unsigned int val) {
+  unsigned int res;
+
+  __asm__ volatile ("bitr %[res],%[val]"
+    : [res] "=r" (res)
+    : [val] "r" (val)
+    : "cc");
+
+  return res;
+}
+
+//#include "C_common2.h"  // common definitions for C
+// avoid stdint.h
+#define uint8_t		unsigned char
+#define uint16_t	unsigned short
+#define uint32_t	unsigned int
+#define uint64_t	unsigned long long // <!> unsigned long = 4 bytes under 32-bit ARM
+
+#define int8_t		char
+#define int16_t		short
+#define int32_t		int
+#define int64_t		long long
+
+// my semantic
+#define uc     unsigned char
+#define ull    unsigned long long
+#define pf      printf
+#define print   printf("\n");
+#define pfv(x)  printf("v: %d\n",x);
+#define LOOP1(x) for(fn1=0;fn1<(x);fn1++)
+#define LOOP2(x) for(fn2=0;fn2<(x);fn2++)
+#define LOOP3(x) for(fn3=0;fn3<(x);fn3++)
+#define LOOP4(x) for(fn4=0;fn4<(x);fn4++)
+#define LOOP5(x) for(fn5=0;fn5<(x);fn5++)
+#define LOOP6(x) for(fn6=0;fn6<(x);fn6++)
+#define LOOP7(x) for(fn7=0;fn7<(x);fn7++)
+#define LOOP8(x) for(fn8=0;fn8<(x);fn8++)
+#define LOOP(x,y) for(x=0;x<y;x++)
+
+// #######################################
+// think x86 asm, think jz... setz... cmovz... think flags...
+#define ifz(x) if(!(x))
+#define ifnz(x) if(x)
+#define ife(x) if(!(x))
+#define ifne(x) if(x)
+// booleans
+#define ifzbool32(x,bit) ifz((x)&(1U<<(bit))) 
+#define ifbool32(x,bit)  if((x)&(1U<<(bit))) 
+#define ifzbool64(x,bit) ifz((uint64_t)(x)&(1ULL<<(bit)))
+#define ifbool64(x,bit)  if((uint64_t)(x)&(1ULL<<(bit)))
+
+#include "e2g_common.h" // common definitions for EII project
+
+//#######################################
+//INPUT/OUTPUT DATA
+
+/* previous code was:
+    
+   volatile Sinput  in  SECTION(".data_bank3"); // SHARED_IN
+   volatile Soutput out SECTION(".data_bank3"); // SHARED_OUT
+  
+  this way of coding is BAD: the linker will NOT necessarily place 'in' at offset 0x6000 and 'out' just AFTER 'in' (actually it places 'out' BEFORE 'in' !)
+  
+   => ONE reliable way of coding is ONE structure for exchanging with the rest of the world
+*/
+
+volatile Sio  io  SECTION(".data_bank3");
+
+#define in  io.in
+#define out io.out
+
+//#######################################
+
+Stmp    tmp;
+
+//#######################################
+//THE 'COMPUTE KERNEL'
+//#include "e2c_solver.c"
+
+void BorderWest(const int, const int);
+void InnerTile0(const int, const int);
+void InnerTile1(const int, const int);
+void InnerTile(const int, const int);
+void InnerRow(const int, const int);
+void InnerRow2(const int, const int); // with tinner_Upd
+void BorderEast(const int, const int);
+void BorderEastUpdate(const int, const int);
+void Special_H10(const int, const int);
+void Special_I1(const int, const int);
+void Special_I10(const int, const int);
+void BorderEastBottom(const int, const int);
+void Special_J2(const int, const int);
+void Special_Debug(const int, const int); // for debugging purpose
+
+void __attribute__ ((noinline)) Input_Copy(int, int *);
+
+//#######################################
+//STATIC DATA
+
+const int tlscouleur_B2016[BORDERCOLOR_N+1]={   
+  0x00000000, 
+  0x0870809A, 0x91032001, 0x42845140, 0x24080E24, 
+  0x000000FF, 0x00007F00, 0x00FF8000, 0xFF000000, 
+  0x00000001, 0x00018300, 0x03000406, 0x00000008, 0x00020830, 0x04041040, 0x38080000, 0x40100000, 0x00202000, 0x80C04080, 
+  0x00000000
+};
+
+//<!> colors 0-3 for D and G, 0-9 for I
+const int tbordureD[32]={ 0x01, 0x00, 0x03, 0x00, 0x00, 0x03, 0x02, 0x00, 0x02, 0x03, 0x03, 0x03, 0x02, 0x01, 0x02, 0x00, 0x01, 0x01, 0x02, 0x03, 0x00, 0x00, 0x00, 0x02, 0x01, 0x02, 0x03, 0x00, 0x01, 0x03, 0x02, 0x01 };
+const int tbordureG[32]={ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 };
+const int tbordureI[32]={ 0x00, 0x02, 0x02, 0x03, 0x04, 0x04, 0x05, 0x09, 0x01, 0x01, 0x02, 0x04, 0x05, 0x08, 0x09, 0x01, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x02, 0x02, 0x05, 0x06, 0x06, 0x06, 0x07, 0x09 };
+
+const int ttileN[100]={
+   1,  4,  3,  2,  1,  4,  4,  5,  2,  3,  5,  2,  5,  3,  0,  3,  3,  2,  0,  3,
+   1,  5,  3,  5,  2,  0,  1,  2,  3,  3,  6,  1,  4,  3,  6,  1,  2,  1,  3,  2,
+   2,  2,  2,  2,  4,  1,  0,  1,  4,  6,  5,  3,  2,  1,  1,  2,  4,  1,  1,  4,
+   3,  0,  2,  5,  3,  5,  0,  1,  4,  1,  1,  4,  1,  5,  2,  4,  1,  6,  2,  0,
+   0,  4,  1,  0,  4,  2,  7,  4,  3,  1,  5,  1,  2,  3,  1,  2,  2,  3,  4,  0
+};
+
+// color format: G + 4*D (i.e tbordureG + 4*tbordureD)
+const int tGDN[16]={ 4, 0, 4, 1, 1, 1, 2, 3, 1, 3, 2, 2, 2, 3, 1, 2 };
+
+const int tGD[16][8]={ 
+  {  1, 2,  3, 3,  4, 4,  7, 9 },
+  {  },
+  { 15, 1, 20, 7, 21, 8, 22, 9 },
+  { 27, 6, },
+  {  0, 0, },
+  { 13, 8, },
+  { 16, 1, 17, 4, },
+  { 24, 2, 28, 6, 31, 9, },
+  {  6, 5, },
+  {  8, 1, 12, 5, 14, 9, },
+  { 18, 5, 23, 9, },
+  { 25, 2, 30, 7, },
+  {  2, 2,  5, 4, },
+  {  9, 1, 10, 2, 11, 4, },
+  { 19, 6, },
+  { 26, 5, 29, 6, }
+};
+
+#define VOIDTILE 64
+#define VOIDSOUTH 0
+
+//tcount(11) = 2 tcount(19) = 2 tcount(22) = 3  tcount(27) = 2  tcount(36) = 2  tcount(38) = 2
+//const int tbordereast_uniquecolor[20]={ 1, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 23, 24, 25, 28, 30, 32, 33, 37, }; //19 actually
+//const int tbordereast_uniquetile[20]={ 0, 0, 4, 1, 24, 0, 27, 28, 0, 15, 17, 0, 29, 20, 0, 30, 0, 0, 0, }; //19 actually
+
+//from tmpbordure*.c
+const int t14[100][8]={ // format: LSB = tile, 2nd byte = east, 3rd byte = south, MSB = 0
+  { 0x00060500, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000901, 0x00020308, 0x00010613, 0x0009030C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000506, 0x00010915, 0x0004030A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004040D, 0x00080510, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010307, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050103, 0x00050918, 0x00000206, 0x00020102, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000712, 0x00080511, 0x00050000, 0x00020614, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040917, 0x00000612, 0x00030309, 0x00070104, 0x0007050F, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070105, 0x00010916, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000101, 0x0006030B, 0x0006040E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050503, 0x00090001, 0x00080705, 0x00070704, 0x00050202, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050919, 0x00060013, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070823, 0x00030008, 0x0001051A, 0x00090720, 0x0001071B, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000407, 0x0006021C, 0x0004041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060822, 0x00090119, 0x0001021A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000113, 0x0007051F, 0x00050821, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0006031D, 0x0001021B, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000816, 0x00000215, 0x0003000C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050006, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00090015, 0x0005011A, 0x00000502, 0x0003061C, 0x0007011B, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030325, 0x00020924, 0x00090224, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030225, 0x00000108, 0x00020826, 0x00020927, 0x00040328, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007072C, 0x0003000A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000614, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080123, 0x00050429, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0006052B, 0x0006042A, 0x00020326, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020224, 0x00020327, 0x00070120, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0009060B, 0x00070309, 0x00010208, 0x00040107, 0x0001090C, 0x0002040A, 0x00000040, 0x00000040 },
+  { 0x0007061D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020325, 0x00080226, 0x00090227, 0x00030428, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020225, 0x0007072D, 0x00000709, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004000D, 0x00080732, 0x00040935, 0x0005062F, 0x00030228, 0x0004011E, 0x00000040, 0x00000040 },
+  { 0x00090734, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080631, 0x0002011C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007032D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040936, 0x00050010, 0x0006052E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050630, 0x00080733, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0003040D, 0x0009060E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030007, 0x0003041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070529, 0x0008062A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020328, 0x0000020A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0000030D, 0x00090335, 0x00080737, 0x0001031E, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0006032F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007022C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070332, 0x00060839, 0x00070437, 0x0006093C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000717, 0x00030435, 0x0008083B, 0x00030836, 0x00050638, 0x0007083A, 0x00000040, 0x00000040 },
+  { 0x00060811, 0x00020006, 0x00000600, 0x00030810, 0x0007070F, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000503, 0x0002011A, 0x0006071F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008062B, 0x00010002, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008062E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020729, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010003, 0x00090018, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080122, 0x00030930, 0x0003042F, 0x00040938, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007073E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010621, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000518, 0x00070334, 0x00010119, 0x0008063D, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070012, 0x00010113, 0x00060214, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060014, 0x0001031C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00090530, 0x0000090B, 0x00060831, 0x0001071D, 0x0004052F, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0000090E, 0x0002082A, 0x00090538, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0002082B, 0x00000000, 0x0009083D, 0x0003082E, 0x00080121, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0005011F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010522, 0x00050011, 0x00060331, 0x00040839, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004083C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060012, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000805, 0x00000704, 0x00020920, 0x0002011B, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004072C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040832, 0x0003072D, 0x00030009, 0x00050934, 0x00090833, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00090017, 0x00040837, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007073E, 0x00040229, 0x0001061F, 0x0000070F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0003011D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007053E, 0x0005073E, 0x0003032D, 0x00010004, 0x0002042C, 0x0005000F, 0x00000040, 0x00000040 },
+  { 0x00010223, 0x0004093A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050622, 0x00020723, 0x00090016, 0x00060521, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030226, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080639, 0x0009083B, 0x00090336, 0x0009073A, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000611, 0x00000310, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0005022B, 0x00030631, 0x00080439, 0x0005093D, 0x0009043C, 0x0004022A, 0x0005032E, 0x00000040 },
+  { 0x00030432, 0x00010005, 0x00040437, 0x00030933, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004093B, 0x0008093F, 0x0009083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010001, 0x00070417, 0x00080116, 0x00050518, 0x00020115, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010519, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020224, 0x00030227, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040435, 0x00080436, 0x0000010C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008063C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060330, 0x00060438, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0003000B, 0x0004000E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030534, 0x00010220, 0x0008043A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008043B, 0x00070333, 0x0006053D, 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }
+};
+
+ALIGN(8)
+int ttileN_upd[100]={0}; // ? does not work yet
+
+static inline void TileN_Update(void) {
+  int fn1;
+  LOOP1(100) ttileN_upd[fn1]=ttileN[fn1];
+  
+  ttileN_upd[1]=tmp.ttiles[0];
+	//modele : tmp.ttileN_upd[1]=tmp.ttiles[0];
+	/*
+  ttileN_upd[1]=tmp.ttiles[0];
+	ttileN_upd[4]=tmp.ttiles[15];
+	ttileN_upd[5]=tmp.ttiles[16];
+	ttileN_upd[6]=tmp.ttiles[8];
+	ttileN_upd[7]=tmp.ttiles[9];
+	ttileN_upd[8]=tmp.ttiles[1];
+	ttileN_upd[9]=tmp.ttiles[24];
+	ttileN_upd[10]=tmp.ttiles[25];
+	ttileN_upd[12]=tmp.ttiles[3];
+	ttileN_upd[16]=tmp.ttiles[4];
+	ttileN_upd[17]=tmp.ttiles[17];
+	ttileN_upd[23]=tmp.ttiles[26];
+	ttileN_upd[24]=tmp.ttiles[27];
+	ttileN_upd[25]=tmp.ttiles[28];
+	ttileN_upd[28]=tmp.ttiles[20];
+	ttileN_upd[30]=tmp.ttiles[30];
+	ttileN_upd[32]=tmp.ttiles[21];
+	ttileN_upd[33]=tmp.ttiles[13];
+	ttileN_upd[37]=tmp.ttiles[31];
+  */
+}
+
+//typedef void (*ptrFonction) (const signed int, const signed int);
+
+ALIGN(8)
+void (* tfncall[78]) (const int, const int) ={
+   BorderWest,
+   InnerRow,
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // C10
+    //Special_Debug, 
+  
+   BorderWest,
+   InnerRow, // replaces InnerTile() * 8
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   BorderEast, //	 BorderEastUpdate, // D10   // ? fail
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // E10
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // F10
+  
+//Special_Debug,
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // G10
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 Special_H10,    // H10
+   
+   Special_I1, 
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 Special_I10,    // I10
+   
+   BorderEastBottom, // J9
+   BorderEastBottom, // J8
+   BorderEastBottom, // J7
+   BorderEastBottom, // J6
+   BorderEastBottom, // J5
+   BorderEastBottom, // J4
+   BorderEastBottom, // J3
+   Special_J2, // J2, last square
+};
+
+//dynamic
+int tborderwestN[4]={0};
+int tbordereastN[40]={0};
+
+int tborderwestT[4][9]={0}; 
+int tborderwestE[4][9]={0}; 
+int tborderwestS[4][9]={0};
+
+int tbordereastT[40][4]={0};
+int tbordereastS[40][4]={0};
+
+// sandwiched BorderWest ; J1N=0 or 3 for this specific problem
+void Special_I1(const int north, const int northI) {
+  int couleur, tuileN, fn1, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=north + 4*0; 
+  tmp.j9e=3;
+  tuileN=tGDN[couleur];
+  LOOP1(tuileN) {
+    tuile=tGD[couleur][fn1*2 + 0];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      east=tGD[couleur][fn1*2 + 1];
+      tmp.ttiles[64+1 + tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  couleur=north + 4*3; 
+  tmp.j9e=0;
+  tuileN=tGDN[couleur];
+  LOOP1(tuileN) {
+    tuile=tGD[couleur][fn1*2 + 0];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      east=tGD[couleur][fn1*2 + 1];
+      tmp.ttiles[64+1 + tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+}
+
+void BorderWest(const int north, const int northI) {
+  int couleur, tuileN, fn1, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=north; // in.tdam[C1N];
+  tuileN=tborderwestN[couleur];
+  LOOP1(tuileN) {
+    tuile=tborderwestT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      east=tborderwestE[couleur][fn1];
+      in.tdam[northI+10]=tborderwestS[couleur][fn1];
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(east, northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+void InnerTile(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=eastcolor + 10*in.tdam[northI];
+  tuileN=ttileN[couleur];
+  LOOP1(tuileN) {
+    fn2=t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[tuile]=1;
+    }
+  }
+}
+
+void BorderEast(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      in.tdam[northI+10]=tbordereastS[couleur][fn1];
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(in.tdam[northI+1], northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+// BorderEast for the bottom line ; from J9 to J2 excluded ; only change to these tags '//#'
+void BorderEastBottom(const int northI, const int eastcolor) {
+  int couleur, tuileN, fn1, tuile, nexteast;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=in.tdam[northI] * 4 + eastcolor; //#
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      tmp.ttiles[64+1 + tuile]=0;
+      out.fn_idx++;
+      (*ptr)(northI-1, tbordureG[tuile]); //#
+      out.fn_idx--;
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+}
+
+// BorderEastBottom with final check for J2 the last square
+void Special_J2(const int northI, const int eastcolor) {
+  int couleur, tuileN, fn1, tuile, nexteast;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=in.tdam[northI] * 4 + eastcolor; //#
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      if(tbordureG[tuile] != 1) continue; // tdam[J1E] == 1 for this specific problem
+      
+      out.globalres++; // O_O reach this point after about 10^17 nodes...
+      
+    }
+  }
+}
+
+// BorderEast with I10 checkup
+void Special_H10(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      if(tbordureG[tuile] == 0) continue; // borders 0/1/x/0 do not exist on this specific problem
+      //in.tdam[northI+10]=tbordereastS[couleur][fn1];
+      
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(in.tdam[northI+1], northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+// BorderEast with I10 strong constraint
+void Special_I10(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      if(tbordureG[tuile] != 2) continue; // J10N == 2 for this specific problem
+      
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(northI+9, tmp.j9e);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+//??? why does it NOT work ? should be 10 % faster
+void BorderEastUpdate(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  TileN_Update();
+
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      in.tdam[northI+10]=tbordereastS[couleur][fn1];
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(in.tdam[northI+1], northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+void InnerTile0(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=eastcolor + 10*in.tdam[northI];
+  tuileN=ttileN[couleur];
+  LOOP1(tuileN) {
+    fn2=t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[tuile]=1;
+    }
+  }
+}
+
+void InnerTile1(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor + 10*in.tdam[northI];
+  tuileN=ttileN_upd[couleur];
+  LOOP1(tuileN) {
+    fn2=t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      (*ptr)(east, northI+1);
+      tmp.ttiles[tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+//point de vigilance : teast[idx - 1] ; le reste : std
+#define macro_innerrow_loop(idx)\
+			macro_globaltrace(out.fn_idx - 8 + idx);\
+			tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\
+			ttuileN[idx]=ttileN[tcouleur[idx]];\
+			LOOP(tfn1[idx], ttuileN[idx]) {\
+				tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\
+				ttuile[idx]=tfn2[idx] & 0xff;\
+				ifnz(tmp.ttiles[ttuile[idx]]) {\
+					teast[idx]=(tfn2[idx]>>8) & 0xff;\
+					in.tdam[south + idx]=tfn2[idx] >> 16;\
+					tmp.ttiles[ttuile[idx]]=0;
+					
+#define macro_innerrow_loopz(idx) tmp.ttiles[ttuile[idx]]=1; } }
+
+
+void InnerRow(const int eastcolor, const int northI) {
+  int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8];
+	int south=northI+10;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+
+	out.fn_idx +=8;
+  ptr=tfncall[out.fn_idx];
+	
+  tcouleur[0]=eastcolor + 10*in.tdam[northI];
+  ttuileN[0]=ttileN[tcouleur[0]];
+  LOOP(tfn1[0], ttuileN[0]) {
+    tfn2[0]=t14[tcouleur[0]][tfn1[0]];
+    ttuile[0]=tfn2[0] & 0xff;
+    ifnz(tmp.ttiles[ttuile[0]]) {
+      teast[0]=(tfn2[0]>>8) & 0xff;
+      in.tdam[south +0]=tfn2[0] >> 16;
+      tmp.ttiles[ttuile[0]]=0;
+      
+			// (*ptr)(teast[0], northI+1);
+			/*
+			tcouleur[1]=teast[0] + 10*in.tdam[northI +1];
+			ttuileN[1]=ttileN[tcouleur[1]];
+			LOOP(tfn1[1], ttuileN[1]) {
+				tfn2[1]=t14[tcouleur[1]][tfn1[1]];
+				ttuile[1]=tfn2[1] & 0xff;
+				ifnz(tmp.ttiles[ttuile[1]]) {
+					teast[1]=(tfn2[1]>>8) & 0xff;
+					in.tdam[south +1]=tfn2[1] >> 16;
+					tmp.ttiles[ttuile[1]]=0;
+					out.fn_idx++;
+			*/
+			macro_innerrow_loop(1)
+				macro_innerrow_loop(2)
+					macro_innerrow_loop(3)
+						macro_innerrow_loop(4)
+							macro_innerrow_loop(5)
+								macro_innerrow_loop(6)
+									macro_innerrow_loop(7)
+									
+									(*ptr)(teast[7], northI + 8);
+									
+									macro_innerrow_loopz(7)
+								macro_innerrow_loopz(6)
+							macro_innerrow_loopz(5)
+						macro_innerrow_loopz(4)
+					macro_innerrow_loopz(3)
+				macro_innerrow_loopz(2)
+			macro_innerrow_loopz(1)
+      
+      tmp.ttiles[ttuile[0]]=1;
+    }
+  }
+
+	out.fn_idx -=8;
+}
+
+
+//point de vigilance : teast[idx - 1] ; le reste : std
+#define macro_innerrow_loop2(idx)\
+			macro_globaltrace(out.fn_idx - 8 + idx);\
+			tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\
+			ttuileN[idx]=ttileN_upd[tcouleur[idx]];\
+			LOOP(tfn1[idx], ttuileN[idx]) {\
+				tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\
+				ttuile[idx]=tfn2[idx] & 0xff;\
+				ifnz(tmp.ttiles[ttuile[idx]]) {\
+					teast[idx]=(tfn2[idx]>>8) & 0xff;\
+					in.tdam[south + idx]=tfn2[idx] >> 16;\
+					tmp.ttiles[ttuile[idx]]=0;
+
+
+void InnerRow2(const int eastcolor, const int northI) {
+  int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8];
+	int south=northI+10;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+
+	out.fn_idx +=8;
+  ptr=tfncall[out.fn_idx];
+	
+  tcouleur[0]=eastcolor + 10*in.tdam[northI];
+  ttuileN[0]=ttileN_upd[tcouleur[0]];
+  LOOP(tfn1[0], ttuileN[0]) {
+    tfn2[0]=t14[tcouleur[0]][tfn1[0]];
+    ttuile[0]=tfn2[0] & 0xff;
+    ifnz(tmp.ttiles[ttuile[0]]) {
+      teast[0]=(tfn2[0]>>8) & 0xff;
+      in.tdam[south +0]=tfn2[0] >> 16;
+      tmp.ttiles[ttuile[0]]=0;
+
+			macro_innerrow_loop(1)
+				macro_innerrow_loop(2)
+					macro_innerrow_loop(3)
+						macro_innerrow_loop(4)
+							macro_innerrow_loop(5)
+								macro_innerrow_loop(6)
+									macro_innerrow_loop(7)
+									
+									(*ptr)(teast[7], northI + 8);
+									
+									macro_innerrow_loopz(7)
+								macro_innerrow_loopz(6)
+							macro_innerrow_loopz(5)
+						macro_innerrow_loopz(4)
+					macro_innerrow_loopz(3)
+				macro_innerrow_loopz(2)
+			macro_innerrow_loopz(1)
+      
+      tmp.ttiles[ttuile[0]]=1;
+    }
+  }
+
+	out.fn_idx -=8;
+}
+
+
+/* super BIDE ou BUG de nouveau
+void InnerTile0(const int eastcolor, const int northI) {
+  int64_t tfn2[4];
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=eastcolor + 10*in.tdam[northI];
+
+  tfn2[0]=*(int64_t *)&t14[couleur][0];
+  tfn2[1]=*(int64_t *)&t14[couleur][2];
+  tfn2[2]=*(int64_t *)&t14[couleur][4];
+  tfn2[3]=*(int64_t *)&t14[couleur][6];
+
+  tuileN=ttileN[couleur];
+  LOOP1(tuileN) {
+    fn2=*(int *)(&tfn2 + fn1*4); //t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[tuile]=1;
+    }
+  }
+}
+*/
+
+void Special_Debug(const int north, const int northI) {
+  macro_globaltrace(out.fn_idx);
+}
+
+//#######################################
+
+// prevent inlining this trivial function: we may need some room
+void __attribute__ ((noinline)) Input_Copy(int tiles, int *dest) {
+  int fn1;
+  LOOP1(32) {
+    dest[fn1]=tiles & 1;
+    tiles>>=1;
+  }
+}
+
+//void __attribute__((interrupt)) null_isr() { return; }
+
+//#######################################
+
+int main(void) {
+e_start:;
+  
+  int fn1, westcolor, eastcolor, tiles;
+
+  volatile signed int *inputP  = (void *)SHARED_IN;  // pointer for input
+  //volatile signed int *cmdP    = (void *)SHARED_CMD; // pointer for output command
+
+  // init compute kernel
+  tiles=*(inputP+0); Input_Copy(tiles, &tmp.ttiles[ 0]); // 1st 32 tiles
+  tiles=*(inputP+1); Input_Copy(tiles, &tmp.ttiles[32]);
+  tiles=*(inputP+2); Input_Copy(tiles, &tmp.ttiles[VOIDTILE+1]); // 32 borders
+  tmp.ttiles[VOIDTILE]=0; // VOIDTILE == 64
+    
+  LOOP1(4)
+    tborderwestN[fn1]=0;
+  LOOP1(40)
+    tbordereastN[fn1]=0;
+
+  LOOP1(32) {
+    if(tmp.ttiles[64+1 +fn1]) {
+      westcolor=tbordureG[fn1];
+      tborderwestT[westcolor] [tborderwestN[westcolor]]=fn1;
+      tborderwestE[westcolor] [tborderwestN[westcolor]]=tbordureI[fn1];
+      tborderwestS[westcolor] [tborderwestN[westcolor]]=tbordureD[fn1];
+      tborderwestN[westcolor]++;
+      
+      eastcolor=tbordureI[fn1]*4 + tbordureD[fn1];
+      tbordereastT[eastcolor] [tbordereastN[eastcolor]]=fn1;
+      tbordereastS[eastcolor] [tbordereastN[eastcolor]]=tbordureG[fn1];
+      tbordereastN[eastcolor]++;
+    }
+  }
+
+  out.fn_idx=0;
+  BorderWest(in.tdam[C1N], C1N);
+
+  out.cmd=CMD_DONE; // *cmdP=CMD_DONE;
+  
+  //return 0;
+  __asm__ __volatile__ ("idle"); // experience: can you idle an Epiphany core until ARM wakes it up ?  Answer: empirically, yes ; use e_start() to reload the core
+  //goto e_start; // wake by IVT # 0
+  
+}
diff --git a/paralle2/x86_build.sh b/paralle2/x86_build.sh
new file mode 100644
index 0000000..0113851
--- /dev/null
+++ b/paralle2/x86_build.sh
@@ -0,0 +1,3 @@
+# cross compiling on x86_64 host, assuming /opt/adapteva as default path
+
+e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -o bin/e_e2g.elf -le-lib
diff --git a/paralle2/x86_buildasm.sh b/paralle2/x86_buildasm.sh
new file mode 100644
index 0000000..20836b0
--- /dev/null
+++ b/paralle2/x86_buildasm.sh
@@ -0,0 +1,12 @@
+# cross compiling on x86_64 host
+
+# -mshort-calls: OK
+# -msmall16: still broken
+# -m1reg-r63: OK
+# -mfp-mode=int: OK
+
+echo Cross compiling on x86_64 host
+echo.
+echo.
+e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -S -le-lib
+echo.
diff --git a/paralle3/README.md b/paralle3/README.md
new file mode 100644
index 0000000..928cca0
--- /dev/null
+++ b/paralle3/README.md
@@ -0,0 +1,45 @@
+# A 10x10 Eternity II solver
+
+## Build and Run
+
+#data BEFORE ; will create bin/bench.bin - a bunch of benchs
+
+ ./build_data.sh
+ ./run_data.sh
+
+ ./build.sh
+ ./run.sh value
+
+ Wanting an assembly output ? Use ./buildasm.sh
+ Cross compiling for an x86_64 platform ? You can use ./x86*.sh  
+ 
+## Benchmarks
+
+All programs are full C, sometimes with some assembly.
+Mn/s/W = Million nodes per second per Watt
+
+
+GPU OpenCL                      : not even a tenth of a modest x86 core with a Radeon 5770 graphics card. The numerous branches are a dead end, not to talk about the watts.
+Parallella, one ARM A9 core     :    6 Mn/s ;   3.0 W ;  2    Mn/s/W
+My high-end computer, one core  :  166 Mn/s ;  72.0 W ;  2.3  Mn/s/W ; x86_64, Fedora Core 23, i7 5820k
+Raspberry Pi 3                  :                        8-10 Mn/s/W iirc ; A53, 4-core, 1.2 GHz
+My high-end computer, 12 threads: 1470 Mn/s ; 140.0 W ; 10.0  Mn/s/W
+Odroid XU4                      :  245 Mn/s ;  15.7 W ; 15.6 Mn/s/W ; 8-core ; deeply optimized, not much margin
+Parallella 16-core Epiphany     :  103 Mn/s ;   5.0 W ; 20.6 Mn/s/W ; remove the Ethernet cable to earn 0.6 W due to ssh with the headless Parabuntu distro
+
+
+So...
+To my knowledge, Parallella is today the most energy-efficient platform for this highly recursive task...
+although it does *not* use any float !
+
+Eagerly waiting the 1024-core Epiphany V...
+
+## Author
+
+DonQuichotteComputers at gmail dot com
+2017
+
+## License
+
+BSD-3 clause.
+
diff --git a/paralle3/build.sh b/paralle3/build.sh
new file mode 100644
index 0000000..b74c16d
--- /dev/null
+++ b/paralle3/build.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+set -e
+
+ESDK=${EPIPHANY_HOME}
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
+ELDF=${ESDK}/bsps/current/internal.ldf
+
+SCRIPT=$(readlink -f "$0")
+EXEPATH=$(dirname "$SCRIPT")
+cd $EXEPATH
+
+CROSS_PREFIX=
+case $(uname -p) in
+	arm*)
+		# Use native arm compiler (no cross prefix)
+		CROSS_PREFIX=
+		;;
+	   *)
+		# Use cross compiler
+		CROSS_PREFIX="arm-linux-gnueabihf-"
+		;;
+esac
+
+# Create output dir
+mkdir -p bin
+
+# Build HOST side application
+${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader
+
+# Build DEVICE side program
+# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4
+#-mshort-calls still works :D  
+
+#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib
+#e-gcc 5.4 makes poor use of the option -mfp-mode=int
+#  the option -mfp-iarith slows DOWN my program -- more than 20 % :/ 
+
+ e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 src/e_e2g.c -o bin/e_e2g.elf -le-lib
+
+# trick to get the spare room usage: epiphany-elf-size your_program.elf ; with internal.ldf the value of 'dec' cannot be beyond 32767
+#
+#parallella@parallella:~/parallella-examples/tmp$ epiphany-elf-size bin/e_e2g.elf 
+#   text	   data	    bss	    dec	    hex	filename
+#  18730	   2148	   2808	  23686	   5c86	bin/e_e2g.elf
+#
diff --git a/paralle3/build_data.sh b/paralle3/build_data.sh
new file mode 100644
index 0000000..587ae30
--- /dev/null
+++ b/paralle3/build_data.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+ESDK=${EPIPHANY_HOME}
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
+ELDF=${ESDK}/bsps/current/internal.ldf
+
+SCRIPT=$(readlink -f "$0")
+EXEPATH=$(dirname "$SCRIPT")
+cd $EXEPATH
+
+CROSS_PREFIX=
+case $(uname -p) in
+	arm*)
+		# Use native arm compiler (no cross prefix)
+		CROSS_PREFIX=
+		;;
+	   *)
+		# Use cross compiler
+		CROSS_PREFIX="arm-linux-gnueabihf-"
+		;;
+esac
+
+# Create output dir
+mkdir -p bin
+
+# Build HOST side application
+${CROSS_PREFIX}gcc src/build_data.c -o bin/build_data.elf -I ${EINCS} -L ${ELIBS}
diff --git a/paralle3/buildasm.sh b/paralle3/buildasm.sh
new file mode 100644
index 0000000..d513d73
--- /dev/null
+++ b/paralle3/buildasm.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+
+ESDK=${EPIPHANY_HOME}
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
+ELDF=${ESDK}/bsps/current/internal.ldf
+
+SCRIPT=$(readlink -f "$0")
+EXEPATH=$(dirname "$SCRIPT")
+cd $EXEPATH
+
+CROSS_PREFIX=
+case $(uname -p) in
+	arm*)
+		# Use native arm compiler (no cross prefix)
+		CROSS_PREFIX=
+		;;
+	   *)
+		# Use cross compiler
+		CROSS_PREFIX="arm-linux-gnueabihf-"
+		;;
+esac
+
+# Create output dir
+mkdir -p bin
+
+# Build HOST side application
+${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader
+
+# Build DEVICE side program
+# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4
+#-mshort-calls still works :D  
+
+#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib
+#e-gcc 5.4 makes poor use of the option -mfp-mode=int
+#  the option -mfp-iarith slows DOWN my program -- more than 20 % :/ 
+
+ e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 -mfp-iarith src/e_e2g.c -S -le-lib
diff --git a/paralle3/readme.MD b/paralle3/readme.MD
new file mode 100644
index 0000000..aa4f88f
--- /dev/null
+++ b/paralle3/readme.MD
@@ -0,0 +1 @@
+ah
diff --git a/paralle3/run.sh b/paralle3/run.sh
new file mode 100644
index 0000000..3c78ee8
--- /dev/null
+++ b/paralle3/run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -e
+
+BENCH_INDEX=""
+
+if [ $# -lt 1 ]; then
+	echo "Usage:  ./run.sh numberic-value"
+	exit 1
+else
+	if [[ ! "$1" =~ ^[0-9]+$ ]]; then
+		echo "ERROR:  value must be numeric"
+		echo "Usage:  ./run.sh numberic-value"
+		exit 1
+	else
+		BENCH_INDEX=$1
+	fi
+fi
+
+time bin/e2g.elf ${BENCH_INDEX}
+
diff --git a/paralle3/run_data.sh b/paralle3/run_data.sh
new file mode 100644
index 0000000..7da8aff
--- /dev/null
+++ b/paralle3/run_data.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e
+
+bin/build_data.elf
+
+echo "Building data: done.  Now you can run './build.sh'"
diff --git a/paralle3/src/C_common2.h b/paralle3/src/C_common2.h
new file mode 100644
index 0000000..ebea59a
--- /dev/null
+++ b/paralle3/src/C_common2.h
@@ -0,0 +1,46 @@
+// avoid stdint.h
+#define uint8_t		unsigned char
+#define uint16_t	unsigned short
+#define uint32_t	unsigned int
+#define uint64_t	unsigned long long // <!> unsigned long = 4 bytes under ARM
+
+#define int8_t		char
+#define int16_t		short
+#define int32_t		int
+#define int64_t		long long
+
+// my semantic
+#define uc     unsigned char
+#define ull    unsigned long long
+#define pf      printf
+#define print   printf("\n");
+#define pfv(x)  printf("v: %d\n",x);
+#define LOOP1(x) for(fn1=0;fn1<(x);fn1++)
+#define LOOP2(x) for(fn2=0;fn2<(x);fn2++)
+#define LOOP3(x) for(fn3=0;fn3<(x);fn3++)
+#define LOOP4(x) for(fn4=0;fn4<(x);fn4++)
+#define LOOP5(x) for(fn5=0;fn5<(x);fn5++)
+#define LOOP6(x) for(fn6=0;fn6<(x);fn6++)
+#define LOOP7(x) for(fn7=0;fn7<(x);fn7++)
+#define LOOP8(x) for(fn8=0;fn8<(x);fn8++)
+#define LOOP(x,y) for(x=0;x<y;x++)
+
+// #######################################
+// think x86 asm, think jz... setz... cmovz... think flags...
+#define ifz(x) if(!(x))
+#define ifnz(x) if(x)
+#define ife(x) if(!(x))
+#define ifne(x) if(x)
+// booleans
+#define ifzbool32(x,bit) ifz((x)&(1U<<(bit))) 
+#define ifbool32(x,bit)  if((x)&(1U<<(bit))) 
+#define ifzbool64(x,bit) ifz((uint64_t)(x)&(1ULL<<(bit)))
+#define ifbool64(x,bit)  if((uint64_t)(x)&(1ULL<<(bit)))
+
+// __asm__ __volatile__ ("idle");//repos !
+
+/* pense-bete
+#define ALIGN(x)   __attribute__ ((aligned (x))) 
+#define PACKED     __attribute__ ((packed)) 
+#define SECTION(x) __attribute__ ((section (x)))
+*/
diff --git a/paralle3/src/build_data.c b/paralle3/src/build_data.c
new file mode 100644
index 0000000..deed086
--- /dev/null
+++ b/paralle3/src/build_data.c
@@ -0,0 +1,39 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define BENCH_INIT 0
+#define BENCH_MAX  1023
+
+const unsigned char tbench[16][17]={
+  { 0xFF,0xED,0xDF,0xFC,0xFF,0xF4,0xFF,0xFF,0x25,0x7E,0xEB,0xEF,0x79,0x02,0x05,0x84,0x43 },
+  { 0xE5,0xFF,0xFB,0xFB,0xEF,0xFE,0xFF,0xFB,0xF5,0xFA,0x56,0xBE,0x39,0x19,0x17,0x15,0x44 },
+  { 0xFE,0xBF,0xFF,0xDF,0xEC,0xFB,0xFF,0xFE,0x8D,0x1F,0xAF,0xBF,0x82,0x23,0x45,0x50,0x23 },
+  { 0xFF,0xF5,0xFF,0xFF,0x6E,0xFF,0xFF,0x3B,0x5C,0x0F,0xDF,0xF7,0x32,0x79,0x03,0x97,0x23 },
+  { 0xBF,0xFF,0xFF,0xF9,0x7F,0xF7,0xDE,0xDF,0x3C,0x3E,0x9D,0xFF,0x55,0x58,0x03,0x46,0x33 },
+  { 0xE6,0xD7,0xDF,0xFF,0xFF,0xFF,0xEF,0xFD,0xDD,0x6E,0x5B,0x6F,0x79,0x13,0x30,0x83,0x43 },
+  { 0xFE,0xFB,0xF3,0xEB,0xFF,0x7F,0xFD,0xFF,0xF9,0xE7,0xC8,0xFD,0x63,0x00,0x35,0x33,0x34 },
+  { 0xEF,0xF7,0xFD,0xF7,0xFE,0xF1,0xFF,0xFF,0xF8,0xBB,0x4F,0xAF,0x27,0x50,0x62,0x30,0x13 },
+  { 0xFE,0xF7,0xF7,0xF0,0xFB,0xFF,0xFF,0xFF,0xF9,0x3E,0x5F,0x4F,0x55,0x53,0x10,0x57,0x43 },
+  { 0xBF,0xF6,0xDF,0xEF,0xFF,0xFE,0xBF,0xF7,0x6C,0x9E,0x9B,0xFF,0x33,0x22,0x22,0x98,0x23 },
+  { 0xFF,0xEF,0x4F,0xBB,0xBF,0xBF,0xFF,0xFF,0x68,0xFC,0xEF,0xE7,0x69,0x31,0x02,0x93,0x13 },
+  { 0xBB,0xFD,0xF7,0xFF,0x9F,0xFF,0xFB,0xFE,0xE1,0xFB,0xAD,0xDD,0x22,0x37,0x15,0x60,0x34 },
+  { 0x7F,0xCE,0xFF,0xBD,0xFF,0xFE,0xEF,0xFF,0xE5,0x7A,0x78,0xFF,0x13,0x50,0x07,0x00,0x34 },
+  { 0xFC,0xFF,0x7F,0xFE,0xDF,0xFD,0x3F,0xFF,0x64,0xFA,0xE7,0x7F,0x82,0x58,0x99,0x05,0x14 },
+  { 0xFF,0x7B,0xF7,0xB3,0xFF,0xFF,0xFE,0xBF,0xB7,0x36,0xEC,0xFE,0x64,0x57,0x11,0x06,0x44 },
+  { 0xFD,0xEF,0xDF,0xFB,0x7F,0xFF,0xF2,0xFF,0xF2,0x73,0x3F,0x3F,0x19,0x96,0x07,0x19,0x14 },
+};
+
+int main(void) {
+	unsigned i, j;
+  FILE *f;
+  
+  f=fopen("bin/bench.bin", "wb");
+  if(f == NULL) { printf("Error writing bin/bench.bin\n"); exit(-1); }
+  
+  for(i=0; i<2048; i++)
+    j=fwrite(tbench, 17*16, 1, f);
+  fclose(f);
+  
+  return 0;
+}
diff --git a/paralle3/src/e2g.c b/paralle3/src/e2g.c
new file mode 100644
index 0000000..93e1ba6
--- /dev/null
+++ b/paralle3/src/e2g.c
@@ -0,0 +1,199 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <e-hal.h> // HOST side ; mandatory
+
+#include "C_common2.h"  // common definitions for C
+#include "e2g_common.h" // common definitions for EII project
+
+#define BENCH_MIN 0    //min bench to start with
+#define BENCH_MAX 1024 //max bench to start with
+#define BENCH_N   1 //16   //16 to solve per core
+#define BENCH_LIMIT 10000 //stop after x benchs done ; not implemented actually
+
+//#######################################
+
+void Epiphany_Boot(e_platform_t *epiphany) {
+  e_init(NULL); // initialise the system, establish connection to the device
+  e_reset_system(); // reset the Epiphany chip
+  e_get_platform_info(epiphany);// get the configuration info for the parallella platform
+}
+
+//#######################################
+//printf("%-2d"...) -> left align
+//GOAL: display an array of node numbers
+void Node_Board_Print2(uint *tsolN) {
+  int fn1, fn2, fn3=0, place;
+
+	print
+  LOOP2(9)
+    pf("              %d", 1+fn2);
+  pf("             %2d\n", 1+fn2);
+  //pf("  -  %c\n", 'A');
+  //pf("  -  %c\n", 'B');
+  
+  LOOP1(8) {
+    LOOP2(10) {
+      place=(fn1+2)*16 + fn2;//+2 because basis is C1
+      ifnz(tsolN[fn3]) {
+        pf("   %012u", tsolN[fn3]);
+      }
+      else {
+        pf("       %c%d     ", 'C'+fn1, 1+fn2);//'C' because basis is C1
+        if(fn2!=9) pf(" ");
+      }
+      fn3++;
+    }
+    pf("  -  %c\n", 'C'+fn1);//basis is C1
+  }
+}
+
+//#######################################
+//print out result
+int64_t Output_Print(Soutput out) {
+  int64_t l1=0;
+  int fn1;
+  
+  LOOP1(DAM_SZ) l1+=out.globaltsolN[fn1];
+  pf("\n   %012llu nodes\n", l1);
+  pf("\n   res: %09u\n", out.globalres);
+
+  Node_Board_Print2(out.globaltsolN);
+
+  return l1 + out.globalres;
+}
+
+//#######################################
+
+int main(int argc, char *argv[]) {
+  // Epiphany input/output through shared RAM ; details: e2g_common.h
+  Sio     fromio;//Sio *fromio=(Sio *)malloc(sizeof(Sio));
+  int64_t l1=0;
+	int row, col, i, j, fn1, fn2, bench_start=BENCH_MIN, toccN[CORE_N]={0}, benchlimit=0;
+  e_platform_t epiphany;// Epiphany platform configuration
+	e_epiphany_t dev;
+  FILE *fin;
+  char *tbench=(char *)malloc(MAX_CORE_N * 17 * 16);;
+
+	if(argc > 1) {
+		i=atoi(argv[1]); if(i < BENCH_MAX) bench_start=i;
+  }
+
+  //get data
+  fin=fopen("./bin/bench.bin", "rb");
+  ifz(fin) { printf("Error reading file bin/bench.bin ; did you generate it with build_data.sh ?\n"); exit(-1); }
+  fseek(fin, 17 * bench_start, 0);
+  i=fread(tbench, MAX_CORE_N * 17 * 16, 1, fin);
+  fclose(fin);
+
+  printf("\n\nEternity II running under Parallella :) \n\n\n");
+	
+  Epiphany_Boot(&epiphany);
+
+	// Create a workgroup using all of the cores	
+	e_open(&dev, 0, 0, epiphany.rows, epiphany.cols);
+	e_reset_group(&dev);
+
+	// Load the device code into each core of the chip, and don't start it yet
+	e_load_group("bin/e_e2g.elf", &dev, 0, 0, epiphany.rows, epiphany.cols, E_FALSE);
+
+	// Set the maximum per core test value on each core at address 0x7020
+	i=0;
+  for(row=0;row<epiphany.rows;row++) {
+		for(col=0;col<epiphany.cols;col++) {
+      fromio.out.cmd=CMD_INIT;
+      LOOP1(DAM_SZ) 
+        fromio.in.tdam[fn1]=0;
+      j=17 * bench_start; // for demo purpose
+
+      LOOP1(8)
+        fromio.in.tdam[C2N + fn1] = (tbench[j + 12 + (fn1/2)] >> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte
+      fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1;
+      fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1;
+      fromio.in.tuile2do=  *(uint64_t *)&tbench[j + 0];
+      fromio.in.bordertuile2do=*(uint *)&tbench[j + 8];
+LOOP1(10) pf("%u ", fromio.in.tdam[C1N+fn1]); print
+
+      pf("0x%016llX tiles\n",      fromio.in.tuile2do);
+      pf("0x%08X       borders\n", fromio.in.bordertuile2do);
+      pf("sz(io) = %u\n", sizeof(Sio));
+      LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0;
+      fromio.out.globalres=0;
+      
+			e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio));
+pf("i %u ; in written ; C1N = %u\n", i, fromio.in.tdam[C1N]);
+
+      i++;
+		}
+	}
+
+	// Start all of the cores
+  pf("Some results in a minute... starting the core workgroup...\n\n");
+	e_start_group(&dev);
+  pf("... core workgroup started ; the whole test will last about 120 seconds...\n\n");
+
+	while(1) {
+		usleep(100000);
+    //pf("fromio.out.cmd: 0x%08X\n", fromio.out.cmd);
+		int done = 0;
+
+		// wait for the cores to complete their work
+		i=0;
+    for(row=0;row<epiphany.rows;row++) {
+			for(col=0;col<epiphany.cols;col++) {
+				// Get the number being tested by the core
+				if(e_read(&dev, row, col, SHARED_CMD, &fromio.out.cmd, sizeof(uint)) != sizeof(uint))
+					fprintf(stderr, "\n\nFailed to read\n\n\n");
+
+				if ( fromio.out.cmd != CMD_INIT) { //== CMD_DONE) {
+          if(e_read(&dev, row, col, SHARED_OUT, &fromio.out, sizeof(uint) * (DAM_SZ+1)) != sizeof(uint) * (DAM_SZ+1))
+            fprintf(stderr, "\n\nFailed to read 2\n\n\n");
+          l1 += Output_Print(fromio.out);
+          pf("Crunched %015llu nodes. bench # %u ; cmd output = 0x%08X\n\n", l1, benchlimit, fromio.out.cmd);
+          
+          benchlimit++;
+          if(benchlimit >= BENCH_LIMIT) break;
+          if(toccN[i] == BENCH_N) 
+            done++;
+          else {
+            toccN[i]++;
+            pf("core %4u: done %2u times ; cmd 0x%08X.\n", i, toccN[i], fromio.out.cmd);
+
+            fromio.out.cmd=CMD_INIT;
+            LOOP1(DAM_SZ) 
+              fromio.in.tdam[fn1]=0;
+            j=17 * bench_start; // for demo purpose
+            
+            LOOP1(8)
+              fromio.in.tdam[C2N + fn1] = (tbench[j + 12 + (fn1/2)] >> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte
+            fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1;
+            fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1;
+            fromio.in.tuile2do=  *(uint64_t *)&tbench[j + 0];
+            fromio.in.bordertuile2do=*(uint *)&tbench[j + 8];
+            LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0;
+            fromio.out.globalres=0;
+
+            e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio));
+pf("i %u ; in written again ; C1N = %u\n", i, fromio.in.tdam[C1N]);
+//OBSOLETE ! esdk doc too :/    e_reset_core(&dev, row, col);
+            e_start(&dev, row, col);
+          }
+        }
+          
+        i++;
+			}
+		}
+
+		if ( done >= CORE_N ) // some benchmarks are lengthy
+			break;
+
+    if(benchlimit >= BENCH_LIMIT) break;
+	}
+
+	e_finalize();
+  pf("Crunched %015llu nodes.\n\n", l1);
+
+	return 0;
+}
diff --git a/paralle3/src/e2g_common.h b/paralle3/src/e2g_common.h
new file mode 100644
index 0000000..2fd506d
--- /dev/null
+++ b/paralle3/src/e2g_common.h
@@ -0,0 +1,132 @@
+// 2017/01/28: 103 Mn/s (Million nodes per second) C version compared to previous 81 Mn/s assembly version.
+//   ELF support instead of SREC. Quicker load. File input. No more UNsigned integers, no more char loads, no more "ctz" and "popcount" instructions.
+//   Removed bug from multiple 0x6000 section inputs.
+//   Eagerly waiting for Epiphany V...
+
+#define CORE_N 16       //change it if needed ; our choice for standard 16-core Epiphany
+#define STATS           //undefine STATS to get full performance (from 111.2 to 103.4 s with a 16-core Parallella)
+#define MAX_CORE_N 1024 //Epiphany V ready ;)
+
+// specific to the project
+#define DAM_SZ 90
+
+// to DEVICE
+#pragma pack(4)
+typedef struct S_input {
+	int64_t tuile2do;
+	int bordertuile2do;
+	int tdam[DAM_SZ];
+  int east;
+}Sinput;
+// from DEVICE
+typedef struct S_output {
+	int globaltsolN[DAM_SZ]; //int64_t is twice as long to execute, you need at least 6 ic to increment a 64-bit memory value :/
+  int globalres;
+  int cmd;
+  int fn_idx;
+}Soutput;
+// shared MEMORY
+typedef struct S_io {
+  Sinput  in;
+  Soutput out;
+}Sio;
+// tmp variables for DEVICE, trying a workaround for the -msmall16 compilation option
+typedef struct S_tmp {
+  //int fn_idx;
+  int ttiles[64 + 1 + 32];
+  int j9e;
+  int j1n;
+}Stmp;
+
+// global offset for shared RAM
+#define SHARED_RAM (0x01000000)
+
+// a whole forum post for that
+#define PERFECT_ALIGN8 __asm__ (".balignw 4, 0x01a2\n"); __asm__ (".balignl 8, 0xfc02fcef\n");
+
+// Epiphany local offsets
+#define SHARED_IN  0x6000
+#define SHARED_OUT (SHARED_IN  + sizeof(Sinput))
+#define SHARED_RES (SHARED_OUT + DAM_SZ*4) // offset for result
+#define SHARED_CMD (SHARED_OUT + DAM_SZ*4 + 4) // offset for 'cmd'
+#define R_IDX      (SHARED_OUT + sizeof(Soutput))
+
+// commands for the Epiphany core
+#define CMD_INIT 0x80000000 // host init
+#define CMD_DONE 0x40000000 // eCore did the job properly (probably ; some bug might crush this word but it's highly improbable)
+
+// specific to the project
+#ifdef STATS
+  #define macro_globaltrace(niveau) out.globaltsolN[niveau]++;
+#else
+  #define macro_globaltrace(niveau)
+#endif
+
+#define macro_globaltrace2(niveau) out.globaltsolN[niveau]++;
+
+#define NORTH 0
+#define EAST  1
+#define SOUTH 2
+#define WEST  3
+
+#define B1N   0
+#define C1N  10
+#define C2N  11
+#define C3N  12
+#define C4N  13
+#define C5N  14
+#define C6N  15
+#define C7N  16
+#define C8N  17
+#define C9N  18
+#define C10N 19
+#define D1N  20 //etc
+
+#define G1N  50
+#define G2N  51
+#define G3N  52
+#define G4N  53
+#define G5N  54
+#define G6N  55
+#define G7N  56
+#define G8N  57
+#define G9N  58
+#define G10N 59
+
+#define H1N  60
+#define H2N  61
+#define H3N  62
+#define H4N  63
+#define H5N  64
+#define H6N  65
+#define H7N  66
+#define H8N  67
+#define H9N  68
+#define H10N 69
+
+#define I1N  70
+#define I2N  71
+#define I3N  72
+#define I4N  73
+#define I5N  74
+#define I6N  75
+#define I7N  76
+#define I8N  77
+#define I9N  78
+#define I10N 79
+
+#define J1N  80
+#define J2N  81
+#define J3N  82
+#define J4N  83
+#define J5N  84
+#define J6N  85
+#define J7N  86
+#define J8N  87
+#define J9N  88
+#define J10N 89
+
+#define BORDERCOLOR_D 0
+#define BORDERCOLOR_G 4
+#define BORDERCOLOR_I 9
+#define BORDERCOLOR_N 19 // 19 colors ; 1st one is empty, colors 1-4 stand for D(roite), 5-8 for G(auche), 9-18 for I(nterieur)
diff --git a/paralle3/src/e_e2g.c b/paralle3/src/e_e2g.c
new file mode 100644
index 0000000..9e281d7
--- /dev/null
+++ b/paralle3/src/e_e2g.c
@@ -0,0 +1,891 @@
+#include "e-lib.h" // mandatory even for a minimalist design -- e_get_coreid(), e_read(), e_write()
+
+//from notzed on the forum, "...gcc extended inline asm, 'cc' clobber_php.htm"
+//volatile needed, the compiler may mix code without taking care of the condition flags :/ 
+unsigned int bitrev(unsigned int val) {
+  unsigned int res;
+
+  __asm__ volatile ("bitr %[res],%[val]"
+    : [res] "=r" (res)
+    : [val] "r" (val)
+    : "cc");
+
+  return res;
+}
+
+//#include "C_common2.h"  // common definitions for C
+// avoid stdint.h
+#define uint8_t		unsigned char
+#define uint16_t	unsigned short
+#define uint32_t	unsigned int
+#define uint64_t	unsigned long long // <!> unsigned long = 4 bytes under 32-bit ARM
+
+#define int8_t		char
+#define int16_t		short
+#define int32_t		int
+#define int64_t		long long
+
+// my semantic
+#define uc     unsigned char
+#define ull    unsigned long long
+#define pf      printf
+#define print   printf("\n");
+#define pfv(x)  printf("v: %d\n",x);
+#define LOOP1(x) for(fn1=0;fn1<(x);fn1++)
+#define LOOP2(x) for(fn2=0;fn2<(x);fn2++)
+#define LOOP3(x) for(fn3=0;fn3<(x);fn3++)
+#define LOOP4(x) for(fn4=0;fn4<(x);fn4++)
+#define LOOP5(x) for(fn5=0;fn5<(x);fn5++)
+#define LOOP6(x) for(fn6=0;fn6<(x);fn6++)
+#define LOOP7(x) for(fn7=0;fn7<(x);fn7++)
+#define LOOP8(x) for(fn8=0;fn8<(x);fn8++)
+#define LOOP(x,y) for(x=0;x<y;x++)
+
+// #######################################
+// think x86 asm, think jz... setz... cmovz... think flags...
+#define ifz(x) if(!(x))
+#define ifnz(x) if(x)
+#define ife(x) if(!(x))
+#define ifne(x) if(x)
+// booleans
+#define ifzbool32(x,bit) ifz((x)&(1U<<(bit))) 
+#define ifbool32(x,bit)  if((x)&(1U<<(bit))) 
+#define ifzbool64(x,bit) ifz((uint64_t)(x)&(1ULL<<(bit)))
+#define ifbool64(x,bit)  if((uint64_t)(x)&(1ULL<<(bit)))
+
+#include "e2g_common.h" // common definitions for EII project
+
+//#######################################
+//INPUT/OUTPUT DATA
+
+/* previous code was:
+    
+   volatile Sinput  in  SECTION(".data_bank3"); // SHARED_IN
+   volatile Soutput out SECTION(".data_bank3"); // SHARED_OUT
+  
+  this way of coding is BAD: the linker will NOT necessarily place 'in' at offset 0x6000 and 'out' just AFTER 'in' (actually it places 'out' BEFORE 'in' !)
+  
+   => ONE reliable way of coding is ONE structure for exchanging with the rest of the world
+*/
+
+volatile Sio  io  SECTION(".data_bank3");
+
+#define in  io.in
+#define out io.out
+
+//#######################################
+
+Stmp    tmp;
+
+//#######################################
+//THE 'COMPUTE KERNEL'
+//#include "e2c_solver.c"
+
+void BorderWest(const int, const int);
+void InnerTile0(const int, const int);
+void InnerTile1(const int, const int);
+void InnerTile(const int, const int);
+void InnerRow(const int, const int);
+void InnerRow2(const int, const int); // with tinner_Upd
+void BorderEast(const int, const int);
+void BorderEastUpdate(const int, const int);
+void Special_H10(const int, const int);
+void Special_I1(const int, const int);
+void Special_I10(const int, const int);
+void BorderEastBottom(const int, const int);
+void Special_J2(const int, const int);
+void Special_Debug(const int, const int); // for debugging purpose
+
+void __attribute__ ((noinline)) Input_Copy(int, int *);
+
+//#######################################
+//STATIC DATA
+
+const int tlscouleur_B2016[BORDERCOLOR_N+1]={   
+  0x00000000, 
+  0x0870809A, 0x91032001, 0x42845140, 0x24080E24, 
+  0x000000FF, 0x00007F00, 0x00FF8000, 0xFF000000, 
+  0x00000001, 0x00018300, 0x03000406, 0x00000008, 0x00020830, 0x04041040, 0x38080000, 0x40100000, 0x00202000, 0x80C04080, 
+  0x00000000
+};
+
+//<!> colors 0-3 for D and G, 0-9 for I
+const int tbordureD[32]={ 0x01, 0x00, 0x03, 0x00, 0x00, 0x03, 0x02, 0x00, 0x02, 0x03, 0x03, 0x03, 0x02, 0x01, 0x02, 0x00, 0x01, 0x01, 0x02, 0x03, 0x00, 0x00, 0x00, 0x02, 0x01, 0x02, 0x03, 0x00, 0x01, 0x03, 0x02, 0x01 };
+const int tbordureG[32]={ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 };
+const int tbordureI[32]={ 0x00, 0x02, 0x02, 0x03, 0x04, 0x04, 0x05, 0x09, 0x01, 0x01, 0x02, 0x04, 0x05, 0x08, 0x09, 0x01, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x02, 0x02, 0x05, 0x06, 0x06, 0x06, 0x07, 0x09 };
+
+const int ttileN[100]={
+   1,  4,  3,  2,  1,  4,  4,  5,  2,  3,  5,  2,  5,  3,  0,  3,  3,  2,  0,  3,
+   1,  5,  3,  5,  2,  0,  1,  2,  3,  3,  6,  1,  4,  3,  6,  1,  2,  1,  3,  2,
+   2,  2,  2,  2,  4,  1,  0,  1,  4,  6,  5,  3,  2,  1,  1,  2,  4,  1,  1,  4,
+   3,  0,  2,  5,  3,  5,  0,  1,  4,  1,  1,  4,  1,  5,  2,  4,  1,  6,  2,  0,
+   0,  4,  1,  0,  4,  2,  7,  4,  3,  1,  5,  1,  2,  3,  1,  2,  2,  3,  4,  0
+};
+
+// color format: G + 4*D (i.e tbordureG + 4*tbordureD)
+const int tGDN[16]={ 4, 0, 4, 1, 1, 1, 2, 3, 1, 3, 2, 2, 2, 3, 1, 2 };
+
+const int tGD[16][8]={ 
+  {  1, 2,  3, 3,  4, 4,  7, 9 },
+  {  },
+  { 15, 1, 20, 7, 21, 8, 22, 9 },
+  { 27, 6, },
+  {  0, 0, },
+  { 13, 8, },
+  { 16, 1, 17, 4, },
+  { 24, 2, 28, 6, 31, 9, },
+  {  6, 5, },
+  {  8, 1, 12, 5, 14, 9, },
+  { 18, 5, 23, 9, },
+  { 25, 2, 30, 7, },
+  {  2, 2,  5, 4, },
+  {  9, 1, 10, 2, 11, 4, },
+  { 19, 6, },
+  { 26, 5, 29, 6, }
+};
+
+#define VOIDTILE 64
+#define VOIDSOUTH 0
+
+//tcount(11) = 2 tcount(19) = 2 tcount(22) = 3  tcount(27) = 2  tcount(36) = 2  tcount(38) = 2
+//const int tbordereast_uniquecolor[20]={ 1, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 23, 24, 25, 28, 30, 32, 33, 37, }; //19 actually
+//const int tbordereast_uniquetile[20]={ 0, 0, 4, 1, 24, 0, 27, 28, 0, 15, 17, 0, 29, 20, 0, 30, 0, 0, 0, }; //19 actually
+
+//from tmpbordure*.c
+const int t14[100][8]={ // format: LSB = tile, 2nd byte = east, 3rd byte = south, MSB = 0
+  { 0x00060500, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000901, 0x00020308, 0x00010613, 0x0009030C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000506, 0x00010915, 0x0004030A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004040D, 0x00080510, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010307, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050103, 0x00050918, 0x00000206, 0x00020102, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000712, 0x00080511, 0x00050000, 0x00020614, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040917, 0x00000612, 0x00030309, 0x00070104, 0x0007050F, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070105, 0x00010916, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000101, 0x0006030B, 0x0006040E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050503, 0x00090001, 0x00080705, 0x00070704, 0x00050202, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050919, 0x00060013, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070823, 0x00030008, 0x0001051A, 0x00090720, 0x0001071B, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000407, 0x0006021C, 0x0004041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060822, 0x00090119, 0x0001021A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000113, 0x0007051F, 0x00050821, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0006031D, 0x0001021B, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000816, 0x00000215, 0x0003000C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050006, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00090015, 0x0005011A, 0x00000502, 0x0003061C, 0x0007011B, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030325, 0x00020924, 0x00090224, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030225, 0x00000108, 0x00020826, 0x00020927, 0x00040328, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007072C, 0x0003000A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000614, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080123, 0x00050429, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0006052B, 0x0006042A, 0x00020326, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020224, 0x00020327, 0x00070120, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0009060B, 0x00070309, 0x00010208, 0x00040107, 0x0001090C, 0x0002040A, 0x00000040, 0x00000040 },
+  { 0x0007061D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020325, 0x00080226, 0x00090227, 0x00030428, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020225, 0x0007072D, 0x00000709, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004000D, 0x00080732, 0x00040935, 0x0005062F, 0x00030228, 0x0004011E, 0x00000040, 0x00000040 },
+  { 0x00090734, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080631, 0x0002011C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007032D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040936, 0x00050010, 0x0006052E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050630, 0x00080733, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0003040D, 0x0009060E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030007, 0x0003041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070529, 0x0008062A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020328, 0x0000020A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0000030D, 0x00090335, 0x00080737, 0x0001031E, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0006032F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007022C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070332, 0x00060839, 0x00070437, 0x0006093C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000717, 0x00030435, 0x0008083B, 0x00030836, 0x00050638, 0x0007083A, 0x00000040, 0x00000040 },
+  { 0x00060811, 0x00020006, 0x00000600, 0x00030810, 0x0007070F, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000503, 0x0002011A, 0x0006071F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008062B, 0x00010002, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008062E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020729, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010003, 0x00090018, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080122, 0x00030930, 0x0003042F, 0x00040938, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007073E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010621, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000518, 0x00070334, 0x00010119, 0x0008063D, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00070012, 0x00010113, 0x00060214, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060014, 0x0001031C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00090530, 0x0000090B, 0x00060831, 0x0001071D, 0x0004052F, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0000090E, 0x0002082A, 0x00090538, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0002082B, 0x00000000, 0x0009083D, 0x0003082E, 0x00080121, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0005011F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010522, 0x00050011, 0x00060331, 0x00040839, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004083C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060012, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000805, 0x00000704, 0x00020920, 0x0002011B, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004072C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040832, 0x0003072D, 0x00030009, 0x00050934, 0x00090833, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00090017, 0x00040837, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007073E, 0x00040229, 0x0001061F, 0x0000070F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0003011D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0007053E, 0x0005073E, 0x0003032D, 0x00010004, 0x0002042C, 0x0005000F, 0x00000040, 0x00000040 },
+  { 0x00010223, 0x0004093A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00050622, 0x00020723, 0x00090016, 0x00060521, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030226, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00080639, 0x0009083B, 0x00090336, 0x0009073A, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000611, 0x00000310, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0005022B, 0x00030631, 0x00080439, 0x0005093D, 0x0009043C, 0x0004022A, 0x0005032E, 0x00000040 },
+  { 0x00030432, 0x00010005, 0x00040437, 0x00030933, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0004093B, 0x0008093F, 0x0009083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010001, 0x00070417, 0x00080116, 0x00050518, 0x00020115, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00010519, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00020224, 0x00030227, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00040435, 0x00080436, 0x0000010C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008063C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00060330, 0x00060438, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0003000B, 0x0004000E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00030534, 0x00010220, 0x0008043A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x0008043B, 0x00070333, 0x0006053D, 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 },
+  { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }
+};
+
+ALIGN(8)
+int ttileN_upd[100]={0}; // ? does not work yet
+
+static inline void TileN_Update(void) {
+  int fn1;
+  LOOP1(100) ttileN_upd[fn1]=ttileN[fn1];
+  
+  ttileN_upd[1]=tmp.ttiles[0];
+	//modele : tmp.ttileN_upd[1]=tmp.ttiles[0];
+	/*
+  ttileN_upd[1]=tmp.ttiles[0];
+	ttileN_upd[4]=tmp.ttiles[15];
+	ttileN_upd[5]=tmp.ttiles[16];
+	ttileN_upd[6]=tmp.ttiles[8];
+	ttileN_upd[7]=tmp.ttiles[9];
+	ttileN_upd[8]=tmp.ttiles[1];
+	ttileN_upd[9]=tmp.ttiles[24];
+	ttileN_upd[10]=tmp.ttiles[25];
+	ttileN_upd[12]=tmp.ttiles[3];
+	ttileN_upd[16]=tmp.ttiles[4];
+	ttileN_upd[17]=tmp.ttiles[17];
+	ttileN_upd[23]=tmp.ttiles[26];
+	ttileN_upd[24]=tmp.ttiles[27];
+	ttileN_upd[25]=tmp.ttiles[28];
+	ttileN_upd[28]=tmp.ttiles[20];
+	ttileN_upd[30]=tmp.ttiles[30];
+	ttileN_upd[32]=tmp.ttiles[21];
+	ttileN_upd[33]=tmp.ttiles[13];
+	ttileN_upd[37]=tmp.ttiles[31];
+  */
+}
+
+//typedef void (*ptrFonction) (const signed int, const signed int);
+
+ALIGN(8)
+void (* tfncall[78]) (const int, const int) ={
+   BorderWest,
+   InnerRow,
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // C10
+    //Special_Debug, 
+  
+   BorderWest,
+   InnerRow, // replaces InnerTile() * 8
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   BorderEast, //	 BorderEastUpdate, // D10   // ? fail
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // E10
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // F10
+  
+//Special_Debug,
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 BorderEast, // G10
+  
+   BorderWest,
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 Special_H10,    // H10
+   
+   Special_I1, 
+   InnerRow, 
+   Special_Debug, // only for clean stats
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+   Special_Debug,
+	 Special_I10,    // I10
+   
+   BorderEastBottom, // J9
+   BorderEastBottom, // J8
+   BorderEastBottom, // J7
+   BorderEastBottom, // J6
+   BorderEastBottom, // J5
+   BorderEastBottom, // J4
+   BorderEastBottom, // J3
+   Special_J2, // J2, last square
+};
+
+//dynamic
+int tborderwestN[4]={0};
+int tbordereastN[40]={0};
+
+int tborderwestT[4][9]={0}; 
+int tborderwestE[4][9]={0}; 
+int tborderwestS[4][9]={0};
+
+int tbordereastT[40][4]={0};
+int tbordereastS[40][4]={0};
+
+// sandwiched BorderWest ; J1N=0 or 3 for this specific problem
+void Special_I1(const int north, const int northI) {
+  int couleur, tuileN, fn1, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=north + 4*0; 
+  tmp.j9e=3;
+  tuileN=tGDN[couleur];
+  LOOP1(tuileN) {
+    tuile=tGD[couleur][fn1*2 + 0];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      east=tGD[couleur][fn1*2 + 1];
+      tmp.ttiles[64+1 + tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  couleur=north + 4*3; 
+  tmp.j9e=0;
+  tuileN=tGDN[couleur];
+  LOOP1(tuileN) {
+    tuile=tGD[couleur][fn1*2 + 0];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      east=tGD[couleur][fn1*2 + 1];
+      tmp.ttiles[64+1 + tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+}
+
+void BorderWest(const int north, const int northI) {
+  int couleur, tuileN, fn1, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=north; // in.tdam[C1N];
+  tuileN=tborderwestN[couleur];
+  LOOP1(tuileN) {
+    tuile=tborderwestT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      east=tborderwestE[couleur][fn1];
+      in.tdam[northI+10]=tborderwestS[couleur][fn1];
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(east, northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+void InnerTile(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=eastcolor + 10*in.tdam[northI];
+  tuileN=ttileN[couleur];
+  LOOP1(tuileN) {
+    fn2=t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[tuile]=1;
+    }
+  }
+}
+
+void BorderEast(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      in.tdam[northI+10]=tbordereastS[couleur][fn1];
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(in.tdam[northI+1], northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+// BorderEast for the bottom line ; from J9 to J2 excluded ; only change to these tags '//#'
+void BorderEastBottom(const int northI, const int eastcolor) {
+  int couleur, tuileN, fn1, tuile, nexteast;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=in.tdam[northI] * 4 + eastcolor; //#
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      tmp.ttiles[64+1 + tuile]=0;
+      out.fn_idx++;
+      (*ptr)(northI-1, tbordureG[tuile]); //#
+      out.fn_idx--;
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+}
+
+// BorderEastBottom with final check for J2 the last square
+void Special_J2(const int northI, const int eastcolor) {
+  int couleur, tuileN, fn1, tuile, nexteast;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=in.tdam[northI] * 4 + eastcolor; //#
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      if(tbordureG[tuile] != 1) continue; // tdam[J1E] == 1 for this specific problem
+      
+      out.globalres++; // O_O reach this point after about 10^17 nodes...
+      
+    }
+  }
+}
+
+// BorderEast with I10 checkup
+void Special_H10(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      if(tbordureG[tuile] == 0) continue; // borders 0/1/x/0 do not exist on this specific problem
+      //in.tdam[northI+10]=tbordereastS[couleur][fn1];
+      
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(in.tdam[northI+1], northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+// BorderEast with I10 strong constraint
+void Special_I10(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace2(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      if(tbordureG[tuile] != 2) continue; // J10N == 2 for this specific problem
+      
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(northI+9, tmp.j9e);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+//??? why does it NOT work ? should be 10 % faster
+void BorderEastUpdate(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, tuile;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  TileN_Update();
+
+  out.fn_idx++;
+
+  couleur=eastcolor*4 + in.tdam[northI];
+  tuileN=tbordereastN[couleur];
+  LOOP1(tuileN) {
+    tuile=tbordereastT[couleur][fn1];
+    ifnz(tmp.ttiles[64+1 + tuile]) {
+      in.tdam[northI+10]=tbordereastS[couleur][fn1];
+      tmp.ttiles[64+1 + tuile]=0;
+      (*ptr)(in.tdam[northI+1], northI+1);
+      tmp.ttiles[64+1 + tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+void InnerTile0(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=eastcolor + 10*in.tdam[northI];
+  tuileN=ttileN[couleur];
+  LOOP1(tuileN) {
+    fn2=t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[tuile]=1;
+    }
+  }
+}
+
+void InnerTile1(const int eastcolor, const int northI) {
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+  out.fn_idx++;
+
+  couleur=eastcolor + 10*in.tdam[northI];
+  tuileN=ttileN_upd[couleur];
+  LOOP1(tuileN) {
+    fn2=t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      (*ptr)(east, northI+1);
+      tmp.ttiles[tuile]=1;
+    }
+  }
+
+  out.fn_idx--;
+}
+
+//point de vigilance : teast[idx - 1] ; le reste : std
+#define macro_innerrow_loop(idx)\
+			macro_globaltrace(out.fn_idx - 8 + idx);\
+			tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\
+			ttuileN[idx]=ttileN[tcouleur[idx]];\
+			LOOP(tfn1[idx], ttuileN[idx]) {\
+				tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\
+				ttuile[idx]=tfn2[idx] & 0xff;\
+				ifnz(tmp.ttiles[ttuile[idx]]) {\
+					teast[idx]=(tfn2[idx]>>8) & 0xff;\
+					in.tdam[south + idx]=tfn2[idx] >> 16;\
+					tmp.ttiles[ttuile[idx]]=0;
+					
+#define macro_innerrow_loopz(idx) tmp.ttiles[ttuile[idx]]=1; } }
+
+
+void InnerRow(const int eastcolor, const int northI) {
+  int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8];
+	int south=northI+10;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+
+	out.fn_idx +=8;
+  ptr=tfncall[out.fn_idx];
+	
+  tcouleur[0]=eastcolor + 10*in.tdam[northI];
+  ttuileN[0]=ttileN[tcouleur[0]];
+  LOOP(tfn1[0], ttuileN[0]) {
+    tfn2[0]=t14[tcouleur[0]][tfn1[0]];
+    ttuile[0]=tfn2[0] & 0xff;
+    ifnz(tmp.ttiles[ttuile[0]]) {
+      teast[0]=(tfn2[0]>>8) & 0xff;
+      in.tdam[south +0]=tfn2[0] >> 16;
+      tmp.ttiles[ttuile[0]]=0;
+      
+			// (*ptr)(teast[0], northI+1);
+			/*
+			tcouleur[1]=teast[0] + 10*in.tdam[northI +1];
+			ttuileN[1]=ttileN[tcouleur[1]];
+			LOOP(tfn1[1], ttuileN[1]) {
+				tfn2[1]=t14[tcouleur[1]][tfn1[1]];
+				ttuile[1]=tfn2[1] & 0xff;
+				ifnz(tmp.ttiles[ttuile[1]]) {
+					teast[1]=(tfn2[1]>>8) & 0xff;
+					in.tdam[south +1]=tfn2[1] >> 16;
+					tmp.ttiles[ttuile[1]]=0;
+					out.fn_idx++;
+			*/
+			macro_innerrow_loop(1)
+				macro_innerrow_loop(2)
+					macro_innerrow_loop(3)
+						macro_innerrow_loop(4)
+							macro_innerrow_loop(5)
+								macro_innerrow_loop(6)
+									macro_innerrow_loop(7)
+									
+									(*ptr)(teast[7], northI + 8);
+									
+									macro_innerrow_loopz(7)
+								macro_innerrow_loopz(6)
+							macro_innerrow_loopz(5)
+						macro_innerrow_loopz(4)
+					macro_innerrow_loopz(3)
+				macro_innerrow_loopz(2)
+			macro_innerrow_loopz(1)
+      
+      tmp.ttiles[ttuile[0]]=1;
+    }
+  }
+
+	out.fn_idx -=8;
+}
+
+
+//point de vigilance : teast[idx - 1] ; le reste : std
+#define macro_innerrow_loop2(idx)\
+			macro_globaltrace(out.fn_idx - 8 + idx);\
+			tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\
+			ttuileN[idx]=ttileN_upd[tcouleur[idx]];\
+			LOOP(tfn1[idx], ttuileN[idx]) {\
+				tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\
+				ttuile[idx]=tfn2[idx] & 0xff;\
+				ifnz(tmp.ttiles[ttuile[idx]]) {\
+					teast[idx]=(tfn2[idx]>>8) & 0xff;\
+					in.tdam[south + idx]=tfn2[idx] >> 16;\
+					tmp.ttiles[ttuile[idx]]=0;
+
+
+void InnerRow2(const int eastcolor, const int northI) {
+  int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8];
+	int south=northI+10;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+
+	out.fn_idx +=8;
+  ptr=tfncall[out.fn_idx];
+	
+  tcouleur[0]=eastcolor + 10*in.tdam[northI];
+  ttuileN[0]=ttileN_upd[tcouleur[0]];
+  LOOP(tfn1[0], ttuileN[0]) {
+    tfn2[0]=t14[tcouleur[0]][tfn1[0]];
+    ttuile[0]=tfn2[0] & 0xff;
+    ifnz(tmp.ttiles[ttuile[0]]) {
+      teast[0]=(tfn2[0]>>8) & 0xff;
+      in.tdam[south +0]=tfn2[0] >> 16;
+      tmp.ttiles[ttuile[0]]=0;
+
+			macro_innerrow_loop(1)
+				macro_innerrow_loop(2)
+					macro_innerrow_loop(3)
+						macro_innerrow_loop(4)
+							macro_innerrow_loop(5)
+								macro_innerrow_loop(6)
+									macro_innerrow_loop(7)
+									
+									(*ptr)(teast[7], northI + 8);
+									
+									macro_innerrow_loopz(7)
+								macro_innerrow_loopz(6)
+							macro_innerrow_loopz(5)
+						macro_innerrow_loopz(4)
+					macro_innerrow_loopz(3)
+				macro_innerrow_loopz(2)
+			macro_innerrow_loopz(1)
+      
+      tmp.ttiles[ttuile[0]]=1;
+    }
+  }
+
+	out.fn_idx -=8;
+}
+
+
+/* super BIDE ou BUG de nouveau
+void InnerTile0(const int eastcolor, const int northI) {
+  int64_t tfn2[4];
+  int couleur, tuileN, fn1, fn2, tuile, east;
+  void (*ptr)(const int, const int);
+  macro_globaltrace(out.fn_idx);
+  ptr=tfncall[out.fn_idx+1];
+
+  couleur=eastcolor + 10*in.tdam[northI];
+
+  tfn2[0]=*(int64_t *)&t14[couleur][0];
+  tfn2[1]=*(int64_t *)&t14[couleur][2];
+  tfn2[2]=*(int64_t *)&t14[couleur][4];
+  tfn2[3]=*(int64_t *)&t14[couleur][6];
+
+  tuileN=ttileN[couleur];
+  LOOP1(tuileN) {
+    fn2=*(int *)(&tfn2 + fn1*4); //t14[couleur][fn1];
+    tuile=fn2 & 0xff;
+    ifnz(tmp.ttiles[tuile]) {
+      east=(fn2>>8) & 0xff;
+      in.tdam[northI+10]=fn2>>16;
+      tmp.ttiles[tuile]=0;
+      out.fn_idx++;
+      (*ptr)(east, northI+1);
+      out.fn_idx--;
+      tmp.ttiles[tuile]=1;
+    }
+  }
+}
+*/
+
+void Special_Debug(const int north, const int northI) {
+  macro_globaltrace(out.fn_idx);
+}
+
+//#######################################
+
+// prevent inlining this trivial function: we may need some room
+void __attribute__ ((noinline)) Input_Copy(int tiles, int *dest) {
+  int fn1;
+  LOOP1(32) {
+    dest[fn1]=tiles & 1;
+    tiles>>=1;
+  }
+}
+
+//void __attribute__((interrupt)) null_isr() { return; }
+
+//#######################################
+
+int main(void) {
+e_start:;
+  
+  int fn1, westcolor, eastcolor, tiles;
+
+  volatile signed int *inputP  = (void *)SHARED_IN;  // pointer for input
+  //volatile signed int *cmdP    = (void *)SHARED_CMD; // pointer for output command
+
+  // init compute kernel
+  tiles=*(inputP+0); Input_Copy(tiles, &tmp.ttiles[ 0]); // 1st 32 tiles
+  tiles=*(inputP+1); Input_Copy(tiles, &tmp.ttiles[32]);
+  tiles=*(inputP+2); Input_Copy(tiles, &tmp.ttiles[VOIDTILE+1]); // 32 borders
+  tmp.ttiles[VOIDTILE]=0; // VOIDTILE == 64
+    
+  LOOP1(4)
+    tborderwestN[fn1]=0;
+  LOOP1(40)
+    tbordereastN[fn1]=0;
+
+  LOOP1(32) {
+    if(tmp.ttiles[64+1 +fn1]) {
+      westcolor=tbordureG[fn1];
+      tborderwestT[westcolor] [tborderwestN[westcolor]]=fn1;
+      tborderwestE[westcolor] [tborderwestN[westcolor]]=tbordureI[fn1];
+      tborderwestS[westcolor] [tborderwestN[westcolor]]=tbordureD[fn1];
+      tborderwestN[westcolor]++;
+      
+      eastcolor=tbordureI[fn1]*4 + tbordureD[fn1];
+      tbordereastT[eastcolor] [tbordereastN[eastcolor]]=fn1;
+      tbordereastS[eastcolor] [tbordereastN[eastcolor]]=tbordureG[fn1];
+      tbordereastN[eastcolor]++;
+    }
+  }
+
+  out.fn_idx=0;
+  BorderWest(in.tdam[C1N], C1N);
+
+  out.cmd=CMD_DONE; // *cmdP=CMD_DONE;
+  
+  //return 0;
+  __asm__ __volatile__ ("idle"); // experience: can you idle an Epiphany core until ARM wakes it up ?  Answer: empirically, yes ; use e_start() to reload the core
+  //goto e_start; // wake by IVT # 0
+  
+}
diff --git a/paralle3/x86_build.sh b/paralle3/x86_build.sh
new file mode 100644
index 0000000..0113851
--- /dev/null
+++ b/paralle3/x86_build.sh
@@ -0,0 +1,3 @@
+# cross compiling on x86_64 host, assuming /opt/adapteva as default path
+
+e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -o bin/e_e2g.elf -le-lib
diff --git a/paralle3/x86_buildasm.sh b/paralle3/x86_buildasm.sh
new file mode 100644
index 0000000..20836b0
--- /dev/null
+++ b/paralle3/x86_buildasm.sh
@@ -0,0 +1,12 @@
+# cross compiling on x86_64 host
+
+# -mshort-calls: OK
+# -msmall16: still broken
+# -m1reg-r63: OK
+# -mfp-mode=int: OK
+
+echo Cross compiling on x86_64 host
+echo.
+echo.
+e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -S -le-lib
+echo.