parallella · DonQuichotteComputers · May 17, 2016 · Jan 28, 2017 · Jan 28, 2017 · Jan 28, 2017
diff --git a/paralle2/README.md b/paralle2/README.md
@@ -1,22 +1,45 @@
-Pseudo Eternity II solver under Parallella !
+# A 10x10 Eternity II solver
 
-Demo with a 10x10 subproblem with 16 cores running in parallel.
+## Build and Run
 
-A single eCore reaches 3 Mn/s, that's a tremendous 48 Mn/s for 4.7 W !
-My high-end computer reaches 130 Mn/s with a single core and heavy optimizations, but it consumes 72 W.
+#data BEFORE ; will create bin/bench.bin - a bunch of benchs
 
-This is only the beginning with a basic algorithm, I'm confident we can reach 80 Mn/s with a single 16-core Parallella :D
-For your information a single ARM core with the same basic algorithm reaches 6 Mn/s.
-This kind of problem would be a good candidate to run under the unused parts of the FPGA... that's another story. For experts.
+ ./build_data.sh
+ ./run_data.sh
 
-***
+ ./build.sh
+ ./run.sh value
 
-How to run the demo under a 16-core Parallella:
-./build.sh e2
-time ./run.sh e2
+ Wanting an assembly output ? Use ./buildasm.sh
+ Cross compiling for an x86_64 platform ? You can use ./x86*.sh  
+
+## Benchmarks
 
-Please check you use a 16-core Epiphany like mine: E16G301, a Kickstarter model with Zynq 7020 and a 'headless' configuration.
-If not the case, you may adapt the sources: src/e2.c and src/e_e2.c
-This kind of problem is a perfect candidate for clusters too.
-You can easily tweak this code if you have a 64-core Parallella (if you don't know how to code it, just sell it to me lol)
+All programs are full C, sometimes with some assembly.
+Mn/s/W = Million nodes per second per Watt
+
+
+GPU OpenCL                      : not even a tenth of a modest x86 core with a Radeon 5770 graphics card. The numerous branches are a dead end, not to talk about the watts.
+Parallella, one ARM A9 core     :    6 Mn/s ;   3.0 W ;  2    Mn/s/W
+My high-end computer, one core  :  166 Mn/s ;  72.0 W ;  2.3  Mn/s/W ; x86_64, Fedora Core 23, i7 5820k
+Raspberry Pi 3                  :                        8-10 Mn/s/W iirc ; A53, 4-core, 1.2 GHz
+My high-end computer, 12 threads: 1470 Mn/s ; 140.0 W ; 10.0  Mn/s/W
+Odroid XU4                      :  245 Mn/s ;  15.7 W ; 15.6 Mn/s/W ; 8-core ; deeply optimized, not much margin
+Parallella 16-core Epiphany     :  103 Mn/s ;   5.0 W ; 20.6 Mn/s/W ; remove the Ethernet cable to earn 0.6 W due to ssh with the headless Parabuntu distro
+
+
+So...
+To my knowledge, Parallella is today the most energy-efficient platform for this highly recursive task...
+although it does *not* use any float !
+
+Eagerly waiting the 1024-core Epiphany V...
+
+## Author
+
+DonQuichotteComputers at gmail dot com
+2017
+
+## License
+
+BSD-3 clause.
 
diff --git a/paralle2/build.sh b/paralle2/build.sh
@@ -3,8 +3,8 @@
 set -e
 
 ESDK=${EPIPHANY_HOME}
-ELIBS="-L ${ESDK}/tools/host/lib"
-EINCS="-I ${ESDK}/tools/host/include"
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
 ELDF=${ESDK}/bsps/current/internal.ldf
 
 SCRIPT=$(readlink -f "$0")
@@ -23,14 +23,25 @@ case $(uname -p) in
 		;;
 esac
 
+# Create output dir
+mkdir -p bin
+
 # Build HOST side application
-${CROSS_PREFIX}gcc -Ofast src/$1.c -o Debug/$1.elf ${EINCS} ${ELIBS} -le-hal -le-loader -lpthread
+${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader
 
 # Build DEVICE side program
-#for speed optimization replace $2... with -Ofast
-#e-gcc --help=optimizers gives you hints
-e-gcc -T ${ELDF} -Ofast $2 $3 $4 $5 $6 $7 $8 $9 src/e_$1.c -o Debug/e_$1.elf -le-lib
+# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4
+#-mshort-calls still works :D  
+
+#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib
+#e-gcc 5.4 makes poor use of the option -mfp-mode=int
+#  the option -mfp-iarith slows DOWN my program -- more than 20 % :/ 
 
-# Convert ebinary to SREC file
-e-objcopy --srec-forceS3 --output-target srec Debug/e_$1.elf Debug/e_$1.srec
+ e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 src/e_e2g.c -o bin/e_e2g.elf -le-lib
 
+# trick to get the spare room usage: epiphany-elf-size your_program.elf ; with internal.ldf the value of 'dec' cannot be beyond 32767
+#
+#parallella@parallella:~/parallella-examples/tmp$ epiphany-elf-size bin/e_e2g.elf 
+#   text	   data	    bss	    dec	    hex	filename
+#  18730	   2148	   2808	  23686	   5c86	bin/e_e2g.elf
+#
diff --git a/paralle2/build_data.sh b/paralle2/build_data.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+ESDK=${EPIPHANY_HOME}
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
+ELDF=${ESDK}/bsps/current/internal.ldf
+
+SCRIPT=$(readlink -f "$0")
+EXEPATH=$(dirname "$SCRIPT")
+cd $EXEPATH
+
+CROSS_PREFIX=
+case $(uname -p) in
+	arm*)
+		# Use native arm compiler (no cross prefix)
+		CROSS_PREFIX=
+		;;
+	   *)
+		# Use cross compiler
+		CROSS_PREFIX="arm-linux-gnueabihf-"
+		;;
+esac
+
+# Create output dir
+mkdir -p bin
+
+# Build HOST side application
+${CROSS_PREFIX}gcc src/build_data.c -o bin/build_data.elf -I ${EINCS} -L ${ELIBS}
diff --git a/paralle2/buildasm.sh b/paralle2/buildasm.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+
+ESDK=${EPIPHANY_HOME}
+ELIBS=${ESDK}/tools/host/lib
+EINCS=${ESDK}/tools/host/include
+ELDF=${ESDK}/bsps/current/internal.ldf
+
+SCRIPT=$(readlink -f "$0")
+EXEPATH=$(dirname "$SCRIPT")
+cd $EXEPATH
+
+CROSS_PREFIX=
+case $(uname -p) in
+	arm*)
+		# Use native arm compiler (no cross prefix)
+		CROSS_PREFIX=
+		;;
+	   *)
+		# Use cross compiler
+		CROSS_PREFIX="arm-linux-gnueabihf-"
+		;;
+esac
+
+# Create output dir
+mkdir -p bin
+
+# Build HOST side application
+${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader
+
+# Build DEVICE side program
+# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4
+#-mshort-calls still works :D  
+
+#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib
+#e-gcc 5.4 makes poor use of the option -mfp-mode=int
+#  the option -mfp-iarith slows DOWN my program -- more than 20 % :/ 
+
+ e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 -mfp-iarith src/e_e2g.c -S -le-lib
diff --git a/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf b/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf
diff --git a/paralle2/run.sh b/paralle2/run.sh
@@ -2,8 +2,20 @@
 
 set -e
 
+BENCH_INDEX=""
 
-cd Debug
+if [ $# -lt 1 ]; then
+	echo "Usage:  ./run.sh numberic-value"
+	exit 1
+else
+	if [[ ! "$1" =~ ^[0-9]+$ ]]; then
+		echo "ERROR:  value must be numeric"
+		echo "Usage:  ./run.sh numberic-value"
+		exit 1
+	else
+		BENCH_INDEX=$1
+	fi
+fi
 
-./$1.elf $2 $3 $4 $5 $6 $7 $8 $9
+time bin/e2g.elf ${BENCH_INDEX}
 
diff --git a/paralle2/run_data.sh b/paralle2/run_data.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e
+
+bin/build_data.elf
+
+echo "Building data: done.  Now you can run './build.sh'"
diff --git a/paralle2/src/C_common2.h b/paralle2/src/C_common2.h
@@ -0,0 +1,46 @@
+// avoid stdint.h
+#define uint8_t		unsigned char
+#define uint16_t	unsigned short
+#define uint32_t	unsigned int
+#define uint64_t	unsigned long long // <!> unsigned long = 4 bytes under ARM
+
+#define int8_t		char
+#define int16_t		short
+#define int32_t		int
+#define int64_t		long long
+
+// my semantic
+#define uc     unsigned char
+#define ull    unsigned long long
+#define pf      printf
+#define print   printf("\n");
+#define pfv(x)  printf("v: %d\n",x);
+#define LOOP1(x) for(fn1=0;fn1<(x);fn1++)
+#define LOOP2(x) for(fn2=0;fn2<(x);fn2++)
+#define LOOP3(x) for(fn3=0;fn3<(x);fn3++)
+#define LOOP4(x) for(fn4=0;fn4<(x);fn4++)
+#define LOOP5(x) for(fn5=0;fn5<(x);fn5++)
+#define LOOP6(x) for(fn6=0;fn6<(x);fn6++)
+#define LOOP7(x) for(fn7=0;fn7<(x);fn7++)
+#define LOOP8(x) for(fn8=0;fn8<(x);fn8++)
+#define LOOP(x,y) for(x=0;x<y;x++)
+
+// #######################################
+// think x86 asm, think jz... setz... cmovz... think flags...
+#define ifz(x) if(!(x))
+#define ifnz(x) if(x)
+#define ife(x) if(!(x))
+#define ifne(x) if(x)
+// booleans
+#define ifzbool32(x,bit) ifz((x)&(1U<<(bit))) 
+#define ifbool32(x,bit)  if((x)&(1U<<(bit))) 
+#define ifzbool64(x,bit) ifz((uint64_t)(x)&(1ULL<<(bit)))
+#define ifbool64(x,bit)  if((uint64_t)(x)&(1ULL<<(bit)))
+
+// __asm__ __volatile__ ("idle");//repos !
+
+/* pense-bete
+#define ALIGN(x)   __attribute__ ((aligned (x))) 
+#define PACKED     __attribute__ ((packed)) 
+#define SECTION(x) __attribute__ ((section (x)))
+*/
diff --git a/paralle2/src/build_data.c b/paralle2/src/build_data.c
@@ -0,0 +1,39 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define BENCH_INIT 0
+#define BENCH_MAX  1023
+
+const unsigned char tbench[16][17]={
+  { 0xFF,0xED,0xDF,0xFC,0xFF,0xF4,0xFF,0xFF,0x25,0x7E,0xEB,0xEF,0x79,0x02,0x05,0x84,0x43 },
+  { 0xE5,0xFF,0xFB,0xFB,0xEF,0xFE,0xFF,0xFB,0xF5,0xFA,0x56,0xBE,0x39,0x19,0x17,0x15,0x44 },
+  { 0xFE,0xBF,0xFF,0xDF,0xEC,0xFB,0xFF,0xFE,0x8D,0x1F,0xAF,0xBF,0x82,0x23,0x45,0x50,0x23 },
+  { 0xFF,0xF5,0xFF,0xFF,0x6E,0xFF,0xFF,0x3B,0x5C,0x0F,0xDF,0xF7,0x32,0x79,0x03,0x97,0x23 },
+  { 0xBF,0xFF,0xFF,0xF9,0x7F,0xF7,0xDE,0xDF,0x3C,0x3E,0x9D,0xFF,0x55,0x58,0x03,0x46,0x33 },
+  { 0xE6,0xD7,0xDF,0xFF,0xFF,0xFF,0xEF,0xFD,0xDD,0x6E,0x5B,0x6F,0x79,0x13,0x30,0x83,0x43 },
+  { 0xFE,0xFB,0xF3,0xEB,0xFF,0x7F,0xFD,0xFF,0xF9,0xE7,0xC8,0xFD,0x63,0x00,0x35,0x33,0x34 },
+  { 0xEF,0xF7,0xFD,0xF7,0xFE,0xF1,0xFF,0xFF,0xF8,0xBB,0x4F,0xAF,0x27,0x50,0x62,0x30,0x13 },
+  { 0xFE,0xF7,0xF7,0xF0,0xFB,0xFF,0xFF,0xFF,0xF9,0x3E,0x5F,0x4F,0x55,0x53,0x10,0x57,0x43 },
+  { 0xBF,0xF6,0xDF,0xEF,0xFF,0xFE,0xBF,0xF7,0x6C,0x9E,0x9B,0xFF,0x33,0x22,0x22,0x98,0x23 },
+  { 0xFF,0xEF,0x4F,0xBB,0xBF,0xBF,0xFF,0xFF,0x68,0xFC,0xEF,0xE7,0x69,0x31,0x02,0x93,0x13 },
+  { 0xBB,0xFD,0xF7,0xFF,0x9F,0xFF,0xFB,0xFE,0xE1,0xFB,0xAD,0xDD,0x22,0x37,0x15,0x60,0x34 },
+  { 0x7F,0xCE,0xFF,0xBD,0xFF,0xFE,0xEF,0xFF,0xE5,0x7A,0x78,0xFF,0x13,0x50,0x07,0x00,0x34 },
+  { 0xFC,0xFF,0x7F,0xFE,0xDF,0xFD,0x3F,0xFF,0x64,0xFA,0xE7,0x7F,0x82,0x58,0x99,0x05,0x14 },
+  { 0xFF,0x7B,0xF7,0xB3,0xFF,0xFF,0xFE,0xBF,0xB7,0x36,0xEC,0xFE,0x64,0x57,0x11,0x06,0x44 },
+  { 0xFD,0xEF,0xDF,0xFB,0x7F,0xFF,0xF2,0xFF,0xF2,0x73,0x3F,0x3F,0x19,0x96,0x07,0x19,0x14 },
+};
+
+int main(void) {
+	unsigned i, j;
+  FILE *f;
+
+  f=fopen("bin/bench.bin", "wb");
+  if(f == NULL) { printf("Error writing bin/bench.bin\n"); exit(-1); }
+
+  for(i=0; i<2048; i++)
+    j=fwrite(tbench, 17*16, 1, f);
+  fclose(f);
+
+  return 0;
+}