diff --git a/paralle2/README.md b/paralle2/README.md index d165035..928cca0 100644 --- a/paralle2/README.md +++ b/paralle2/README.md @@ -1,22 +1,45 @@ -Pseudo Eternity II solver under Parallella ! +# A 10x10 Eternity II solver -Demo with a 10x10 subproblem with 16 cores running in parallel. +## Build and Run -A single eCore reaches 3 Mn/s, that's a tremendous 48 Mn/s for 4.7 W ! -My high-end computer reaches 130 Mn/s with a single core and heavy optimizations, but it consumes 72 W. +#data BEFORE ; will create bin/bench.bin - a bunch of benchs -This is only the beginning with a basic algorithm, I'm confident we can reach 80 Mn/s with a single 16-core Parallella :D -For your information a single ARM core with the same basic algorithm reaches 6 Mn/s. -This kind of problem would be a good candidate to run under the unused parts of the FPGA... that's another story. For experts. + ./build_data.sh + ./run_data.sh -*** + ./build.sh + ./run.sh value -How to run the demo under a 16-core Parallella: -./build.sh e2 -time ./run.sh e2 + Wanting an assembly output ? Use ./buildasm.sh + Cross compiling for an x86_64 platform ? You can use ./x86*.sh + +## Benchmarks -Please check you use a 16-core Epiphany like mine: E16G301, a Kickstarter model with Zynq 7020 and a 'headless' configuration. -If not the case, you may adapt the sources: src/e2.c and src/e_e2.c -This kind of problem is a perfect candidate for clusters too. -You can easily tweak this code if you have a 64-core Parallella (if you don't know how to code it, just sell it to me lol) +All programs are full C, sometimes with some assembly. +Mn/s/W = Million nodes per second per Watt + + +GPU OpenCL : not even a tenth of a modest x86 core with a Radeon 5770 graphics card. The numerous branches are a dead end, not to talk about the watts. +Parallella, one ARM A9 core : 6 Mn/s ; 3.0 W ; 2 Mn/s/W +My high-end computer, one core : 166 Mn/s ; 72.0 W ; 2.3 Mn/s/W ; x86_64, Fedora Core 23, i7 5820k +Raspberry Pi 3 : 8-10 Mn/s/W iirc ; A53, 4-core, 1.2 GHz +My high-end computer, 12 threads: 1470 Mn/s ; 140.0 W ; 10.0 Mn/s/W +Odroid XU4 : 245 Mn/s ; 15.7 W ; 15.6 Mn/s/W ; 8-core ; deeply optimized, not much margin +Parallella 16-core Epiphany : 103 Mn/s ; 5.0 W ; 20.6 Mn/s/W ; remove the Ethernet cable to earn 0.6 W due to ssh with the headless Parabuntu distro + + +So... +To my knowledge, Parallella is today the most energy-efficient platform for this highly recursive task... +although it does *not* use any float ! + +Eagerly waiting the 1024-core Epiphany V... + +## Author + +DonQuichotteComputers at gmail dot com +2017 + +## License + +BSD-3 clause. diff --git a/paralle2/build.sh b/paralle2/build.sh index a7d2d11..b74c16d 100644 --- a/paralle2/build.sh +++ b/paralle2/build.sh @@ -3,8 +3,8 @@ set -e ESDK=${EPIPHANY_HOME} -ELIBS="-L ${ESDK}/tools/host/lib" -EINCS="-I ${ESDK}/tools/host/include" +ELIBS=${ESDK}/tools/host/lib +EINCS=${ESDK}/tools/host/include ELDF=${ESDK}/bsps/current/internal.ldf SCRIPT=$(readlink -f "$0") @@ -23,14 +23,25 @@ case $(uname -p) in ;; esac +# Create output dir +mkdir -p bin + # Build HOST side application -${CROSS_PREFIX}gcc -Ofast src/$1.c -o Debug/$1.elf ${EINCS} ${ELIBS} -le-hal -le-loader -lpthread +${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader # Build DEVICE side program -#for speed optimization replace $2... with -Ofast -#e-gcc --help=optimizers gives you hints -e-gcc -T ${ELDF} -Ofast $2 $3 $4 $5 $6 $7 $8 $9 src/e_$1.c -o Debug/e_$1.elf -le-lib +# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4 +#-mshort-calls still works :D + +#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib +#e-gcc 5.4 makes poor use of the option -mfp-mode=int +# the option -mfp-iarith slows DOWN my program -- more than 20 % :/ -# Convert ebinary to SREC file -e-objcopy --srec-forceS3 --output-target srec Debug/e_$1.elf Debug/e_$1.srec + e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 src/e_e2g.c -o bin/e_e2g.elf -le-lib +# trick to get the spare room usage: epiphany-elf-size your_program.elf ; with internal.ldf the value of 'dec' cannot be beyond 32767 +# +#parallella@parallella:~/parallella-examples/tmp$ epiphany-elf-size bin/e_e2g.elf +# text data bss dec hex filename +# 18730 2148 2808 23686 5c86 bin/e_e2g.elf +# diff --git a/paralle2/build_data.sh b/paralle2/build_data.sh new file mode 100644 index 0000000..587ae30 --- /dev/null +++ b/paralle2/build_data.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +ESDK=${EPIPHANY_HOME} +ELIBS=${ESDK}/tools/host/lib +EINCS=${ESDK}/tools/host/include +ELDF=${ESDK}/bsps/current/internal.ldf + +SCRIPT=$(readlink -f "$0") +EXEPATH=$(dirname "$SCRIPT") +cd $EXEPATH + +CROSS_PREFIX= +case $(uname -p) in + arm*) + # Use native arm compiler (no cross prefix) + CROSS_PREFIX= + ;; + *) + # Use cross compiler + CROSS_PREFIX="arm-linux-gnueabihf-" + ;; +esac + +# Create output dir +mkdir -p bin + +# Build HOST side application +${CROSS_PREFIX}gcc src/build_data.c -o bin/build_data.elf -I ${EINCS} -L ${ELIBS} diff --git a/paralle2/buildasm.sh b/paralle2/buildasm.sh new file mode 100644 index 0000000..d513d73 --- /dev/null +++ b/paralle2/buildasm.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -e + +ESDK=${EPIPHANY_HOME} +ELIBS=${ESDK}/tools/host/lib +EINCS=${ESDK}/tools/host/include +ELDF=${ESDK}/bsps/current/internal.ldf + +SCRIPT=$(readlink -f "$0") +EXEPATH=$(dirname "$SCRIPT") +cd $EXEPATH + +CROSS_PREFIX= +case $(uname -p) in + arm*) + # Use native arm compiler (no cross prefix) + CROSS_PREFIX= + ;; + *) + # Use cross compiler + CROSS_PREFIX="arm-linux-gnueabihf-" + ;; +esac + +# Create output dir +mkdir -p bin + +# Build HOST side application +${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader + +# Build DEVICE side program +# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4 +#-mshort-calls still works :D + +#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib +#e-gcc 5.4 makes poor use of the option -mfp-mode=int +# the option -mfp-iarith slows DOWN my program -- more than 20 % :/ + + e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 -mfp-iarith src/e_e2g.c -S -le-lib diff --git a/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf b/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf new file mode 100644 index 0000000..31324b1 Binary files /dev/null and b/paralle2/doc/tutorial1b-Parallella_starter_kit_SDK2016.3.1.pdf differ diff --git a/paralle2/run.sh b/paralle2/run.sh index e332839..3c78ee8 100644 --- a/paralle2/run.sh +++ b/paralle2/run.sh @@ -2,8 +2,20 @@ set -e +BENCH_INDEX="" -cd Debug +if [ $# -lt 1 ]; then + echo "Usage: ./run.sh numberic-value" + exit 1 +else + if [[ ! "$1" =~ ^[0-9]+$ ]]; then + echo "ERROR: value must be numeric" + echo "Usage: ./run.sh numberic-value" + exit 1 + else + BENCH_INDEX=$1 + fi +fi -./$1.elf $2 $3 $4 $5 $6 $7 $8 $9 +time bin/e2g.elf ${BENCH_INDEX} diff --git a/paralle2/run_data.sh b/paralle2/run_data.sh new file mode 100644 index 0000000..7da8aff --- /dev/null +++ b/paralle2/run_data.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e + +bin/build_data.elf + +echo "Building data: done. Now you can run './build.sh'" diff --git a/paralle2/src/C_common2.h b/paralle2/src/C_common2.h new file mode 100644 index 0000000..ebea59a --- /dev/null +++ b/paralle2/src/C_common2.h @@ -0,0 +1,46 @@ +// avoid stdint.h +#define uint8_t unsigned char +#define uint16_t unsigned short +#define uint32_t unsigned int +#define uint64_t unsigned long long // unsigned long = 4 bytes under ARM + +#define int8_t char +#define int16_t short +#define int32_t int +#define int64_t long long + +// my semantic +#define uc unsigned char +#define ull unsigned long long +#define pf printf +#define print printf("\n"); +#define pfv(x) printf("v: %d\n",x); +#define LOOP1(x) for(fn1=0;fn1<(x);fn1++) +#define LOOP2(x) for(fn2=0;fn2<(x);fn2++) +#define LOOP3(x) for(fn3=0;fn3<(x);fn3++) +#define LOOP4(x) for(fn4=0;fn4<(x);fn4++) +#define LOOP5(x) for(fn5=0;fn5<(x);fn5++) +#define LOOP6(x) for(fn6=0;fn6<(x);fn6++) +#define LOOP7(x) for(fn7=0;fn7<(x);fn7++) +#define LOOP8(x) for(fn8=0;fn8<(x);fn8++) +#define LOOP(x,y) for(x=0;x +#include +#include + +#define BENCH_INIT 0 +#define BENCH_MAX 1023 + +const unsigned char tbench[16][17]={ + { 0xFF,0xED,0xDF,0xFC,0xFF,0xF4,0xFF,0xFF,0x25,0x7E,0xEB,0xEF,0x79,0x02,0x05,0x84,0x43 }, + { 0xE5,0xFF,0xFB,0xFB,0xEF,0xFE,0xFF,0xFB,0xF5,0xFA,0x56,0xBE,0x39,0x19,0x17,0x15,0x44 }, + { 0xFE,0xBF,0xFF,0xDF,0xEC,0xFB,0xFF,0xFE,0x8D,0x1F,0xAF,0xBF,0x82,0x23,0x45,0x50,0x23 }, + { 0xFF,0xF5,0xFF,0xFF,0x6E,0xFF,0xFF,0x3B,0x5C,0x0F,0xDF,0xF7,0x32,0x79,0x03,0x97,0x23 }, + { 0xBF,0xFF,0xFF,0xF9,0x7F,0xF7,0xDE,0xDF,0x3C,0x3E,0x9D,0xFF,0x55,0x58,0x03,0x46,0x33 }, + { 0xE6,0xD7,0xDF,0xFF,0xFF,0xFF,0xEF,0xFD,0xDD,0x6E,0x5B,0x6F,0x79,0x13,0x30,0x83,0x43 }, + { 0xFE,0xFB,0xF3,0xEB,0xFF,0x7F,0xFD,0xFF,0xF9,0xE7,0xC8,0xFD,0x63,0x00,0x35,0x33,0x34 }, + { 0xEF,0xF7,0xFD,0xF7,0xFE,0xF1,0xFF,0xFF,0xF8,0xBB,0x4F,0xAF,0x27,0x50,0x62,0x30,0x13 }, + { 0xFE,0xF7,0xF7,0xF0,0xFB,0xFF,0xFF,0xFF,0xF9,0x3E,0x5F,0x4F,0x55,0x53,0x10,0x57,0x43 }, + { 0xBF,0xF6,0xDF,0xEF,0xFF,0xFE,0xBF,0xF7,0x6C,0x9E,0x9B,0xFF,0x33,0x22,0x22,0x98,0x23 }, + { 0xFF,0xEF,0x4F,0xBB,0xBF,0xBF,0xFF,0xFF,0x68,0xFC,0xEF,0xE7,0x69,0x31,0x02,0x93,0x13 }, + { 0xBB,0xFD,0xF7,0xFF,0x9F,0xFF,0xFB,0xFE,0xE1,0xFB,0xAD,0xDD,0x22,0x37,0x15,0x60,0x34 }, + { 0x7F,0xCE,0xFF,0xBD,0xFF,0xFE,0xEF,0xFF,0xE5,0x7A,0x78,0xFF,0x13,0x50,0x07,0x00,0x34 }, + { 0xFC,0xFF,0x7F,0xFE,0xDF,0xFD,0x3F,0xFF,0x64,0xFA,0xE7,0x7F,0x82,0x58,0x99,0x05,0x14 }, + { 0xFF,0x7B,0xF7,0xB3,0xFF,0xFF,0xFE,0xBF,0xB7,0x36,0xEC,0xFE,0x64,0x57,0x11,0x06,0x44 }, + { 0xFD,0xEF,0xDF,0xFB,0x7F,0xFF,0xF2,0xFF,0xF2,0x73,0x3F,0x3F,0x19,0x96,0x07,0x19,0x14 }, +}; + +int main(void) { + unsigned i, j; + FILE *f; + + f=fopen("bin/bench.bin", "wb"); + if(f == NULL) { printf("Error writing bin/bench.bin\n"); exit(-1); } + + for(i=0; i<2048; i++) + j=fwrite(tbench, 17*16, 1, f); + fclose(f); + + return 0; +} diff --git a/paralle2/src/e2g.c b/paralle2/src/e2g.c new file mode 100644 index 0000000..93e1ba6 --- /dev/null +++ b/paralle2/src/e2g.c @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include +#include // HOST side ; mandatory + +#include "C_common2.h" // common definitions for C +#include "e2g_common.h" // common definitions for EII project + +#define BENCH_MIN 0 //min bench to start with +#define BENCH_MAX 1024 //max bench to start with +#define BENCH_N 1 //16 //16 to solve per core +#define BENCH_LIMIT 10000 //stop after x benchs done ; not implemented actually + +//####################################### + +void Epiphany_Boot(e_platform_t *epiphany) { + e_init(NULL); // initialise the system, establish connection to the device + e_reset_system(); // reset the Epiphany chip + e_get_platform_info(epiphany);// get the configuration info for the parallella platform +} + +//####################################### +//printf("%-2d"...) -> left align +//GOAL: display an array of node numbers +void Node_Board_Print2(uint *tsolN) { + int fn1, fn2, fn3=0, place; + + print + LOOP2(9) + pf(" %d", 1+fn2); + pf(" %2d\n", 1+fn2); + //pf(" - %c\n", 'A'); + //pf(" - %c\n", 'B'); + + LOOP1(8) { + LOOP2(10) { + place=(fn1+2)*16 + fn2;//+2 because basis is C1 + ifnz(tsolN[fn3]) { + pf(" %012u", tsolN[fn3]); + } + else { + pf(" %c%d ", 'C'+fn1, 1+fn2);//'C' because basis is C1 + if(fn2!=9) pf(" "); + } + fn3++; + } + pf(" - %c\n", 'C'+fn1);//basis is C1 + } +} + +//####################################### +//print out result +int64_t Output_Print(Soutput out) { + int64_t l1=0; + int fn1; + + LOOP1(DAM_SZ) l1+=out.globaltsolN[fn1]; + pf("\n %012llu nodes\n", l1); + pf("\n res: %09u\n", out.globalres); + + Node_Board_Print2(out.globaltsolN); + + return l1 + out.globalres; +} + +//####################################### + +int main(int argc, char *argv[]) { + // Epiphany input/output through shared RAM ; details: e2g_common.h + Sio fromio;//Sio *fromio=(Sio *)malloc(sizeof(Sio)); + int64_t l1=0; + int row, col, i, j, fn1, fn2, bench_start=BENCH_MIN, toccN[CORE_N]={0}, benchlimit=0; + e_platform_t epiphany;// Epiphany platform configuration + e_epiphany_t dev; + FILE *fin; + char *tbench=(char *)malloc(MAX_CORE_N * 17 * 16);; + + if(argc > 1) { + i=atoi(argv[1]); if(i < BENCH_MAX) bench_start=i; + } + + //get data + fin=fopen("./bin/bench.bin", "rb"); + ifz(fin) { printf("Error reading file bin/bench.bin ; did you generate it with build_data.sh ?\n"); exit(-1); } + fseek(fin, 17 * bench_start, 0); + i=fread(tbench, MAX_CORE_N * 17 * 16, 1, fin); + fclose(fin); + + printf("\n\nEternity II running under Parallella :) \n\n\n"); + + Epiphany_Boot(&epiphany); + + // Create a workgroup using all of the cores + e_open(&dev, 0, 0, epiphany.rows, epiphany.cols); + e_reset_group(&dev); + + // Load the device code into each core of the chip, and don't start it yet + e_load_group("bin/e_e2g.elf", &dev, 0, 0, epiphany.rows, epiphany.cols, E_FALSE); + + // Set the maximum per core test value on each core at address 0x7020 + i=0; + for(row=0;row> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte + fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1; + fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1; + fromio.in.tuile2do= *(uint64_t *)&tbench[j + 0]; + fromio.in.bordertuile2do=*(uint *)&tbench[j + 8]; +LOOP1(10) pf("%u ", fromio.in.tdam[C1N+fn1]); print + + pf("0x%016llX tiles\n", fromio.in.tuile2do); + pf("0x%08X borders\n", fromio.in.bordertuile2do); + pf("sz(io) = %u\n", sizeof(Sio)); + LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0; + fromio.out.globalres=0; + + e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio)); +pf("i %u ; in written ; C1N = %u\n", i, fromio.in.tdam[C1N]); + + i++; + } + } + + // Start all of the cores + pf("Some results in a minute... starting the core workgroup...\n\n"); + e_start_group(&dev); + pf("... core workgroup started ; the whole test will last about 120 seconds...\n\n"); + + while(1) { + usleep(100000); + //pf("fromio.out.cmd: 0x%08X\n", fromio.out.cmd); + int done = 0; + + // wait for the cores to complete their work + i=0; + for(row=0;row= BENCH_LIMIT) break; + if(toccN[i] == BENCH_N) + done++; + else { + toccN[i]++; + pf("core %4u: done %2u times ; cmd 0x%08X.\n", i, toccN[i], fromio.out.cmd); + + fromio.out.cmd=CMD_INIT; + LOOP1(DAM_SZ) + fromio.in.tdam[fn1]=0; + j=17 * bench_start; // for demo purpose + + LOOP1(8) + fromio.in.tdam[C2N + fn1] = (tbench[j + 12 + (fn1/2)] >> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte + fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1; + fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1; + fromio.in.tuile2do= *(uint64_t *)&tbench[j + 0]; + fromio.in.bordertuile2do=*(uint *)&tbench[j + 8]; + LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0; + fromio.out.globalres=0; + + e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio)); +pf("i %u ; in written again ; C1N = %u\n", i, fromio.in.tdam[C1N]); +//OBSOLETE ! esdk doc too :/ e_reset_core(&dev, row, col); + e_start(&dev, row, col); + } + } + + i++; + } + } + + if ( done >= CORE_N ) // some benchmarks are lengthy + break; + + if(benchlimit >= BENCH_LIMIT) break; + } + + e_finalize(); + pf("Crunched %015llu nodes.\n\n", l1); + + return 0; +} diff --git a/paralle2/src/e2g_common.h b/paralle2/src/e2g_common.h new file mode 100644 index 0000000..2fd506d --- /dev/null +++ b/paralle2/src/e2g_common.h @@ -0,0 +1,132 @@ +// 2017/01/28: 103 Mn/s (Million nodes per second) C version compared to previous 81 Mn/s assembly version. +// ELF support instead of SREC. Quicker load. File input. No more UNsigned integers, no more char loads, no more "ctz" and "popcount" instructions. +// Removed bug from multiple 0x6000 section inputs. +// Eagerly waiting for Epiphany V... + +#define CORE_N 16 //change it if needed ; our choice for standard 16-core Epiphany +#define STATS //undefine STATS to get full performance (from 111.2 to 103.4 s with a 16-core Parallella) +#define MAX_CORE_N 1024 //Epiphany V ready ;) + +// specific to the project +#define DAM_SZ 90 + +// to DEVICE +#pragma pack(4) +typedef struct S_input { + int64_t tuile2do; + int bordertuile2do; + int tdam[DAM_SZ]; + int east; +}Sinput; +// from DEVICE +typedef struct S_output { + int globaltsolN[DAM_SZ]; //int64_t is twice as long to execute, you need at least 6 ic to increment a 64-bit memory value :/ + int globalres; + int cmd; + int fn_idx; +}Soutput; +// shared MEMORY +typedef struct S_io { + Sinput in; + Soutput out; +}Sio; +// tmp variables for DEVICE, trying a workaround for the -msmall16 compilation option +typedef struct S_tmp { + //int fn_idx; + int ttiles[64 + 1 + 32]; + int j9e; + int j1n; +}Stmp; + +// global offset for shared RAM +#define SHARED_RAM (0x01000000) + +// a whole forum post for that +#define PERFECT_ALIGN8 __asm__ (".balignw 4, 0x01a2\n"); __asm__ (".balignl 8, 0xfc02fcef\n"); + +// Epiphany local offsets +#define SHARED_IN 0x6000 +#define SHARED_OUT (SHARED_IN + sizeof(Sinput)) +#define SHARED_RES (SHARED_OUT + DAM_SZ*4) // offset for result +#define SHARED_CMD (SHARED_OUT + DAM_SZ*4 + 4) // offset for 'cmd' +#define R_IDX (SHARED_OUT + sizeof(Soutput)) + +// commands for the Epiphany core +#define CMD_INIT 0x80000000 // host init +#define CMD_DONE 0x40000000 // eCore did the job properly (probably ; some bug might crush this word but it's highly improbable) + +// specific to the project +#ifdef STATS + #define macro_globaltrace(niveau) out.globaltsolN[niveau]++; +#else + #define macro_globaltrace(niveau) +#endif + +#define macro_globaltrace2(niveau) out.globaltsolN[niveau]++; + +#define NORTH 0 +#define EAST 1 +#define SOUTH 2 +#define WEST 3 + +#define B1N 0 +#define C1N 10 +#define C2N 11 +#define C3N 12 +#define C4N 13 +#define C5N 14 +#define C6N 15 +#define C7N 16 +#define C8N 17 +#define C9N 18 +#define C10N 19 +#define D1N 20 //etc + +#define G1N 50 +#define G2N 51 +#define G3N 52 +#define G4N 53 +#define G5N 54 +#define G6N 55 +#define G7N 56 +#define G8N 57 +#define G9N 58 +#define G10N 59 + +#define H1N 60 +#define H2N 61 +#define H3N 62 +#define H4N 63 +#define H5N 64 +#define H6N 65 +#define H7N 66 +#define H8N 67 +#define H9N 68 +#define H10N 69 + +#define I1N 70 +#define I2N 71 +#define I3N 72 +#define I4N 73 +#define I5N 74 +#define I6N 75 +#define I7N 76 +#define I8N 77 +#define I9N 78 +#define I10N 79 + +#define J1N 80 +#define J2N 81 +#define J3N 82 +#define J4N 83 +#define J5N 84 +#define J6N 85 +#define J7N 86 +#define J8N 87 +#define J9N 88 +#define J10N 89 + +#define BORDERCOLOR_D 0 +#define BORDERCOLOR_G 4 +#define BORDERCOLOR_I 9 +#define BORDERCOLOR_N 19 // 19 colors ; 1st one is empty, colors 1-4 stand for D(roite), 5-8 for G(auche), 9-18 for I(nterieur) diff --git a/paralle2/src/e_e2g.c b/paralle2/src/e_e2g.c new file mode 100644 index 0000000..9e281d7 --- /dev/null +++ b/paralle2/src/e_e2g.c @@ -0,0 +1,891 @@ +#include "e-lib.h" // mandatory even for a minimalist design -- e_get_coreid(), e_read(), e_write() + +//from notzed on the forum, "...gcc extended inline asm, 'cc' clobber_php.htm" +//volatile needed, the compiler may mix code without taking care of the condition flags :/ +unsigned int bitrev(unsigned int val) { + unsigned int res; + + __asm__ volatile ("bitr %[res],%[val]" + : [res] "=r" (res) + : [val] "r" (val) + : "cc"); + + return res; +} + +//#include "C_common2.h" // common definitions for C +// avoid stdint.h +#define uint8_t unsigned char +#define uint16_t unsigned short +#define uint32_t unsigned int +#define uint64_t unsigned long long // unsigned long = 4 bytes under 32-bit ARM + +#define int8_t char +#define int16_t short +#define int32_t int +#define int64_t long long + +// my semantic +#define uc unsigned char +#define ull unsigned long long +#define pf printf +#define print printf("\n"); +#define pfv(x) printf("v: %d\n",x); +#define LOOP1(x) for(fn1=0;fn1<(x);fn1++) +#define LOOP2(x) for(fn2=0;fn2<(x);fn2++) +#define LOOP3(x) for(fn3=0;fn3<(x);fn3++) +#define LOOP4(x) for(fn4=0;fn4<(x);fn4++) +#define LOOP5(x) for(fn5=0;fn5<(x);fn5++) +#define LOOP6(x) for(fn6=0;fn6<(x);fn6++) +#define LOOP7(x) for(fn7=0;fn7<(x);fn7++) +#define LOOP8(x) for(fn8=0;fn8<(x);fn8++) +#define LOOP(x,y) for(x=0;x ONE reliable way of coding is ONE structure for exchanging with the rest of the world +*/ + +volatile Sio io SECTION(".data_bank3"); + +#define in io.in +#define out io.out + +//####################################### + +Stmp tmp; + +//####################################### +//THE 'COMPUTE KERNEL' +//#include "e2c_solver.c" + +void BorderWest(const int, const int); +void InnerTile0(const int, const int); +void InnerTile1(const int, const int); +void InnerTile(const int, const int); +void InnerRow(const int, const int); +void InnerRow2(const int, const int); // with tinner_Upd +void BorderEast(const int, const int); +void BorderEastUpdate(const int, const int); +void Special_H10(const int, const int); +void Special_I1(const int, const int); +void Special_I10(const int, const int); +void BorderEastBottom(const int, const int); +void Special_J2(const int, const int); +void Special_Debug(const int, const int); // for debugging purpose + +void __attribute__ ((noinline)) Input_Copy(int, int *); + +//####################################### +//STATIC DATA + +const int tlscouleur_B2016[BORDERCOLOR_N+1]={ + 0x00000000, + 0x0870809A, 0x91032001, 0x42845140, 0x24080E24, + 0x000000FF, 0x00007F00, 0x00FF8000, 0xFF000000, + 0x00000001, 0x00018300, 0x03000406, 0x00000008, 0x00020830, 0x04041040, 0x38080000, 0x40100000, 0x00202000, 0x80C04080, + 0x00000000 +}; + +// colors 0-3 for D and G, 0-9 for I +const int tbordureD[32]={ 0x01, 0x00, 0x03, 0x00, 0x00, 0x03, 0x02, 0x00, 0x02, 0x03, 0x03, 0x03, 0x02, 0x01, 0x02, 0x00, 0x01, 0x01, 0x02, 0x03, 0x00, 0x00, 0x00, 0x02, 0x01, 0x02, 0x03, 0x00, 0x01, 0x03, 0x02, 0x01 }; +const int tbordureG[32]={ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; +const int tbordureI[32]={ 0x00, 0x02, 0x02, 0x03, 0x04, 0x04, 0x05, 0x09, 0x01, 0x01, 0x02, 0x04, 0x05, 0x08, 0x09, 0x01, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x02, 0x02, 0x05, 0x06, 0x06, 0x06, 0x07, 0x09 }; + +const int ttileN[100]={ + 1, 4, 3, 2, 1, 4, 4, 5, 2, 3, 5, 2, 5, 3, 0, 3, 3, 2, 0, 3, + 1, 5, 3, 5, 2, 0, 1, 2, 3, 3, 6, 1, 4, 3, 6, 1, 2, 1, 3, 2, + 2, 2, 2, 2, 4, 1, 0, 1, 4, 6, 5, 3, 2, 1, 1, 2, 4, 1, 1, 4, + 3, 0, 2, 5, 3, 5, 0, 1, 4, 1, 1, 4, 1, 5, 2, 4, 1, 6, 2, 0, + 0, 4, 1, 0, 4, 2, 7, 4, 3, 1, 5, 1, 2, 3, 1, 2, 2, 3, 4, 0 +}; + +// color format: G + 4*D (i.e tbordureG + 4*tbordureD) +const int tGDN[16]={ 4, 0, 4, 1, 1, 1, 2, 3, 1, 3, 2, 2, 2, 3, 1, 2 }; + +const int tGD[16][8]={ + { 1, 2, 3, 3, 4, 4, 7, 9 }, + { }, + { 15, 1, 20, 7, 21, 8, 22, 9 }, + { 27, 6, }, + { 0, 0, }, + { 13, 8, }, + { 16, 1, 17, 4, }, + { 24, 2, 28, 6, 31, 9, }, + { 6, 5, }, + { 8, 1, 12, 5, 14, 9, }, + { 18, 5, 23, 9, }, + { 25, 2, 30, 7, }, + { 2, 2, 5, 4, }, + { 9, 1, 10, 2, 11, 4, }, + { 19, 6, }, + { 26, 5, 29, 6, } +}; + +#define VOIDTILE 64 +#define VOIDSOUTH 0 + +//tcount(11) = 2 tcount(19) = 2 tcount(22) = 3 tcount(27) = 2 tcount(36) = 2 tcount(38) = 2 +//const int tbordereast_uniquecolor[20]={ 1, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 23, 24, 25, 28, 30, 32, 33, 37, }; //19 actually +//const int tbordereast_uniquetile[20]={ 0, 0, 4, 1, 24, 0, 27, 28, 0, 15, 17, 0, 29, 20, 0, 30, 0, 0, 0, }; //19 actually + +//from tmpbordure*.c +const int t14[100][8]={ // format: LSB = tile, 2nd byte = east, 3rd byte = south, MSB = 0 + { 0x00060500, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000901, 0x00020308, 0x00010613, 0x0009030C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000506, 0x00010915, 0x0004030A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004040D, 0x00080510, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010307, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050103, 0x00050918, 0x00000206, 0x00020102, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000712, 0x00080511, 0x00050000, 0x00020614, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040917, 0x00000612, 0x00030309, 0x00070104, 0x0007050F, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070105, 0x00010916, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000101, 0x0006030B, 0x0006040E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050503, 0x00090001, 0x00080705, 0x00070704, 0x00050202, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050919, 0x00060013, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070823, 0x00030008, 0x0001051A, 0x00090720, 0x0001071B, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000407, 0x0006021C, 0x0004041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060822, 0x00090119, 0x0001021A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000113, 0x0007051F, 0x00050821, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0006031D, 0x0001021B, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000816, 0x00000215, 0x0003000C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050006, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00090015, 0x0005011A, 0x00000502, 0x0003061C, 0x0007011B, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030325, 0x00020924, 0x00090224, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030225, 0x00000108, 0x00020826, 0x00020927, 0x00040328, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007072C, 0x0003000A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000614, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080123, 0x00050429, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0006052B, 0x0006042A, 0x00020326, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020224, 0x00020327, 0x00070120, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0009060B, 0x00070309, 0x00010208, 0x00040107, 0x0001090C, 0x0002040A, 0x00000040, 0x00000040 }, + { 0x0007061D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020325, 0x00080226, 0x00090227, 0x00030428, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020225, 0x0007072D, 0x00000709, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004000D, 0x00080732, 0x00040935, 0x0005062F, 0x00030228, 0x0004011E, 0x00000040, 0x00000040 }, + { 0x00090734, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080631, 0x0002011C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007032D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040936, 0x00050010, 0x0006052E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050630, 0x00080733, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0003040D, 0x0009060E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030007, 0x0003041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070529, 0x0008062A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020328, 0x0000020A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0000030D, 0x00090335, 0x00080737, 0x0001031E, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0006032F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007022C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070332, 0x00060839, 0x00070437, 0x0006093C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000717, 0x00030435, 0x0008083B, 0x00030836, 0x00050638, 0x0007083A, 0x00000040, 0x00000040 }, + { 0x00060811, 0x00020006, 0x00000600, 0x00030810, 0x0007070F, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000503, 0x0002011A, 0x0006071F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008062B, 0x00010002, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008062E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020729, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010003, 0x00090018, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080122, 0x00030930, 0x0003042F, 0x00040938, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007073E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010621, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000518, 0x00070334, 0x00010119, 0x0008063D, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070012, 0x00010113, 0x00060214, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060014, 0x0001031C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00090530, 0x0000090B, 0x00060831, 0x0001071D, 0x0004052F, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0000090E, 0x0002082A, 0x00090538, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0002082B, 0x00000000, 0x0009083D, 0x0003082E, 0x00080121, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0005011F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010522, 0x00050011, 0x00060331, 0x00040839, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004083C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060012, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000805, 0x00000704, 0x00020920, 0x0002011B, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004072C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040832, 0x0003072D, 0x00030009, 0x00050934, 0x00090833, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00090017, 0x00040837, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007073E, 0x00040229, 0x0001061F, 0x0000070F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0003011D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007053E, 0x0005073E, 0x0003032D, 0x00010004, 0x0002042C, 0x0005000F, 0x00000040, 0x00000040 }, + { 0x00010223, 0x0004093A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050622, 0x00020723, 0x00090016, 0x00060521, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030226, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080639, 0x0009083B, 0x00090336, 0x0009073A, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000611, 0x00000310, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0005022B, 0x00030631, 0x00080439, 0x0005093D, 0x0009043C, 0x0004022A, 0x0005032E, 0x00000040 }, + { 0x00030432, 0x00010005, 0x00040437, 0x00030933, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004093B, 0x0008093F, 0x0009083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010001, 0x00070417, 0x00080116, 0x00050518, 0x00020115, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010519, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020224, 0x00030227, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040435, 0x00080436, 0x0000010C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008063C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060330, 0x00060438, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0003000B, 0x0004000E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030534, 0x00010220, 0x0008043A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008043B, 0x00070333, 0x0006053D, 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 } +}; + +ALIGN(8) +int ttileN_upd[100]={0}; // ? does not work yet + +static inline void TileN_Update(void) { + int fn1; + LOOP1(100) ttileN_upd[fn1]=ttileN[fn1]; + + ttileN_upd[1]=tmp.ttiles[0]; + //modele : tmp.ttileN_upd[1]=tmp.ttiles[0]; + /* + ttileN_upd[1]=tmp.ttiles[0]; + ttileN_upd[4]=tmp.ttiles[15]; + ttileN_upd[5]=tmp.ttiles[16]; + ttileN_upd[6]=tmp.ttiles[8]; + ttileN_upd[7]=tmp.ttiles[9]; + ttileN_upd[8]=tmp.ttiles[1]; + ttileN_upd[9]=tmp.ttiles[24]; + ttileN_upd[10]=tmp.ttiles[25]; + ttileN_upd[12]=tmp.ttiles[3]; + ttileN_upd[16]=tmp.ttiles[4]; + ttileN_upd[17]=tmp.ttiles[17]; + ttileN_upd[23]=tmp.ttiles[26]; + ttileN_upd[24]=tmp.ttiles[27]; + ttileN_upd[25]=tmp.ttiles[28]; + ttileN_upd[28]=tmp.ttiles[20]; + ttileN_upd[30]=tmp.ttiles[30]; + ttileN_upd[32]=tmp.ttiles[21]; + ttileN_upd[33]=tmp.ttiles[13]; + ttileN_upd[37]=tmp.ttiles[31]; + */ +} + +//typedef void (*ptrFonction) (const signed int, const signed int); + +ALIGN(8) +void (* tfncall[78]) (const int, const int) ={ + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // C10 + //Special_Debug, + + BorderWest, + InnerRow, // replaces InnerTile() * 8 + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // BorderEastUpdate, // D10 // ? fail + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // E10 + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // F10 + +//Special_Debug, + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // G10 + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_H10, // H10 + + Special_I1, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_I10, // I10 + + BorderEastBottom, // J9 + BorderEastBottom, // J8 + BorderEastBottom, // J7 + BorderEastBottom, // J6 + BorderEastBottom, // J5 + BorderEastBottom, // J4 + BorderEastBottom, // J3 + Special_J2, // J2, last square +}; + +//dynamic +int tborderwestN[4]={0}; +int tbordereastN[40]={0}; + +int tborderwestT[4][9]={0}; +int tborderwestE[4][9]={0}; +int tborderwestS[4][9]={0}; + +int tbordereastT[40][4]={0}; +int tbordereastS[40][4]={0}; + +// sandwiched BorderWest ; J1N=0 or 3 for this specific problem +void Special_I1(const int north, const int northI) { + int couleur, tuileN, fn1, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=north + 4*0; + tmp.j9e=3; + tuileN=tGDN[couleur]; + LOOP1(tuileN) { + tuile=tGD[couleur][fn1*2 + 0]; + ifnz(tmp.ttiles[64+1 + tuile]) { + east=tGD[couleur][fn1*2 + 1]; + tmp.ttiles[64+1 + tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[64+1 + tuile]=1; + } + } + + couleur=north + 4*3; + tmp.j9e=0; + tuileN=tGDN[couleur]; + LOOP1(tuileN) { + tuile=tGD[couleur][fn1*2 + 0]; + ifnz(tmp.ttiles[64+1 + tuile]) { + east=tGD[couleur][fn1*2 + 1]; + tmp.ttiles[64+1 + tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[64+1 + tuile]=1; + } + } +} + +void BorderWest(const int north, const int northI) { + int couleur, tuileN, fn1, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=north; // in.tdam[C1N]; + tuileN=tborderwestN[couleur]; + LOOP1(tuileN) { + tuile=tborderwestT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + east=tborderwestE[couleur][fn1]; + in.tdam[northI+10]=tborderwestS[couleur][fn1]; + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(east, northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +void InnerTile(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=eastcolor + 10*in.tdam[northI]; + tuileN=ttileN[couleur]; + LOOP1(tuileN) { + fn2=t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[tuile]=1; + } + } +} + +void BorderEast(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + in.tdam[northI+10]=tbordereastS[couleur][fn1]; + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(in.tdam[northI+1], northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +// BorderEast for the bottom line ; from J9 to J2 excluded ; only change to these tags '//#' +void BorderEastBottom(const int northI, const int eastcolor) { + int couleur, tuileN, fn1, tuile, nexteast; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=in.tdam[northI] * 4 + eastcolor; //# + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + tmp.ttiles[64+1 + tuile]=0; + out.fn_idx++; + (*ptr)(northI-1, tbordureG[tuile]); //# + out.fn_idx--; + tmp.ttiles[64+1 + tuile]=1; + } + } +} + +// BorderEastBottom with final check for J2 the last square +void Special_J2(const int northI, const int eastcolor) { + int couleur, tuileN, fn1, tuile, nexteast; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=in.tdam[northI] * 4 + eastcolor; //# + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + if(tbordureG[tuile] != 1) continue; // tdam[J1E] == 1 for this specific problem + + out.globalres++; // O_O reach this point after about 10^17 nodes... + + } + } +} + +// BorderEast with I10 checkup +void Special_H10(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + if(tbordureG[tuile] == 0) continue; // borders 0/1/x/0 do not exist on this specific problem + //in.tdam[northI+10]=tbordereastS[couleur][fn1]; + + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(in.tdam[northI+1], northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +// BorderEast with I10 strong constraint +void Special_I10(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + if(tbordureG[tuile] != 2) continue; // J10N == 2 for this specific problem + + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(northI+9, tmp.j9e); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +//??? why does it NOT work ? should be 10 % faster +void BorderEastUpdate(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + TileN_Update(); + + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + in.tdam[northI+10]=tbordereastS[couleur][fn1]; + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(in.tdam[northI+1], northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +void InnerTile0(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=eastcolor + 10*in.tdam[northI]; + tuileN=ttileN[couleur]; + LOOP1(tuileN) { + fn2=t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[tuile]=1; + } + } +} + +void InnerTile1(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor + 10*in.tdam[northI]; + tuileN=ttileN_upd[couleur]; + LOOP1(tuileN) { + fn2=t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + (*ptr)(east, northI+1); + tmp.ttiles[tuile]=1; + } + } + + out.fn_idx--; +} + +//point de vigilance : teast[idx - 1] ; le reste : std +#define macro_innerrow_loop(idx)\ + macro_globaltrace(out.fn_idx - 8 + idx);\ + tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\ + ttuileN[idx]=ttileN[tcouleur[idx]];\ + LOOP(tfn1[idx], ttuileN[idx]) {\ + tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\ + ttuile[idx]=tfn2[idx] & 0xff;\ + ifnz(tmp.ttiles[ttuile[idx]]) {\ + teast[idx]=(tfn2[idx]>>8) & 0xff;\ + in.tdam[south + idx]=tfn2[idx] >> 16;\ + tmp.ttiles[ttuile[idx]]=0; + +#define macro_innerrow_loopz(idx) tmp.ttiles[ttuile[idx]]=1; } } + + +void InnerRow(const int eastcolor, const int northI) { + int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8]; + int south=northI+10; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + + out.fn_idx +=8; + ptr=tfncall[out.fn_idx]; + + tcouleur[0]=eastcolor + 10*in.tdam[northI]; + ttuileN[0]=ttileN[tcouleur[0]]; + LOOP(tfn1[0], ttuileN[0]) { + tfn2[0]=t14[tcouleur[0]][tfn1[0]]; + ttuile[0]=tfn2[0] & 0xff; + ifnz(tmp.ttiles[ttuile[0]]) { + teast[0]=(tfn2[0]>>8) & 0xff; + in.tdam[south +0]=tfn2[0] >> 16; + tmp.ttiles[ttuile[0]]=0; + + // (*ptr)(teast[0], northI+1); + /* + tcouleur[1]=teast[0] + 10*in.tdam[northI +1]; + ttuileN[1]=ttileN[tcouleur[1]]; + LOOP(tfn1[1], ttuileN[1]) { + tfn2[1]=t14[tcouleur[1]][tfn1[1]]; + ttuile[1]=tfn2[1] & 0xff; + ifnz(tmp.ttiles[ttuile[1]]) { + teast[1]=(tfn2[1]>>8) & 0xff; + in.tdam[south +1]=tfn2[1] >> 16; + tmp.ttiles[ttuile[1]]=0; + out.fn_idx++; + */ + macro_innerrow_loop(1) + macro_innerrow_loop(2) + macro_innerrow_loop(3) + macro_innerrow_loop(4) + macro_innerrow_loop(5) + macro_innerrow_loop(6) + macro_innerrow_loop(7) + + (*ptr)(teast[7], northI + 8); + + macro_innerrow_loopz(7) + macro_innerrow_loopz(6) + macro_innerrow_loopz(5) + macro_innerrow_loopz(4) + macro_innerrow_loopz(3) + macro_innerrow_loopz(2) + macro_innerrow_loopz(1) + + tmp.ttiles[ttuile[0]]=1; + } + } + + out.fn_idx -=8; +} + + +//point de vigilance : teast[idx - 1] ; le reste : std +#define macro_innerrow_loop2(idx)\ + macro_globaltrace(out.fn_idx - 8 + idx);\ + tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\ + ttuileN[idx]=ttileN_upd[tcouleur[idx]];\ + LOOP(tfn1[idx], ttuileN[idx]) {\ + tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\ + ttuile[idx]=tfn2[idx] & 0xff;\ + ifnz(tmp.ttiles[ttuile[idx]]) {\ + teast[idx]=(tfn2[idx]>>8) & 0xff;\ + in.tdam[south + idx]=tfn2[idx] >> 16;\ + tmp.ttiles[ttuile[idx]]=0; + + +void InnerRow2(const int eastcolor, const int northI) { + int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8]; + int south=northI+10; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + + out.fn_idx +=8; + ptr=tfncall[out.fn_idx]; + + tcouleur[0]=eastcolor + 10*in.tdam[northI]; + ttuileN[0]=ttileN_upd[tcouleur[0]]; + LOOP(tfn1[0], ttuileN[0]) { + tfn2[0]=t14[tcouleur[0]][tfn1[0]]; + ttuile[0]=tfn2[0] & 0xff; + ifnz(tmp.ttiles[ttuile[0]]) { + teast[0]=(tfn2[0]>>8) & 0xff; + in.tdam[south +0]=tfn2[0] >> 16; + tmp.ttiles[ttuile[0]]=0; + + macro_innerrow_loop(1) + macro_innerrow_loop(2) + macro_innerrow_loop(3) + macro_innerrow_loop(4) + macro_innerrow_loop(5) + macro_innerrow_loop(6) + macro_innerrow_loop(7) + + (*ptr)(teast[7], northI + 8); + + macro_innerrow_loopz(7) + macro_innerrow_loopz(6) + macro_innerrow_loopz(5) + macro_innerrow_loopz(4) + macro_innerrow_loopz(3) + macro_innerrow_loopz(2) + macro_innerrow_loopz(1) + + tmp.ttiles[ttuile[0]]=1; + } + } + + out.fn_idx -=8; +} + + +/* super BIDE ou BUG de nouveau +void InnerTile0(const int eastcolor, const int northI) { + int64_t tfn2[4]; + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=eastcolor + 10*in.tdam[northI]; + + tfn2[0]=*(int64_t *)&t14[couleur][0]; + tfn2[1]=*(int64_t *)&t14[couleur][2]; + tfn2[2]=*(int64_t *)&t14[couleur][4]; + tfn2[3]=*(int64_t *)&t14[couleur][6]; + + tuileN=ttileN[couleur]; + LOOP1(tuileN) { + fn2=*(int *)(&tfn2 + fn1*4); //t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[tuile]=1; + } + } +} +*/ + +void Special_Debug(const int north, const int northI) { + macro_globaltrace(out.fn_idx); +} + +//####################################### + +// prevent inlining this trivial function: we may need some room +void __attribute__ ((noinline)) Input_Copy(int tiles, int *dest) { + int fn1; + LOOP1(32) { + dest[fn1]=tiles & 1; + tiles>>=1; + } +} + +//void __attribute__((interrupt)) null_isr() { return; } + +//####################################### + +int main(void) { +e_start:; + + int fn1, westcolor, eastcolor, tiles; + + volatile signed int *inputP = (void *)SHARED_IN; // pointer for input + //volatile signed int *cmdP = (void *)SHARED_CMD; // pointer for output command + + // init compute kernel + tiles=*(inputP+0); Input_Copy(tiles, &tmp.ttiles[ 0]); // 1st 32 tiles + tiles=*(inputP+1); Input_Copy(tiles, &tmp.ttiles[32]); + tiles=*(inputP+2); Input_Copy(tiles, &tmp.ttiles[VOIDTILE+1]); // 32 borders + tmp.ttiles[VOIDTILE]=0; // VOIDTILE == 64 + + LOOP1(4) + tborderwestN[fn1]=0; + LOOP1(40) + tbordereastN[fn1]=0; + + LOOP1(32) { + if(tmp.ttiles[64+1 +fn1]) { + westcolor=tbordureG[fn1]; + tborderwestT[westcolor] [tborderwestN[westcolor]]=fn1; + tborderwestE[westcolor] [tborderwestN[westcolor]]=tbordureI[fn1]; + tborderwestS[westcolor] [tborderwestN[westcolor]]=tbordureD[fn1]; + tborderwestN[westcolor]++; + + eastcolor=tbordureI[fn1]*4 + tbordureD[fn1]; + tbordereastT[eastcolor] [tbordereastN[eastcolor]]=fn1; + tbordereastS[eastcolor] [tbordereastN[eastcolor]]=tbordureG[fn1]; + tbordereastN[eastcolor]++; + } + } + + out.fn_idx=0; + BorderWest(in.tdam[C1N], C1N); + + out.cmd=CMD_DONE; // *cmdP=CMD_DONE; + + //return 0; + __asm__ __volatile__ ("idle"); // experience: can you idle an Epiphany core until ARM wakes it up ? Answer: empirically, yes ; use e_start() to reload the core + //goto e_start; // wake by IVT # 0 + +} diff --git a/paralle2/x86_build.sh b/paralle2/x86_build.sh new file mode 100644 index 0000000..0113851 --- /dev/null +++ b/paralle2/x86_build.sh @@ -0,0 +1,3 @@ +# cross compiling on x86_64 host, assuming /opt/adapteva as default path + +e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -o bin/e_e2g.elf -le-lib diff --git a/paralle2/x86_buildasm.sh b/paralle2/x86_buildasm.sh new file mode 100644 index 0000000..20836b0 --- /dev/null +++ b/paralle2/x86_buildasm.sh @@ -0,0 +1,12 @@ +# cross compiling on x86_64 host + +# -mshort-calls: OK +# -msmall16: still broken +# -m1reg-r63: OK +# -mfp-mode=int: OK + +echo Cross compiling on x86_64 host +echo. +echo. +e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -S -le-lib +echo. diff --git a/paralle3/README.md b/paralle3/README.md new file mode 100644 index 0000000..928cca0 --- /dev/null +++ b/paralle3/README.md @@ -0,0 +1,45 @@ +# A 10x10 Eternity II solver + +## Build and Run + +#data BEFORE ; will create bin/bench.bin - a bunch of benchs + + ./build_data.sh + ./run_data.sh + + ./build.sh + ./run.sh value + + Wanting an assembly output ? Use ./buildasm.sh + Cross compiling for an x86_64 platform ? You can use ./x86*.sh + +## Benchmarks + +All programs are full C, sometimes with some assembly. +Mn/s/W = Million nodes per second per Watt + + +GPU OpenCL : not even a tenth of a modest x86 core with a Radeon 5770 graphics card. The numerous branches are a dead end, not to talk about the watts. +Parallella, one ARM A9 core : 6 Mn/s ; 3.0 W ; 2 Mn/s/W +My high-end computer, one core : 166 Mn/s ; 72.0 W ; 2.3 Mn/s/W ; x86_64, Fedora Core 23, i7 5820k +Raspberry Pi 3 : 8-10 Mn/s/W iirc ; A53, 4-core, 1.2 GHz +My high-end computer, 12 threads: 1470 Mn/s ; 140.0 W ; 10.0 Mn/s/W +Odroid XU4 : 245 Mn/s ; 15.7 W ; 15.6 Mn/s/W ; 8-core ; deeply optimized, not much margin +Parallella 16-core Epiphany : 103 Mn/s ; 5.0 W ; 20.6 Mn/s/W ; remove the Ethernet cable to earn 0.6 W due to ssh with the headless Parabuntu distro + + +So... +To my knowledge, Parallella is today the most energy-efficient platform for this highly recursive task... +although it does *not* use any float ! + +Eagerly waiting the 1024-core Epiphany V... + +## Author + +DonQuichotteComputers at gmail dot com +2017 + +## License + +BSD-3 clause. + diff --git a/paralle3/build.sh b/paralle3/build.sh new file mode 100644 index 0000000..b74c16d --- /dev/null +++ b/paralle3/build.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +set -e + +ESDK=${EPIPHANY_HOME} +ELIBS=${ESDK}/tools/host/lib +EINCS=${ESDK}/tools/host/include +ELDF=${ESDK}/bsps/current/internal.ldf + +SCRIPT=$(readlink -f "$0") +EXEPATH=$(dirname "$SCRIPT") +cd $EXEPATH + +CROSS_PREFIX= +case $(uname -p) in + arm*) + # Use native arm compiler (no cross prefix) + CROSS_PREFIX= + ;; + *) + # Use cross compiler + CROSS_PREFIX="arm-linux-gnueabihf-" + ;; +esac + +# Create output dir +mkdir -p bin + +# Build HOST side application +${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader + +# Build DEVICE side program +# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4 +#-mshort-calls still works :D + +#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib +#e-gcc 5.4 makes poor use of the option -mfp-mode=int +# the option -mfp-iarith slows DOWN my program -- more than 20 % :/ + + e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 src/e_e2g.c -o bin/e_e2g.elf -le-lib + +# trick to get the spare room usage: epiphany-elf-size your_program.elf ; with internal.ldf the value of 'dec' cannot be beyond 32767 +# +#parallella@parallella:~/parallella-examples/tmp$ epiphany-elf-size bin/e_e2g.elf +# text data bss dec hex filename +# 18730 2148 2808 23686 5c86 bin/e_e2g.elf +# diff --git a/paralle3/build_data.sh b/paralle3/build_data.sh new file mode 100644 index 0000000..587ae30 --- /dev/null +++ b/paralle3/build_data.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +ESDK=${EPIPHANY_HOME} +ELIBS=${ESDK}/tools/host/lib +EINCS=${ESDK}/tools/host/include +ELDF=${ESDK}/bsps/current/internal.ldf + +SCRIPT=$(readlink -f "$0") +EXEPATH=$(dirname "$SCRIPT") +cd $EXEPATH + +CROSS_PREFIX= +case $(uname -p) in + arm*) + # Use native arm compiler (no cross prefix) + CROSS_PREFIX= + ;; + *) + # Use cross compiler + CROSS_PREFIX="arm-linux-gnueabihf-" + ;; +esac + +# Create output dir +mkdir -p bin + +# Build HOST side application +${CROSS_PREFIX}gcc src/build_data.c -o bin/build_data.elf -I ${EINCS} -L ${ELIBS} diff --git a/paralle3/buildasm.sh b/paralle3/buildasm.sh new file mode 100644 index 0000000..d513d73 --- /dev/null +++ b/paralle3/buildasm.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -e + +ESDK=${EPIPHANY_HOME} +ELIBS=${ESDK}/tools/host/lib +EINCS=${ESDK}/tools/host/include +ELDF=${ESDK}/bsps/current/internal.ldf + +SCRIPT=$(readlink -f "$0") +EXEPATH=$(dirname "$SCRIPT") +cd $EXEPATH + +CROSS_PREFIX= +case $(uname -p) in + arm*) + # Use native arm compiler (no cross prefix) + CROSS_PREFIX= + ;; + *) + # Use cross compiler + CROSS_PREFIX="arm-linux-gnueabihf-" + ;; +esac + +# Create output dir +mkdir -p bin + +# Build HOST side application +${CROSS_PREFIX}gcc src/e2g.c -o bin/e2g.elf -I ${EINCS} -L ${ELIBS} -le-hal -le-loader + +# Build DEVICE side program +# -msmall16 still does not work with 2016.11 ESDK and gcc 5.4 +#-mshort-calls still works :D + +#e-gcc -Ofast -T ${ELDF} -msmall16 src/e_e2g.c -o bin/e_e2g.elf -le-lib +#e-gcc 5.4 makes poor use of the option -mfp-mode=int +# the option -mfp-iarith slows DOWN my program -- more than 20 % :/ + + e-gcc -Ofast -T ${ELDF} -mfp-mode=int -mshort-calls -m1reg-r63 -mfp-iarith src/e_e2g.c -S -le-lib diff --git a/paralle3/readme.MD b/paralle3/readme.MD new file mode 100644 index 0000000..aa4f88f --- /dev/null +++ b/paralle3/readme.MD @@ -0,0 +1 @@ +ah diff --git a/paralle3/run.sh b/paralle3/run.sh new file mode 100644 index 0000000..3c78ee8 --- /dev/null +++ b/paralle3/run.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e + +BENCH_INDEX="" + +if [ $# -lt 1 ]; then + echo "Usage: ./run.sh numberic-value" + exit 1 +else + if [[ ! "$1" =~ ^[0-9]+$ ]]; then + echo "ERROR: value must be numeric" + echo "Usage: ./run.sh numberic-value" + exit 1 + else + BENCH_INDEX=$1 + fi +fi + +time bin/e2g.elf ${BENCH_INDEX} + diff --git a/paralle3/run_data.sh b/paralle3/run_data.sh new file mode 100644 index 0000000..7da8aff --- /dev/null +++ b/paralle3/run_data.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e + +bin/build_data.elf + +echo "Building data: done. Now you can run './build.sh'" diff --git a/paralle3/src/C_common2.h b/paralle3/src/C_common2.h new file mode 100644 index 0000000..ebea59a --- /dev/null +++ b/paralle3/src/C_common2.h @@ -0,0 +1,46 @@ +// avoid stdint.h +#define uint8_t unsigned char +#define uint16_t unsigned short +#define uint32_t unsigned int +#define uint64_t unsigned long long // unsigned long = 4 bytes under ARM + +#define int8_t char +#define int16_t short +#define int32_t int +#define int64_t long long + +// my semantic +#define uc unsigned char +#define ull unsigned long long +#define pf printf +#define print printf("\n"); +#define pfv(x) printf("v: %d\n",x); +#define LOOP1(x) for(fn1=0;fn1<(x);fn1++) +#define LOOP2(x) for(fn2=0;fn2<(x);fn2++) +#define LOOP3(x) for(fn3=0;fn3<(x);fn3++) +#define LOOP4(x) for(fn4=0;fn4<(x);fn4++) +#define LOOP5(x) for(fn5=0;fn5<(x);fn5++) +#define LOOP6(x) for(fn6=0;fn6<(x);fn6++) +#define LOOP7(x) for(fn7=0;fn7<(x);fn7++) +#define LOOP8(x) for(fn8=0;fn8<(x);fn8++) +#define LOOP(x,y) for(x=0;x +#include +#include + +#define BENCH_INIT 0 +#define BENCH_MAX 1023 + +const unsigned char tbench[16][17]={ + { 0xFF,0xED,0xDF,0xFC,0xFF,0xF4,0xFF,0xFF,0x25,0x7E,0xEB,0xEF,0x79,0x02,0x05,0x84,0x43 }, + { 0xE5,0xFF,0xFB,0xFB,0xEF,0xFE,0xFF,0xFB,0xF5,0xFA,0x56,0xBE,0x39,0x19,0x17,0x15,0x44 }, + { 0xFE,0xBF,0xFF,0xDF,0xEC,0xFB,0xFF,0xFE,0x8D,0x1F,0xAF,0xBF,0x82,0x23,0x45,0x50,0x23 }, + { 0xFF,0xF5,0xFF,0xFF,0x6E,0xFF,0xFF,0x3B,0x5C,0x0F,0xDF,0xF7,0x32,0x79,0x03,0x97,0x23 }, + { 0xBF,0xFF,0xFF,0xF9,0x7F,0xF7,0xDE,0xDF,0x3C,0x3E,0x9D,0xFF,0x55,0x58,0x03,0x46,0x33 }, + { 0xE6,0xD7,0xDF,0xFF,0xFF,0xFF,0xEF,0xFD,0xDD,0x6E,0x5B,0x6F,0x79,0x13,0x30,0x83,0x43 }, + { 0xFE,0xFB,0xF3,0xEB,0xFF,0x7F,0xFD,0xFF,0xF9,0xE7,0xC8,0xFD,0x63,0x00,0x35,0x33,0x34 }, + { 0xEF,0xF7,0xFD,0xF7,0xFE,0xF1,0xFF,0xFF,0xF8,0xBB,0x4F,0xAF,0x27,0x50,0x62,0x30,0x13 }, + { 0xFE,0xF7,0xF7,0xF0,0xFB,0xFF,0xFF,0xFF,0xF9,0x3E,0x5F,0x4F,0x55,0x53,0x10,0x57,0x43 }, + { 0xBF,0xF6,0xDF,0xEF,0xFF,0xFE,0xBF,0xF7,0x6C,0x9E,0x9B,0xFF,0x33,0x22,0x22,0x98,0x23 }, + { 0xFF,0xEF,0x4F,0xBB,0xBF,0xBF,0xFF,0xFF,0x68,0xFC,0xEF,0xE7,0x69,0x31,0x02,0x93,0x13 }, + { 0xBB,0xFD,0xF7,0xFF,0x9F,0xFF,0xFB,0xFE,0xE1,0xFB,0xAD,0xDD,0x22,0x37,0x15,0x60,0x34 }, + { 0x7F,0xCE,0xFF,0xBD,0xFF,0xFE,0xEF,0xFF,0xE5,0x7A,0x78,0xFF,0x13,0x50,0x07,0x00,0x34 }, + { 0xFC,0xFF,0x7F,0xFE,0xDF,0xFD,0x3F,0xFF,0x64,0xFA,0xE7,0x7F,0x82,0x58,0x99,0x05,0x14 }, + { 0xFF,0x7B,0xF7,0xB3,0xFF,0xFF,0xFE,0xBF,0xB7,0x36,0xEC,0xFE,0x64,0x57,0x11,0x06,0x44 }, + { 0xFD,0xEF,0xDF,0xFB,0x7F,0xFF,0xF2,0xFF,0xF2,0x73,0x3F,0x3F,0x19,0x96,0x07,0x19,0x14 }, +}; + +int main(void) { + unsigned i, j; + FILE *f; + + f=fopen("bin/bench.bin", "wb"); + if(f == NULL) { printf("Error writing bin/bench.bin\n"); exit(-1); } + + for(i=0; i<2048; i++) + j=fwrite(tbench, 17*16, 1, f); + fclose(f); + + return 0; +} diff --git a/paralle3/src/e2g.c b/paralle3/src/e2g.c new file mode 100644 index 0000000..93e1ba6 --- /dev/null +++ b/paralle3/src/e2g.c @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include +#include // HOST side ; mandatory + +#include "C_common2.h" // common definitions for C +#include "e2g_common.h" // common definitions for EII project + +#define BENCH_MIN 0 //min bench to start with +#define BENCH_MAX 1024 //max bench to start with +#define BENCH_N 1 //16 //16 to solve per core +#define BENCH_LIMIT 10000 //stop after x benchs done ; not implemented actually + +//####################################### + +void Epiphany_Boot(e_platform_t *epiphany) { + e_init(NULL); // initialise the system, establish connection to the device + e_reset_system(); // reset the Epiphany chip + e_get_platform_info(epiphany);// get the configuration info for the parallella platform +} + +//####################################### +//printf("%-2d"...) -> left align +//GOAL: display an array of node numbers +void Node_Board_Print2(uint *tsolN) { + int fn1, fn2, fn3=0, place; + + print + LOOP2(9) + pf(" %d", 1+fn2); + pf(" %2d\n", 1+fn2); + //pf(" - %c\n", 'A'); + //pf(" - %c\n", 'B'); + + LOOP1(8) { + LOOP2(10) { + place=(fn1+2)*16 + fn2;//+2 because basis is C1 + ifnz(tsolN[fn3]) { + pf(" %012u", tsolN[fn3]); + } + else { + pf(" %c%d ", 'C'+fn1, 1+fn2);//'C' because basis is C1 + if(fn2!=9) pf(" "); + } + fn3++; + } + pf(" - %c\n", 'C'+fn1);//basis is C1 + } +} + +//####################################### +//print out result +int64_t Output_Print(Soutput out) { + int64_t l1=0; + int fn1; + + LOOP1(DAM_SZ) l1+=out.globaltsolN[fn1]; + pf("\n %012llu nodes\n", l1); + pf("\n res: %09u\n", out.globalres); + + Node_Board_Print2(out.globaltsolN); + + return l1 + out.globalres; +} + +//####################################### + +int main(int argc, char *argv[]) { + // Epiphany input/output through shared RAM ; details: e2g_common.h + Sio fromio;//Sio *fromio=(Sio *)malloc(sizeof(Sio)); + int64_t l1=0; + int row, col, i, j, fn1, fn2, bench_start=BENCH_MIN, toccN[CORE_N]={0}, benchlimit=0; + e_platform_t epiphany;// Epiphany platform configuration + e_epiphany_t dev; + FILE *fin; + char *tbench=(char *)malloc(MAX_CORE_N * 17 * 16);; + + if(argc > 1) { + i=atoi(argv[1]); if(i < BENCH_MAX) bench_start=i; + } + + //get data + fin=fopen("./bin/bench.bin", "rb"); + ifz(fin) { printf("Error reading file bin/bench.bin ; did you generate it with build_data.sh ?\n"); exit(-1); } + fseek(fin, 17 * bench_start, 0); + i=fread(tbench, MAX_CORE_N * 17 * 16, 1, fin); + fclose(fin); + + printf("\n\nEternity II running under Parallella :) \n\n\n"); + + Epiphany_Boot(&epiphany); + + // Create a workgroup using all of the cores + e_open(&dev, 0, 0, epiphany.rows, epiphany.cols); + e_reset_group(&dev); + + // Load the device code into each core of the chip, and don't start it yet + e_load_group("bin/e_e2g.elf", &dev, 0, 0, epiphany.rows, epiphany.cols, E_FALSE); + + // Set the maximum per core test value on each core at address 0x7020 + i=0; + for(row=0;row> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte + fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1; + fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1; + fromio.in.tuile2do= *(uint64_t *)&tbench[j + 0]; + fromio.in.bordertuile2do=*(uint *)&tbench[j + 8]; +LOOP1(10) pf("%u ", fromio.in.tdam[C1N+fn1]); print + + pf("0x%016llX tiles\n", fromio.in.tuile2do); + pf("0x%08X borders\n", fromio.in.bordertuile2do); + pf("sz(io) = %u\n", sizeof(Sio)); + LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0; + fromio.out.globalres=0; + + e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio)); +pf("i %u ; in written ; C1N = %u\n", i, fromio.in.tdam[C1N]); + + i++; + } + } + + // Start all of the cores + pf("Some results in a minute... starting the core workgroup...\n\n"); + e_start_group(&dev); + pf("... core workgroup started ; the whole test will last about 120 seconds...\n\n"); + + while(1) { + usleep(100000); + //pf("fromio.out.cmd: 0x%08X\n", fromio.out.cmd); + int done = 0; + + // wait for the cores to complete their work + i=0; + for(row=0;row= BENCH_LIMIT) break; + if(toccN[i] == BENCH_N) + done++; + else { + toccN[i]++; + pf("core %4u: done %2u times ; cmd 0x%08X.\n", i, toccN[i], fromio.out.cmd); + + fromio.out.cmd=CMD_INIT; + LOOP1(DAM_SZ) + fromio.in.tdam[fn1]=0; + j=17 * bench_start; // for demo purpose + + LOOP1(8) + fromio.in.tdam[C2N + fn1] = (tbench[j + 12 + (fn1/2)] >> (4 * (fn1&1))) & 15; //format: 2 nibbles per byte + fromio.in.tdam[C1N] =(tbench[j + 16] & 15) - 1; + fromio.in.tdam[C10N]=(tbench[j + 16] >> 4) - 1; + fromio.in.tuile2do= *(uint64_t *)&tbench[j + 0]; + fromio.in.bordertuile2do=*(uint *)&tbench[j + 8]; + LOOP1(DAM_SZ) fromio.out.globaltsolN[fn1]=0; + fromio.out.globalres=0; + + e_write(&dev, row, col, SHARED_IN, &fromio, sizeof(Sio)); +pf("i %u ; in written again ; C1N = %u\n", i, fromio.in.tdam[C1N]); +//OBSOLETE ! esdk doc too :/ e_reset_core(&dev, row, col); + e_start(&dev, row, col); + } + } + + i++; + } + } + + if ( done >= CORE_N ) // some benchmarks are lengthy + break; + + if(benchlimit >= BENCH_LIMIT) break; + } + + e_finalize(); + pf("Crunched %015llu nodes.\n\n", l1); + + return 0; +} diff --git a/paralle3/src/e2g_common.h b/paralle3/src/e2g_common.h new file mode 100644 index 0000000..2fd506d --- /dev/null +++ b/paralle3/src/e2g_common.h @@ -0,0 +1,132 @@ +// 2017/01/28: 103 Mn/s (Million nodes per second) C version compared to previous 81 Mn/s assembly version. +// ELF support instead of SREC. Quicker load. File input. No more UNsigned integers, no more char loads, no more "ctz" and "popcount" instructions. +// Removed bug from multiple 0x6000 section inputs. +// Eagerly waiting for Epiphany V... + +#define CORE_N 16 //change it if needed ; our choice for standard 16-core Epiphany +#define STATS //undefine STATS to get full performance (from 111.2 to 103.4 s with a 16-core Parallella) +#define MAX_CORE_N 1024 //Epiphany V ready ;) + +// specific to the project +#define DAM_SZ 90 + +// to DEVICE +#pragma pack(4) +typedef struct S_input { + int64_t tuile2do; + int bordertuile2do; + int tdam[DAM_SZ]; + int east; +}Sinput; +// from DEVICE +typedef struct S_output { + int globaltsolN[DAM_SZ]; //int64_t is twice as long to execute, you need at least 6 ic to increment a 64-bit memory value :/ + int globalres; + int cmd; + int fn_idx; +}Soutput; +// shared MEMORY +typedef struct S_io { + Sinput in; + Soutput out; +}Sio; +// tmp variables for DEVICE, trying a workaround for the -msmall16 compilation option +typedef struct S_tmp { + //int fn_idx; + int ttiles[64 + 1 + 32]; + int j9e; + int j1n; +}Stmp; + +// global offset for shared RAM +#define SHARED_RAM (0x01000000) + +// a whole forum post for that +#define PERFECT_ALIGN8 __asm__ (".balignw 4, 0x01a2\n"); __asm__ (".balignl 8, 0xfc02fcef\n"); + +// Epiphany local offsets +#define SHARED_IN 0x6000 +#define SHARED_OUT (SHARED_IN + sizeof(Sinput)) +#define SHARED_RES (SHARED_OUT + DAM_SZ*4) // offset for result +#define SHARED_CMD (SHARED_OUT + DAM_SZ*4 + 4) // offset for 'cmd' +#define R_IDX (SHARED_OUT + sizeof(Soutput)) + +// commands for the Epiphany core +#define CMD_INIT 0x80000000 // host init +#define CMD_DONE 0x40000000 // eCore did the job properly (probably ; some bug might crush this word but it's highly improbable) + +// specific to the project +#ifdef STATS + #define macro_globaltrace(niveau) out.globaltsolN[niveau]++; +#else + #define macro_globaltrace(niveau) +#endif + +#define macro_globaltrace2(niveau) out.globaltsolN[niveau]++; + +#define NORTH 0 +#define EAST 1 +#define SOUTH 2 +#define WEST 3 + +#define B1N 0 +#define C1N 10 +#define C2N 11 +#define C3N 12 +#define C4N 13 +#define C5N 14 +#define C6N 15 +#define C7N 16 +#define C8N 17 +#define C9N 18 +#define C10N 19 +#define D1N 20 //etc + +#define G1N 50 +#define G2N 51 +#define G3N 52 +#define G4N 53 +#define G5N 54 +#define G6N 55 +#define G7N 56 +#define G8N 57 +#define G9N 58 +#define G10N 59 + +#define H1N 60 +#define H2N 61 +#define H3N 62 +#define H4N 63 +#define H5N 64 +#define H6N 65 +#define H7N 66 +#define H8N 67 +#define H9N 68 +#define H10N 69 + +#define I1N 70 +#define I2N 71 +#define I3N 72 +#define I4N 73 +#define I5N 74 +#define I6N 75 +#define I7N 76 +#define I8N 77 +#define I9N 78 +#define I10N 79 + +#define J1N 80 +#define J2N 81 +#define J3N 82 +#define J4N 83 +#define J5N 84 +#define J6N 85 +#define J7N 86 +#define J8N 87 +#define J9N 88 +#define J10N 89 + +#define BORDERCOLOR_D 0 +#define BORDERCOLOR_G 4 +#define BORDERCOLOR_I 9 +#define BORDERCOLOR_N 19 // 19 colors ; 1st one is empty, colors 1-4 stand for D(roite), 5-8 for G(auche), 9-18 for I(nterieur) diff --git a/paralle3/src/e_e2g.c b/paralle3/src/e_e2g.c new file mode 100644 index 0000000..9e281d7 --- /dev/null +++ b/paralle3/src/e_e2g.c @@ -0,0 +1,891 @@ +#include "e-lib.h" // mandatory even for a minimalist design -- e_get_coreid(), e_read(), e_write() + +//from notzed on the forum, "...gcc extended inline asm, 'cc' clobber_php.htm" +//volatile needed, the compiler may mix code without taking care of the condition flags :/ +unsigned int bitrev(unsigned int val) { + unsigned int res; + + __asm__ volatile ("bitr %[res],%[val]" + : [res] "=r" (res) + : [val] "r" (val) + : "cc"); + + return res; +} + +//#include "C_common2.h" // common definitions for C +// avoid stdint.h +#define uint8_t unsigned char +#define uint16_t unsigned short +#define uint32_t unsigned int +#define uint64_t unsigned long long // unsigned long = 4 bytes under 32-bit ARM + +#define int8_t char +#define int16_t short +#define int32_t int +#define int64_t long long + +// my semantic +#define uc unsigned char +#define ull unsigned long long +#define pf printf +#define print printf("\n"); +#define pfv(x) printf("v: %d\n",x); +#define LOOP1(x) for(fn1=0;fn1<(x);fn1++) +#define LOOP2(x) for(fn2=0;fn2<(x);fn2++) +#define LOOP3(x) for(fn3=0;fn3<(x);fn3++) +#define LOOP4(x) for(fn4=0;fn4<(x);fn4++) +#define LOOP5(x) for(fn5=0;fn5<(x);fn5++) +#define LOOP6(x) for(fn6=0;fn6<(x);fn6++) +#define LOOP7(x) for(fn7=0;fn7<(x);fn7++) +#define LOOP8(x) for(fn8=0;fn8<(x);fn8++) +#define LOOP(x,y) for(x=0;x ONE reliable way of coding is ONE structure for exchanging with the rest of the world +*/ + +volatile Sio io SECTION(".data_bank3"); + +#define in io.in +#define out io.out + +//####################################### + +Stmp tmp; + +//####################################### +//THE 'COMPUTE KERNEL' +//#include "e2c_solver.c" + +void BorderWest(const int, const int); +void InnerTile0(const int, const int); +void InnerTile1(const int, const int); +void InnerTile(const int, const int); +void InnerRow(const int, const int); +void InnerRow2(const int, const int); // with tinner_Upd +void BorderEast(const int, const int); +void BorderEastUpdate(const int, const int); +void Special_H10(const int, const int); +void Special_I1(const int, const int); +void Special_I10(const int, const int); +void BorderEastBottom(const int, const int); +void Special_J2(const int, const int); +void Special_Debug(const int, const int); // for debugging purpose + +void __attribute__ ((noinline)) Input_Copy(int, int *); + +//####################################### +//STATIC DATA + +const int tlscouleur_B2016[BORDERCOLOR_N+1]={ + 0x00000000, + 0x0870809A, 0x91032001, 0x42845140, 0x24080E24, + 0x000000FF, 0x00007F00, 0x00FF8000, 0xFF000000, + 0x00000001, 0x00018300, 0x03000406, 0x00000008, 0x00020830, 0x04041040, 0x38080000, 0x40100000, 0x00202000, 0x80C04080, + 0x00000000 +}; + +// colors 0-3 for D and G, 0-9 for I +const int tbordureD[32]={ 0x01, 0x00, 0x03, 0x00, 0x00, 0x03, 0x02, 0x00, 0x02, 0x03, 0x03, 0x03, 0x02, 0x01, 0x02, 0x00, 0x01, 0x01, 0x02, 0x03, 0x00, 0x00, 0x00, 0x02, 0x01, 0x02, 0x03, 0x00, 0x01, 0x03, 0x02, 0x01 }; +const int tbordureG[32]={ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; +const int tbordureI[32]={ 0x00, 0x02, 0x02, 0x03, 0x04, 0x04, 0x05, 0x09, 0x01, 0x01, 0x02, 0x04, 0x05, 0x08, 0x09, 0x01, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x02, 0x02, 0x05, 0x06, 0x06, 0x06, 0x07, 0x09 }; + +const int ttileN[100]={ + 1, 4, 3, 2, 1, 4, 4, 5, 2, 3, 5, 2, 5, 3, 0, 3, 3, 2, 0, 3, + 1, 5, 3, 5, 2, 0, 1, 2, 3, 3, 6, 1, 4, 3, 6, 1, 2, 1, 3, 2, + 2, 2, 2, 2, 4, 1, 0, 1, 4, 6, 5, 3, 2, 1, 1, 2, 4, 1, 1, 4, + 3, 0, 2, 5, 3, 5, 0, 1, 4, 1, 1, 4, 1, 5, 2, 4, 1, 6, 2, 0, + 0, 4, 1, 0, 4, 2, 7, 4, 3, 1, 5, 1, 2, 3, 1, 2, 2, 3, 4, 0 +}; + +// color format: G + 4*D (i.e tbordureG + 4*tbordureD) +const int tGDN[16]={ 4, 0, 4, 1, 1, 1, 2, 3, 1, 3, 2, 2, 2, 3, 1, 2 }; + +const int tGD[16][8]={ + { 1, 2, 3, 3, 4, 4, 7, 9 }, + { }, + { 15, 1, 20, 7, 21, 8, 22, 9 }, + { 27, 6, }, + { 0, 0, }, + { 13, 8, }, + { 16, 1, 17, 4, }, + { 24, 2, 28, 6, 31, 9, }, + { 6, 5, }, + { 8, 1, 12, 5, 14, 9, }, + { 18, 5, 23, 9, }, + { 25, 2, 30, 7, }, + { 2, 2, 5, 4, }, + { 9, 1, 10, 2, 11, 4, }, + { 19, 6, }, + { 26, 5, 29, 6, } +}; + +#define VOIDTILE 64 +#define VOIDSOUTH 0 + +//tcount(11) = 2 tcount(19) = 2 tcount(22) = 3 tcount(27) = 2 tcount(36) = 2 tcount(38) = 2 +//const int tbordereast_uniquecolor[20]={ 1, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 23, 24, 25, 28, 30, 32, 33, 37, }; //19 actually +//const int tbordereast_uniquetile[20]={ 0, 0, 4, 1, 24, 0, 27, 28, 0, 15, 17, 0, 29, 20, 0, 30, 0, 0, 0, }; //19 actually + +//from tmpbordure*.c +const int t14[100][8]={ // format: LSB = tile, 2nd byte = east, 3rd byte = south, MSB = 0 + { 0x00060500, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000901, 0x00020308, 0x00010613, 0x0009030C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000506, 0x00010915, 0x0004030A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004040D, 0x00080510, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010307, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050103, 0x00050918, 0x00000206, 0x00020102, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000712, 0x00080511, 0x00050000, 0x00020614, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040917, 0x00000612, 0x00030309, 0x00070104, 0x0007050F, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070105, 0x00010916, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000101, 0x0006030B, 0x0006040E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050503, 0x00090001, 0x00080705, 0x00070704, 0x00050202, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050919, 0x00060013, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070823, 0x00030008, 0x0001051A, 0x00090720, 0x0001071B, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000407, 0x0006021C, 0x0004041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060822, 0x00090119, 0x0001021A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000113, 0x0007051F, 0x00050821, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0006031D, 0x0001021B, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000816, 0x00000215, 0x0003000C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050006, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00090015, 0x0005011A, 0x00000502, 0x0003061C, 0x0007011B, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030325, 0x00020924, 0x00090224, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030225, 0x00000108, 0x00020826, 0x00020927, 0x00040328, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007072C, 0x0003000A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000614, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080123, 0x00050429, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0006052B, 0x0006042A, 0x00020326, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020224, 0x00020327, 0x00070120, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0009060B, 0x00070309, 0x00010208, 0x00040107, 0x0001090C, 0x0002040A, 0x00000040, 0x00000040 }, + { 0x0007061D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020325, 0x00080226, 0x00090227, 0x00030428, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020225, 0x0007072D, 0x00000709, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004000D, 0x00080732, 0x00040935, 0x0005062F, 0x00030228, 0x0004011E, 0x00000040, 0x00000040 }, + { 0x00090734, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080631, 0x0002011C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007032D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040936, 0x00050010, 0x0006052E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050630, 0x00080733, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0003040D, 0x0009060E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030007, 0x0003041E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070529, 0x0008062A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020328, 0x0000020A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0000030D, 0x00090335, 0x00080737, 0x0001031E, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0006032F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007022C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070332, 0x00060839, 0x00070437, 0x0006093C, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000717, 0x00030435, 0x0008083B, 0x00030836, 0x00050638, 0x0007083A, 0x00000040, 0x00000040 }, + { 0x00060811, 0x00020006, 0x00000600, 0x00030810, 0x0007070F, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000503, 0x0002011A, 0x0006071F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008062B, 0x00010002, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008062E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020729, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010003, 0x00090018, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080122, 0x00030930, 0x0003042F, 0x00040938, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007073E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010621, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000518, 0x00070334, 0x00010119, 0x0008063D, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00070012, 0x00010113, 0x00060214, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060014, 0x0001031C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00090530, 0x0000090B, 0x00060831, 0x0001071D, 0x0004052F, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0000090E, 0x0002082A, 0x00090538, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0002082B, 0x00000000, 0x0009083D, 0x0003082E, 0x00080121, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0005011F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010522, 0x00050011, 0x00060331, 0x00040839, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004083C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060012, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000805, 0x00000704, 0x00020920, 0x0002011B, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004072C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040832, 0x0003072D, 0x00030009, 0x00050934, 0x00090833, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00090017, 0x00040837, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007073E, 0x00040229, 0x0001061F, 0x0000070F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0003011D, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0007053E, 0x0005073E, 0x0003032D, 0x00010004, 0x0002042C, 0x0005000F, 0x00000040, 0x00000040 }, + { 0x00010223, 0x0004093A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00050622, 0x00020723, 0x00090016, 0x00060521, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030226, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00080639, 0x0009083B, 0x00090336, 0x0009073A, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000611, 0x00000310, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0005022B, 0x00030631, 0x00080439, 0x0005093D, 0x0009043C, 0x0004022A, 0x0005032E, 0x00000040 }, + { 0x00030432, 0x00010005, 0x00040437, 0x00030933, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0004093B, 0x0008093F, 0x0009083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010001, 0x00070417, 0x00080116, 0x00050518, 0x00020115, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00010519, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00020224, 0x00030227, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00040435, 0x00080436, 0x0000010C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008063C, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00060330, 0x00060438, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0003000B, 0x0004000E, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00030534, 0x00010220, 0x0008043A, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x0008043B, 0x00070333, 0x0006053D, 0x0008083F, 0x00000040, 0x00000040, 0x00000040, 0x00000040 }, + { 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040, 0x00000040 } +}; + +ALIGN(8) +int ttileN_upd[100]={0}; // ? does not work yet + +static inline void TileN_Update(void) { + int fn1; + LOOP1(100) ttileN_upd[fn1]=ttileN[fn1]; + + ttileN_upd[1]=tmp.ttiles[0]; + //modele : tmp.ttileN_upd[1]=tmp.ttiles[0]; + /* + ttileN_upd[1]=tmp.ttiles[0]; + ttileN_upd[4]=tmp.ttiles[15]; + ttileN_upd[5]=tmp.ttiles[16]; + ttileN_upd[6]=tmp.ttiles[8]; + ttileN_upd[7]=tmp.ttiles[9]; + ttileN_upd[8]=tmp.ttiles[1]; + ttileN_upd[9]=tmp.ttiles[24]; + ttileN_upd[10]=tmp.ttiles[25]; + ttileN_upd[12]=tmp.ttiles[3]; + ttileN_upd[16]=tmp.ttiles[4]; + ttileN_upd[17]=tmp.ttiles[17]; + ttileN_upd[23]=tmp.ttiles[26]; + ttileN_upd[24]=tmp.ttiles[27]; + ttileN_upd[25]=tmp.ttiles[28]; + ttileN_upd[28]=tmp.ttiles[20]; + ttileN_upd[30]=tmp.ttiles[30]; + ttileN_upd[32]=tmp.ttiles[21]; + ttileN_upd[33]=tmp.ttiles[13]; + ttileN_upd[37]=tmp.ttiles[31]; + */ +} + +//typedef void (*ptrFonction) (const signed int, const signed int); + +ALIGN(8) +void (* tfncall[78]) (const int, const int) ={ + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // C10 + //Special_Debug, + + BorderWest, + InnerRow, // replaces InnerTile() * 8 + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // BorderEastUpdate, // D10 // ? fail + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // E10 + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // F10 + +//Special_Debug, + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + BorderEast, // G10 + + BorderWest, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_H10, // H10 + + Special_I1, + InnerRow, + Special_Debug, // only for clean stats + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_Debug, + Special_I10, // I10 + + BorderEastBottom, // J9 + BorderEastBottom, // J8 + BorderEastBottom, // J7 + BorderEastBottom, // J6 + BorderEastBottom, // J5 + BorderEastBottom, // J4 + BorderEastBottom, // J3 + Special_J2, // J2, last square +}; + +//dynamic +int tborderwestN[4]={0}; +int tbordereastN[40]={0}; + +int tborderwestT[4][9]={0}; +int tborderwestE[4][9]={0}; +int tborderwestS[4][9]={0}; + +int tbordereastT[40][4]={0}; +int tbordereastS[40][4]={0}; + +// sandwiched BorderWest ; J1N=0 or 3 for this specific problem +void Special_I1(const int north, const int northI) { + int couleur, tuileN, fn1, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=north + 4*0; + tmp.j9e=3; + tuileN=tGDN[couleur]; + LOOP1(tuileN) { + tuile=tGD[couleur][fn1*2 + 0]; + ifnz(tmp.ttiles[64+1 + tuile]) { + east=tGD[couleur][fn1*2 + 1]; + tmp.ttiles[64+1 + tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[64+1 + tuile]=1; + } + } + + couleur=north + 4*3; + tmp.j9e=0; + tuileN=tGDN[couleur]; + LOOP1(tuileN) { + tuile=tGD[couleur][fn1*2 + 0]; + ifnz(tmp.ttiles[64+1 + tuile]) { + east=tGD[couleur][fn1*2 + 1]; + tmp.ttiles[64+1 + tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[64+1 + tuile]=1; + } + } +} + +void BorderWest(const int north, const int northI) { + int couleur, tuileN, fn1, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=north; // in.tdam[C1N]; + tuileN=tborderwestN[couleur]; + LOOP1(tuileN) { + tuile=tborderwestT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + east=tborderwestE[couleur][fn1]; + in.tdam[northI+10]=tborderwestS[couleur][fn1]; + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(east, northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +void InnerTile(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=eastcolor + 10*in.tdam[northI]; + tuileN=ttileN[couleur]; + LOOP1(tuileN) { + fn2=t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[tuile]=1; + } + } +} + +void BorderEast(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + in.tdam[northI+10]=tbordereastS[couleur][fn1]; + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(in.tdam[northI+1], northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +// BorderEast for the bottom line ; from J9 to J2 excluded ; only change to these tags '//#' +void BorderEastBottom(const int northI, const int eastcolor) { + int couleur, tuileN, fn1, tuile, nexteast; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=in.tdam[northI] * 4 + eastcolor; //# + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + tmp.ttiles[64+1 + tuile]=0; + out.fn_idx++; + (*ptr)(northI-1, tbordureG[tuile]); //# + out.fn_idx--; + tmp.ttiles[64+1 + tuile]=1; + } + } +} + +// BorderEastBottom with final check for J2 the last square +void Special_J2(const int northI, const int eastcolor) { + int couleur, tuileN, fn1, tuile, nexteast; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=in.tdam[northI] * 4 + eastcolor; //# + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + if(tbordureG[tuile] != 1) continue; // tdam[J1E] == 1 for this specific problem + + out.globalres++; // O_O reach this point after about 10^17 nodes... + + } + } +} + +// BorderEast with I10 checkup +void Special_H10(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + if(tbordureG[tuile] == 0) continue; // borders 0/1/x/0 do not exist on this specific problem + //in.tdam[northI+10]=tbordereastS[couleur][fn1]; + + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(in.tdam[northI+1], northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +// BorderEast with I10 strong constraint +void Special_I10(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace2(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + if(tbordureG[tuile] != 2) continue; // J10N == 2 for this specific problem + + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(northI+9, tmp.j9e); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +//??? why does it NOT work ? should be 10 % faster +void BorderEastUpdate(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, tuile; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + TileN_Update(); + + out.fn_idx++; + + couleur=eastcolor*4 + in.tdam[northI]; + tuileN=tbordereastN[couleur]; + LOOP1(tuileN) { + tuile=tbordereastT[couleur][fn1]; + ifnz(tmp.ttiles[64+1 + tuile]) { + in.tdam[northI+10]=tbordereastS[couleur][fn1]; + tmp.ttiles[64+1 + tuile]=0; + (*ptr)(in.tdam[northI+1], northI+1); + tmp.ttiles[64+1 + tuile]=1; + } + } + + out.fn_idx--; +} + +void InnerTile0(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=eastcolor + 10*in.tdam[northI]; + tuileN=ttileN[couleur]; + LOOP1(tuileN) { + fn2=t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[tuile]=1; + } + } +} + +void InnerTile1(const int eastcolor, const int northI) { + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + out.fn_idx++; + + couleur=eastcolor + 10*in.tdam[northI]; + tuileN=ttileN_upd[couleur]; + LOOP1(tuileN) { + fn2=t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + (*ptr)(east, northI+1); + tmp.ttiles[tuile]=1; + } + } + + out.fn_idx--; +} + +//point de vigilance : teast[idx - 1] ; le reste : std +#define macro_innerrow_loop(idx)\ + macro_globaltrace(out.fn_idx - 8 + idx);\ + tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\ + ttuileN[idx]=ttileN[tcouleur[idx]];\ + LOOP(tfn1[idx], ttuileN[idx]) {\ + tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\ + ttuile[idx]=tfn2[idx] & 0xff;\ + ifnz(tmp.ttiles[ttuile[idx]]) {\ + teast[idx]=(tfn2[idx]>>8) & 0xff;\ + in.tdam[south + idx]=tfn2[idx] >> 16;\ + tmp.ttiles[ttuile[idx]]=0; + +#define macro_innerrow_loopz(idx) tmp.ttiles[ttuile[idx]]=1; } } + + +void InnerRow(const int eastcolor, const int northI) { + int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8]; + int south=northI+10; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + + out.fn_idx +=8; + ptr=tfncall[out.fn_idx]; + + tcouleur[0]=eastcolor + 10*in.tdam[northI]; + ttuileN[0]=ttileN[tcouleur[0]]; + LOOP(tfn1[0], ttuileN[0]) { + tfn2[0]=t14[tcouleur[0]][tfn1[0]]; + ttuile[0]=tfn2[0] & 0xff; + ifnz(tmp.ttiles[ttuile[0]]) { + teast[0]=(tfn2[0]>>8) & 0xff; + in.tdam[south +0]=tfn2[0] >> 16; + tmp.ttiles[ttuile[0]]=0; + + // (*ptr)(teast[0], northI+1); + /* + tcouleur[1]=teast[0] + 10*in.tdam[northI +1]; + ttuileN[1]=ttileN[tcouleur[1]]; + LOOP(tfn1[1], ttuileN[1]) { + tfn2[1]=t14[tcouleur[1]][tfn1[1]]; + ttuile[1]=tfn2[1] & 0xff; + ifnz(tmp.ttiles[ttuile[1]]) { + teast[1]=(tfn2[1]>>8) & 0xff; + in.tdam[south +1]=tfn2[1] >> 16; + tmp.ttiles[ttuile[1]]=0; + out.fn_idx++; + */ + macro_innerrow_loop(1) + macro_innerrow_loop(2) + macro_innerrow_loop(3) + macro_innerrow_loop(4) + macro_innerrow_loop(5) + macro_innerrow_loop(6) + macro_innerrow_loop(7) + + (*ptr)(teast[7], northI + 8); + + macro_innerrow_loopz(7) + macro_innerrow_loopz(6) + macro_innerrow_loopz(5) + macro_innerrow_loopz(4) + macro_innerrow_loopz(3) + macro_innerrow_loopz(2) + macro_innerrow_loopz(1) + + tmp.ttiles[ttuile[0]]=1; + } + } + + out.fn_idx -=8; +} + + +//point de vigilance : teast[idx - 1] ; le reste : std +#define macro_innerrow_loop2(idx)\ + macro_globaltrace(out.fn_idx - 8 + idx);\ + tcouleur[idx]=teast[idx - 1] + 10*in.tdam[northI +idx];\ + ttuileN[idx]=ttileN_upd[tcouleur[idx]];\ + LOOP(tfn1[idx], ttuileN[idx]) {\ + tfn2[idx]=t14[tcouleur[idx]][tfn1[idx]];\ + ttuile[idx]=tfn2[idx] & 0xff;\ + ifnz(tmp.ttiles[ttuile[idx]]) {\ + teast[idx]=(tfn2[idx]>>8) & 0xff;\ + in.tdam[south + idx]=tfn2[idx] >> 16;\ + tmp.ttiles[ttuile[idx]]=0; + + +void InnerRow2(const int eastcolor, const int northI) { + int tcouleur[8], ttuileN[8], tfn1[8], tfn2[8], ttuile[8], teast[8]; + int south=northI+10; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + + out.fn_idx +=8; + ptr=tfncall[out.fn_idx]; + + tcouleur[0]=eastcolor + 10*in.tdam[northI]; + ttuileN[0]=ttileN_upd[tcouleur[0]]; + LOOP(tfn1[0], ttuileN[0]) { + tfn2[0]=t14[tcouleur[0]][tfn1[0]]; + ttuile[0]=tfn2[0] & 0xff; + ifnz(tmp.ttiles[ttuile[0]]) { + teast[0]=(tfn2[0]>>8) & 0xff; + in.tdam[south +0]=tfn2[0] >> 16; + tmp.ttiles[ttuile[0]]=0; + + macro_innerrow_loop(1) + macro_innerrow_loop(2) + macro_innerrow_loop(3) + macro_innerrow_loop(4) + macro_innerrow_loop(5) + macro_innerrow_loop(6) + macro_innerrow_loop(7) + + (*ptr)(teast[7], northI + 8); + + macro_innerrow_loopz(7) + macro_innerrow_loopz(6) + macro_innerrow_loopz(5) + macro_innerrow_loopz(4) + macro_innerrow_loopz(3) + macro_innerrow_loopz(2) + macro_innerrow_loopz(1) + + tmp.ttiles[ttuile[0]]=1; + } + } + + out.fn_idx -=8; +} + + +/* super BIDE ou BUG de nouveau +void InnerTile0(const int eastcolor, const int northI) { + int64_t tfn2[4]; + int couleur, tuileN, fn1, fn2, tuile, east; + void (*ptr)(const int, const int); + macro_globaltrace(out.fn_idx); + ptr=tfncall[out.fn_idx+1]; + + couleur=eastcolor + 10*in.tdam[northI]; + + tfn2[0]=*(int64_t *)&t14[couleur][0]; + tfn2[1]=*(int64_t *)&t14[couleur][2]; + tfn2[2]=*(int64_t *)&t14[couleur][4]; + tfn2[3]=*(int64_t *)&t14[couleur][6]; + + tuileN=ttileN[couleur]; + LOOP1(tuileN) { + fn2=*(int *)(&tfn2 + fn1*4); //t14[couleur][fn1]; + tuile=fn2 & 0xff; + ifnz(tmp.ttiles[tuile]) { + east=(fn2>>8) & 0xff; + in.tdam[northI+10]=fn2>>16; + tmp.ttiles[tuile]=0; + out.fn_idx++; + (*ptr)(east, northI+1); + out.fn_idx--; + tmp.ttiles[tuile]=1; + } + } +} +*/ + +void Special_Debug(const int north, const int northI) { + macro_globaltrace(out.fn_idx); +} + +//####################################### + +// prevent inlining this trivial function: we may need some room +void __attribute__ ((noinline)) Input_Copy(int tiles, int *dest) { + int fn1; + LOOP1(32) { + dest[fn1]=tiles & 1; + tiles>>=1; + } +} + +//void __attribute__((interrupt)) null_isr() { return; } + +//####################################### + +int main(void) { +e_start:; + + int fn1, westcolor, eastcolor, tiles; + + volatile signed int *inputP = (void *)SHARED_IN; // pointer for input + //volatile signed int *cmdP = (void *)SHARED_CMD; // pointer for output command + + // init compute kernel + tiles=*(inputP+0); Input_Copy(tiles, &tmp.ttiles[ 0]); // 1st 32 tiles + tiles=*(inputP+1); Input_Copy(tiles, &tmp.ttiles[32]); + tiles=*(inputP+2); Input_Copy(tiles, &tmp.ttiles[VOIDTILE+1]); // 32 borders + tmp.ttiles[VOIDTILE]=0; // VOIDTILE == 64 + + LOOP1(4) + tborderwestN[fn1]=0; + LOOP1(40) + tbordereastN[fn1]=0; + + LOOP1(32) { + if(tmp.ttiles[64+1 +fn1]) { + westcolor=tbordureG[fn1]; + tborderwestT[westcolor] [tborderwestN[westcolor]]=fn1; + tborderwestE[westcolor] [tborderwestN[westcolor]]=tbordureI[fn1]; + tborderwestS[westcolor] [tborderwestN[westcolor]]=tbordureD[fn1]; + tborderwestN[westcolor]++; + + eastcolor=tbordureI[fn1]*4 + tbordureD[fn1]; + tbordereastT[eastcolor] [tbordereastN[eastcolor]]=fn1; + tbordereastS[eastcolor] [tbordereastN[eastcolor]]=tbordureG[fn1]; + tbordereastN[eastcolor]++; + } + } + + out.fn_idx=0; + BorderWest(in.tdam[C1N], C1N); + + out.cmd=CMD_DONE; // *cmdP=CMD_DONE; + + //return 0; + __asm__ __volatile__ ("idle"); // experience: can you idle an Epiphany core until ARM wakes it up ? Answer: empirically, yes ; use e_start() to reload the core + //goto e_start; // wake by IVT # 0 + +} diff --git a/paralle3/x86_build.sh b/paralle3/x86_build.sh new file mode 100644 index 0000000..0113851 --- /dev/null +++ b/paralle3/x86_build.sh @@ -0,0 +1,3 @@ +# cross compiling on x86_64 host, assuming /opt/adapteva as default path + +e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -o bin/e_e2g.elf -le-lib diff --git a/paralle3/x86_buildasm.sh b/paralle3/x86_buildasm.sh new file mode 100644 index 0000000..20836b0 --- /dev/null +++ b/paralle3/x86_buildasm.sh @@ -0,0 +1,12 @@ +# cross compiling on x86_64 host + +# -mshort-calls: OK +# -msmall16: still broken +# -m1reg-r63: OK +# -mfp-mode=int: OK + +echo Cross compiling on x86_64 host +echo. +echo. +e-gcc -Ofast -mfp-mode=int -mshort-calls -m1reg-r63 -T /opt/adapteva/esdk/bsps/current/internal.ldf src/e_e2g.c -S -le-lib +echo.