From 5745322bb5147f7a4c7bdf7122bd77c6fd15ef8f Mon Sep 17 00:00:00 2001 From: Florian Schmaus <flow@cs.fau.de> Date: Fri, 21 May 2021 17:04:35 +0200 Subject: [PATCH] EMPER Cactus Stack Devel --- CMakeLists.txt | 7 +- Makefile | 6 +- benchmarks/CMakeLists.txt | 17 + benchmarks/cholesky.cpp | 665 ++++ benchmarks/cilkplus/CMakeLists.txt | 44 + benchmarks/cilkplus/cilkplus.h | 34 + benchmarks/emper_continuation/CMakeLists.txt | 53 + .../emper_continuation/emper_continuation.h | 153 + benchmarks/emper_continuation/fork.h | 70 + benchmarks/emper_fiber/CMakeLists.txt | 39 + benchmarks/emper_fiber/emper_fiber.h | 39 + benchmarks/fft.cpp | 354 ++ benchmarks/fft.h | 2877 +++++++++++++++++ benchmarks/fib.cpp | 56 + benchmarks/fibril.h | 69 + benchmarks/fibril/CMakeLists.txt | 41 + benchmarks/fibril/fibrile.h | 97 + benchmarks/fibril/fibrili.h | 90 + benchmarks/fibril/fork.h | 70 + benchmarks/fibril_lf/CMakeLists.txt | 41 + benchmarks/fibril_lf/fork.h | 70 + benchmarks/heat.cpp | 205 ++ benchmarks/integrate.cpp | 79 + benchmarks/knapsack.cpp | 165 + benchmarks/lu.cpp | 458 +++ benchmarks/matmul.cpp | 142 + benchmarks/nqueens.cpp | 70 + benchmarks/openmp/CMakeLists.txt | 28 + benchmarks/openmp/openmp.h | 101 + benchmarks/quicksort.cpp | 84 + benchmarks/rectmul.cpp | 365 +++ benchmarks/serial/CMakeLists.txt | 27 + benchmarks/serial/serial.h | 18 + benchmarks/strassen.cpp | 644 ++++ benchmarks/tbb/CMakeLists.txt | 43 + benchmarks/tbb/tbb.h | 36 + benchmarks/test.h | 148 + emper/CMakeLists.txt | 1 + emper/Context.hpp | 59 +- emper/ContextManager.cpp | 40 +- emper/ContextManager.hpp | 5 +- emper/Continuation.hpp | 61 + emper/Dispatcher.cpp | 2 - emper/Dispatcher.hpp | 14 +- emper/Fiber.cpp | 2 +- emper/Fiber.hpp | 13 +- emper/Fibril.cpp | 5 + emper/Fibril.hpp | 283 ++ emper/MemoryManager.hpp | 14 +- emper/Runtime.cpp | 48 +- emper/Runtime.hpp | 19 + emper/Scheduler.hpp | 3 + emper/SynchronizedFiber.hpp | 2 +- emper/include/emper-common.h | 2 +- emper/include/emper.hpp | 11 + emper/lib/adt/BoundedMpmcQueue.hpp | 94 + emper/lib/adt/FibrilDeque.hpp | 85 + emper/lib/adt/FibrilLock.hpp | 25 + emper/lib/adt/LockedQueue.hpp | 11 + emper/lib/adt/WsClV3Queue.hpp | 79 + emper/lib/adt/WsClV4Queue.hpp | 84 + emper/strategies/laws/LawsDispatcher.cpp | 6 +- emper/strategies/laws/LawsScheduler.cpp | 28 + emper/strategies/laws/LawsScheduler.hpp | 3 + emper/strategies/ws/WsDispatcher.cpp | 6 +- emper/strategies/ws/WsScheduler.cpp | 29 + emper/strategies/ws/WsScheduler.hpp | 9 + run_benchmarks.sh | 128 + test.sh | 39 + tests/CMakeLists.txt | 21 + tests/ContinuationSyncTest.cpp | 68 + tests/ContinuationVariableParameterTest.cpp | 99 + tests/CppApiTest.cpp | 4 +- tests/CppContinuationApiTest.cpp | 41 + tests/SimpleContinuationFibTest.cpp | 94 + tests/SimpleContinuationLawsTest.cpp | 106 + tests/SimpleFibTest.cpp | 25 +- time.sh | 30 + 78 files changed, 9164 insertions(+), 39 deletions(-) create mode 100644 benchmarks/CMakeLists.txt create mode 100644 benchmarks/cholesky.cpp create mode 100644 benchmarks/cilkplus/CMakeLists.txt create mode 100644 benchmarks/cilkplus/cilkplus.h create mode 100644 benchmarks/emper_continuation/CMakeLists.txt create mode 100644 benchmarks/emper_continuation/emper_continuation.h create mode 100644 benchmarks/emper_continuation/fork.h create mode 100644 benchmarks/emper_fiber/CMakeLists.txt create mode 100644 benchmarks/emper_fiber/emper_fiber.h create mode 100644 benchmarks/fft.cpp create mode 100644 benchmarks/fft.h create mode 100644 benchmarks/fib.cpp create mode 100644 benchmarks/fibril.h create mode 100644 benchmarks/fibril/CMakeLists.txt create mode 100644 benchmarks/fibril/fibrile.h create mode 100644 benchmarks/fibril/fibrili.h create mode 100644 benchmarks/fibril/fork.h create mode 100644 benchmarks/fibril_lf/CMakeLists.txt create mode 100644 benchmarks/fibril_lf/fork.h create mode 100644 benchmarks/heat.cpp create mode 100644 benchmarks/integrate.cpp create mode 100644 benchmarks/knapsack.cpp create mode 100644 benchmarks/lu.cpp create mode 100644 benchmarks/matmul.cpp create mode 100644 benchmarks/nqueens.cpp create mode 100644 benchmarks/openmp/CMakeLists.txt create mode 100644 benchmarks/openmp/openmp.h create mode 100644 benchmarks/quicksort.cpp create mode 100644 benchmarks/rectmul.cpp create mode 100644 benchmarks/serial/CMakeLists.txt create mode 100644 benchmarks/serial/serial.h create mode 100644 benchmarks/strassen.cpp create mode 100644 benchmarks/tbb/CMakeLists.txt create mode 100644 benchmarks/tbb/tbb.h create mode 100644 benchmarks/test.h create mode 100644 emper/Continuation.hpp create mode 100644 emper/Fibril.cpp create mode 100644 emper/Fibril.hpp create mode 100644 emper/lib/adt/BoundedMpmcQueue.hpp create mode 100644 emper/lib/adt/FibrilDeque.hpp create mode 100644 emper/lib/adt/FibrilLock.hpp create mode 100644 emper/lib/adt/WsClV3Queue.hpp create mode 100644 emper/lib/adt/WsClV4Queue.hpp create mode 100755 run_benchmarks.sh create mode 100755 test.sh create mode 100644 tests/ContinuationSyncTest.cpp create mode 100644 tests/ContinuationVariableParameterTest.cpp create mode 100644 tests/CppContinuationApiTest.cpp create mode 100644 tests/SimpleContinuationFibTest.cpp create mode 100644 tests/SimpleContinuationLawsTest.cpp create mode 100755 time.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 48af4ac6..0d1951ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,9 +56,12 @@ endmacro() emper_option(WORKER_SLEEP "Enable sleeping worker support") emper_option(LOCKED_WS_QUEUE "Use a fully locked queue for work-stealing") +emper_option(LOCKED_FIBRIL "Use a fully locked Fibril. Only works with locked work-stealing queues") emper_option(OVERFLOW_QUEUE "Use a overflow queue in case the primary queue is full") emper_option(LOCKED_MPSC_QUEUE "Use the locked variant for the MPSC queue") emper_option(STATS "Collect stats and print them at the end of the execution") +emper_option(MADVISE "Use madvise(MADV_DONTNEED) to unmap unused stack pages. Bound memory consumption") +emper_option(CM_WITH_MEMORY_MANAGER "Use context manager with a memory manager") # Macro to add files to a var. Can even be used in subdirectories. # Source: http://stackoverflow.com/a/7049380/194894 @@ -108,12 +111,14 @@ add_library(c_emper STATIC ${C_EMPER_SOURCE}) # set_property(TARGET c_emper PROPERTY INTERPROCEDURAL_OPTIMIZATION True) target_link_libraries(c_emper emper) -add_subdirectory("lib") +#add_subdirectory("lib") add_subdirectory("apps") add_subdirectory("tests") +add_subdirectory("benchmarks") + add_subdirectory("eval") file(GLOB ALL_SOURCE_FILES *.cpp) diff --git a/Makefile b/Makefile index 0af760e4..1c4c50d5 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,11 @@ debug release relwithdebug: rm -f build ln -rs build-$@ build cd build-$@; \ - [[ -f CMakeCache.txt ]] || cmake -DCMAKE_BUILD_TYPE=$@ .. \ + [[ -f CMakeCache.txt ]] || cmake -DCMAKE_BUILD_TYPE=$@ \ + -DEMPER_CM_WITH_MEMORY_MANAGER=OFF \ + -DEMPER_LOCKED_WS_QUEUE=OFF \ + -DEMPER_LOCKED_FIBRIL=OFF \ + -DEMPER_MADVISE=OFF .. \ && make $(COMMON_MAKE_ARGS) reldebug: relwithdebug diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 00000000..715f4ed1 --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,17 @@ + +add_subdirectory(tbb) + +add_subdirectory(openmp) + +add_subdirectory(serial) + +add_subdirectory(emper_continuation) + +add_subdirectory(emper_fiber) + +add_subdirectory(fibril) + +add_subdirectory(fibril_lf) + +#add_subdirectory(cilkplus) + diff --git a/benchmarks/cholesky.cpp b/benchmarks/cholesky.cpp new file mode 100644 index 00000000..701f8b3f --- /dev/null +++ b/benchmarks/cholesky.cpp @@ -0,0 +1,665 @@ +/* + * Sparse Cholesky code with little blocks at the leaves of the Quad tree + * Keith Randall -- Aske Plaat + * + * This code should run with any square sparse real symmetric matrix + * from MatrixMarket (http://math.nist.gov/MatrixMarket) + * + * run with `cholesky -f george-liu.mtx' for a given matrix, or + * `cholesky -n 1000 -z 10000' for a 1000x1000 random matrix with 10000 + * nonzeros (caution: random matrices produce lots of fill). + */ +/* + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <math.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include "test.h" + +/*************************************************************\ + * Basic types + \*************************************************************/ + +typedef double Real; + +#define BLOCK_DEPTH 2 /* logarithm base 2 of BLOCK_SIZE */ +#define BLOCK_SIZE (1<<BLOCK_DEPTH) /* 4 seems to be the optimum */ + +typedef Real Block[BLOCK_SIZE][BLOCK_SIZE]; + +#define BLOCK(B,I,J) (B[I][J]) + +#define _00 0 +#define _01 1 +#define _10 2 +#define _11 3 + +#define TR_00 _00 +#define TR_01 _10 +#define TR_10 _01 +#define TR_11 _11 + +typedef struct InternalNode { + struct InternalNode *child[4]; +} InternalNode; + +typedef struct { + Block block; +} LeafNode; + +typedef InternalNode *Matrix; + +static Matrix A, R; +static int depth; + +#ifndef BENCHMARK +int n = 2000; +static int nonzeros = 10000; +#else +int n = 4000; +static int nonzeros = 40000; +#endif + +/*************************************************************\ + * Linear algebra on blocks + \*************************************************************/ + +/* + * elem_daxmy - Compute y' = y - ax where a is a Real and x and y are + * vectors of Reals. + */ +static void elem_daxmy(Real a, Real * x, Real * y, int n) +{ + for (n--; n >= 0; n--) + y[n] -= a * x[n]; +} + +/* + * block_schur - Compute Schur complement B' = B - AC. + */ +static void block_schur_full(Block B, Block A, Block C) +{ + int i, j, k; + for (i = 0; i < BLOCK_SIZE; i++) { + for (j = 0; j < BLOCK_SIZE; j++) { + for (k = 0; k < BLOCK_SIZE; k++) { + BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k); + } + } + } +} + +/* + * block_schur - Compute Schur complement B' = B - AC. + */ +static void block_schur_half(Block B, Block A, Block C) +{ + int i, j, k; + + /* + * printf("schur half\n"); + */ + /* Compute Schur complement. */ + for (i = 0; i < BLOCK_SIZE; i++) { + for (j = 0; j <= i /* BLOCK_SIZE */ ; j++) { + for (k = 0; k < BLOCK_SIZE; k++) { + BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k); + } + } + } +} + +/* + * block_upper_solve - Perform substitution to solve for B' in + * B'U = B. + */ +static void block_backsub(Block B, Block U) +{ + int i, j, k; + + /* Perform backward substitution. */ + for (i = 0; i < BLOCK_SIZE; i++) { + for (j = 0; j < BLOCK_SIZE; j++) { + for (k = 0; k < i; k++) { + BLOCK(B, j, i) -= BLOCK(U, i, k) * BLOCK(B, j, k); /* transpose? */ + } + BLOCK(B, j, i) /= BLOCK(U, i, i); + } + } +} + +/* + * block_lower_solve - Perform forward substitution to solve for B' in + * LB' = B. + */ +static void xblock_backsub(Block B, Block L) +{ + int i, k; + (void) xblock_backsub; + + /* Perform forward substitution. */ + for (i = 0; i < BLOCK_SIZE; i++) + for (k = 0; k <= i; k++) { + BLOCK(B, i, k) /= BLOCK(L, k, k); + elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0), + &BLOCK(B, i, 0), BLOCK_SIZE - k); + } +} + +/* + * block_cholesky - Factor block B. + */ +static void block_cholesky(Block B) +{ + int i, j, k; + + for (k = 0; k < BLOCK_SIZE; k++) { + Real x; + if (BLOCK(B, k, k) < 0.0) { + printf("sqrt error: %f\n", BLOCK(B, k, k)); + printf("matrix is probably not numerically stable\n"); + exit(9); + } + x = sqrt(BLOCK(B, k, k)); + for (i = k; i < BLOCK_SIZE; i++) { + BLOCK(B, i, k) /= x; + } + for (j = k + 1; j < BLOCK_SIZE; j++) { + for (i = j; i < BLOCK_SIZE; i++) { + BLOCK(B, i, j) -= BLOCK(B, i, k) * BLOCK(B, j, k); + if (j > i && BLOCK(B, i, j) != 0.0) { + printf("Upper not empty\n"); + } + } + } + } +} + +/* + * block_zero - zero block B. + */ +static void block_zero(Block B) +{ + int i, k; + + for (i = 0; i < BLOCK_SIZE; i++) { + for (k = 0; k < BLOCK_SIZE; k++) { + BLOCK(B, i, k) = 0.0; + } + } +} + +/*************************************************************\ + * Allocation and initialization + \*************************************************************/ + + /* + * Create new leaf nodes (BLOCK_SIZE x BLOCK_SIZE submatrices) + */ +static inline InternalNode *new_block_leaf(void) +{ + LeafNode *leaf = (LeafNode*) malloc(sizeof(LeafNode)); + if (leaf == NULL) { + printf("out of memory!\n"); + exit(1); + } + return (InternalNode *) leaf; +} + +/* + * Create internal node in quadtree representation + */ +static inline InternalNode *new_internal(InternalNode * a00, InternalNode * a01, + InternalNode * a10, InternalNode * a11) +{ + InternalNode *node = (InternalNode*) malloc(sizeof(InternalNode)); + if (node == NULL) { + printf("out of memory!\n"); + exit(1); + } + node->child[_00] = a00; + node->child[_01] = a01; + node->child[_10] = a10; + node->child[_11] = a11; + return node; +} + +/* + * Duplicate matrix. Resulting matrix may be laid out in memory + * better than source matrix. + */ +fibril static Matrix copy_matrix(int depth, Matrix a) +{ + Matrix r; + + if (!a) + return a; + + if (depth == BLOCK_DEPTH) { + LeafNode *A = (LeafNode *) a; + LeafNode *R; + r = new_block_leaf(); + R = (LeafNode *) r; + memcpy(R->block, A->block, sizeof(Block)); + } else { + Matrix r00, r01, r10, r11; + + depth--; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, &r00, copy_matrix, (depth, a->child[_00])); + fibril_fork(&fr, &r01, copy_matrix, (depth, a->child[_01])); + fibril_fork(&fr, &r10, copy_matrix, (depth, a->child[_10])); + r11 = copy_matrix(depth, a->child[_11]); + fibril_join(&fr); + + r = new_internal(r00, r01, r10, r11); + } + return r; +} + +/* + * Deallocate matrix. + */ +void free_matrix(int depth, Matrix a) +{ + if (a == NULL) + return; + if (depth == BLOCK_DEPTH) { + free(a); + } else { + depth--; + free_matrix(depth, a->child[_00]); + free_matrix(depth, a->child[_01]); + free_matrix(depth, a->child[_10]); + free_matrix(depth, a->child[_11]); + free(a); + } +} + +/*************************************************************\ + * Simple matrix operations + \*************************************************************/ + + /* + * Get matrix element at row r, column c. + */ +static Real get_matrix(int depth, Matrix a, int r, int c) +{ + if (a == NULL) + return 0.0; + + if (depth == BLOCK_DEPTH) { + LeafNode *A = (LeafNode *) a; + return BLOCK(A->block, r, c); + } else { + int mid; + + depth--; + mid = 1 << depth; + + if (r < mid) { + if (c < mid) + return get_matrix(depth, a->child[_00], r, c); + else + return get_matrix(depth, a->child[_01], r, c - mid); + } else { + if (c < mid) + return get_matrix(depth, a->child[_10], r - mid, c); + else + return get_matrix(depth, a->child[_11], r - mid, c - mid); + } + } +} + +/* + * Set matrix element at row r, column c to value. + */ +static Matrix set_matrix(int depth, Matrix a, int r, int c, Real value) +{ + if (depth == BLOCK_DEPTH) { + LeafNode *A; + if (a == NULL) { + a = new_block_leaf(); + A = (LeafNode *) a; + block_zero(A->block); + } else { + A = (LeafNode *) a; + } + BLOCK(A->block, r, c) = value; + } else { + int mid; + + if (a == NULL) + a = new_internal(NULL, NULL, NULL, NULL); + + depth--; + mid = 1 << depth; + + if (r < mid) { + if (c < mid) + a->child[_00] = set_matrix(depth, a->child[_00], + r, c, value); + else + a->child[_01] = set_matrix(depth, a->child[_01], + r, c - mid, value); + } else { + if (c < mid) + a->child[_10] = set_matrix(depth, a->child[_10], + r - mid, c, value); + else + a->child[_11] = set_matrix(depth, a->child[_11], + r - mid, c - mid, value); + } + } + return a; +} + +/* + * Compute sum of squares of elements of matrix + */ +static Real mag(int depth, Matrix a) +{ + Real res = 0.0; + if (!a) + return res; + + if (depth == BLOCK_DEPTH) { + LeafNode *A = (LeafNode *) a; + int i, j; + for (i = 0; i < BLOCK_SIZE; i++) + for (j = 0; j < BLOCK_SIZE; j++) + res += BLOCK(A->block, i, j) * BLOCK(A->block, i, j); + } else { + depth--; + res += mag(depth, a->child[_00]); + res += mag(depth, a->child[_01]); + res += mag(depth, a->child[_10]); + res += mag(depth, a->child[_11]); + } + return res; +} + +/*************************************************************\ + * Cholesky algorithm + \*************************************************************/ + + /* + * Perform R -= A * Transpose(B) + * if lower==1, update only lower-triangular part of R + */ +fibril static +Matrix mul_and_subT(int depth, int lower, Matrix a, Matrix b, Matrix r) +{ + if (depth == BLOCK_DEPTH) { + LeafNode *A = (LeafNode *) a; + LeafNode *B = (LeafNode *) b; + LeafNode *R; + + if (r == NULL) { + r = new_block_leaf(); + R = (LeafNode *) r; + block_zero(R->block); + } else + R = (LeafNode *) r; + + if (lower) + block_schur_half(R->block, A->block, B->block); + else + block_schur_full(R->block, A->block, B->block); + } else { + Matrix r00, r01, r10, r11; + + depth--; + + if (r != NULL) { + r00 = r->child[_00]; + r01 = r->child[_01]; + r10 = r->child[_10]; + r11 = r->child[_11]; + } else { + r00 = NULL; + r01 = NULL; + r10 = NULL; + r11 = NULL; + } + + fibril_t fr; + fibril_init(&fr); + + if (a->child[_00] && b->child[TR_00]) + fibril_fork(&fr, &r00, mul_and_subT, (depth, lower, + a->child[_00], b->child[TR_00], + r00)); + + if (!lower && a->child[_00] && b->child[TR_01]) + fibril_fork(&fr, &r01, mul_and_subT, (depth, 0, + a->child[_00], b->child[TR_01], + r01)); + + if (a->child[_10] && b->child[TR_00]) + fibril_fork(&fr, &r10, mul_and_subT, (depth, 0, + a->child[_10], b->child[TR_00], + r10)); + + if (a->child[_10] && b->child[TR_01]) + fibril_fork(&fr, &r11, mul_and_subT, (depth, lower, + a->child[_10], b->child[TR_01], + r11)); + + fibril_join(&fr); + + if (a->child[_01] && b->child[TR_10]) + fibril_fork(&fr, &r00, mul_and_subT, (depth, lower, + a->child[_01], b->child[TR_10], + r00)); + + if (!lower && a->child[_01] && b->child[TR_11]) + fibril_fork(&fr, &r01, mul_and_subT, (depth, 0, + a->child[_01], b->child[TR_11], + r01)); + + if (a->child[_11] && b->child[TR_10]) + fibril_fork(&fr, &r10, mul_and_subT, (depth, 0, + a->child[_11], b->child[TR_10], + r10)); + + if (a->child[_11] && b->child[TR_11]) + fibril_fork(&fr, &r11, mul_and_subT, (depth, lower, + a->child[_11], b->child[TR_11], + r11)); + + fibril_join(&fr); + + if (r == NULL) { + if (r00 || r01 || r10 || r11) + r = new_internal(r00, r01, r10, r11); + } else { + r->child[_00] = r00; + r->child[_01] = r01; + r->child[_10] = r10; + r->child[_11] = r11; + } + } + return r; +} + +/* + * Perform substitution to solve for B in BL = A + * Returns B in place of A. + */ +fibril static Matrix backsub(int depth, Matrix a, Matrix l) +{ + if (depth == BLOCK_DEPTH) { + LeafNode *A = (LeafNode *) a; + LeafNode *L = (LeafNode *) l; + block_backsub(A->block, L->block); + } else { + Matrix a00, a01, a10, a11; + Matrix l00, l10, l11; + + depth--; + + a00 = a->child[_00]; + a01 = a->child[_01]; + a10 = a->child[_10]; + a11 = a->child[_11]; + + l00 = l->child[_00]; + l10 = l->child[_10]; + l11 = l->child[_11]; + + fibril_t fr; + fibril_init(&fr); + + if (a00) + fibril_fork(&fr, &a00, backsub, (depth, a00, l00)); + if (a10) + fibril_fork(&fr, &a10, backsub, (depth, a10, l00)); + + fibril_join(&fr); + + if (a00 && l10) + fibril_fork(&fr, &a01, mul_and_subT, (depth, 0, a00, l10, a01)); + if (a10 && l10) + fibril_fork(&fr, &a11, mul_and_subT, (depth, 0, a10, l10, a11)); + + fibril_join(&fr); + + if (a01) + fibril_fork(&fr, &a01, backsub, (depth, a01, l11)); + if (a11) + fibril_fork(&fr, &a11, backsub, (depth, a11, l11)); + + fibril_join(&fr); + + a->child[_00] = a00; + a->child[_01] = a01; + a->child[_10] = a10; + a->child[_11] = a11; + } + + return a; +} + +/* + * Compute Cholesky factorization of A. + */ +fibril static Matrix cholesky(int depth, Matrix a) +{ + if (depth == BLOCK_DEPTH) { + LeafNode *A = (LeafNode *) a; + block_cholesky(A->block); + } else { + Matrix a00, a10, a11; + + depth--; + + a00 = a->child[_00]; + a10 = a->child[_10]; + a11 = a->child[_11]; + + if (!a10) { + fibril_t fr; + fibril_init(&fr); + fibril_fork(&fr, &a00, cholesky, (depth, a00)); + a11 = cholesky(depth, a11); + fibril_join(&fr); + } else { + a00 = cholesky(depth, a00); + a10 = backsub(depth, a10, a00); + a11 = mul_and_subT(depth, 1, a10, a10, a11); + a11 = cholesky(depth, a11); + } + a->child[_00] = a00; + a->child[_10] = a10; + a->child[_11] = a11; + } + return a; +} + +static int logarithm(int size) +{ + int k = 0; + + while ((1 << k) < size) + k++; + return k; +} + +void init() +{ + /* generate random matrix */ + depth = logarithm(n); + + /* diagonal elements */ + int i; + for (i = 0; i < n; i++) + A = set_matrix(depth, A, i, i, 1.0); + + /* off-diagonal elements */ + for (i = 0; i < nonzeros - n; i++) { + int r, c; + + do { + r = rand() % n; + c = rand() % n; + } while (r <= c || get_matrix(depth, A, r, c) != 0.0); + + A = set_matrix(depth, A, r, c, 0.1); + } + + /* extend to power of two n with identity matrix */ + for (i = n; i < (1 << depth); i++) { + A = set_matrix(depth, A, i, i, 1.0); + } +} + +void prep() +{ + free_matrix(depth, R); + R = copy_matrix(depth, A); +} + +void test() +{ + R = cholesky(depth, R); +} + +int verify() +{ + int fail = 0; + +#ifndef BENCHMARK + /* test - make sure R * Transpose(R) == A */ + /* compute || A - R * Transpose(R) || */ + A = mul_and_subT(depth, 1, R, R, A); + Real error = mag(depth, A); + fail = (error > 0.00001); +#endif + + free_matrix(depth, A); + free_matrix(depth, R); + return fail; +} diff --git a/benchmarks/cilkplus/CMakeLists.txt b/benchmarks/cilkplus/CMakeLists.txt new file mode 100644 index 00000000..18d21761 --- /dev/null +++ b/benchmarks/cilkplus/CMakeLists.txt @@ -0,0 +1,44 @@ + +add_definitions(-DFIBRIL_CILKPLUS) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus") + +find_library(CILKRTS_LIB cilkrts /srv/scratch/uh15efil/intel-cilk-runtime/build/lib) +find_library(DL_LIB NAMES dl) + + +add_executable(cholesky_cilkplus ../cholesky.cpp) +target_link_libraries(cholesky_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(fft_cilkplus ../fft.cpp) +target_link_libraries(fft_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(fib_cilkplus ../fib.cpp) +target_link_libraries(fib_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(heat_cilkplus ../heat.cpp) +target_link_libraries(heat_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(integrate_cilkplus ../integrate.cpp) +target_link_libraries(integrate_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(knapsack_cilkplus ../knapsack.cpp) +target_link_libraries(knapsack_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(lu_cilkplus ../lu.cpp) +target_link_libraries(lu_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(matmul_cilkplus ../matmul.cpp) +target_link_libraries(matmul_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(nqueens_cilkplus ../nqueens.cpp) +target_link_libraries(nqueens_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(quicksort_cilkplus ../quicksort.cpp) +target_link_libraries(quicksort_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(rectmul_cilkplus ../rectmul.cpp) +target_link_libraries(rectmul_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") + +add_executable(strassen_cilkplus ../strassen.cpp) +target_link_libraries(strassen_cilkplus "${CILKRTS_LIB}" "${DL_LIB}") diff --git a/benchmarks/cilkplus/cilkplus.h b/benchmarks/cilkplus/cilkplus.h new file mode 100644 index 00000000..722d72c1 --- /dev/null +++ b/benchmarks/cilkplus/cilkplus.h @@ -0,0 +1,34 @@ +#ifndef CILKPLUS_H +#define CILKPLUS_H + +#include <thread> +#include <stdio.h> +#include <cilk/cilk.h> +#include <cilk/cilk_api.h> + +#define fibril +#define fibril_t __attribute__((unused)) int +#define fibril_init(fp) +#define fibril_join(fp) cilk_sync + +#define fibril_fork_nrt(fp, fn, ag) cilk_spawn fn ag +#define fibril_fork_wrt(fp, rt, fn, ag) *rt = cilk_spawn fn ag + + +#define _nthreads(_n) [](int n) -> int { \ + int nprocs = std::thread::hardware_concurrency(); \ + if (n > 0 && n < nprocs) \ + return n; \ + return nprocs; \ +}(_n) + +#define fibril_rt_init(n) do { \ + char nprocs[32]; \ + snprintf(nprocs, 32, "%d", _nthreads(n)); \ + __cilkrts_set_param("nworkers", nprocs); \ + __cilkrts_set_param("stack size", "0x800000"); \ +} while (0); +#define fibril_rt_exit() (__cilkrts_end_cilk()) +#define fibril_rt_nprocs() (__cilkrts_get_nworkers()) + +#endif /* end of include guard: CILKPLUS_H */ diff --git a/benchmarks/emper_continuation/CMakeLists.txt b/benchmarks/emper_continuation/CMakeLists.txt new file mode 100644 index 00000000..d6473281 --- /dev/null +++ b/benchmarks/emper_continuation/CMakeLists.txt @@ -0,0 +1,53 @@ + +add_definitions(-DFIBRIL_EMPER_CONTINUATION) + + +add_executable(cholesky_emper_continuation ../cholesky.cpp) +target_link_libraries(cholesky_emper_continuation Threads::Threads emper) + +add_executable(fft_emper_continuation ../fft.cpp) +target_link_libraries(fft_emper_continuation Threads::Threads emper) + +add_executable(fib_emper_continuation ../fib.cpp) +target_link_libraries(fib_emper_continuation Threads::Threads emper) + +add_executable(heat_emper_continuation ../heat.cpp) +target_link_libraries(heat_emper_continuation Threads::Threads emper) + +add_executable(integrate_emper_continuation ../integrate.cpp) +target_link_libraries(integrate_emper_continuation Threads::Threads emper) + +add_executable(knapsack_emper_continuation ../knapsack.cpp) +target_link_libraries(knapsack_emper_continuation Threads::Threads emper) + +add_executable(lu_emper_continuation ../lu.cpp) +target_link_libraries(lu_emper_continuation Threads::Threads emper) + +add_executable(matmul_emper_continuation ../matmul.cpp) +target_link_libraries(matmul_emper_continuation Threads::Threads emper) + +add_executable(nqueens_emper_continuation ../nqueens.cpp) +target_link_libraries(nqueens_emper_continuation Threads::Threads emper) + +add_executable(quicksort_emper_continuation ../quicksort.cpp) +target_link_libraries(quicksort_emper_continuation Threads::Threads emper) + +add_executable(rectmul_emper_continuation ../rectmul.cpp) +target_link_libraries(rectmul_emper_continuation Threads::Threads emper) + +add_executable(strassen_emper_continuation ../strassen.cpp) +target_link_libraries(strassen_emper_continuation Threads::Threads emper) + + +add_test(cholesky cholesky_emper_continuation) +add_test(fft fft_emper_continuation) +add_test(fib fib_emper_continuation) +add_test(heat heat_emper_continuation) +add_test(integrate integrate_emper_continuation) +add_test(knapsack knapsack_emper_continuation) +add_test(lu lu_emper_continuation) +add_test(matmul matmul_emper_continuation) +add_test(nqueens nqueens_emper_continuation) +add_test(quicksort quicksort_emper_continuation) +add_test(rectmul rectmul_emper_continuation) +add_test(strassen strassen_emper_continuation) diff --git a/benchmarks/emper_continuation/emper_continuation.h b/benchmarks/emper_continuation/emper_continuation.h new file mode 100644 index 00000000..495fb836 --- /dev/null +++ b/benchmarks/emper_continuation/emper_continuation.h @@ -0,0 +1,153 @@ +#ifndef EMPER_CONTINUATION_H +#define EMPER_CONTINUATION_H + + + +#include <thread> + +//#include "fork.h" +#include "emper.hpp" + + + +#if 0 +class StackFibril { +private: + //Fibril *f; + //char memory[sizeof(Fibril) + alignof(Fibril)]; + char memory[sizeof(Fibril)]; + +public: + __attribute__((always_inline)) + inline StackFibril() { + //char *addr = (char*) ((uintptr_t) (memory + alignof(Fibril) - 1) & ~(alignof(Fibril) - 1)); + //f = new (addr) Fibril(); + new (memory) Fibril(); + } + + __attribute__((always_inline)) + inline ~StackFibril() { + //f->~Fibril(); + ((Fibril*) memory)->~Fibril(); + } + + __attribute__((always_inline)) + inline Fibril* operator->() const noexcept { + //return f; + return (Fibril*) memory; + } + + __attribute__((always_inline)) + inline Fibril& operator*() const { + //return *f; + return *((Fibril*) memory); + } + +}; + + +#define fibril_t StackFibril +#define fibril_init(fp) +#define fibril_join(fp) (*fp)->join(); + + +#if 1 +#include "fork.h" +#define fibril_fork_nrt(fp, fn, ag) do { \ + auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \ + (*f)->cont.ip = __builtin_return_address(0); \ + Runtime* runtime = Runtime::getRuntime(); \ + runtime->pushBottom(**f); \ + fn(_fibril_args ag); \ + if (!runtime->popBottom()) { \ + (*f)->resume(); \ + } \ + }; \ + membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \ +} while (0); + +#define fibril_fork_wrt(fp, rt, fn, ag) do { \ + auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rt) p) __attribute__((noinline, hot, optimize(3))) { \ + (*f)->cont.ip = __builtin_return_address(0); \ + Runtime* runtime = Runtime::getRuntime(); \ + runtime->pushBottom(**f); \ + *p = fn(_fibril_args ag); \ + if (!runtime->popBottom()) { \ + (*f)->resume(); \ + } \ + }; \ + membar(_fibril_##fn##_fork(_fibril_expand ag fp, rt)); \ +} while (0); +#else +#define _fibril_expand(...) \ + _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_expand_(n, ...) \ + _fibril_concat(_fibril_expand_, n)(__VA_ARGS__) +#define _fibril_expand_16(...) __VA_ARGS__ +#define _fibril_expand_15(...) __VA_ARGS__ +#define _fibril_expand_14(...) __VA_ARGS__ +#define _fibril_expand_13(...) __VA_ARGS__ +#define _fibril_expand_12(...) __VA_ARGS__ +#define _fibril_expand_11(...) __VA_ARGS__ +#define _fibril_expand_10(...) __VA_ARGS__ +#define _fibril_expand_9( ...) __VA_ARGS__ +#define _fibril_expand_8( ...) __VA_ARGS__ +#define _fibril_expand_7( ...) __VA_ARGS__ +#define _fibril_expand_6( ...) __VA_ARGS__ +#define _fibril_expand_5( ...) __VA_ARGS__ +#define _fibril_expand_4( ...) __VA_ARGS__ +#define _fibril_expand_3( ...) __VA_ARGS__ +#define _fibril_expand_2( ...) __VA_ARGS__ +#define _fibril_expand_1( ...) __VA_ARGS__ +#define _fibril_expand_0() + +#define fibril_fork_nrt(fp, fn, ag) (*fp)->fork(fn, _fibril_expand ag) +#define fibril_fork_wrt(fp, rt, fn, ag) (*fp)->fork(rt, fn, _fibril_expand ag) +#endif +#endif + +#define fibril_t Fibril +#define fibril_init(fp) +#define fibril_join(fp) (fp)->join(); + + +#include "fork.h" + +#define fibril_fork_nrt(fp, fn, ag) do { \ + auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \ + (f)->cont.ip = __builtin_return_address(0); \ + Runtime* runtime = Runtime::getRuntime(); \ + runtime->pushBottom(*f); \ + fn(_fibril_args ag); \ + if (!runtime->popBottom()) { \ + (f)->resume(); \ + } \ + }; \ + membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \ +} while (0); + +#define fibril_fork_wrt(fp, rt, fn, ag) do { \ + auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rt) p) __attribute__((noinline, hot, optimize(3))) { \ + (f)->cont.ip = __builtin_return_address(0); \ + Runtime* runtime = Runtime::getRuntime(); \ + runtime->pushBottom(*f); \ + *p = fn(_fibril_args ag); \ + if (!runtime->popBottom()) { \ + (f)->resume(); \ + } \ + }; \ + membar(_fibril_##fn##_fork(_fibril_expand ag fp, rt)); \ +} while (0); + +#define _nthreads(_n) [](int n) -> int { \ + int nprocs = std::thread::hardware_concurrency(); \ + if (n > 0 && n < nprocs) \ + return n; \ + return nprocs; \ +}(_n) + +#define fibril_rt_init(n) Runtime runtime(_nthreads(n)); runtime.executeAndWait([&] () { +#define fibril_rt_exit() }); +#define fibril_rt_nprocs() runtime.getWorkerCount() + +#endif /* end of include guard: EMPER_CONTINUATION_H */ diff --git a/benchmarks/emper_continuation/fork.h b/benchmarks/emper_continuation/fork.h new file mode 100644 index 00000000..8ab080b0 --- /dev/null +++ b/benchmarks/emper_continuation/fork.h @@ -0,0 +1,70 @@ +#ifndef FIBRIL_FORK_H +#define FIBRIL_FORK_H + +#define _fibril_defs(...) \ + _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_defs_(n, ...) \ + _fibril_concat(_fibril_defs_, n)(__VA_ARGS__) +#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__) +#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__) +#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__) +#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__) +#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__) +#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__) +#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__) +#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__) +#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__) +#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__) +#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__) +#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__) +#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__) +#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__) +#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__) +#define _fibril_defs_1(a) __typeof__(a) a1, +#define _fibril_defs_0() + +#define _fibril_args(...) \ + _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_args_(n, ...) \ + _fibril_concat(_fibril_args_, n)(__VA_ARGS__) +#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__) +#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__) +#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__) +#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__) +#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__) +#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__) +#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__) +#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__) +#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__) +#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__) +#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__) +#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__) +#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__) +#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__) +#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__) +#define _fibril_args_1(a) a1 +#define _fibril_args_0() + +#define _fibril_expand(...) \ + _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_expand_(n, ...) \ + _fibril_concat(_fibril_expand_, n)(__VA_ARGS__) +#define _fibril_expand_16(...) __VA_ARGS__, +#define _fibril_expand_15(...) __VA_ARGS__, +#define _fibril_expand_14(...) __VA_ARGS__, +#define _fibril_expand_13(...) __VA_ARGS__, +#define _fibril_expand_12(...) __VA_ARGS__, +#define _fibril_expand_11(...) __VA_ARGS__, +#define _fibril_expand_10(...) __VA_ARGS__, +#define _fibril_expand_9( ...) __VA_ARGS__, +#define _fibril_expand_8( ...) __VA_ARGS__, +#define _fibril_expand_7( ...) __VA_ARGS__, +#define _fibril_expand_6( ...) __VA_ARGS__, +#define _fibril_expand_5( ...) __VA_ARGS__, +#define _fibril_expand_4( ...) __VA_ARGS__, +#define _fibril_expand_3( ...) __VA_ARGS__, +#define _fibril_expand_2( ...) __VA_ARGS__, +#define _fibril_expand_1( ...) __VA_ARGS__, +#define _fibril_expand_0() + +#endif /* end of include guard: FIBRIL_FORK_H */ diff --git a/benchmarks/emper_fiber/CMakeLists.txt b/benchmarks/emper_fiber/CMakeLists.txt new file mode 100644 index 00000000..2acf2e69 --- /dev/null +++ b/benchmarks/emper_fiber/CMakeLists.txt @@ -0,0 +1,39 @@ + +add_definitions(-DFIBRIL_EMPER_FIBER) + + +add_executable(cholesky_emper_fiber ../cholesky.cpp) +target_link_libraries(cholesky_emper_fiber Threads::Threads emper) + +add_executable(fft_emper_fiber ../fft.cpp) +target_link_libraries(fft_emper_fiber Threads::Threads emper) + +add_executable(fib_emper_fiber ../fib.cpp) +target_link_libraries(fib_emper_fiber Threads::Threads emper) + +add_executable(heat_emper_fiber ../heat.cpp) +target_link_libraries(heat_emper_fiber Threads::Threads emper) + +add_executable(integrate_emper_fiber ../integrate.cpp) +target_link_libraries(integrate_emper_fiber Threads::Threads emper) + +add_executable(knapsack_emper_fiber ../knapsack.cpp) +target_link_libraries(knapsack_emper_fiber Threads::Threads emper) + +add_executable(lu_emper_fiber ../lu.cpp) +target_link_libraries(lu_emper_fiber Threads::Threads emper) + +add_executable(matmul_emper_fiber ../matmul.cpp) +target_link_libraries(matmul_emper_fiber Threads::Threads emper) + +add_executable(nqueens_emper_fiber ../nqueens.cpp) +target_link_libraries(nqueens_emper_fiber Threads::Threads emper) + +add_executable(quicksort_emper_fiber ../quicksort.cpp) +target_link_libraries(quicksort_emper_fiber Threads::Threads emper) + +add_executable(rectmul_emper_fiber ../rectmul.cpp) +target_link_libraries(rectmul_emper_fiber Threads::Threads emper) + +add_executable(strassen_emper_fiber ../strassen.cpp) +target_link_libraries(strassen_emper_fiber Threads::Threads emper) diff --git a/benchmarks/emper_fiber/emper_fiber.h b/benchmarks/emper_fiber/emper_fiber.h new file mode 100644 index 00000000..de549c49 --- /dev/null +++ b/benchmarks/emper_fiber/emper_fiber.h @@ -0,0 +1,39 @@ +#ifndef EMPER_FIBER_H +#define EMPER_FIBER_H + + + +#include <thread> + +#include "emper.hpp" + + +#define fibril_t CPS +#define fibril_init(fp) +#define fibril_join(fp) (*fp).wait(); + +#define fibril_fork_nrt(fp, fn, ag) do { \ + (*fp).incrementCounterByOne(); \ + __typeof__(fp) fpp = fp; \ + Runtime::getRuntime()->schedule(*Fiber::from([=] () {fn ag; (*fpp).signalAndExit(); })); \ +} while (0); +#define fibril_fork_wrt(fp, rt, fn, ag) do { \ + (*fp).incrementCounterByOne(); \ + __typeof__(fp) fpp = fp; \ + __typeof__(rt) rtp = rt; \ + Runtime::getRuntime()->schedule(*Fiber::from([=] () { *rtp = fn ag; (*fpp).signalAndExit(); })); \ +} while (0); + + +#define _nthreads(_n) [](int n) -> int { \ + int nprocs = std::thread::hardware_concurrency(); \ + if (n > 0 && n < nprocs) \ + return n; \ + return nprocs; \ +}(_n) + +#define fibril_rt_init(n) Runtime runtime(_nthreads(n)); runtime.executeAndWait([&] () { +#define fibril_rt_exit() }); +#define fibril_rt_nprocs() runtime.getWorkerCount() + +#endif /* end of include guard: EMPER_FIBER_H */ diff --git a/benchmarks/fft.cpp b/benchmarks/fft.cpp new file mode 100644 index 00000000..5f3ac2af --- /dev/null +++ b/benchmarks/fft.cpp @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include "test.h" +#include "fft.h" + +#ifdef BENCHMARK +int n = 26; +#else +int n = 12; +#endif + +static int size; +static COMPLEX *in, *out, *cp, *W; +static const REAL pi = 3.1415926535897932384626434; + +/* + * compute the W coefficients (that is, powers of the root of 1) + * and store them into an array. + */ +fibril static void compute_w_coefficients(int n, int a, int b, COMPLEX * W) +{ + //register double twoPiOverN; + //register int k; + //register REAL s, c; + double twoPiOverN; + int k; + REAL s, c; + + if (b - a < 128) { + twoPiOverN = 2.0 * pi / n; + for (k = a; k <= b; ++k) { + c = cos(twoPiOverN * k); + c_re(W[k]) = c_re(W[n - k]) = c; + s = sin(twoPiOverN * k); + c_im(W[k]) = -s; + c_im(W[n - k]) = s; + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, compute_w_coefficients, (n, a, ab, W)); + compute_w_coefficients(n, ab + 1, b, W); + + fibril_join(&fr); + } +} + +/* + * Determine (in a stupid way) if n is divisible by eight, then by four, else + * find the smallest prime factor of n. + */ +static int factor(int n) +{ + int r; + + if (n < 2) + return 1; + + if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048 + || n == 4096) + return 8; + if ((n & 15) == 0) + return 16; + if ((n & 7) == 0) + return 8; + if ((n & 3) == 0) + return 4; + if ((n & 1) == 0) + return 2; + +#if 0 + /* radix-32 is too big --- wait for processors with more registers + * :-) */ + if ((n & 31) == 0 && n > 256) + return 32; +#endif + + /* try odd numbers up to n (computing the sqrt may be slower) */ + for (r = 3; r < n; r += 2) + if (n % r == 0) + return r; + + /* n is prime */ + return n; +} + +fibril static void unshuffle(int a, int b, + COMPLEX * in, COMPLEX * out, int r, int m) +{ + int i, j; + int r4 = r & (~0x3); + const COMPLEX *ip; + COMPLEX *jp; + + if (b - a < 16) { + ip = in + a * r; + for (i = a; i < b; ++i) { + jp = out + i; + for (j = 0; j < r4; j += 4) { + jp[0] = ip[0]; + jp[m] = ip[1]; + jp[2 * m] = ip[2]; + jp[3 * m] = ip[3]; + jp += 4 * m; + ip += 4; + } + for (; j < r; ++j) { + *jp = *ip; + ip++; + jp += m; + } + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, unshuffle, (a, ab, in, out, r, m)); + unshuffle(ab, b, in, out, r, m); + + fibril_join(&fr); + } +} + +/* + * Recursive complex FFT on the n complex components of the array in: + * basic Cooley-Tukey algorithm, with some improvements for + * n power of two. The result is placed in the array out. n is arbitrary. + * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk + * are prime numbers, and r1 * r2 * ... * rk = n. + * + * n: size of the input + * in: pointer to input + * out: pointer to output + * factors: list of factors of n, precomputed + * W: twiddle factors + * nW: size of W, that is, size of the original transform + * + */ +fibril static void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, + COMPLEX * W, int nW) +{ + int r, m; + + /* special cases */ + if (n == 32) { + fft_base_32(in, out); + return; + } + if (n == 16) { + fft_base_16(in, out); + return; + } + if (n == 8) { + fft_base_8(in, out); + return; + } + if (n == 4) { + fft_base_4(in, out); + return; + } + if (n == 2) { + fft_base_2(in, out); + return; + } + /* the cases n == 3, n == 5, and maybe 7 should be implemented as well */ + + r = *factors; + m = n / r; + + if (r < n) { + /* split the DFT of length n into r DFTs of length n/r, and recurse */ + if (r == 32) + fft_unshuffle_32(0, m, in, out, m); + else if (r == 16) + fft_unshuffle_16(0, m, in, out, m); + else if (r == 8) + fft_unshuffle_8(0, m, in, out, m); + else if (r == 4) + fft_unshuffle_4(0, m, in, out, m); + else if (r == 2) + fft_unshuffle_2(0, m, in, out, m); + else + unshuffle(0, m, in, out, r, m); + + fibril_t fr; + fibril_init(&fr); + + int k; + for(k = 0; k < n; k += m) { + fibril_fork(&fr, fft_aux, (m, out + k, in + k, factors + 1, W, nW)); + } + + fibril_join(&fr); + } + + /* now multiply by the twiddle factors, and perform m FFTs of length r */ + if (r == 2) + fft_twiddle_2(0, m, in, out, W, nW, nW / n, m); + else if (r == 4) + fft_twiddle_4(0, m, in, out, W, nW, nW / n, m); + else if (r == 8) + fft_twiddle_8(0, m, in, out, W, nW, nW / n, m); + else if (r == 16) + fft_twiddle_16(0, m, in, out, W, nW, nW / n, m); + else if (r == 32) + fft_twiddle_32(0, m, in, out, W, nW, nW / n, m); + else + fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m); + + return; +} + +/* + * user interface for fft_aux + */ +static void fft(int n, COMPLEX * in, COMPLEX * out) +{ + int factors[40]; /* allows FFTs up to at least 3^40 */ + int *p = factors; + int l = n; + int r; + + compute_w_coefficients(n, 0, n / 2, W); + + /** + * find factors of n, first 8, then 4 and then primes in ascending + * order. + */ + do { + r = factor(l); + *p++ = r; + l /= r; + } while (l > 1); + + fft_aux(n, in, out, factors, W, n); + return; +} + +/**************************************************************** + * END OF FFT ALGORITHM + ****************************************************************/ + +/* tests */ + +static void fft_alt(int n, COMPLEX * in, COMPLEX * out) +{ + int i, j; + COMPLEX sum; + COMPLEX w; + (void) fft_alt; + + for (j = 0; j < n; ++j) { + c_re(sum) = c_im(sum) = 0.0; + + for (i = 0; i < n; ++i) { + c_re(w) = cos((2.0 * pi * (i * j % n)) / n); + c_im(w) = -sin((2.0 * pi * (i * j % n)) / n); + c_re(sum) += c_re(in[i]) * c_re(w) - c_im(in[i]) * c_im(w); + c_im(sum) += c_im(in[i]) * c_re(w) + c_re(in[i]) * c_im(w); + } + + out[j] = sum; + } + + return; +} + +void init() +{ + size = (1 << n); + out = (COMPLEX*) malloc(sizeof(COMPLEX) * size); + in = (COMPLEX*) malloc(sizeof(COMPLEX) * size); + W = (COMPLEX*) malloc(sizeof(COMPLEX) * (size + 1)); + + int i; + for (i = 0; i < size; ++i) { + c_re(in[i]) = drand48(); + c_im(in[i]) = drand48(); + } +} + +void prep() +{ + if (cp == NULL) + cp = (COMPLEX*) malloc(sizeof(COMPLEX) * size); + + memcpy(cp, in, sizeof(COMPLEX) * size); +} + +void test() +{ + fft(size, cp, out); +} + +#ifdef BENCHMARK +int verify(void) { return 0; } +#else +int verify(void) +{ + COMPLEX * expect = (COMPLEX*) malloc(sizeof(COMPLEX) * size); + + fft_alt(size, in, expect); + + /* compute the relative error */ + double error = 0.0; + + int i; + for (i = 0; i < size; ++i) { + double a = sqrt( + (c_re(out[i]) - c_re(expect[i])) * (c_re(out[i]) - c_re(expect[i])) + + (c_im(out[i]) - c_im(expect[i])) * (c_im(out[i]) - c_im(expect[i]))); + double d = sqrt( + c_re(expect[i]) * c_re(expect[i]) + c_im(expect[i]) * c_im(expect[i])); + + if (d < -1.0e-10 || d > 1.0e-10) a /= d; + if (a > error) error = a; + } + + if (error > 1e-3) { + printf("size=%d error=%e\n", size, error); + return 1; + } else { + return 0; + } +} +#endif + diff --git a/benchmarks/fft.h b/benchmarks/fft.h new file mode 100644 index 00000000..7d9debf8 --- /dev/null +++ b/benchmarks/fft.h @@ -0,0 +1,2877 @@ +#ifndef FFT_H +#define FFT_H + +/* our real numbers */ +typedef float REAL; + +/* Complex numbers and operations */ +typedef struct { + REAL re, im; +} COMPLEX; + +#define c_re(c) ((c).re) +#define c_im(c) ((c).im) + +static void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, + COMPLEX * W, int r, int m, + int nW, int nWdnti, int nWdntm) +{ + int j, k; + COMPLEX *jp, *kp; + + for (k = 0, kp = out; k < r; ++k, kp += m) { + REAL r0, i0, rt, it, rw, iw; + int l1 = nWdnti + nWdntm * k; + int l0; + + r0 = i0 = 0.0; + for (j = 0, jp = in, l0 = 0; j < r; ++j, jp += m) { + rw = c_re(W[l0]); + iw = c_im(W[l0]); + rt = c_re(*jp); + it = c_im(*jp); + r0 += rt * rw - it * iw; + i0 += rt * iw + it * rw; + l0 += l1; + if (l0 > nW) + l0 -= nW; + } + c_re(*kp) = r0; + c_im(*kp) = i0; + } +} + +fibril static void fft_twiddle_gen(int i, int i1, + COMPLEX * in, COMPLEX * out, + COMPLEX * W, + int nW, int nWdn, int r, int m) +{ + if (i == i1 - 1) { + fft_twiddle_gen1(in + i, out + i, W, + r, m, nW, nWdn * i, nWdn * m); + } else { + int i2 = (i + i1) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_twiddle_gen, (i, i2, in, out, W, nW, nWdn, r, m)); + fft_twiddle_gen(i2, i1, in, out, W, nW, nWdn, r, m); + + fibril_join(&fr); + } +} + +/* machine-generated code begins here */ +static void fft_base_2(COMPLEX * in, COMPLEX * out) +{ + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(in[0]); + i1_0 = c_im(in[0]); + r1_1 = c_re(in[1]); + i1_1 = c_im(in[1]); + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[1]) = (r1_0 - r1_1); + c_im(out[1]) = (i1_0 - i1_1); +} + +fibril static void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, + COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(jp[0 * m]); + i1_0 = c_im(jp[0 * m]); + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r1_1 = ((wr * tmpr) - (wi * tmpi)); + i1_1 = ((wi * tmpr) + (wr * tmpi)); + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[1 * m]) = (r1_0 - r1_1); + c_im(kp[1 * m]) = (i1_0 - i1_1); + } + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_twiddle_2, (a, ab, in, out, W, nW, nWdn, m)); + fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m); + + fibril_join(&fr); + } +} + +fibril static void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 2; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_unshuffle_2, (a, ab, in, out, m)); + fft_unshuffle_2(ab, b, in, out, m); + + fibril_join(&fr); + } +} + +static void fft_base_4(COMPLEX * in, COMPLEX * out) +{ + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(in[0]); + i2_0 = c_im(in[0]); + r2_2 = c_re(in[2]); + i2_2 = c_im(in[2]); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + r2_1 = c_re(in[1]); + i2_1 = c_im(in[1]); + r2_3 = c_re(in[3]); + i2_3 = c_im(in[3]); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[2]) = (r1_0 - r1_1); + c_im(out[2]) = (i1_0 - i1_1); + c_re(out[1]) = (r1_2 + i1_3); + c_im(out[1]) = (i1_2 - r1_3); + c_re(out[3]) = (r1_2 - i1_3); + c_im(out[3]) = (i1_2 + r1_3); +} + +fibril static void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, + COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(jp[0 * m]); + i2_0 = c_im(jp[0 * m]); + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r2_2 = ((wr * tmpr) - (wi * tmpi)); + i2_2 = ((wi * tmpr) + (wr * tmpi)); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r2_1 = ((wr * tmpr) - (wi * tmpi)); + i2_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r2_3 = ((wr * tmpr) - (wi * tmpi)); + i2_3 = ((wi * tmpr) + (wr * tmpi)); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[2 * m]) = (r1_0 - r1_1); + c_im(kp[2 * m]) = (i1_0 - i1_1); + c_re(kp[1 * m]) = (r1_2 + i1_3); + c_im(kp[1 * m]) = (i1_2 - r1_3); + c_re(kp[3 * m]) = (r1_2 - i1_3); + c_im(kp[3 * m]) = (i1_2 + r1_3); + } + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_twiddle_4, (a, ab, in, out, W, nW, nWdn, m)); + fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m); + + fibril_join(&fr); + } +} + +fibril static void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 4; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_unshuffle_4, (a, ab, in, out, m)); + fft_unshuffle_4(ab, b, in, out, m); + + fibril_join(&fr); + } +} + +static void fft_base_8(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(in[0]); + i3_0 = c_im(in[0]); + r3_4 = c_re(in[4]); + i3_4 = c_im(in[4]); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + r3_2 = c_re(in[2]); + i3_2 = c_im(in[2]); + r3_6 = c_re(in[6]); + i3_6 = c_im(in[6]); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + r3_1 = c_re(in[1]); + i3_1 = c_im(in[1]); + r3_5 = c_re(in[5]); + i3_5 = c_im(in[5]); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + r3_3 = c_re(in[3]); + i3_3 = c_im(in[3]); + r3_7 = c_re(in[7]); + i3_7 = c_im(in[7]); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[4]) = (r1_0 - r1_1); + c_im(out[4]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[5]) = (r1_2 - tmpr); + c_im(out[5]) = (i1_2 - tmpi); + c_re(out[2]) = (r1_4 + i1_5); + c_im(out[2]) = (i1_4 - r1_5); + c_re(out[6]) = (r1_4 - i1_5); + c_im(out[6]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 - tmpi); + c_re(out[7]) = (r1_6 - tmpr); + c_im(out[7]) = (i1_6 + tmpi); + } +} + +fibril static void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, + COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(jp[0 * m]); + i3_0 = c_im(jp[0 * m]); + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r3_4 = ((wr * tmpr) - (wi * tmpi)); + i3_4 = ((wi * tmpr) + (wr * tmpi)); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r3_2 = ((wr * tmpr) - (wi * tmpi)); + i3_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r3_6 = ((wr * tmpr) - (wi * tmpi)); + i3_6 = ((wi * tmpr) + (wr * tmpi)); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r3_1 = ((wr * tmpr) - (wi * tmpi)); + i3_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r3_5 = ((wr * tmpr) - (wi * tmpi)); + i3_5 = ((wi * tmpr) + (wr * tmpi)); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r3_3 = ((wr * tmpr) - (wi * tmpi)); + i3_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r3_7 = ((wr * tmpr) - (wi * tmpi)); + i3_7 = ((wi * tmpr) + (wr * tmpi)); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[4 * m]) = (r1_0 - r1_1); + c_im(kp[4 * m]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[5 * m]) = (r1_2 - tmpr); + c_im(kp[5 * m]) = (i1_2 - tmpi); + c_re(kp[2 * m]) = (r1_4 + i1_5); + c_im(kp[2 * m]) = (i1_4 - r1_5); + c_re(kp[6 * m]) = (r1_4 - i1_5); + c_im(kp[6 * m]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 - tmpi); + c_re(kp[7 * m]) = (r1_6 - tmpr); + c_im(kp[7 * m]) = (i1_6 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_twiddle_8, (a, ab, in, out, W, nW, nWdn, m)); + fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m); + + fibril_join(&fr); + } +} + +fibril static void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 8; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_unshuffle_8, (a, ab, in, out, m)); + fft_unshuffle_8(ab, b, in, out, m); + + fibril_join(&fr); + } +} + +static void fft_base_16(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(in[0]); + i4_0 = c_im(in[0]); + r4_8 = c_re(in[8]); + i4_8 = c_im(in[8]); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + r4_4 = c_re(in[4]); + i4_4 = c_im(in[4]); + r4_12 = c_re(in[12]); + i4_12 = c_im(in[12]); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + r4_2 = c_re(in[2]); + i4_2 = c_im(in[2]); + r4_10 = c_re(in[10]); + i4_10 = c_im(in[10]); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + r4_6 = c_re(in[6]); + i4_6 = c_im(in[6]); + r4_14 = c_re(in[14]); + i4_14 = c_im(in[14]); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + r4_1 = c_re(in[1]); + i4_1 = c_im(in[1]); + r4_9 = c_re(in[9]); + i4_9 = c_im(in[9]); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + r4_5 = c_re(in[5]); + i4_5 = c_im(in[5]); + r4_13 = c_re(in[13]); + i4_13 = c_im(in[13]); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + r4_3 = c_re(in[3]); + i4_3 = c_im(in[3]); + r4_11 = c_re(in[11]); + i4_11 = c_im(in[11]); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + r4_7 = c_re(in[7]); + i4_7 = c_im(in[7]); + r4_15 = c_re(in[15]); + i4_15 = c_im(in[15]); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[8]) = (r1_0 - r1_1); + c_im(out[8]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[9]) = (r1_2 - tmpr); + c_im(out[9]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(out[2]) = (r1_4 + tmpr); + c_im(out[2]) = (i1_4 + tmpi); + c_re(out[10]) = (r1_4 - tmpr); + c_im(out[10]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 + tmpi); + c_re(out[11]) = (r1_6 - tmpr); + c_im(out[11]) = (i1_6 - tmpi); + c_re(out[4]) = (r1_8 + i1_9); + c_im(out[4]) = (i1_8 - r1_9); + c_re(out[12]) = (r1_8 - i1_9); + c_im(out[12]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(out[5]) = (r1_10 + tmpr); + c_im(out[5]) = (i1_10 - tmpi); + c_re(out[13]) = (r1_10 - tmpr); + c_im(out[13]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(out[6]) = (r1_12 + tmpr); + c_im(out[6]) = (i1_12 - tmpi); + c_re(out[14]) = (r1_12 - tmpr); + c_im(out[14]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(out[7]) = (r1_14 + tmpr); + c_im(out[7]) = (i1_14 - tmpi); + c_re(out[15]) = (r1_14 - tmpr); + c_im(out[15]) = (i1_14 + tmpi); + } +} + +fibril static void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, + COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(jp[0 * m]); + i4_0 = c_im(jp[0 * m]); + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r4_8 = ((wr * tmpr) - (wi * tmpi)); + i4_8 = ((wi * tmpr) + (wr * tmpi)); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r4_4 = ((wr * tmpr) - (wi * tmpi)); + i4_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r4_12 = ((wr * tmpr) - (wi * tmpi)); + i4_12 = ((wi * tmpr) + (wr * tmpi)); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r4_2 = ((wr * tmpr) - (wi * tmpi)); + i4_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r4_10 = ((wr * tmpr) - (wi * tmpi)); + i4_10 = ((wi * tmpr) + (wr * tmpi)); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r4_6 = ((wr * tmpr) - (wi * tmpi)); + i4_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r4_14 = ((wr * tmpr) - (wi * tmpi)); + i4_14 = ((wi * tmpr) + (wr * tmpi)); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r4_1 = ((wr * tmpr) - (wi * tmpi)); + i4_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r4_9 = ((wr * tmpr) - (wi * tmpi)); + i4_9 = ((wi * tmpr) + (wr * tmpi)); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r4_5 = ((wr * tmpr) - (wi * tmpi)); + i4_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r4_13 = ((wr * tmpr) - (wi * tmpi)); + i4_13 = ((wi * tmpr) + (wr * tmpi)); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r4_3 = ((wr * tmpr) - (wi * tmpi)); + i4_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r4_11 = ((wr * tmpr) - (wi * tmpi)); + i4_11 = ((wi * tmpr) + (wr * tmpi)); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r4_7 = ((wr * tmpr) - (wi * tmpi)); + i4_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r4_15 = ((wr * tmpr) - (wi * tmpi)); + i4_15 = ((wi * tmpr) + (wr * tmpi)); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[8 * m]) = (r1_0 - r1_1); + c_im(kp[8 * m]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[9 * m]) = (r1_2 - tmpr); + c_im(kp[9 * m]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[10 * m]) = (r1_4 - tmpr); + c_im(kp[10 * m]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[11 * m]) = (r1_6 - tmpr); + c_im(kp[11 * m]) = (i1_6 - tmpi); + c_re(kp[4 * m]) = (r1_8 + i1_9); + c_im(kp[4 * m]) = (i1_8 - r1_9); + c_re(kp[12 * m]) = (r1_8 - i1_9); + c_im(kp[12 * m]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 - tmpi); + c_re(kp[13 * m]) = (r1_10 - tmpr); + c_im(kp[13 * m]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 - tmpi); + c_re(kp[14 * m]) = (r1_12 - tmpr); + c_im(kp[14 * m]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 - tmpi); + c_re(kp[15 * m]) = (r1_14 - tmpr); + c_im(kp[15 * m]) = (i1_14 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_twiddle_16, (a, ab, in, out, W, nW, nWdn, m)); + fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m); + + fibril_join(&fr); + } +} + +fibril static void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 16; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_unshuffle_16, (a, ab, in, out, m)); + fft_unshuffle_16(ab, b, in, out, m); + + fibril_join(&fr); + } +} + +static void fft_base_32(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(in[0]); + i5_0 = c_im(in[0]); + r5_16 = c_re(in[16]); + i5_16 = c_im(in[16]); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + r5_8 = c_re(in[8]); + i5_8 = c_im(in[8]); + r5_24 = c_re(in[24]); + i5_24 = c_im(in[24]); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + r5_4 = c_re(in[4]); + i5_4 = c_im(in[4]); + r5_20 = c_re(in[20]); + i5_20 = c_im(in[20]); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + r5_12 = c_re(in[12]); + i5_12 = c_im(in[12]); + r5_28 = c_re(in[28]); + i5_28 = c_im(in[28]); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + r5_2 = c_re(in[2]); + i5_2 = c_im(in[2]); + r5_18 = c_re(in[18]); + i5_18 = c_im(in[18]); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + r5_10 = c_re(in[10]); + i5_10 = c_im(in[10]); + r5_26 = c_re(in[26]); + i5_26 = c_im(in[26]); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + r5_6 = c_re(in[6]); + i5_6 = c_im(in[6]); + r5_22 = c_re(in[22]); + i5_22 = c_im(in[22]); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + r5_14 = c_re(in[14]); + i5_14 = c_im(in[14]); + r5_30 = c_re(in[30]); + i5_30 = c_im(in[30]); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + r5_1 = c_re(in[1]); + i5_1 = c_im(in[1]); + r5_17 = c_re(in[17]); + i5_17 = c_im(in[17]); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + r5_9 = c_re(in[9]); + i5_9 = c_im(in[9]); + r5_25 = c_re(in[25]); + i5_25 = c_im(in[25]); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + r5_5 = c_re(in[5]); + i5_5 = c_im(in[5]); + r5_21 = c_re(in[21]); + i5_21 = c_im(in[21]); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + r5_13 = c_re(in[13]); + i5_13 = c_im(in[13]); + r5_29 = c_re(in[29]); + i5_29 = c_im(in[29]); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + r5_3 = c_re(in[3]); + i5_3 = c_im(in[3]); + r5_19 = c_re(in[19]); + i5_19 = c_im(in[19]); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + r5_11 = c_re(in[11]); + i5_11 = c_im(in[11]); + r5_27 = c_re(in[27]); + i5_27 = c_im(in[27]); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + r5_7 = c_re(in[7]); + i5_7 = c_im(in[7]); + r5_23 = c_re(in[23]); + i5_23 = c_im(in[23]); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + r5_15 = c_re(in[15]); + i5_15 = c_im(in[15]); + r5_31 = c_re(in[31]); + i5_31 = c_im(in[31]); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[16]) = (r1_0 - r1_1); + c_im(out[16]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[17]) = (r1_2 - tmpr); + c_im(out[17]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(out[2]) = (r1_4 + tmpr); + c_im(out[2]) = (i1_4 + tmpi); + c_re(out[18]) = (r1_4 - tmpr); + c_im(out[18]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 + tmpi); + c_re(out[19]) = (r1_6 - tmpr); + c_im(out[19]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(out[4]) = (r1_8 + tmpr); + c_im(out[4]) = (i1_8 + tmpi); + c_re(out[20]) = (r1_8 - tmpr); + c_im(out[20]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(out[5]) = (r1_10 + tmpr); + c_im(out[5]) = (i1_10 + tmpi); + c_re(out[21]) = (r1_10 - tmpr); + c_im(out[21]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(out[6]) = (r1_12 + tmpr); + c_im(out[6]) = (i1_12 + tmpi); + c_re(out[22]) = (r1_12 - tmpr); + c_im(out[22]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(out[7]) = (r1_14 + tmpr); + c_im(out[7]) = (i1_14 + tmpi); + c_re(out[23]) = (r1_14 - tmpr); + c_im(out[23]) = (i1_14 - tmpi); + c_re(out[8]) = (r1_16 + i1_17); + c_im(out[8]) = (i1_16 - r1_17); + c_re(out[24]) = (r1_16 - i1_17); + c_im(out[24]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(out[9]) = (r1_18 + tmpr); + c_im(out[9]) = (i1_18 - tmpi); + c_re(out[25]) = (r1_18 - tmpr); + c_im(out[25]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(out[10]) = (r1_20 + tmpr); + c_im(out[10]) = (i1_20 - tmpi); + c_re(out[26]) = (r1_20 - tmpr); + c_im(out[26]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(out[11]) = (r1_22 + tmpr); + c_im(out[11]) = (i1_22 - tmpi); + c_re(out[27]) = (r1_22 - tmpr); + c_im(out[27]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(out[12]) = (r1_24 + tmpr); + c_im(out[12]) = (i1_24 - tmpi); + c_re(out[28]) = (r1_24 - tmpr); + c_im(out[28]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(out[13]) = (r1_26 + tmpr); + c_im(out[13]) = (i1_26 - tmpi); + c_re(out[29]) = (r1_26 - tmpr); + c_im(out[29]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(out[14]) = (r1_28 + tmpr); + c_im(out[14]) = (i1_28 - tmpi); + c_re(out[30]) = (r1_28 - tmpr); + c_im(out[30]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(out[15]) = (r1_30 + tmpr); + c_im(out[15]) = (i1_30 - tmpi); + c_re(out[31]) = (r1_30 - tmpr); + c_im(out[31]) = (i1_30 + tmpi); + } +} + +fibril static void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, + COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(jp[0 * m]); + i5_0 = c_im(jp[0 * m]); + wr = c_re(W[16 * l1]); + wi = c_im(W[16 * l1]); + tmpr = c_re(jp[16 * m]); + tmpi = c_im(jp[16 * m]); + r5_16 = ((wr * tmpr) - (wi * tmpi)); + i5_16 = ((wi * tmpr) + (wr * tmpi)); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r5_8 = ((wr * tmpr) - (wi * tmpi)); + i5_8 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[24 * l1]); + wi = c_im(W[24 * l1]); + tmpr = c_re(jp[24 * m]); + tmpi = c_im(jp[24 * m]); + r5_24 = ((wr * tmpr) - (wi * tmpi)); + i5_24 = ((wi * tmpr) + (wr * tmpi)); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r5_4 = ((wr * tmpr) - (wi * tmpi)); + i5_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[20 * l1]); + wi = c_im(W[20 * l1]); + tmpr = c_re(jp[20 * m]); + tmpi = c_im(jp[20 * m]); + r5_20 = ((wr * tmpr) - (wi * tmpi)); + i5_20 = ((wi * tmpr) + (wr * tmpi)); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r5_12 = ((wr * tmpr) - (wi * tmpi)); + i5_12 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[28 * l1]); + wi = c_im(W[28 * l1]); + tmpr = c_re(jp[28 * m]); + tmpi = c_im(jp[28 * m]); + r5_28 = ((wr * tmpr) - (wi * tmpi)); + i5_28 = ((wi * tmpr) + (wr * tmpi)); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r5_2 = ((wr * tmpr) - (wi * tmpi)); + i5_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[18 * l1]); + wi = c_im(W[18 * l1]); + tmpr = c_re(jp[18 * m]); + tmpi = c_im(jp[18 * m]); + r5_18 = ((wr * tmpr) - (wi * tmpi)); + i5_18 = ((wi * tmpr) + (wr * tmpi)); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r5_10 = ((wr * tmpr) - (wi * tmpi)); + i5_10 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[26 * l1]); + wi = c_im(W[26 * l1]); + tmpr = c_re(jp[26 * m]); + tmpi = c_im(jp[26 * m]); + r5_26 = ((wr * tmpr) - (wi * tmpi)); + i5_26 = ((wi * tmpr) + (wr * tmpi)); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r5_6 = ((wr * tmpr) - (wi * tmpi)); + i5_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[22 * l1]); + wi = c_im(W[22 * l1]); + tmpr = c_re(jp[22 * m]); + tmpi = c_im(jp[22 * m]); + r5_22 = ((wr * tmpr) - (wi * tmpi)); + i5_22 = ((wi * tmpr) + (wr * tmpi)); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r5_14 = ((wr * tmpr) - (wi * tmpi)); + i5_14 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[30 * l1]); + wi = c_im(W[30 * l1]); + tmpr = c_re(jp[30 * m]); + tmpi = c_im(jp[30 * m]); + r5_30 = ((wr * tmpr) - (wi * tmpi)); + i5_30 = ((wi * tmpr) + (wr * tmpi)); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r5_1 = ((wr * tmpr) - (wi * tmpi)); + i5_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[17 * l1]); + wi = c_im(W[17 * l1]); + tmpr = c_re(jp[17 * m]); + tmpi = c_im(jp[17 * m]); + r5_17 = ((wr * tmpr) - (wi * tmpi)); + i5_17 = ((wi * tmpr) + (wr * tmpi)); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r5_9 = ((wr * tmpr) - (wi * tmpi)); + i5_9 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[25 * l1]); + wi = c_im(W[25 * l1]); + tmpr = c_re(jp[25 * m]); + tmpi = c_im(jp[25 * m]); + r5_25 = ((wr * tmpr) - (wi * tmpi)); + i5_25 = ((wi * tmpr) + (wr * tmpi)); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r5_5 = ((wr * tmpr) - (wi * tmpi)); + i5_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[21 * l1]); + wi = c_im(W[21 * l1]); + tmpr = c_re(jp[21 * m]); + tmpi = c_im(jp[21 * m]); + r5_21 = ((wr * tmpr) - (wi * tmpi)); + i5_21 = ((wi * tmpr) + (wr * tmpi)); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r5_13 = ((wr * tmpr) - (wi * tmpi)); + i5_13 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[29 * l1]); + wi = c_im(W[29 * l1]); + tmpr = c_re(jp[29 * m]); + tmpi = c_im(jp[29 * m]); + r5_29 = ((wr * tmpr) - (wi * tmpi)); + i5_29 = ((wi * tmpr) + (wr * tmpi)); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r5_3 = ((wr * tmpr) - (wi * tmpi)); + i5_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[19 * l1]); + wi = c_im(W[19 * l1]); + tmpr = c_re(jp[19 * m]); + tmpi = c_im(jp[19 * m]); + r5_19 = ((wr * tmpr) - (wi * tmpi)); + i5_19 = ((wi * tmpr) + (wr * tmpi)); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r5_11 = ((wr * tmpr) - (wi * tmpi)); + i5_11 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[27 * l1]); + wi = c_im(W[27 * l1]); + tmpr = c_re(jp[27 * m]); + tmpi = c_im(jp[27 * m]); + r5_27 = ((wr * tmpr) - (wi * tmpi)); + i5_27 = ((wi * tmpr) + (wr * tmpi)); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r5_7 = ((wr * tmpr) - (wi * tmpi)); + i5_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[23 * l1]); + wi = c_im(W[23 * l1]); + tmpr = c_re(jp[23 * m]); + tmpi = c_im(jp[23 * m]); + r5_23 = ((wr * tmpr) - (wi * tmpi)); + i5_23 = ((wi * tmpr) + (wr * tmpi)); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r5_15 = ((wr * tmpr) - (wi * tmpi)); + i5_15 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[31 * l1]); + wi = c_im(W[31 * l1]); + tmpr = c_re(jp[31 * m]); + tmpi = c_im(jp[31 * m]); + r5_31 = ((wr * tmpr) - (wi * tmpi)); + i5_31 = ((wi * tmpr) + (wr * tmpi)); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[16 * m]) = (r1_0 - r1_1); + c_im(kp[16 * m]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[17 * m]) = (r1_2 - tmpr); + c_im(kp[17 * m]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[18 * m]) = (r1_4 - tmpr); + c_im(kp[18 * m]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[19 * m]) = (r1_6 - tmpr); + c_im(kp[19 * m]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(kp[4 * m]) = (r1_8 + tmpr); + c_im(kp[4 * m]) = (i1_8 + tmpi); + c_re(kp[20 * m]) = (r1_8 - tmpr); + c_im(kp[20 * m]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 + tmpi); + c_re(kp[21 * m]) = (r1_10 - tmpr); + c_im(kp[21 * m]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 + tmpi); + c_re(kp[22 * m]) = (r1_12 - tmpr); + c_im(kp[22 * m]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 + tmpi); + c_re(kp[23 * m]) = (r1_14 - tmpr); + c_im(kp[23 * m]) = (i1_14 - tmpi); + c_re(kp[8 * m]) = (r1_16 + i1_17); + c_im(kp[8 * m]) = (i1_16 - r1_17); + c_re(kp[24 * m]) = (r1_16 - i1_17); + c_im(kp[24 * m]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(kp[9 * m]) = (r1_18 + tmpr); + c_im(kp[9 * m]) = (i1_18 - tmpi); + c_re(kp[25 * m]) = (r1_18 - tmpr); + c_im(kp[25 * m]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(kp[10 * m]) = (r1_20 + tmpr); + c_im(kp[10 * m]) = (i1_20 - tmpi); + c_re(kp[26 * m]) = (r1_20 - tmpr); + c_im(kp[26 * m]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(kp[11 * m]) = (r1_22 + tmpr); + c_im(kp[11 * m]) = (i1_22 - tmpi); + c_re(kp[27 * m]) = (r1_22 - tmpr); + c_im(kp[27 * m]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(kp[12 * m]) = (r1_24 + tmpr); + c_im(kp[12 * m]) = (i1_24 - tmpi); + c_re(kp[28 * m]) = (r1_24 - tmpr); + c_im(kp[28 * m]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(kp[13 * m]) = (r1_26 + tmpr); + c_im(kp[13 * m]) = (i1_26 - tmpi); + c_re(kp[29 * m]) = (r1_26 - tmpr); + c_im(kp[29 * m]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(kp[14 * m]) = (r1_28 + tmpr); + c_im(kp[14 * m]) = (i1_28 - tmpi); + c_re(kp[30 * m]) = (r1_28 - tmpr); + c_im(kp[30 * m]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(kp[15 * m]) = (r1_30 + tmpr); + c_im(kp[15 * m]) = (i1_30 - tmpi); + c_re(kp[31 * m]) = (r1_30 - tmpr); + c_im(kp[31 * m]) = (i1_30 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_twiddle_32, (a, ab, in, out, W, nW, nWdn, m)); + fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m); + + fibril_join(&fr); + } +} + +fibril static void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 32; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, fft_unshuffle_32, (a, ab, in, out, m)); + fft_unshuffle_32(ab, b, in, out, m); + + fibril_join(&fr); + } +} + +/* end of machine-generated code */ + +#endif /* end of include guard: FFT_H */ diff --git a/benchmarks/fib.cpp b/benchmarks/fib.cpp new file mode 100644 index 00000000..37cbd0e0 --- /dev/null +++ b/benchmarks/fib.cpp @@ -0,0 +1,56 @@ +#include <stdio.h> +#include "test.h" + +int n = 42; +int m; + +static int fib_fast(int n) +{ + if (n < 2) return n; + + int i = 2, x = 0, y = 0, z = 1; + + do { + x = y; + y = z; + z = x + y; + } while (i++ < n); + + return z; +} + +fibril int fib(int n) +{ + if (n < 2) return n; + + int x, y; + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, &x, fib, (n - 1)); + + y = fib(n - 2); + fibril_join(&fr); + + return x + y; +} + +int verify() +{ + int expect = fib_fast(n); + + if (expect != m) { + printf("fib(%d)=%d (expected %d)\n", n, m, expect); + return 1; + } + + return 0; +} + +void init() {} +void prep() {} + +void test() { + m = fib(n); +} + diff --git a/benchmarks/fibril.h b/benchmarks/fibril.h new file mode 100644 index 00000000..c4360d5e --- /dev/null +++ b/benchmarks/fibril.h @@ -0,0 +1,69 @@ +#ifndef FIBRIL_H +#define FIBRIL_H + +#define FIBRIL_SUCCESS 0 +#define FIBRIL_FAILURE -1 + +/** + * These are special arguments to fibril_rt_init(). + * FIBRIL_NPROCS tells the runtime to fetch the number of processors + * from the environment variable FIBRIL_NPROCS (getenv(FIBRIL_NPROCS)). + * FIBRIL_NPROCS_ONLN tells the runtime to use all available processors + * in the system (sysconf(_SC_NPROCESSORS_ONLN)). + */ +#define FIBRIL_NPROCS 0 +#define FIBRIL_NPROCS_ONLN -1 + + + +/** Serial version. */ +#ifdef FIBRIL_SERIAL +#include "serial/serial.h" + +/** Cilkplus version. */ +#elif FIBRIL_CILKPLUS +#include "cilkplus/cilkplus.h" + +/** TBB version. */ +#elif FIBRIL_TBB +#include "tbb/tbb.h" + +/** OpenMP version. */ +#elif FIBRIL_OPENMP +#include "openmp/openmp.h" + +/** Emper continuation version. */ +#elif FIBRIL_EMPER_CONTINUATION +#include "emper_continuation/emper_continuation.h" + +/** Emper fiber version. */ +#elif FIBRIL_EMPER_FIBER +#include "emper_fiber/emper_fiber.h" + +/** Fibril version. */ +#elif FIBRIL_FIBRIL_LF +#include "fibril_lf/fibrile.h" +#elif FIBRIL_FIBRIL +#include "fibril/fibrile.h" +#endif + +/** fibril_fork has two versions: one with return value and one without. */ +#define fibril_fork(...) _fibril_fork_(_fibril_nth(__VA_ARGS__), __VA_ARGS__) +#define _fibril_fork_(n, ...) _fibril_concat(_fibril_fork_, n)(__VA_ARGS__) + +/** If nargs is 3, use the no-return-value version. */ +#define _fibril_fork_3(...) fibril_fork_nrt(__VA_ARGS__) + +/** If nargs is 4, use the with-return-value version. */ +#define _fibril_fork_4(...) fibril_fork_wrt(__VA_ARGS__) + +/** Helper macros to count number of arguments. */ +#define _fibril_nth(...) _fibril_nth_(__VA_ARGS__, ## __VA_ARGS__, \ + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, \ + 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 0) +#define _fibril_nth_(_1, _1_, _2, _2_, _3, _3_, _4, _4_, _5, _5_, \ + _6, _6_, _7, _7_, _8, _8_, _9, _9_, _10, _10_, _11, _11_, _12, _12_, \ + _13, _13_, _14, _14_, _15, _15_, _16, _16_, N, ...) N +#define _fibril_concat(left, right) left##right + +#endif /* end of include guard: FIBRIL_H */ diff --git a/benchmarks/fibril/CMakeLists.txt b/benchmarks/fibril/CMakeLists.txt new file mode 100644 index 00000000..a2f98fd8 --- /dev/null +++ b/benchmarks/fibril/CMakeLists.txt @@ -0,0 +1,41 @@ + +add_definitions(-DFIBRIL_FIBRIL) + +find_library(FIBRIL_LIB fibril /home/nicolas/uni/ma/fibril/build/lib) + + +add_executable(cholesky_fibril ../cholesky.cpp) +target_link_libraries(cholesky_fibril "${FIBRIL_LIB}") + +add_executable(fft_fibril ../fft.cpp) +target_link_libraries(fft_fibril "${FIBRIL_LIB}") + +add_executable(fib_fibril ../fib.cpp) +target_link_libraries(fib_fibril "${FIBRIL_LIB}") + +add_executable(heat_fibril ../heat.cpp) +target_link_libraries(heat_fibril "${FIBRIL_LIB}") + +add_executable(integrate_fibril ../integrate.cpp) +target_link_libraries(integrate_fibril "${FIBRIL_LIB}") + +add_executable(knapsack_fibril ../knapsack.cpp) +target_link_libraries(knapsack_fibril "${FIBRIL_LIB}") + +add_executable(lu_fibril ../lu.cpp) +target_link_libraries(lu_fibril "${FIBRIL_LIB}") + +add_executable(matmul_fibril ../matmul.cpp) +target_link_libraries(matmul_fibril "${FIBRIL_LIB}") + +add_executable(nqueens_fibril ../nqueens.cpp) +target_link_libraries(nqueens_fibril "${FIBRIL_LIB}") + +add_executable(quicksort_fibril ../quicksort.cpp) +target_link_libraries(quicksort_fibril "${FIBRIL_LIB}") + +add_executable(rectmul_fibril ../rectmul.cpp) +target_link_libraries(rectmul_fibril "${FIBRIL_LIB}") + +add_executable(strassen_fibril ../strassen.cpp) +target_link_libraries(strassen_fibril "${FIBRIL_LIB}") diff --git a/benchmarks/fibril/fibrile.h b/benchmarks/fibril/fibrile.h new file mode 100644 index 00000000..8d24685f --- /dev/null +++ b/benchmarks/fibril/fibrile.h @@ -0,0 +1,97 @@ +#ifndef FIBRILE_H +#define FIBRILE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "fibrili.h" + +/** fibril. */ +#define fibril __attribute__((optimize("no-omit-frame-pointer"))) + +/** fibril_t. */ +typedef struct _fibril_t fibril_t; + +/** fibril_init. */ +__attribute__((always_inline)) extern inline +void fibril_init(fibril_t * frptr) +{ + register void * rbp asm ("rbp"); + register void * rsp asm ("rsp"); + + frptr->lock = 0; + frptr->unmapped = 0; + frptr->count = -1; + frptr->stack.btm = rbp; + frptr->stack.top = rsp; +} + +/** fibril_join. */ +__attribute__((always_inline)) extern inline +void fibril_join(fibril_t * frptr) +{ + if (frptr->count > -1) { + fibrili_membar(fibrili_join(frptr)); + } +} + +#include "fork.h" + +#ifdef __cplusplus + +/** _fibril_fork_nrt. */ +#define fibril_fork_nrt(fp, fn, ag) do { \ + auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \ + fibrili_push(f); \ + fn(_fibril_args ag); \ + if (!fibrili_pop()) fibrili_resume(f); \ + }; \ + fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \ +} while (0) + +/** _fibril_fork_wrt. */ +#define fibril_fork_wrt(fp, rtp, fn, ag) do { \ + auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rtp) p) __attribute__((noinline, hot, optimize(3))) { \ + fibrili_push(f); \ + *p = fn(_fibril_args ag); \ + if (!fibrili_pop()) fibrili_resume(f); \ + }; \ + fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \ +} while (0) + +#else + +/** _fibril_fork_nrt. */ +#define fibril_fork_nrt(fp, fn, ag) do { \ + __attribute__((noinline, hot, optimize(3))) \ + void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f) { \ + fibrili_push(f); \ + fn(_fibril_args ag); \ + if (!fibrili_pop()) fibrili_resume(f); \ + } \ + fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \ +} while (0) + +/** _fibril_fork_wrt. */ +#define fibril_fork_wrt(fp, rtp, fn, ag) do { \ + __attribute__((noinline, hot, optimize(3))) \ + void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f, __typeof__(rtp) p) { \ + fibrili_push(f); \ + *p = fn(_fibril_args ag); \ + if (!fibrili_pop()) fibrili_resume(f); \ + } \ + fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \ +} while (0) + +#endif + +extern int fibril_rt_init(int nprocs); +extern int fibril_rt_exit(); +extern int fibril_rt_nprocs(); + +#ifdef __cplusplus +} +#endif + +#endif /* end of include guard: FIBRILE_H */ diff --git a/benchmarks/fibril/fibrili.h b/benchmarks/fibril/fibrili.h new file mode 100644 index 00000000..11e94397 --- /dev/null +++ b/benchmarks/fibril/fibrili.h @@ -0,0 +1,90 @@ +#ifndef FIBRILI_H +#define FIBRILI_H + +struct _fibril_t { + char lock; + char unmapped; + int count; + struct { + void * btm; + void * top; + void * ptr; + } stack; + void * pc; +}; + +extern __thread struct _fibrili_deque_t { + char lock; + int head; + int tail; + void * stack; + void * buff[1000]; +} fibrili_deq; + +#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7 + +#define fibrili_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST) +#define fibrili_lock(l) do { \ + __asm__ ( "pause" : : : "memory" ); \ +} while (__atomic_test_and_set(&(l), __ATOMIC_ACQUIRE)) +#define fibrili_unlock(l) __atomic_clear(&(l), __ATOMIC_RELEASE) + +#else +#if defined(__x86_64__) || defined(_M_X64_) + +#define fibrili_fence() __sync_synchronize() +#define fibrili_lock(l) do { \ + __asm__ ( "pause" ::: "memory" ); \ +} while (__sync_lock_test_and_set(&(l), 1)) +#define fibrili_unlock(l) __sync_lock_release(&(l)) + +#endif +#endif + +__attribute__((noinline)) extern +void fibrili_join(struct _fibril_t * frptr); +__attribute__((noreturn)) extern +void fibrili_resume(struct _fibril_t * frptr); + +#define fibrili_push(frptr) do { \ + (frptr)->pc = __builtin_return_address(0); \ + fibrili_deq.buff[fibrili_deq.tail++] = (frptr); \ +} while (0) + +__attribute__((hot)) static +int fibrili_pop(void) +{ + int tail = fibrili_deq.tail; + + if (tail == 0) return 0; + + fibrili_deq.tail = --tail; + + fibrili_fence(); + + if (fibrili_deq.head > tail) { + fibrili_deq.tail = tail + 1; + + fibrili_lock(fibrili_deq.lock); + + if (fibrili_deq.head > tail) { + fibrili_deq.head = 0; + fibrili_deq.tail = 0; + + fibrili_unlock(fibrili_deq.lock); + return 0; + } + + fibrili_deq.tail = tail; + fibrili_unlock(fibrili_deq.lock); + } + + return 1; +} + +#define fibrili_membar(call) do { \ + call; \ + __asm__ ( "nop" : : : "rbx", "r12", "r13", "r14", "r15", "memory" ); \ +} while (0) + +#endif /* end of include guard: FIBRILI_H */ diff --git a/benchmarks/fibril/fork.h b/benchmarks/fibril/fork.h new file mode 100644 index 00000000..8ab080b0 --- /dev/null +++ b/benchmarks/fibril/fork.h @@ -0,0 +1,70 @@ +#ifndef FIBRIL_FORK_H +#define FIBRIL_FORK_H + +#define _fibril_defs(...) \ + _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_defs_(n, ...) \ + _fibril_concat(_fibril_defs_, n)(__VA_ARGS__) +#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__) +#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__) +#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__) +#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__) +#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__) +#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__) +#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__) +#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__) +#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__) +#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__) +#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__) +#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__) +#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__) +#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__) +#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__) +#define _fibril_defs_1(a) __typeof__(a) a1, +#define _fibril_defs_0() + +#define _fibril_args(...) \ + _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_args_(n, ...) \ + _fibril_concat(_fibril_args_, n)(__VA_ARGS__) +#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__) +#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__) +#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__) +#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__) +#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__) +#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__) +#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__) +#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__) +#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__) +#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__) +#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__) +#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__) +#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__) +#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__) +#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__) +#define _fibril_args_1(a) a1 +#define _fibril_args_0() + +#define _fibril_expand(...) \ + _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_expand_(n, ...) \ + _fibril_concat(_fibril_expand_, n)(__VA_ARGS__) +#define _fibril_expand_16(...) __VA_ARGS__, +#define _fibril_expand_15(...) __VA_ARGS__, +#define _fibril_expand_14(...) __VA_ARGS__, +#define _fibril_expand_13(...) __VA_ARGS__, +#define _fibril_expand_12(...) __VA_ARGS__, +#define _fibril_expand_11(...) __VA_ARGS__, +#define _fibril_expand_10(...) __VA_ARGS__, +#define _fibril_expand_9( ...) __VA_ARGS__, +#define _fibril_expand_8( ...) __VA_ARGS__, +#define _fibril_expand_7( ...) __VA_ARGS__, +#define _fibril_expand_6( ...) __VA_ARGS__, +#define _fibril_expand_5( ...) __VA_ARGS__, +#define _fibril_expand_4( ...) __VA_ARGS__, +#define _fibril_expand_3( ...) __VA_ARGS__, +#define _fibril_expand_2( ...) __VA_ARGS__, +#define _fibril_expand_1( ...) __VA_ARGS__, +#define _fibril_expand_0() + +#endif /* end of include guard: FIBRIL_FORK_H */ diff --git a/benchmarks/fibril_lf/CMakeLists.txt b/benchmarks/fibril_lf/CMakeLists.txt new file mode 100644 index 00000000..891e7720 --- /dev/null +++ b/benchmarks/fibril_lf/CMakeLists.txt @@ -0,0 +1,41 @@ + +add_definitions(-DFIBRIL_FIBRIL_LF) + +find_library(FIBRIL_LF_LIB fibril /home/nicolas/uni/ma/fibril_wf/build/lib) + + +add_executable(cholesky_fibril_lf ../cholesky.cpp) +target_link_libraries(cholesky_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(fft_fibril_lf ../fft.cpp) +target_link_libraries(fft_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(fib_fibril_lf ../fib.cpp) +target_link_libraries(fib_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(heat_fibril_lf ../heat.cpp) +target_link_libraries(heat_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(integrate_fibril_lf ../integrate.cpp) +target_link_libraries(integrate_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(knapsack_fibril_lf ../knapsack.cpp) +target_link_libraries(knapsack_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(lu_fibril_lf ../lu.cpp) +target_link_libraries(lu_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(matmul_fibril_lf ../matmul.cpp) +target_link_libraries(matmul_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(nqueens_fibril_lf ../nqueens.cpp) +target_link_libraries(nqueens_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(quicksort_fibril_lf ../quicksort.cpp) +target_link_libraries(quicksort_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(rectmul_fibril_lf ../rectmul.cpp) +target_link_libraries(rectmul_fibril_lf "${FIBRIL_LF_LIB}") + +add_executable(strassen_fibril_lf ../strassen.cpp) +target_link_libraries(strassen_fibril_lf "${FIBRIL_LF_LIB}") diff --git a/benchmarks/fibril_lf/fork.h b/benchmarks/fibril_lf/fork.h new file mode 100644 index 00000000..8ab080b0 --- /dev/null +++ b/benchmarks/fibril_lf/fork.h @@ -0,0 +1,70 @@ +#ifndef FIBRIL_FORK_H +#define FIBRIL_FORK_H + +#define _fibril_defs(...) \ + _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_defs_(n, ...) \ + _fibril_concat(_fibril_defs_, n)(__VA_ARGS__) +#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__) +#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__) +#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__) +#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__) +#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__) +#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__) +#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__) +#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__) +#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__) +#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__) +#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__) +#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__) +#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__) +#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__) +#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__) +#define _fibril_defs_1(a) __typeof__(a) a1, +#define _fibril_defs_0() + +#define _fibril_args(...) \ + _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_args_(n, ...) \ + _fibril_concat(_fibril_args_, n)(__VA_ARGS__) +#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__) +#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__) +#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__) +#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__) +#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__) +#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__) +#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__) +#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__) +#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__) +#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__) +#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__) +#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__) +#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__) +#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__) +#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__) +#define _fibril_args_1(a) a1 +#define _fibril_args_0() + +#define _fibril_expand(...) \ + _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_expand_(n, ...) \ + _fibril_concat(_fibril_expand_, n)(__VA_ARGS__) +#define _fibril_expand_16(...) __VA_ARGS__, +#define _fibril_expand_15(...) __VA_ARGS__, +#define _fibril_expand_14(...) __VA_ARGS__, +#define _fibril_expand_13(...) __VA_ARGS__, +#define _fibril_expand_12(...) __VA_ARGS__, +#define _fibril_expand_11(...) __VA_ARGS__, +#define _fibril_expand_10(...) __VA_ARGS__, +#define _fibril_expand_9( ...) __VA_ARGS__, +#define _fibril_expand_8( ...) __VA_ARGS__, +#define _fibril_expand_7( ...) __VA_ARGS__, +#define _fibril_expand_6( ...) __VA_ARGS__, +#define _fibril_expand_5( ...) __VA_ARGS__, +#define _fibril_expand_4( ...) __VA_ARGS__, +#define _fibril_expand_3( ...) __VA_ARGS__, +#define _fibril_expand_2( ...) __VA_ARGS__, +#define _fibril_expand_1( ...) __VA_ARGS__, +#define _fibril_expand_0() + +#endif /* end of include guard: FIBRIL_FORK_H */ diff --git a/benchmarks/heat.cpp b/benchmarks/heat.cpp new file mode 100644 index 00000000..82da1ea4 --- /dev/null +++ b/benchmarks/heat.cpp @@ -0,0 +1,205 @@ +/* + * Heat diffusion (Jacobi-type iteration) + * + * Volker Strumpen, Boston August 1996 + * + * Copyright (c) 1996 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include "test.h" + +#define f(x,y) (sin(x)*sin(y)) +#define randa(x,t) (0.0) +#define randb(x,t) (exp(-2*(t))*sin(x)) +#define randc(y,t) (0.0) +#define randd(y,t) (exp(-2*(t))*sin(y)) +#define solu(x,y,t) (exp(-2*(t))*sin(x)*sin(y)) + +int n = 4096; + +int nx, ny, nt; +double xu, xo, yu, yo, tu, to; + +double dx, dy, dt; +double dtdxsq, dtdysq; + +double ** odd; +double ** even; + +fibril static void heat(double ** m, int il, int iu) +{ + if (iu - il > 1) { + int im = (il + iu) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, heat, (m, il, im)); + heat(m, im, iu); + + fibril_join(&fr); + return; + } + + int i = il; + int j; + double * row = m[i]; + + if (i == 0) { + for (j = 0; j < ny; ++j) { + row[j] = randc(yu + j * dy, 0); + } + } else if (i == nx - 1) { + for (j = 0; j < ny; ++j) { + row[j] = randd(yu + j * dy, 0); + } + } else { + row[0] = randa(xu + i * dx, 0); + for (j = 1; j < ny - 1; ++j) { + row[j] = f(xu + i * dx, yu + j * dy); + } + row[ny - 1] = randb(xu + i * dx, 0); + } +} + +fibril void diffuse(double ** out, double ** in, int il, int iu, double t) +{ + if (iu - il > 1) { + int im = (il + iu) / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, diffuse, (out, in, il, im, t)); + diffuse(out, in, im, iu, t); + + fibril_join(&fr); + return; + } + + int i = il; + int j; + double * row = out[i]; + + if (i == 0) { + for (j = 0; j < ny; ++j) { + row[j] = randc(yu + j * dy, t); + } + } else if (i == nx - 1) { + for (j = 0; j < ny; ++j) { + row[j] = randd(yu + j * dy, t); + } + } else { + row[0] = randa(xu + i * dx, t); + for (j = 1; j < ny - 1; ++j) { + row[j] = in[i][j] + + dtdysq * (in[i][j + 1] - 2 * in[i][j] + in[i][j - 1]) + + dtdxsq * (in[i + 1][j] - 2 * in[i][j] + in[i - 1][j]); + } + row[ny - 1] = randb(xu + i * dx, t); + } +} + +void init() +{ + nx = n; + ny = 1024; + nt = 100; + xu = 0.0; + xo = 1.570796326794896558; + yu = 0.0; + yo = 1.570796326794896558; + tu = 0.0; + to = 0.0000001; + + dx = (xo - xu) / (nx - 1); + dy = (yo - yu) / (ny - 1); + dt = (to - tu) / nt; + + dtdxsq = dt / (dx * dx); + dtdysq = dt / (dy * dy); + + even = (double**) malloc(sizeof(double *) * nx); + odd = (double**) malloc(sizeof(double *) * nx); + + int i; + for (i = 0; i < nx; ++i) { + even[i] = (double*) malloc(sizeof(double) * ny); + odd [i] = (double*) malloc(sizeof(double) * ny); + } +} + +void prep() +{ + heat(even, 0, nx); +} + +void test() +{ + double t = tu; + int i; + + for (i = 1; i <= nt; i += 2) { + diffuse(odd, even, 0, nx, t += dt); + diffuse(even, odd, 0, nx, t += dt); + } + + if (nt % 2) { + diffuse(odd, even, 0, nx, t += dt); + } +} + +int verify() +{ + double **mat; + double mae = 0.0; + double mre = 0.0; + double me = 0.0; + + mat = nt % 2 ? odd : even; + + int a, b; + + for (a = 0; a < nx; ++a) { + for (b = 0; b < ny; ++b) { + double tmp = fabs(mat[a][b] - solu(xu + a * dx, yu + b * dy, to)); + + me += tmp; + if (tmp > mae) mae = tmp; + if (mat[a][b] != 0.0) tmp = tmp / mat[a][b]; + if (tmp > mre) mre = tmp; + } + } + + me = me / (nx * ny); + + if (mae > 1e-12) { + printf("Local maximal absolute error %10e\n", mae); + return 1; + } if (mre > 1e-12) { + printf("Local maximal relative error %10e\n", mre); + return 1; + } if (me > 1e-12) { + printf("Global Mean absolute error %10e\n", me); + return 1; + } + + return 0; +} + diff --git a/benchmarks/integrate.cpp b/benchmarks/integrate.cpp new file mode 100644 index 00000000..3e888c72 --- /dev/null +++ b/benchmarks/integrate.cpp @@ -0,0 +1,79 @@ +#include <stdio.h> +#include "test.h" + +int n = 10000; + +static double m; +static const double epsilon = 1.0e-9; + +static double f(double x) +{ + return (x * x + 1.0) * x; +} + +static +double integrate_serial(double x1, double y1, double x2, double y2, double area) +{ + double half = (x2 - x1) / 2; + double x0 = x1 + half; + double y0 = f(x0); + + double area_x1x0 = (y1 + y0) / 2 * half; + double area_x0x2 = (y0 + y2) / 2 * half; + double area_x1x2 = area_x1x0 + area_x0x2; + + if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) { + return area_x1x2; + } + + area_x1x0 = integrate_serial(x1, y1, x0, y0, area_x1x0); + area_x0x2 = integrate_serial(x0, y0, x2, y2, area_x0x2); + + return area_x1x0 + area_x0x2; +} + +static fibril +double integrate(double x1, double y1, double x2, double y2, double area) +{ + double half = (x2 - x1) / 2; + double x0 = x1 + half; + double y0 = f(x0); + + double area_x1x0 = (y1 + y0) / 2 * half; + double area_x0x2 = (y0 + y2) / 2 * half; + double area_x1x2 = area_x1x0 + area_x0x2; + + if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) { + return area_x1x2; + } + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, &area_x1x0, integrate, (x1, y1, x0, y0, area_x1x0)); + area_x0x2 = integrate(x0, y0, x2, y2, area_x0x2); + + fibril_join(&fr); + return area_x1x0 + area_x0x2; +} + +void init() {} +void prep() {} + +void test() +{ + m = integrate(0, f(0), n, f(n), 0); +} + +int verify() +{ + double expect = integrate_serial(0, f(0), n, f(n), 0); + + if (m - expect < epsilon && expect - m < epsilon) { + return 0; + } + + printf("integrate(%d)=%lf (expected %lf)\n", n, m, expect); + return 1; +} + diff --git a/benchmarks/knapsack.cpp b/benchmarks/knapsack.cpp new file mode 100644 index 00000000..49bd5852 --- /dev/null +++ b/benchmarks/knapsack.cpp @@ -0,0 +1,165 @@ +/* + * Cilk program to solve the 0-1 knapsack problem using a branch-and-bound + * technique. + * + * Author: Matteo Frigo + */ +/* + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include "test.h" + +struct item { + int value; + int weight; +}; + +int n = 32; +static int capacity = 900; +static int sol; + +static struct item items[] = { + { 15, 23 }, + { 22, 12 }, + { 17, 42 }, + { 1, 13 }, + { 32, 21 }, + { 65, 43 }, + { 23, 56 }, + { 4, 7 }, + { 4, 8 }, + { 32, 42 }, + { 51, 32 }, + { 22, 12 }, + { 17, 24 }, + { 12, 13 }, + { 23, 21 }, + { 56, 47 }, + { 23, 65 }, + { 6, 7 }, + { 4, 7 }, + { 32, 42 }, + { 22, 42 }, + { 59, 32 }, + { 23, 12 }, + { 12, 24 }, + { 12, 13 }, + { 23, 21 }, + { 39, 48 }, + { 22, 65 }, + { 6, 7 }, + { 4, 7 }, + { 33, 42 }, + { 18, 53 } +}; + +static int best_so_far = INT_MIN; + +static int compare(struct item *a, struct item *b) +{ + double c = ((double) a->value / a->weight) - + ((double) b->value / b->weight); + + if (c > 0) + return -1; + if (c < 0) + return 1; + return 0; +} + +/* + * return the optimal solution for n items (first is e) and + * capacity c. Value so far is v. + */ +fibril static int knapsack(struct item *e, int c, int n, int v) +{ + int with, without, best; + double ub; + + /* base case: full knapsack or no items */ + if (c < 0) + return INT_MIN; + + if (n == 0 || c == 0) + return v; /* feasible solution, with value v */ + + ub = (double) v + c * e->value / e->weight; + + if (ub < best_so_far) { + /* prune ! */ + return INT_MIN; + } + + fibril_t fr; + fibril_init(&fr); + /* + * compute the best solution without the current item in the knapsack + */ + fibril_fork(&fr, &without, knapsack, (e + 1, c, n - 1, v)); + + /* compute the best solution with the current item in the knapsack */ + with = knapsack(e + 1, c - e->weight, n - 1, v + e->value); + + fibril_join(&fr); + + best = with > without ? with : without; + + /* + * notice the race condition here. The program is still + * correct, in the sense that the best solution so far + * is at least best_so_far. Moreover best_so_far gets updated + * when returning, so eventually it should get the right + * value. The program is highly non-deterministic. + */ + if (best > best_so_far) + best_so_far = best; + + return best; +} + +void init() +{ + /* sort the items on decreasing order of value/weight */ + qsort(items, n, sizeof(struct item), + (int (*)(const void *, const void *)) compare); +} + +void prep() {} + +void test() +{ + sol = knapsack(items, capacity, n, 0); +} + +int verify() +{ + int expected = 733; + + if (sol != expected) { + printf("sol: %d (expected: %d)\n", sol, expected); + return 1; + } + + return 0; +} + diff --git a/benchmarks/lu.cpp b/benchmarks/lu.cpp new file mode 100644 index 00000000..e971e45f --- /dev/null +++ b/benchmarks/lu.cpp @@ -0,0 +1,458 @@ +/****************************************************************************\ + * LU decomposition + * Robert Blumofe + * + * Copyright (c) 1996, Robert Blumofe. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +\****************************************************************************/ + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "test.h" + +/* Define the size of a block. */ +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 16 +#endif + +/* Define the default matrix size. */ +#ifndef DEFAULT_SIZE +#ifndef BENCHMARK +#define DEFAULT_SIZE (16 * BLOCK_SIZE) +#else +#define DEFAULT_SIZE 4096 +#endif +#endif + +/* A block is a 2D array of doubles. */ +typedef double Block[BLOCK_SIZE][BLOCK_SIZE]; +#define BLOCK(B,I,J) (B[I][J]) + +/* A matrix is a 1D array of blocks. */ +typedef Block * Matrix; +#define MATRIX(M,I,J) ((M)[(I)*nBlocks+(J)]) + +/** Matrix size. */ +int n = DEFAULT_SIZE; + +/** The global matrix and a copy of the matrix. */ +static Matrix M, Msave; + +/* Matrix size in blocks. */ +static int nBlocks; + +/****************************************************************************\ + * Utility routines. + \****************************************************************************/ + +/* + * init_matrix - Fill in matrix M with random values. + */ +static void init_matrix(Matrix M, int nb) +{ + int I, J, K, i, j, k; + + /* Initialize random number generator. */ + srand(1); + + /* For each element of each block, fill in random value. */ + for (I = 0; I < nb; I++) + for (J = 0; J < nb; J++) + for (i = 0; i < BLOCK_SIZE; i++) + for (j = 0; j < BLOCK_SIZE; j++) + BLOCK(MATRIX(M, I, J), i, j) = ((double)rand()) / (double)RAND_MAX; + + /* Inflate diagonal entries. */ + for (K = 0; K < nb; K++) + for (k = 0; k < BLOCK_SIZE; k++) + BLOCK(MATRIX(M, K, K), k, k) *= 10.0; +} + +/* + * print_matrix - Print matrix M. + */ +static void print_matrix(Matrix M, int nb) +{ + int i, j; + (void) print_matrix; + + /* Print out matrix. */ + for (i = 0; i < nb * BLOCK_SIZE; i++) { + for (j = 0; j < nb * BLOCK_SIZE; j++) + printf(" %6.4f", + BLOCK(MATRIX(M, i / BLOCK_SIZE, j / BLOCK_SIZE), + i % BLOCK_SIZE, j % BLOCK_SIZE)); + printf("\n"); + } +} + +/* + * test_result - Check that matrix LU contains LU decomposition of M. + */ +static int test_result(Matrix LU, Matrix M, int nb) +{ + int I, J, K, i, j, k; + double diff, max_diff; + double v; + (void) test_result; + + /* Initialize test. */ + max_diff = 0.0; + + /* Find maximum difference between any element of LU and M. */ + for (i = 0; i < nb * BLOCK_SIZE; i++) + for (j = 0; j < nb * BLOCK_SIZE; j++) { + I = i / BLOCK_SIZE; + J = j / BLOCK_SIZE; + v = 0.0; + for (k = 0; k < i && k <= j; k++) { + K = k / BLOCK_SIZE; + v += BLOCK(MATRIX(LU, I, K), i % BLOCK_SIZE, + k % BLOCK_SIZE) * + BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE, + j % BLOCK_SIZE); + } + if (k == i && k <= j) { + K = k / BLOCK_SIZE; + v += BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE, + j % BLOCK_SIZE); + } + diff = fabs(BLOCK(MATRIX(M, I, J), i % BLOCK_SIZE, + j % BLOCK_SIZE) - v); + if (diff > max_diff) + max_diff = diff; + } + + /* Check maximum difference against threshold. */ + return (max_diff > 0.00001); +} + +/****************************************************************************\ + * Element operations. + \****************************************************************************/ +/* + * elem_daxmy - Compute y' = y - ax where a is a double and x and y are + * vectors of doubles. + */ +static void elem_daxmy(double a, double *x, double *y, int n) +{ + for (n--; n >= 0; n--) y[n] -= a * x[n]; +} + +/****************************************************************************\ + * Block operations. + \****************************************************************************/ + +/* + * block_lu - Factor block B. + */ +static void block_lu(Block B) +{ + int i, k; + + /* Factor block. */ + for (k = 0; k < BLOCK_SIZE; k++) + for (i = k + 1; i < BLOCK_SIZE; i++) { + BLOCK(B, i, k) /= BLOCK(B, k, k); + elem_daxmy(BLOCK(B, i, k), &BLOCK(B, k, k + 1), + &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1); + } +} + +/* + * block_lower_solve - Perform forward substitution to solve for B' in + * LB' = B. + */ +static void block_lower_solve(Block B, Block L) +{ + int i, k; + + /* Perform forward substitution. */ + for (i = 1; i < BLOCK_SIZE; i++) + for (k = 0; k < i; k++) + elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0), + &BLOCK(B, i, 0), BLOCK_SIZE); +} + +/* + * block_upper_solve - Perform forward substitution to solve for B' in + * B'U = B. + */ +static void block_upper_solve(Block B, Block U) +{ + int i, k; + + /* Perform forward substitution. */ + for (i = 0; i < BLOCK_SIZE; i++) + for (k = 0; k < BLOCK_SIZE; k++) { + BLOCK(B, i, k) /= BLOCK(U, k, k); + elem_daxmy(BLOCK(B, i, k), &BLOCK(U, k, k + 1), + &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1); + } +} + +/* + * block_schur - Compute Schur complement B' = B - AC. + */ +static void block_schur(Block B, Block A, Block C) +{ + int i, k; + + /* Compute Schur complement. */ + for (i = 0; i < BLOCK_SIZE; i++) + for (k = 0; k < BLOCK_SIZE; k++) + elem_daxmy(BLOCK(A, i, k), &BLOCK(C, k, 0), + &BLOCK(B, i, 0), BLOCK_SIZE); +} + + +/****************************************************************************\ + * Divide-and-conquer matrix LU decomposition. + \****************************************************************************/ + +/** + * schur - Compute M' = M - VW. + */ +fibril static void schur(Matrix M, Matrix V, Matrix W, int nb) +{ + Matrix M00, M01, M10, M11; + Matrix V00, V01, V10, V11; + Matrix W00, W01, W10, W11; + int hnb; + + /* Check base case. */ + if (nb == 1) { + block_schur(*M, *V, *W); + return; + } + + /* Break matrices into 4 pieces. */ + hnb = nb / 2; + M00 = &MATRIX(M, 0, 0); + M01 = &MATRIX(M, 0, hnb); + M10 = &MATRIX(M, hnb, 0); + M11 = &MATRIX(M, hnb, hnb); + V00 = &MATRIX(V, 0, 0); + V01 = &MATRIX(V, 0, hnb); + V10 = &MATRIX(V, hnb, 0); + V11 = &MATRIX(V, hnb, hnb); + W00 = &MATRIX(W, 0, 0); + W01 = &MATRIX(W, 0, hnb); + W10 = &MATRIX(W, hnb, 0); + W11 = &MATRIX(W, hnb, hnb); + + /* Form Schur complement with recursive calls. */ + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, schur, (M00, V00, W00, hnb)); + fibril_fork(&fr, schur, (M01, V00, W01, hnb)); + fibril_fork(&fr, schur, (M10, V10, W00, hnb)); + schur(M11, V10, W01, hnb); + fibril_join(&fr); + + fibril_fork(&fr, schur, (M00, V01, W10, hnb)); + fibril_fork(&fr, schur, (M01, V01, W11, hnb)); + fibril_fork(&fr, schur, (M10, V11, W10, hnb)); + schur(M11, V11, W11, hnb); + fibril_join(&fr); + + return; +} + +/* + * lower_solve - Compute M' where LM' = M. + */ +fibril static void lower_solve(Matrix M, Matrix L, int nb); + +static void aux_lower_solve(Matrix Ma, Matrix Mb, Matrix L, int nb) +{ + Matrix L00, L01, L10, L11; + (void) L01; + + /* Break L matrix into 4 pieces. */ + L00 = &MATRIX(L, 0, 0); + L01 = &MATRIX(L, 0, nb); + L10 = &MATRIX(L, nb, 0); + L11 = &MATRIX(L, nb, nb); + + /* Solve with recursive calls. */ + lower_solve(Ma, L00, nb); + schur(Mb, L10, Ma, nb); + lower_solve(Mb, L11, nb); +} + +fibril static void lower_solve(Matrix M, Matrix L, int nb) +{ + Matrix M00, M01, M10, M11; + int hnb; + + /* Check base case. */ + if (nb == 1) { + block_lower_solve(*M, *L); + return; + } + + /* Break matrices into 4 pieces. */ + hnb = nb / 2; + M00 = &MATRIX(M, 0, 0); + M01 = &MATRIX(M, 0, hnb); + M10 = &MATRIX(M, hnb, 0); + M11 = &MATRIX(M, hnb, hnb); + + /* Solve with recursive calls. */ + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, aux_lower_solve, (M00, M10, L, hnb)); + aux_lower_solve(M01, M11, L, hnb); + + fibril_join(&fr); + + return; +} + +/* + * upper_solve - Compute M' where M'U = M. + */ +fibril static void upper_solve(Matrix M, Matrix U, int nb); + +static void aux_upper_solve(Matrix Ma, Matrix Mb, Matrix U, int nb) +{ + Matrix U00, U01, U10, U11; + (void) U10; + + /* Break U matrix into 4 pieces. */ + U00 = &MATRIX(U, 0, 0); + U01 = &MATRIX(U, 0, nb); + U10 = &MATRIX(U, nb, 0); + U11 = &MATRIX(U, nb, nb); + + /* Solve with recursive calls. */ + upper_solve(Ma, U00, nb); + schur(Mb, Ma, U01, nb); + upper_solve(Mb, U11, nb); + + return; +} + +fibril static void upper_solve(Matrix M, Matrix U, int nb) +{ + Matrix M00, M01, M10, M11; + int hnb; + + /* Check base case. */ + if (nb == 1) { + block_upper_solve(*M, *U); + return; + } + + /* Break matrices into 4 pieces. */ + hnb = nb / 2; + M00 = &MATRIX(M, 0, 0); + M01 = &MATRIX(M, 0, hnb); + M10 = &MATRIX(M, hnb, 0); + M11 = &MATRIX(M, hnb, hnb); + + /* Solve with recursive calls. */ + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, aux_upper_solve, (M00, M01, U, hnb)); + aux_upper_solve(M10, M11, U, hnb); + + fibril_join(&fr); + + return; +} + +/* + * lu - Perform LU decomposition of matrix M. + */ +fibril void lu(Matrix M, int nb) +{ + Matrix M00, M01, M10, M11; + int hnb; + + /* Check base case. */ + if (nb == 1) { + block_lu(*M); + return; + } + + /* Break matrix into 4 pieces. */ + hnb = nb / 2; + M00 = &MATRIX(M, 0, 0); + M01 = &MATRIX(M, 0, hnb); + M10 = &MATRIX(M, hnb, 0); + M11 = &MATRIX(M, hnb, hnb); + + /* Decompose upper left. */ + lu(M00, hnb); + + /* Solve for upper right and lower left. */ + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, lower_solve, (M01, M00, hnb)); + upper_solve(M10, M00, hnb); + + fibril_join(&fr); + + /* Compute Schur complement of lower right. */ + schur(M11, M10, M01, hnb); + + /* Decompose lower right. */ + lu(M11, hnb); + + return; +} + +void init() +{ + nBlocks = n / BLOCK_SIZE; + M = (Matrix) malloc(n * n * sizeof(double)); + init_matrix(M, nBlocks); + (void) Msave; +#ifndef BENCHMARK + Msave = (Matrix) malloc(n * n * sizeof(double)); + memcpy((void *) Msave, (void *) M, n * n * sizeof(double)); +#endif + +} + +void prep() +{ +#ifndef BENCHMARK + memcpy((void *) M, (void *) Msave, n * n * sizeof(double)); +#endif +} + +void test() +{ + lu(M, nBlocks); +} + +int verify() +{ +#ifndef BENCHMARK + return test_result(M, Msave, nBlocks); +#else + return 0; +#endif +} diff --git a/benchmarks/matmul.cpp b/benchmarks/matmul.cpp new file mode 100644 index 00000000..74275679 --- /dev/null +++ b/benchmarks/matmul.cpp @@ -0,0 +1,142 @@ +#include <stdio.h> +#include <stdlib.h> +#include "test.h" + +int n = 2048; + +static float * a; +static float * b; +static float ** c; + +fibril static void compute(float *, int, int, float *, int, int, + float **, int, int, int); + +static void compute00(float * a, int ai, int aj, float * b, int bi, int bj, + float ** c, int ci, int cj, int n) +{ + compute(a, ai, aj, b, bi, bj, c, ci, cj, n); + compute(a, ai, aj + n, b, bi + n, bj, c, ci, cj, n); +} + +static void compute01(float * a, int ai, int aj, float * b, int bi, int bj, + float ** c, int ci, int cj, int n) +{ + compute(a, ai, aj, b, bi, bj + n, c, ci, cj + n, n); + compute(a, ai, aj + n, b, bi + n, bj + n, c, ci, cj + n, n); +} + +static void compute10(float * a, int ai, int aj, float * b, int bi, int bj, + float ** c, int ci, int cj, int n) +{ + compute(a, ai + n, aj, b, bi, bj, c, ci + n, cj, n); + compute(a, ai + n, aj + n, b, bi + n, bj, c, ci + n, cj, n); +} + +static void compute11(float * a, int ai, int aj, float * b, int bi, int bj, + float ** c, int ci, int cj, int n) +{ + compute(a, ai + n, aj, b, bi, bj + n, c, ci + n, cj + n, n); + compute(a, ai + n, aj + n, b, bi + n, bj + n, c, ci + n, cj + n, n); +} + +static void multiply(float * a, int ai, int aj, float * b, int bi, int bj, + float ** c, int ci, int cj) +{ + int a0 = ai; + int a1 = ai + 1; + + float s00 = 0.0F; + float s01 = 0.0F; + float s10 = 0.0F; + float s11 = 0.0F; + + int b0 = bi; + int b1 = bi + 1; + + s00 += a[a0 + aj] * b[b0 + bj]; + s10 += a[a1 + aj] * b[b0 + bj]; + s01 += a[a0 + aj] * b[b0 + bj + 1]; + s11 += a[a1 + aj] * b[b0 + bj + 1]; + + s00 += a[a0 + aj + 1] * b[b1 + bj]; + s10 += a[a1 + aj + 1] * b[b1 + bj]; + s01 += a[a0 + aj + 1] * b[b1 + bj + 1]; + s11 += a[a1 + aj + 1] * b[b1 + bj + 1]; + + c[ci] [cj] += s00; + c[ci] [cj + 1] += s01; + c[ci + 1][cj] += s10; + c[ci + 1][cj + 1] += s11; +} + +fibril static void compute(float * a, int ai, int aj, float * b, int bi, int bj, + float ** c, int ci, int cj, int n) +{ + if (n == 2) { + multiply(a, ai, aj, b, bi, bj, c, ci, cj); + } else { + int h = n / 2; + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, compute00, (a, ai, aj, b, bi, bj, c, ci, cj, h)); + fibril_fork(&fr, compute10, (a, ai, aj, b, bi, bj, c, ci, cj, h)); + fibril_fork(&fr, compute01, (a, ai, aj, b, bi, bj, c, ci, cj, h)); + compute11(a, ai, aj, b, bi, bj, c, ci, cj, h); + + fibril_join(&fr); + } +} + +void init() +{ + a = (float*) malloc(sizeof(float) * n * n); + b = (float*) malloc(sizeof(float) * n * n); + c = (float**) malloc(sizeof(float *) * n); + + int i, j; + (void) j; + for (i = 0; i < n; ++i) { + c[i] = (float*) malloc(sizeof(float) * n); + } + + for (i = 0; i < n * n; ++i) { + a[i] = 1.0F; + } + + for (i = 0; i < n * n; ++i) { + b[i] = 1.0F; + } +} + +void prep() +{ + int i, j; + + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + c[i][j] = 0; + } + } +} + +void test() +{ + compute(a, 0, 0, b, 0, 0, c, 0, 0, n); +} + +int verify() { + int i, j; + + for (i = 0; i < n; ++i) { + for (j = 0; j < n; j++) { + if (c[i][j] != n) { + printf("c[%d][%d]=%f (expected %f)\n", i, j, c[i][j], (float) n); + return 1; + } + } + } + + return 0; +} diff --git a/benchmarks/nqueens.cpp b/benchmarks/nqueens.cpp new file mode 100644 index 00000000..784c8702 --- /dev/null +++ b/benchmarks/nqueens.cpp @@ -0,0 +1,70 @@ +#include <stdio.h> +#include "test.h" + +int n = 14; +int m; + +fibril static int nqueens(const int * a, int n, int d, int i) +{ + //int aa[d + 1]; + int aa[16]; + int j; + + for (j = 0; j < d; ++j) { + aa[j] = a[j]; + + int diff = a[j] - i; + int dist = d - j; + + if (diff == 0 || dist == diff || dist + diff == 0) return 0; + } + + if (d >= 0) aa[d] = i; + if (++d == n) return 1; + + //int res[n]; + int res[16]; + a = aa; + + fibril_t fr; + fibril_init(&fr); + + for (i = 0; i < n; ++i) { + fibril_fork(&fr, &res[i], nqueens, (a, n, d, i)); + } + + fibril_join(&fr); + + int sum = 0; + + for (i = 0; i < n; ++i) { + sum += res[i]; + } + + return sum; +} + +void init() {} +void prep() {} + +void test() +{ + m = nqueens(NULL, n, -1, 0); +} + +int verify() +{ + static int res[16] = { + 1, 0, 0, 2, 10, 4, 40, 92, 352, 724, 2680, + 14200, 73712, 365596, 2279184, 14772512 + }; + + int failed; + + if ((failed = (m != res[n - 1]))) { + printf("nqueens(%d)=%d (expected %d)\n", n, m, res[n - 1]); + } + + return failed; +} + diff --git a/benchmarks/openmp/CMakeLists.txt b/benchmarks/openmp/CMakeLists.txt new file mode 100644 index 00000000..4e5b97f0 --- /dev/null +++ b/benchmarks/openmp/CMakeLists.txt @@ -0,0 +1,28 @@ + +add_definitions(-DFIBRIL_OPENMP) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") + + +add_executable(cholesky_openmp ../cholesky.cpp) + +add_executable(fft_openmp ../fft.cpp) + +add_executable(fib_openmp ../fib.cpp) + +add_executable(heat_openmp ../heat.cpp) + +add_executable(integrate_openmp ../integrate.cpp) + +add_executable(knapsack_openmp ../knapsack.cpp) + +add_executable(lu_openmp ../lu.cpp) + +add_executable(matmul_openmp ../matmul.cpp) + +add_executable(nqueens_openmp ../nqueens.cpp) + +add_executable(quicksort_openmp ../quicksort.cpp) + +add_executable(rectmul_openmp ../rectmul.cpp) + +add_executable(strassen_openmp ../strassen.cpp) diff --git a/benchmarks/openmp/openmp.h b/benchmarks/openmp/openmp.h new file mode 100644 index 00000000..127a779b --- /dev/null +++ b/benchmarks/openmp/openmp.h @@ -0,0 +1,101 @@ +#ifndef OPENMP_H +#define OPENMP_H + +#include <omp.h> +#include <thread> +#include <functional> + +#define fibril +#define fibril_t __attribute__((unused)) int +#define fibril_init(fp) + +__attribute__((always_inline)) +inline static void fibril_join(__attribute__((unused)) fibril_t *f) { +#pragma omp taskwait +} + +#if 0 +__attribute__((always_inline)) +inline static void _omp_fork(std::function<void(void)> f) { +#pragma omp task untied firstprivate(f) + { + f(); + } +} + +#define fibril_fork_nrt(fp, fn, ag) _omp_fork([=]{ fn ag; }) + +#define fibril_fork_wrt(fp, rtp, fn, ag) do { \ + __typeof__(rtp) pt = rtp; \ + _omp_fork([=]{ *pt = fn ag; }); \ +} while (0) + +#else + +#define _fibril_expand(...) \ + _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) +#define _fibril_expand_(n, ...) \ + _fibril_concat(_fibril_expand_, n)(__VA_ARGS__) +#define _fibril_expand_16(...) __VA_ARGS__ +#define _fibril_expand_15(...) __VA_ARGS__ +#define _fibril_expand_14(...) __VA_ARGS__ +#define _fibril_expand_13(...) __VA_ARGS__ +#define _fibril_expand_12(...) __VA_ARGS__ +#define _fibril_expand_11(...) __VA_ARGS__ +#define _fibril_expand_10(...) __VA_ARGS__ +#define _fibril_expand_9( ...) __VA_ARGS__ +#define _fibril_expand_8( ...) __VA_ARGS__ +#define _fibril_expand_7( ...) __VA_ARGS__ +#define _fibril_expand_6( ...) __VA_ARGS__ +#define _fibril_expand_5( ...) __VA_ARGS__ +#define _fibril_expand_4( ...) __VA_ARGS__ +#define _fibril_expand_3( ...) __VA_ARGS__ +#define _fibril_expand_2( ...) __VA_ARGS__ +#define _fibril_expand_1( ...) __VA_ARGS__ +#define _fibril_expand_0() + +template<class F, class ...As> +__attribute__((always_inline)) +inline static void _omp_fork0(F f, As... as) { +#pragma omp task untied default(shared) + { + f(as...); + } +} + +template<class F, class R, class ...As> +__attribute__((always_inline)) +inline static void _omp_fork1(F f, R r, As... as) { +#pragma omp task untied default(shared) + { + *r = f(as...); + } +} + +#define fibril_fork_nrt(fp, fn, ag) _omp_fork0(fn, _fibril_expand ag) +#define fibril_fork_wrt(fp, rtp, fn, ag) _omp_fork1(fn, rtp, _fibril_expand ag) +#endif + + +static int NTHREADS; +int fibril_rt_nprocs() { return (NTHREADS) ? NTHREADS : std::thread::hardware_concurrency(); } + +__attribute__((always_inline)) +inline static void _omp_init(int n, std::function<void(void)> f) { + int nprocs = std::thread::hardware_concurrency(); + if (n > 0 && n < nprocs) { + NTHREADS = n; + } else { + NTHREADS = nprocs; + } +#pragma omp parallel sections num_threads(NTHREADS) default(shared) + { + f(); + } +} + +#define fibril_rt_init(n) _omp_init(n, [&]() { + +#define fibril_rt_exit() }) + +#endif /* end of include guard: OPENMP_H */ diff --git a/benchmarks/quicksort.cpp b/benchmarks/quicksort.cpp new file mode 100644 index 00000000..c4eb013f --- /dev/null +++ b/benchmarks/quicksort.cpp @@ -0,0 +1,84 @@ +#include <math.h> +#include <stdlib.h> +#include "test.h" + +int n = 8; +static int * a, * b; +static size_t size; + +fibril void quicksort(int * a, size_t n) +{ + if (n < 2) return; + + int pivot = a[n / 2]; + + int *left = a; + int *right = a + n - 1; + + while (left <= right) { + if (*left < pivot) { + left++; + } else if (*right > pivot) { + right--; + } else { + int tmp = *left; + *left = *right; + *right = tmp; + left++; + right--; + } + } + + fibril_t fr; + fibril_init(&fr); + + fibril_fork(&fr, quicksort, (a, right - a + 1)); + quicksort(left, a + n - left); + + fibril_join(&fr); +} + +int verify() +{ + if (size < 2) return 0; + + int prev = a[0]; + size_t i; + for (i = 1; i < size; ++i) { + if (prev > a[i]) return 1; + prev = a[i]; + } + + return 0; +} + +void init() +{ + size = 1; + + size_t i; + for (i = 0; i < (size_t) n; ++i) { + size *= 10; + } + + a = (int*) malloc(sizeof(int) * size); + b = (int*) malloc(sizeof(int) * size); + + for (i = 0; i < size; ++i) { + b[i] = rand(); + } +} + +void prep() +{ + size_t i; + for (i = 0; i < size; ++i) { + a[i] = b[i]; + } +} + +void test() +{ + quicksort(a, size); +} + diff --git a/benchmarks/rectmul.cpp b/benchmarks/rectmul.cpp new file mode 100644 index 00000000..4bf5b05b --- /dev/null +++ b/benchmarks/rectmul.cpp @@ -0,0 +1,365 @@ +/* + * Program to multiply two rectangualar matrizes A(n,m) * B(m,n), where + * (n < m) and (n mod 16 = 0) and (m mod n = 0). (Otherwise fill with 0s + * to fit the shape.) + * + * written by Harald Prokop (prokop@mit.edu) Fall 97. + */ +/* + * Copyright (c) 2003 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <stdlib.h> +#include <stdio.h> +#include "test.h" + +#define BLOCK_EDGE 16 +#define BLOCK_SIZE (BLOCK_EDGE * BLOCK_EDGE) + +typedef double block[BLOCK_SIZE]; + +#ifndef BENCHMARK +int n = 512; +#else +int n = 4096; +#endif + +static block * A, * B, * R; +static int x, y, z; + +/* compute R = R+AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies +*/ +static void mult_add_block(block * A, block * B, block * R) +{ + int i, j; + + for (j = 0; j < 16; j += 2) { /* 2 columns at a time */ + double *bp = &((double *) B)[j]; + for (i = 0; i < 16; i += 2) { /* 2 rows at a time */ + double *ap = &((double *) A)[i * 16]; + double *rp = &((double *) R)[j + i * 16]; + //register double s0_0, s0_1; + //register double s1_0, s1_1; + double s0_0, s0_1; + double s1_0, s1_1; + s0_0 = rp[0]; + s0_1 = rp[1]; + s1_0 = rp[16]; + s1_1 = rp[17]; + s0_0 += ap[0] * bp[0]; + s0_1 += ap[0] * bp[1]; + s1_0 += ap[16] * bp[0]; + s1_1 += ap[16] * bp[1]; + s0_0 += ap[1] * bp[16]; + s0_1 += ap[1] * bp[17]; + s1_0 += ap[17] * bp[16]; + s1_1 += ap[17] * bp[17]; + s0_0 += ap[2] * bp[32]; + s0_1 += ap[2] * bp[33]; + s1_0 += ap[18] * bp[32]; + s1_1 += ap[18] * bp[33]; + s0_0 += ap[3] * bp[48]; + s0_1 += ap[3] * bp[49]; + s1_0 += ap[19] * bp[48]; + s1_1 += ap[19] * bp[49]; + s0_0 += ap[4] * bp[64]; + s0_1 += ap[4] * bp[65]; + s1_0 += ap[20] * bp[64]; + s1_1 += ap[20] * bp[65]; + s0_0 += ap[5] * bp[80]; + s0_1 += ap[5] * bp[81]; + s1_0 += ap[21] * bp[80]; + s1_1 += ap[21] * bp[81]; + s0_0 += ap[6] * bp[96]; + s0_1 += ap[6] * bp[97]; + s1_0 += ap[22] * bp[96]; + s1_1 += ap[22] * bp[97]; + s0_0 += ap[7] * bp[112]; + s0_1 += ap[7] * bp[113]; + s1_0 += ap[23] * bp[112]; + s1_1 += ap[23] * bp[113]; + s0_0 += ap[8] * bp[128]; + s0_1 += ap[8] * bp[129]; + s1_0 += ap[24] * bp[128]; + s1_1 += ap[24] * bp[129]; + s0_0 += ap[9] * bp[144]; + s0_1 += ap[9] * bp[145]; + s1_0 += ap[25] * bp[144]; + s1_1 += ap[25] * bp[145]; + s0_0 += ap[10] * bp[160]; + s0_1 += ap[10] * bp[161]; + s1_0 += ap[26] * bp[160]; + s1_1 += ap[26] * bp[161]; + s0_0 += ap[11] * bp[176]; + s0_1 += ap[11] * bp[177]; + s1_0 += ap[27] * bp[176]; + s1_1 += ap[27] * bp[177]; + s0_0 += ap[12] * bp[192]; + s0_1 += ap[12] * bp[193]; + s1_0 += ap[28] * bp[192]; + s1_1 += ap[28] * bp[193]; + s0_0 += ap[13] * bp[208]; + s0_1 += ap[13] * bp[209]; + s1_0 += ap[29] * bp[208]; + s1_1 += ap[29] * bp[209]; + s0_0 += ap[14] * bp[224]; + s0_1 += ap[14] * bp[225]; + s1_0 += ap[30] * bp[224]; + s1_1 += ap[30] * bp[225]; + s0_0 += ap[15] * bp[240]; + s0_1 += ap[15] * bp[241]; + s1_0 += ap[31] * bp[240]; + s1_1 += ap[31] * bp[241]; + rp[0] = s0_0; + rp[1] = s0_1; + rp[16] = s1_0; + rp[17] = s1_1; + } + } +} + + +/* compute R = AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies +*/ +static void multiply_block(block * A, block * B, block * R) +{ + int i, j; + + for (j = 0; j < 16; j += 2) { /* 2 columns at a time */ + double *bp = &((double *) B)[j]; + for (i = 0; i < 16; i += 2) { /* 2 rows at a time */ + double *ap = &((double *) A)[i * 16]; + double *rp = &((double *) R)[j + i * 16]; + //register double s0_0, s0_1; + //register double s1_0, s1_1; + double s0_0, s0_1; + double s1_0, s1_1; + s0_0 = ap[0] * bp[0]; + s0_1 = ap[0] * bp[1]; + s1_0 = ap[16] * bp[0]; + s1_1 = ap[16] * bp[1]; + s0_0 += ap[1] * bp[16]; + s0_1 += ap[1] * bp[17]; + s1_0 += ap[17] * bp[16]; + s1_1 += ap[17] * bp[17]; + s0_0 += ap[2] * bp[32]; + s0_1 += ap[2] * bp[33]; + s1_0 += ap[18] * bp[32]; + s1_1 += ap[18] * bp[33]; + s0_0 += ap[3] * bp[48]; + s0_1 += ap[3] * bp[49]; + s1_0 += ap[19] * bp[48]; + s1_1 += ap[19] * bp[49]; + s0_0 += ap[4] * bp[64]; + s0_1 += ap[4] * bp[65]; + s1_0 += ap[20] * bp[64]; + s1_1 += ap[20] * bp[65]; + s0_0 += ap[5] * bp[80]; + s0_1 += ap[5] * bp[81]; + s1_0 += ap[21] * bp[80]; + s1_1 += ap[21] * bp[81]; + s0_0 += ap[6] * bp[96]; + s0_1 += ap[6] * bp[97]; + s1_0 += ap[22] * bp[96]; + s1_1 += ap[22] * bp[97]; + s0_0 += ap[7] * bp[112]; + s0_1 += ap[7] * bp[113]; + s1_0 += ap[23] * bp[112]; + s1_1 += ap[23] * bp[113]; + s0_0 += ap[8] * bp[128]; + s0_1 += ap[8] * bp[129]; + s1_0 += ap[24] * bp[128]; + s1_1 += ap[24] * bp[129]; + s0_0 += ap[9] * bp[144]; + s0_1 += ap[9] * bp[145]; + s1_0 += ap[25] * bp[144]; + s1_1 += ap[25] * bp[145]; + s0_0 += ap[10] * bp[160]; + s0_1 += ap[10] * bp[161]; + s1_0 += ap[26] * bp[160]; + s1_1 += ap[26] * bp[161]; + s0_0 += ap[11] * bp[176]; + s0_1 += ap[11] * bp[177]; + s1_0 += ap[27] * bp[176]; + s1_1 += ap[27] * bp[177]; + s0_0 += ap[12] * bp[192]; + s0_1 += ap[12] * bp[193]; + s1_0 += ap[28] * bp[192]; + s1_1 += ap[28] * bp[193]; + s0_0 += ap[13] * bp[208]; + s0_1 += ap[13] * bp[209]; + s1_0 += ap[29] * bp[208]; + s1_1 += ap[29] * bp[209]; + s0_0 += ap[14] * bp[224]; + s0_1 += ap[14] * bp[225]; + s1_0 += ap[30] * bp[224]; + s1_1 += ap[30] * bp[225]; + s0_0 += ap[15] * bp[240]; + s0_1 += ap[15] * bp[241]; + s1_0 += ap[31] * bp[240]; + s1_1 += ap[31] * bp[241]; + rp[0] = s0_0; + rp[1] = s0_1; + rp[16] = s1_0; + rp[17] = s1_1; + } + } +} + + +int check_matrix(block * R, long x, long y, long o, double v) +{ + int a, b; + + if (x * y == 1) { + /** + * Checks if each A[i,j] of a martix A of size nb x nb blocks has + * value v. + */ + int i; + for (i = 0; i < BLOCK_SIZE; i++) + if (((double *) R)[i] != v) + return 1; + + return 0; + } + + if (x>y) { + a = check_matrix(R, x / 2, y, o, v); + b = check_matrix(R + (x / 2) * o,(x + 1) / 2, y, o, v); + } else { + a = check_matrix(R, x, y / 2, o, v); + b = check_matrix(R + (y / 2), x, (y + 1) / 2, o, v); + } + + return a + b; +} + +/* Add matrix T into matrix R, where T and R are bl blocks in size + * + */ +fibril void add_matrix(block * T, long ot, block * R, long oR, long x, long y) +{ + if (x + y == 2) { + long i; + for (i = 0; i < BLOCK_SIZE; i += 4) { + ((double *) R)[i + 0] += ((double *) T)[i + 0]; + ((double *) R)[i + 1] += ((double *) T)[i + 1]; + ((double *) R)[i + 2] += ((double *) T)[i + 2]; + ((double *) R)[i + 3] += ((double *) T)[i + 3]; + } + return; + } + + fibril_t fr; + fibril_init(&fr); + + if (x > y) { + fibril_fork(&fr, add_matrix, (T, ot, R, oR, x/2, y)); + add_matrix(T+(x/2)*ot, ot, R+(x/2)*oR, oR, (x+1)/2, y); + } else { + fibril_fork(&fr, add_matrix, (T, ot, R, oR, x, y/2)); + add_matrix(T+(y/2), ot, R+(y/2), oR, x, (y+1)/2); + } + + fibril_join(&fr); +} + +void init_matrix(block * R, long x, long y, long o, double v) +{ + if (x + y ==2) { + int i; + for (i = 0; i < BLOCK_SIZE; i++) + ((double *) R)[i] = v; + return; + } + + if (x > y) { + init_matrix(R, x/2, y, o, v); + init_matrix(R+(x/2) * o, (x+1)/2, y, o, v); + } else { + init_matrix(R, x, y/2, o, v); + init_matrix(R+(y/2), x, (y+1)/2, o, v); + } +} + +fibril static void multiply_matrix(block * A, long oa, block * B, long ob, + long x, long y, long z, block * R, long oR, int add) +{ + if (x + y + z == 3) { + if (add) + return mult_add_block(A, B, R); + else + return multiply_block(A, B, R); + } + + fibril_t fr; + fibril_init(&fr); + + if (x >= y && x >= z) { + fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x/2, y, z, R, oR, add)); + multiply_matrix(A+(x/2)*oa, oa, B, ob, (x+1)/2, y, z, R+(x/2)*oR, oR, add); + fibril_join(&fr); + } else if (y > x && y > z) { + fibril_fork(&fr, multiply_matrix, + (A+(y/2), oa, B+(y/2)*ob, ob, x, (y+1)/2, z, R, oR, add)); + + block * tmp = (block*) malloc(x * z * sizeof(block)); + multiply_matrix(A, oa, B, ob, x, y/2, z, tmp, z, 0); + fibril_join(&fr); + + add_matrix(tmp, z, R, oR, x, z); + free(tmp); + } else { + fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x, y, z/2, R, oR, add)); + multiply_matrix(A, oa, B+(z/2), ob, x, y, (z+1)/2, R+(z/2), oR, add); + fibril_join(&fr); + } +} + +void init() { + x = n / BLOCK_EDGE; + y = n / BLOCK_EDGE; + z = n / BLOCK_EDGE; + + A = (block*) malloc(x * y * sizeof(block)); + B = (block*) malloc(y * z * sizeof(block)); + R = (block*) malloc(x * z * sizeof(block)); + + init_matrix(A, x, y, y, 1.0); + init_matrix(B, y, z, z, 1.0); +} + +void prep() { + init_matrix(R, x, z, z, 0.0); +} + +void test() { + multiply_matrix(A, y, B, z, x, y, z, R, z, 0); +} + +int verify() { +#ifndef BENCHMARK + if (check_matrix(R, x, z, z, y * 16)) { + printf("WRONG RESULT!\n"); + return 1; + }; +#endif + + return 0; +} diff --git a/benchmarks/serial/CMakeLists.txt b/benchmarks/serial/CMakeLists.txt new file mode 100644 index 00000000..88ca85c3 --- /dev/null +++ b/benchmarks/serial/CMakeLists.txt @@ -0,0 +1,27 @@ + +add_definitions(-DFIBRIL_SERIAL) + + +add_executable(cholesky_serial ../cholesky.cpp) + +add_executable(fft_serial ../fft.cpp) + +add_executable(fib_serial ../fib.cpp) + +add_executable(heat_serial ../heat.cpp) + +add_executable(integrate_serial ../integrate.cpp) + +add_executable(knapsack_serial ../knapsack.cpp) + +add_executable(lu_serial ../lu.cpp) + +add_executable(matmul_serial ../matmul.cpp) + +add_executable(nqueens_serial ../nqueens.cpp) + +add_executable(quicksort_serial ../quicksort.cpp) + +add_executable(rectmul_serial ../rectmul.cpp) + +add_executable(strassen_serial ../strassen.cpp) diff --git a/benchmarks/serial/serial.h b/benchmarks/serial/serial.h new file mode 100644 index 00000000..482577e4 --- /dev/null +++ b/benchmarks/serial/serial.h @@ -0,0 +1,18 @@ +#ifndef SERIAL_H +#define SERIAL_H + + + +#define fibril +#define fibril_t __attribute__((unused)) int +#define fibril_init(fp) +#define fibril_join(fp) + +#define fibril_fork_nrt(fp, fn, ag) (fn ag) +#define fibril_fork_wrt(fp, rtp, fn, ag) (*rtp = fn ag) + +#define fibril_rt_init(n) ((void) n) +#define fibril_rt_exit() +#define fibril_rt_nprocs(n) (1) + +#endif /* end of include guard: SERIAL_H */ diff --git a/benchmarks/strassen.cpp b/benchmarks/strassen.cpp new file mode 100644 index 00000000..a49c66ad --- /dev/null +++ b/benchmarks/strassen.cpp @@ -0,0 +1,644 @@ +/* + * Copyright (c) 1996 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use, copy, modify, and distribute the Software without + * restriction, provided the Software, including any modified copies made + * under this license, is not distributed for a fee, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE + * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Except as contained in this notice, the name of the Massachusetts + * Institute of Technology shall not be used in advertising or otherwise + * to promote the sale, use or other dealings in this Software without + * prior written authorization from the Massachusetts Institute of + * Technology. + * + */ + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include "test.h" + +#define SizeAtWhichDivideAndConquerIsMoreEfficient 64 +#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16 +#define CacheBlockSizeInBytes 32 + +/* The real numbers we are using --- either double or float */ +typedef double REAL; +typedef unsigned long PTR; + +/* maximum tolerable relative error (for the checking routine) */ +#define EPSILON (1.0E-6) + +/* + * Matrices are stored in row-major order; A is a pointer to + * the first element of the matrix, and an is the number of elements + * between two rows. This macro produces the element A[i,j] + * given A, an, i and j + */ +#define ELEM(A, an, i, j) (A[(i) * (an) + (j)]) + +#ifndef BENCHMARK +int n = 512; +#else +int n = 4096; +#endif + +static REAL * A, * B, * C; + +/* + * Naive sequential algorithm, for comparison purposes + */ +void matrixmul(int n, REAL * A, int an, REAL * B, int bn, REAL * C, int cn) +{ + int i, j, k; + REAL s; + + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) { + s = 0.0; + for (k = 0; k < n; ++k) + s += ELEM(A, an, i, k) * ELEM(B, bn, k, j); + + ELEM(C, cn, i, j) = s; + } +} + +/***************************************************************************** + ** + ** FastNaiveMatrixMultiply + ** + ** For small to medium sized matrices A, B, and C of size + ** MatrixSize * MatrixSize this function performs the operation + ** C = A x B efficiently. + ** + ** Note MatrixSize must be divisible by 8. + ** + ** INPUT: + ** C = (*C WRITE) Address of top left element of matrix C. + ** A = (*A IS READ ONLY) Address of top left element of matrix A. + ** B = (*B IS READ ONLY) Address of top left element of matrix B. + ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) + ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] + ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] + ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] + ** + ** OUTPUT: + ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) + ** + *****************************************************************************/ +static void FastNaiveMatrixMultiply( + REAL * C, REAL * A, REAL * B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) +{ + /* Assumes size of real is 8 bytes */ + PTR RowWidthBInBytes = RowWidthB << 3; + PTR RowWidthAInBytes = RowWidthA << 3; + PTR MatrixWidthInBytes = MatrixSize << 3; + PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; + unsigned Horizontal, Vertical; + + REAL *ARowStart = A; + for (Vertical = 0; Vertical < MatrixSize; Vertical++) { + for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { + REAL *BColumnStart = B + Horizontal; + REAL FirstARowValue = *ARowStart++; + + REAL Sum0 = FirstARowValue * (*BColumnStart); + REAL Sum1 = FirstARowValue * (*(BColumnStart+1)); + REAL Sum2 = FirstARowValue * (*(BColumnStart+2)); + REAL Sum3 = FirstARowValue * (*(BColumnStart+3)); + REAL Sum4 = FirstARowValue * (*(BColumnStart+4)); + REAL Sum5 = FirstARowValue * (*(BColumnStart+5)); + REAL Sum6 = FirstARowValue * (*(BColumnStart+6)); + REAL Sum7 = FirstARowValue * (*(BColumnStart+7)); + + unsigned Products; + for (Products = 1; Products < MatrixSize; Products++) { + REAL ARowValue = *ARowStart++; + BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); + + Sum0 += ARowValue * (*BColumnStart); + Sum1 += ARowValue * (*(BColumnStart+1)); + Sum2 += ARowValue * (*(BColumnStart+2)); + Sum3 += ARowValue * (*(BColumnStart+3)); + Sum4 += ARowValue * (*(BColumnStart+4)); + Sum5 += ARowValue * (*(BColumnStart+5)); + Sum6 += ARowValue * (*(BColumnStart+6)); + Sum7 += ARowValue * (*(BColumnStart+7)); + } + ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); + + *(C) = Sum0; + *(C+1) = Sum1; + *(C+2) = Sum2; + *(C+3) = Sum3; + *(C+4) = Sum4; + *(C+5) = Sum5; + *(C+6) = Sum6; + *(C+7) = Sum7; + C+=8; + } + + ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); + C = (REAL*) ( ((PTR) C) + RowIncrementC ); + } +} + +/***************************************************************************** + ** + ** FastAdditiveNaiveMatrixMultiply + ** + ** For small to medium sized matrices A, B, and C of size + ** MatrixSize * MatrixSize this function performs the operation + ** C += A x B efficiently. + ** + ** Note MatrixSize must be divisible by 8. + ** + ** INPUT: + ** C = (*C READ/WRITE) Address of top left element of matrix C. + ** A = (*A IS READ ONLY) Address of top left element of matrix A. + ** B = (*B IS READ ONLY) Address of top left element of matrix B. + ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) + ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] + ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] + ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] + ** + ** OUTPUT: + ** C = (*C READ/WRITE) Matrix C contains C + A x B. + ** + *****************************************************************************/ +static void FastAdditiveNaiveMatrixMultiply( + REAL * C, REAL * A, REAL * B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) +{ + /* Assumes size of real is 8 bytes */ + PTR RowWidthBInBytes = RowWidthB << 3; + PTR RowWidthAInBytes = RowWidthA << 3; + PTR MatrixWidthInBytes = MatrixSize << 3; + PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; + unsigned Horizontal, Vertical; + + REAL *ARowStart = A; + for (Vertical = 0; Vertical < MatrixSize; Vertical++) { + for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { + REAL *BColumnStart = B + Horizontal; + + REAL Sum0 = *C; + REAL Sum1 = *(C+1); + REAL Sum2 = *(C+2); + REAL Sum3 = *(C+3); + REAL Sum4 = *(C+4); + REAL Sum5 = *(C+5); + REAL Sum6 = *(C+6); + REAL Sum7 = *(C+7); + + unsigned Products; + for (Products = 0; Products < MatrixSize; Products++) { + REAL ARowValue = *ARowStart++; + + Sum0 += ARowValue * (*BColumnStart); + Sum1 += ARowValue * (*(BColumnStart+1)); + Sum2 += ARowValue * (*(BColumnStart+2)); + Sum3 += ARowValue * (*(BColumnStart+3)); + Sum4 += ARowValue * (*(BColumnStart+4)); + Sum5 += ARowValue * (*(BColumnStart+5)); + Sum6 += ARowValue * (*(BColumnStart+6)); + Sum7 += ARowValue * (*(BColumnStart+7)); + + BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); + + } + ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); + + *(C) = Sum0; + *(C+1) = Sum1; + *(C+2) = Sum2; + *(C+3) = Sum3; + *(C+4) = Sum4; + *(C+5) = Sum5; + *(C+6) = Sum6; + *(C+7) = Sum7; + C+=8; + } + + ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); + C = (REAL*) ( ((PTR) C) + RowIncrementC ); + } +} + + +/***************************************************************************** + ** + ** MultiplyByDivideAndConquer + ** + ** For medium to medium-large (would you like fries with that) sized + ** matrices A, B, and C of size MatrixSize * MatrixSize this function + ** efficiently performs the operation + ** C = A x B (if AdditiveMode == 0) + ** C += A x B (if AdditiveMode != 0) + ** + ** Note MatrixSize must be divisible by 16. + ** + ** INPUT: + ** C = (*C READ/WRITE) Address of top left element of matrix C. + ** A = (*A IS READ ONLY) Address of top left element of matrix A. + ** B = (*B IS READ ONLY) Address of top left element of matrix B. + ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) + ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] + ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] + ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] + ** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B + ** + ** OUTPUT: + ** C (+)= A x B. (+ if AdditiveMode != 0) + ** + *****************************************************************************/ +void MultiplyByDivideAndConquer( + REAL * C, REAL * A, REAL * B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, + int AdditiveMode) +{ +#define A00 A +#define B00 B +#define C00 C + + REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11; + unsigned QuadrantSize = MatrixSize >> 1; + + /* partition the matrix */ + A01 = A00 + QuadrantSize; + A10 = A00 + RowWidthA * QuadrantSize; + A11 = A10 + QuadrantSize; + + B01 = B00 + QuadrantSize; + B10 = B00 + RowWidthB * QuadrantSize; + B11 = B10 + QuadrantSize; + + C01 = C00 + QuadrantSize; + C10 = C00 + RowWidthC * QuadrantSize; + C11 = C10 + QuadrantSize; + + if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) { + MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, AdditiveMode); + MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, AdditiveMode); + MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, AdditiveMode); + MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, AdditiveMode); + MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, 1); + MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, 1); + MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, 1); + MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, 1); + } else { + if (AdditiveMode) { + FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + } else { + FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + } + + FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + } + + return; +} + + +/***************************************************************************** + ** + ** OptimizedStrassenMultiply + ** + ** For large matrices A, B, and C of size MatrixSize * MatrixSize this + ** function performs the operation C = A x B efficiently. + ** + ** INPUT: + ** C = (*C WRITE) Address of top left element of matrix C. + ** A = (*A IS READ ONLY) Address of top left element of matrix A. + ** B = (*B IS READ ONLY) Address of top left element of matrix B. + ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) + ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] + ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] + ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] + ** OUTPUT: + ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) + ** + *****************************************************************************/ +fibril static void OptimizedStrassenMultiply( + REAL * C, REAL * A, REAL * B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * + QuadrantSize + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /**A11, *B11, *C11,*/ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; +#define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= SizeAtWhichDivideAndConquerIsMoreEfficient) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, + RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ +#define A11 A +#define B11 B +#define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = (char*) malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ +#define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) +#define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) +#define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + + fibril_t fr; + fibril_init(&fr); + + /* M2 = A11 x B11 */ + fibril_fork(&fr, OptimizedStrassenMultiply, + (M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB)); + + /* M5 = S1 * S5 */ + fibril_fork(&fr, OptimizedStrassenMultiply, + (M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize)); + + /* Step 1 of T1 = S2 x S6 + M2 */ + fibril_fork(&fr, OptimizedStrassenMultiply, + (T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize)); + + /* Step 1 of T2 = T1 + S3 x S7 */ + fibril_fork(&fr, OptimizedStrassenMultiply, + (C22, S3, S7, QuadrantSize, RowWidthC, QuadrantSize, QuadrantSize)); + + /* Step 1 of C11 = M2 + A12 * B21 */ + fibril_fork(&fr, OptimizedStrassenMultiply, + (C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB)); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + fibril_fork(&fr, OptimizedStrassenMultiply, + (C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB)); + + /* Step 1 of C21 = T2 - A22 * S8 */ + OptimizedStrassenMultiply(C21, A22, S8, QuadrantSize, RowWidthC, + RowWidthA, QuadrantSize); + + fibril_join(&fr); + + for (Row = 0; Row < QuadrantSize; Row++) { + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + + free(StartHeap); +} + +static void strassen(int n, REAL * A, int an, REAL * B, int bn, + REAL * C, int cn) { + OptimizedStrassenMultiply(C, A, B, n, cn, bn, an); +} + +/* + * Set an n by n matrix A to random values. The distance between + * rows is an + */ +void init_matrix(int n, REAL *A, int an) +{ + int i, j; + + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) + ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; +} + +/* + * Compare two matrices. Print an error message if they differ by + * more than EPSILON. + */ +int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) +{ + int i, j; + REAL c; + + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) { + /* compute the relative error c */ + c = ELEM(A, an, i, j) - ELEM(B, bn, i, j); + if (c < 0.0) + c = -c; + + c = c / ELEM(A, an, i, j); + if (c > EPSILON) { + return 1; + } + } + + return 0; +} + +void init() { + A = (REAL*) malloc(n * n * sizeof(REAL)); + B = (REAL*) malloc(n * n * sizeof(REAL)); + C = (REAL*) malloc(n * n * sizeof(REAL)); + + init_matrix(n, A, n); + init_matrix(n, B, n); +} + +void prep() { +} + +void test() { + strassen(n, A, n, B, n, C, n); +} + +int verify() { + int fail = 0; + +#ifndef BENCHMARK + REAL * E = (REAL*) malloc(n * n * sizeof(REAL)); + matrixmul(n, A, n, B, n, E, n); + fail = compare_matrix(n, E, n, C, n); + if (fail > 0) printf("WRONG RESULT!\n"); +#endif + + return fail; +} diff --git a/benchmarks/tbb/CMakeLists.txt b/benchmarks/tbb/CMakeLists.txt new file mode 100644 index 00000000..2ad0b5b8 --- /dev/null +++ b/benchmarks/tbb/CMakeLists.txt @@ -0,0 +1,43 @@ + +add_definitions(-DFIBRIL_TBB) + +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ltbb") + +find_library(TBB_LIB tbb) + + +add_executable(cholesky_tbb ../cholesky.cpp) +target_link_libraries(cholesky_tbb "${TBB_LIB}") + +add_executable(fft_tbb ../fft.cpp) +target_link_libraries(fft_tbb "${TBB_LIB}") + +add_executable(fib_tbb ../fib.cpp) +target_link_libraries(fib_tbb "${TBB_LIB}") + +add_executable(heat_tbb ../heat.cpp) +target_link_libraries(heat_tbb "${TBB_LIB}") + +add_executable(integrate_tbb ../integrate.cpp) +target_link_libraries(integrate_tbb "${TBB_LIB}") + +add_executable(knapsack_tbb ../knapsack.cpp) +target_link_libraries(knapsack_tbb "${TBB_LIB}") + +add_executable(lu_tbb ../lu.cpp) +target_link_libraries(lu_tbb "${TBB_LIB}") + +add_executable(matmul_tbb ../matmul.cpp) +target_link_libraries(matmul_tbb "${TBB_LIB}") + +add_executable(nqueens_tbb ../nqueens.cpp) +target_link_libraries(nqueens_tbb "${TBB_LIB}") + +add_executable(quicksort_tbb ../quicksort.cpp) +target_link_libraries(quicksort_tbb "${TBB_LIB}") + +add_executable(rectmul_tbb ../rectmul.cpp) +target_link_libraries(rectmul_tbb "${TBB_LIB}") + +add_executable(strassen_tbb ../strassen.cpp) +target_link_libraries(strassen_tbb "${TBB_LIB}") diff --git a/benchmarks/tbb/tbb.h b/benchmarks/tbb/tbb.h new file mode 100644 index 00000000..98434ced --- /dev/null +++ b/benchmarks/tbb/tbb.h @@ -0,0 +1,36 @@ +#ifndef TBB_H +#define TBB_H + +#include <tbb/task_group.h> +#include <tbb/task_scheduler_init.h> + +#define fibril +#define fibril_t tbb::task_group +#define fibril_init(fp) +#define fibril_join(fp) (fp)->wait() + +#define fibril_fork_nrt(fp, fn, ag) (fp)->run([=]{ fn ag; }) +#define fibril_fork_wrt(fp, rtp, fn, ag) do { \ + __typeof__(rtp) pt = rtp; \ + (fp)->run([=]{ *pt = fn ag; }); \ +} while (0) + +#include <thread> + +int NTHREADS; +int fibril_rt_nprocs() { return (NTHREADS) ? NTHREADS : std::thread::hardware_concurrency(); } + +#define fibril_rt_init(n) \ + do { \ + int max_nprocs = fibril_rt_nprocs(); \ + if (n > 0 && n <= max_nprocs) { \ + NTHREADS = n; \ + } else { \ + NTHREADS = max_nprocs; \ + } \ + } while(0); \ +tbb::task_scheduler_init _fibril_rt_init(NTHREADS) + +#define fibril_rt_exit() + +#endif /* end of include guard: TBB_H */ diff --git a/benchmarks/test.h b/benchmarks/test.h new file mode 100644 index 00000000..813a1fd3 --- /dev/null +++ b/benchmarks/test.h @@ -0,0 +1,148 @@ +#ifndef TEST_H +#define TEST_H + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +extern void init(); +extern void prep(); +extern void test(); +extern int verify(); + +extern int n; + +#include <stdlib.h> +#include "fibril.h" + +#ifdef BENCHMARK + +#include <stdio.h> +#include <float.h> +#include <string.h> +#include <sys/time.h> +#include <sys/resource.h> + +static void sort(float * a, int n) +{ + int i, sorted = 0; + + while (!sorted) { + sorted = 1; + + for (i = 1; i < n; ++i) { + if (a[i] < a[i - 1]) { + float t = a[i]; + a[i] = a[i - 1]; + a[i - 1] = t; + sorted = 0; + } + } + } +} + +size_t static inline time_elapsed(size_t val) +{ + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec * 1000000 + t.tv_usec - val; +} + +static void bench(const char * name, int nprocs) +{ + const static int iter = 10; + float times[iter]; + + printf("===========================================\n"); + printf(" Benchmark: %s\n", strrchr(name, '/') + 1); + printf(" Input size: %d\n", n); + printf(" Number of iterations: %d\n", iter); + printf(" Number of processors: %d\n", nprocs); + + struct rusage ru; + getrusage(RUSAGE_SELF, &ru); + long rss = ru.ru_maxrss; + long flt = ru.ru_minflt; + + /* warm up */ + prep(); + test(); + + /* benchmark */ + int i; + for (i = 0; i < iter; ++i) { + prep(); + size_t usecs = time_elapsed(0); + test(); + usecs = time_elapsed(usecs); + times[i] = usecs / 1000000.0; + printf(" #%d execution time: %f s\n", i, times[i]); + } + + sort(times, iter); + + float p10 = times[1]; + float p90 = times[8]; + float med = times[5]; + + getrusage(RUSAGE_SELF, &ru); + rss = ru.ru_maxrss - rss; + flt = ru.ru_minflt - flt; + + printf(" Execution time summary:\n"); + printf(" Median: %f s\n", med); + printf(" 10th %%: %f s\n", p10); + printf(" 90th %%: %f s\n", p90); + printf(" Resources summary: \n"); + printf(" Max RSS: %ld (KB)\n", ru.ru_maxrss); + printf(" Runtime RSS: %ld (KB)\n", rss); + printf(" # of page faults: %ld\n", flt); +} + +#endif + +#include <stdlib.h> + +int main(int argc, const char * argv[]) +{ + if (argc > 1 && (argc = atoi(argv[1])) > 0) { + n = argc; + } + + init(); + int result; + + int nthreads = 0; + char *env = getenv("EMPER_BENCH_NPROCS"); + if (env) nthreads = atoi(env); + + fibril_rt_init(nthreads); + int nprocs = fibril_rt_nprocs(); + +#ifdef BENCHMARK + bench(argv[0], nprocs); +#else + (void) nprocs; + prep(); + test(); +#endif + + result = verify(); + fibril_rt_exit(); + +#ifdef BENCHMARK +#ifdef FIBRIL_STATS + printf(" Statistics summary:\n"); + printf(" # of steals: %s\n", getenv("FIBRIL_N_STEALS")); + printf(" # of suspensions: %s\n", getenv("FIBRIL_N_SUSPENSIONS")); + printf(" # of stacks used: %s\n", getenv("FIBRIL_N_STACKS")); + printf(" # of pages used: %s\n", getenv("FIBRIL_N_PAGES")); +#endif + printf("===========================================\n"); +#endif + + //return verify(); + return result; +} + +#endif /* end of include guard: TEST_H */ diff --git a/emper/CMakeLists.txt b/emper/CMakeLists.txt index e9a18674..62472c71 100644 --- a/emper/CMakeLists.txt +++ b/emper/CMakeLists.txt @@ -11,6 +11,7 @@ add_files(EMPER_SOURCE Debug.cpp) add_files(EMPER_SOURCE ContextManager.cpp) add_files(EMPER_SOURCE BinaryPrivateSemaphore.cpp) add_files(EMPER_SOURCE CountingPrivateSemaphore.cpp) +add_files(EMPER_SOURCE Fibril.cpp) add_files(EMPER_SOURCE Semaphore.cpp) add_files(EMPER_INCLUDE ".") diff --git a/emper/Context.hpp b/emper/Context.hpp index 582f5e05..fecb69ac 100644 --- a/emper/Context.hpp +++ b/emper/Context.hpp @@ -3,6 +3,7 @@ #include <cassert> #include <functional> #include <cstring> +#include <sys/mman.h> #include <valgrind/valgrind.h> @@ -10,6 +11,20 @@ #include "Debug.hpp" class Context; +class Dispatcher; +class Fiber; + +#define PAGE_SIZE (4 * 1024) +#ifdef EMPER_BENCH_STACK_SIZE +#define STACK_SIZE EMPER_BENCH_STACK_SIZE +#else +#define STACK_SIZE (0x10000) +#endif +#ifdef EMPER_FIBRIL_STATS +#include <atomic> +extern std::atomic<uint64_t> statsUnmapp; +#endif + extern "C" [[noreturn]] void switch_and_load_context(void** toTos); // *Not* marked as 'noreturn' because save_and_switch_context does @@ -20,10 +35,12 @@ extern "C" [[noreturn]] void switch_context(void** toTos); class ALIGN_TO_CACHE_LINE Context : Logger<LogSubsystem::C> { private: - static const unsigned int CONTEXT_SIZE = 0xffff; // 1024 * 1024 * 4; + static const unsigned int CONTEXT_SIZE = STACK_SIZE; // 0xffff; static thread_local Context* currentContext; + const Fiber* currentFiber; + void* const tos; // unsigned valgrindStackId; @@ -52,6 +69,21 @@ private: context->mainFunction(); } + friend Dispatcher; + + static void setCurrentFiber(const Fiber* fiber) { + assert(currentContext); + + currentContext->currentFiber = fiber; + } + + static const Fiber* getCurrentFiber() { + assert(currentContext); + + if (!currentContext) return nullptr; + return currentContext->currentFiber; + } + public: // cppcheck-suppress noExplicitConstructor selfInitialization Context(func_t mainFunction) @@ -106,13 +138,35 @@ public: return tos; } +#ifdef EMPER_MADVISE + inline void unmap(void *from) const { + const size_t PAGE_SIZE_MASK = 4 * 1024 - 1; + const uintptr_t start = ((uintptr_t) context + PAGE_SIZE_MASK) & ~PAGE_SIZE_MASK; + const uintptr_t end = (uintptr_t) from & ~PAGE_SIZE_MASK; + //if (madvise((void*) start, (end - start), MADV_DONTNEED)) { + if (madvise((void*) start, (end - start), MADV_FREE)) { + perror("madvise"); + // die()? + } +#ifdef EMPER_FIBRIL_STATS + statsUnmapp++; +#endif + } +#endif + /** * Start this context. */ [[noreturn]] inline void start() { LOGD("starting"); currentContext = this; - switch_context(&savedStackpointer); + asm( + "mov %0, %%rsp\n\t" + "jmp *%1\n\t" + :: "r" ((void**) tos - 1), "r" (kickoff) : "memory" + ); + __builtin_unreachable(); + //switch_context(&savedStackpointer); } /** @@ -154,4 +208,5 @@ public: return currentContext; } + friend class Fibril; }; diff --git a/emper/ContextManager.cpp b/emper/ContextManager.cpp index 88a651c8..8d4b2f7c 100644 --- a/emper/ContextManager.cpp +++ b/emper/ContextManager.cpp @@ -5,10 +5,14 @@ #include "Runtime.hpp" #include "Debug.hpp" #include "Context.hpp" +#include "Continuation.hpp" +#include "Fibril.hpp" +#include "Dispatcher.hpp" ContextManager::ContextManager(Runtime& runtime) : MemoryManager(runtime), runtime(runtime) { +#ifdef EMPER_CM_WITH_MEMORY_MANAGER auto newWorkerHook = [this]() { - for (unsigned int i = 0; i < CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE * 2; ++i) { + for (unsigned int i = 0; i < CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE; ++i) { Context* context = new Context(this->runtime.dispatcher.getDispatchLoop()); putFreeContext(context); } @@ -16,10 +20,11 @@ ContextManager::ContextManager(Runtime& runtime) : MemoryManager(runtime), runti // Note that it is important that this hook is executed *after* // the one of the MemoryManager superclass. runtime.addNewWorkerHook(newWorkerHook); +#endif } Context* ContextManager::getFreeContext() { -#ifdef CM_WITH_MEMORY_MANAGER +#ifdef EMPER_CM_WITH_MEMORY_MANAGER bool malloced; void* memory = getMemory(&malloced); if (malloced) { @@ -32,18 +37,40 @@ Context* ContextManager::getFreeContext() { } void ContextManager::putFreeContext(Context* context) { -#ifdef CM_WITH_MEMORY_MANAGER +#ifdef EMPER_CM_WITH_MEMORY_MANAGER putMemory(context); #else delete context; #endif } +thread_local static Continuation *cont; + void ContextManager::start() { + uintptr_t val; + Continuation c; + cont = &c; + + val = cont->setJmp(); + if (Runtime::getRuntime()->isShuttingDown()) + pthread_exit(nullptr); + + Fibril::tryResumeFiber(val); + + Context* currentContext = Context::getCurrentContext(); + if (currentContext) { + /* use currentContext to execute Fibers */ + currentContext->start(); + } + Context* freeContext = getFreeContext(); freeContext->start(); } +void ContextManager::resume(uintptr_t val) { + cont->longJmp(val); +} + /** * Save the current context and start a new one. */ @@ -64,5 +91,12 @@ void ContextManager::discardAndResume(Context* context) { LOGD("Freeing context " << contextToFree); putFreeContext(contextToFree); }); + + // Since we are going to discard this context, it will never reach + // the end of its dispatch loop, and hence we need to ensure that + // the fiber is recycled. + const Fiber* currentFiber = Dispatcher::getCurrentFiberPtr(); + Dispatcher::recycle(currentFiber); + contextToFree->discardAndResume(context); } diff --git a/emper/ContextManager.hpp b/emper/ContextManager.hpp index e39f5b32..b57a2e2b 100644 --- a/emper/ContextManager.hpp +++ b/emper/ContextManager.hpp @@ -8,7 +8,7 @@ class Context; -#define CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE 64 +#define CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE 16 class ContextManager : public Logger<LogSubsystem::CM>, protected MemoryManager<Context, 128, CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE> { @@ -30,4 +30,7 @@ public: [[noreturn]] void discardAndResume(Context* context); + //[[noreturn]] void resume(); /* FIXME noreturn leads to SEGFAULTs in fibril_join, because compiler thinks join never returns! */ + void resume(uintptr_t); + }; diff --git a/emper/Continuation.hpp b/emper/Continuation.hpp new file mode 100644 index 00000000..3140cbac --- /dev/null +++ b/emper/Continuation.hpp @@ -0,0 +1,61 @@ +#pragma once + + + +#define membar(call) do { \ + call; \ + asm ( "nop" ::: "rbx", "r12", "r13", "r14", "r15", "memory" ); \ +} while (0); + + +class Continuation { +public: + void *bp; + void *sp; + void *ip; + + inline __attribute__((always_inline)) + Continuation() : ip(nullptr) { + register void *rbp asm("rbp"); + register void *rsp asm("rsp"); + + bp = rbp; + sp = rsp; + }; + + inline __attribute__((always_inline, noreturn)) + void execute(const void* _sp) { + asm ( + "mov %0, %%rsp\n\t" + "mov %1, %%rbp\n\t" + "jmp *%2\n\t" + :: "r" (_sp), "r" (bp), "r" (ip) : "memory" + ); + __builtin_unreachable(); + }; + + inline __attribute__((always_inline, returns_twice)) + uintptr_t setJmp() { + auto set_rip = [] (Continuation* c) __attribute__((noinline, hot, optimize(3))) { + c->ip = __builtin_return_address(0); + return 0; + }; + + uintptr_t res; + membar(res = set_rip(this)); + return res; + }; + + inline __attribute__((always_inline, noreturn)) + void longJmp(uintptr_t ret) { + asm ( + "mov %0, %%rsp\n\t" + "mov %1, %%rbp\n\t" + "jmp *%2\n\t" + :: "r" (sp), "r" (bp), "r" (ip), "a" (ret) : "memory" + ); + __builtin_unreachable(); + }; + +}; + diff --git a/emper/Dispatcher.cpp b/emper/Dispatcher.cpp index dfe4e280..ac0c76e8 100644 --- a/emper/Dispatcher.cpp +++ b/emper/Dispatcher.cpp @@ -6,8 +6,6 @@ #include "Runtime.hpp" #include "Debug.hpp" -thread_local const Fiber* Dispatcher::currentFiber; - func_t Dispatcher::getDispatchLoop() { return std::bind(&Dispatcher::dispatchLoop, this); } diff --git a/emper/Dispatcher.hpp b/emper/Dispatcher.hpp index 16a6b884..c1eb7771 100644 --- a/emper/Dispatcher.hpp +++ b/emper/Dispatcher.hpp @@ -1,6 +1,7 @@ #pragma once #include "Common.hpp" +#include "Context.hpp" #include "Fiber.hpp" #include "Debug.hpp" @@ -17,9 +18,9 @@ protected: func_t getDispatchLoop(); - inline void dispatch(const Fiber* fiber) { + inline void dispatch(Fiber* fiber) { LOGD("executing fiber " << fiber); - currentFiber = fiber; + Context::setCurrentFiber(fiber); fiber->run(); } @@ -35,8 +36,8 @@ protected: return fiber->doAtomicDecrRefCount(); } - inline void recycle(const Fiber* fiber) { - delete fiber; + static inline void recycle(const Fiber* fiber) { + delete fiber; /* TODO don't delete Fibrils */ } void putRuntimeWorkerToSleep(); @@ -52,12 +53,11 @@ public: } static const Fiber* getCurrentFiberPtr() { - assert(currentFiber); - return currentFiber; + return Context::getCurrentFiber(); } static bool isDispatchedControlFlow() { - return currentFiber != nullptr; + return getCurrentFiberPtr() != nullptr; } friend ContextManager; diff --git a/emper/Fiber.cpp b/emper/Fiber.cpp index c6e578ca..45580cf0 100644 --- a/emper/Fiber.cpp +++ b/emper/Fiber.cpp @@ -2,7 +2,7 @@ #include <ostream> -void Fiber::run() const { +void Fiber::run() { LOGD("run() calling " << function.target<FIBER_FUN_TEMPLATE_ARG>() << " (" << function.target_type().name() diff --git a/emper/Fiber.hpp b/emper/Fiber.hpp index 4a88f185..088c9677 100644 --- a/emper/Fiber.hpp +++ b/emper/Fiber.hpp @@ -16,20 +16,27 @@ class Scheduler; class Dispatcher; class LawsScheduler; -class ALIGN_TO_CACHE_LINE Fiber : public Logger<LogSubsystem::F> { +class Fiber : public Logger<LogSubsystem::F> { public: typedef std::function<FIBER_FUN_TEMPLATE_ARG> fiber_fun_t; typedef std::function<FIBER_FUN0_TEMPLATE_ARG> fiber_fun0_t; static const workeraffinity_t NOT_AFFINE = -1; + enum Type { + FiberType, + FibrilType + }; + + Type type = FiberType; + private: const fiber_fun_t function; void* const arg; std::atomic<bool> runnable = { true }; - ALIGN_TO_CACHE_LINE std::atomic_uint referenceCounter = { 1 }; + std::atomic_uint referenceCounter = { 1 }; workeraffinity_t* const affinity; @@ -68,7 +75,7 @@ protected: virtual ~Fiber() = default; - virtual void run() const; + virtual void run(); private: inline void setMpscNext(Fiber* next) { diff --git a/emper/Fibril.cpp b/emper/Fibril.cpp new file mode 100644 index 00000000..0e2ee524 --- /dev/null +++ b/emper/Fibril.cpp @@ -0,0 +1,5 @@ +#include "Fibril.hpp" + + + +thread_local Fibril *Fibril::toResume = nullptr; diff --git a/emper/Fibril.hpp b/emper/Fibril.hpp new file mode 100644 index 00000000..eddfa742 --- /dev/null +++ b/emper/Fibril.hpp @@ -0,0 +1,283 @@ +#pragma once + +#ifdef EMPER_LOCKED_FIBRIL +#ifdef EMPER_FIBRIL_SYNC +#include "FibrilLock.hpp" +#else +#include <mutex> +#endif +#endif + +#include "Runtime.hpp" +#include "Fiber.hpp" +#include "Continuation.hpp" +#include "ContextManager.hpp" +#include "Context.hpp" +#include "atomic" + + +class Fibril : public Fiber { +#ifdef EMPER_LOCKED_FIBRIL + +#ifndef EMPER_LOCKED_WS_QUEUE +#error "EMPER_LOCKED_FIBRIL only works in combination with EMPER_LOCKED_WS_QUEUE!" +#endif + +public: +#ifdef EMPER_FIBRIL_SYNC + FibrilLock m; +#else + std::mutex m; +#endif +private: + int activeChildrenCount = 0; + +#else /* ! EMPER_LOCKED_FIBRIL */ + +private: + std::atomic<uint32_t> activeChildrenCount = 0; + uint32_t reserveStealCount = 0; + +#ifdef EMPER_MADVISE + std::atomic<bool> resumable = false; +#endif + +#endif /* EMPER_LOCKED_FIBRIL */ + +public: + Continuation cont; + +private: + Context *stack; + + static thread_local Fibril *toResume; + + +#ifdef EMPER_LOCKED_FIBRIL + inline void tryResume(__attribute__((unused)) uint32_t val) { + int c; + + m.lock(); + c = --activeChildrenCount; + + if (c > 0) { + if (stack == Context::currentContext) { +#ifdef EMPER_MADVISE + /* unmap unused stack pages */ + Context::currentContext->unmap(cont.sp); +#endif + Context::currentContext = nullptr; + } + m.unlock(); + /* random steal */ + return; + } else { + m.unlock(); + if (stack != Context::currentContext) { + Runtime::getRuntime()->getContextManager().putFreeContext(Context::currentContext); + // XXX has to check for hook? + Context::currentContext = stack; + } + /* resume, no return */ + cont.execute(cont.sp); + } + } + +#else /* ! EMPER_LOCKED_FIBRIL */ + + inline void tryResume(uint32_t val) { + uint32_t c; + Context *s; + + s = stack; + + //c = activeChildrenCount.fetch_sub(1, std::memory_order_acq_rel) - 1; + c = activeChildrenCount.fetch_sub(val, std::memory_order_relaxed) - val; + + if (c > 0) { + if (s == Context::currentContext) { +#ifdef EMPER_MADVISE + /* unmap unused stack pages */ + Context::currentContext->unmap(cont.sp); + /* set resumable to 'true' */ + if (true == resumable.exchange(true, std::memory_order_acq_rel)) { + /* last one joined, but could not resume + * because of us, so we can resume. + * resume, no return */ + cont.execute(cont.sp); + } +#endif + Context::currentContext = nullptr; + } + /* random steal */ + return; + } else { + if (stack != Context::currentContext) { +#ifdef EMPER_MADVISE + /* get resumable, signal we tried to resume */ + if (false == resumable.exchange(true, std::memory_order_acq_rel)) { + /* stack owner is unmapping pages, we don't + * wait, stack owner sees we were here. + * random steal */ + return; + } + /* we can proceed resume */ +#endif + Runtime::getRuntime()->getContextManager().putFreeContext(Context::currentContext); + // XXX has to check for hook? + Context::currentContext = stack; + } + /* resume, no return */ + cont.execute(cont.sp); + } + } +#endif /* ! EMPER_LOCKED_FIBRIL */ + +public: + + inline __attribute__((always_inline)) + Fibril() : Fiber(nullptr, nullptr, nullptr), cont() { + stack = Context::currentContext; // TODO check if this is correct + type = FibrilType; + } + + ~Fibril() = default; + + + void run() override { +#ifdef EMPER_LOCKED_FIBRIL + if (!activeChildrenCount) activeChildrenCount = 2; + else activeChildrenCount++; + m.unlock(); +#else + reserveStealCount -= 1; +#endif + +#ifdef EMPER_FIBRIL_STATS + statsSteals++; +#endif + + /* Reserve 128 byte at the bottom. */ + /* FIXME clean up, make nice looking */ + cont.execute((void**)Context::currentContext->getTos() - 16); + /* This seems to be necessary, because the compiler will pop args off the stack. + * On the new stack, if we don't reserve space for this, it will lead to access + * outside the stack area. + */ + } + + __attribute__((noreturn)) + inline void resume() { + toResume = this; + Runtime::getRuntime()->getContextManager().resume(1); + __builtin_unreachable(); + } + + inline static void tryResumeFiber(uint32_t val) { + if (toResume != nullptr) { + toResume->tryResume(val); + // XXX has to set toResume to nullptr??? + } + } + +private: + __attribute__((noinline, hot, optimize(3))) + void join_func() { + cont.ip = __builtin_return_address(0); + toResume = this; +#ifdef EMPER_LOCKED_FIBRIL + Runtime::getRuntime()->getContextManager().resume(1); +#else + Runtime::getRuntime()->getContextManager().resume(reserveStealCount); +#endif + } + +public: + template<class RET, class... PARs, class... ARGs> + inline __attribute__((always_inline)) + void fork(RET *ret, RET(*fun)(PARs...), ARGs ...args) { + auto fork_func = [](ARGs ...args, Fibril *fr, RET *ret, RET(*fun)(PARs...)) __attribute__((noinline, hot, optimize(3))) { + fr->cont.ip = __builtin_return_address(0); + Runtime* runtime = Runtime::getRuntime(); + runtime->pushBottom(*fr); + *ret = fun(args...); + if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */ + fr->resume(); + } + }; + membar(fork_func(args..., this, ret, fun)); + } + + template<class... PARs, class... ARGs> + inline __attribute__((always_inline)) + void fork(void(*fun)(PARs...), ARGs ...args) { + auto fork_func = [](ARGs ...args, Fibril *fr, void(*fun)(PARs...)) __attribute__((noinline, hot, optimize(3))) { + fr->cont.ip = __builtin_return_address(0); + Runtime* runtime = Runtime::getRuntime(); + runtime->pushBottom(*fr); + fun(args...); + if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */ + fr->resume(); + } + }; + membar(fork_func(args..., this, fun)); + } + +// template<class RET, class T, class... ARGs> +// inline __attribute__((always_inline)) +// void fork(RET *ret, std::function<T> fun, ARGs ...args) { +// auto fork_func = [](Fibril *fr, RET *ret, std::function<T> fun, ARGs ...args) __attribute__((noinline, hot, optimize(3))) { +// fr->cont.ip = __builtin_return_address(0); +// Runtime* runtime = Runtime::getRuntime(); +// runtime->pushBottom(*fr); +// *ret = fun(args...); +// if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */ +// fr->resume(); +// } +// }; +// membar(fork_func(this, ret, fun, args...)); +// } + + template<class T, class... ARGs> + inline __attribute__((always_inline)) + void fork(std::function<T> fun, ARGs ...args) { + auto fork_func = [](Fibril *fr, std::function<T> fun, ARGs ...args) __attribute__((noinline, hot, optimize(3))) { + fr->cont.ip = __builtin_return_address(0); + Runtime* runtime = Runtime::getRuntime(); + runtime->pushBottom(*fr); + fun(args...); + if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */ + fr->resume(); + } + }; + membar(fork_func(this, fun, args...)); + } + +#ifdef EMPER_LOCKED_FIBRIL + inline __attribute__((always_inline)) + void join() { + if (activeChildrenCount == 0) + return; + + membar(join_func()); + } + +#else + + inline __attribute__((always_inline)) + void join() { + if (reserveStealCount == 0) { + return; + } + + membar(join_func()); + + reserveStealCount = 0; +#ifdef EMPER_MADVISE + resumable.store(false, std::memory_order_relaxed); +#endif /* EMPER_MADVISE */ + } +#endif /* EMPER_LOCKED_FIBRIL */ + +}; + diff --git a/emper/MemoryManager.hpp b/emper/MemoryManager.hpp index c8494fa3..c7302a4b 100644 --- a/emper/MemoryManager.hpp +++ b/emper/MemoryManager.hpp @@ -11,11 +11,11 @@ class MemoryManager { private: const workerid_t workerCount; - adt::WsClQueue<void*, WS_QUEUE_SIZE>** queues; + //adt::WsClQueue<void*, WS_QUEUE_SIZE>** queues; static thread_local adt::BoundedBumpArray<void, WORKER_EXCLUSIVE_QUEUE_SIZE> workerExclusiveQueue; - static thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> queue; + //static thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> queue; static void* mallocMemory() { void* memory; @@ -35,6 +35,7 @@ public: if (memory) return memory; +#if 0 bool poped = queue.popTop(&memory); if (likely(poped)) return memory; @@ -50,6 +51,7 @@ public: poped = queues[victim]->popTop(&memory); if (poped) return memory; } +#endif *malloced = true; // If everything fails, allocate the memory. @@ -62,9 +64,11 @@ public: if (pushed) return; +#if 0 pushed = queue.pushBottom(memory); if (pushed) return; +#endif free(memory); } @@ -73,15 +77,17 @@ public: template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE> thread_local adt::BoundedBumpArray<void, WORKER_EXCLUSIVE_QUEUE_SIZE> MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::workerExclusiveQueue; -template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE> -thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::queue; +//template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE> +//thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::queue; template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE> MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::MemoryManager(Runtime& runtime) : workerCount(runtime.getWorkerCount()) { +#if 0 queues = new adt::WsClQueue<void*, WS_QUEUE_SIZE>*[workerCount]; auto newWorkerHook = [this]() { queues[Runtime::getWorkerId()] = &queue; }; runtime.addNewWorkerHook(newWorkerHook); +#endif } diff --git a/emper/Runtime.cpp b/emper/Runtime.cpp index 693f2596..2c812bfd 100644 --- a/emper/Runtime.cpp +++ b/emper/Runtime.cpp @@ -20,6 +20,11 @@ thread_local unsigned int Runtime::seed; thread_local workerid_t Runtime::workerId; RuntimeStrategy& Runtime::DEFAULT_STRATEGY = WsStrategy::INSTANCE; +#ifdef EMPER_FIBRIL_STATS +std::atomic<uint64_t> statsSteals = 0; +std::atomic<uint64_t> statsUnmapp = 0; +#endif + Runtime::Runtime(workerid_t workerCount, RuntimeStrategy& strategy) : workerCount(workerCount) , workerLatch(workerCount) , strategy(strategy) @@ -74,9 +79,11 @@ Runtime::Runtime(workerid_t workerCount, RuntimeStrategy& strategy) : workerCoun Runtime::~Runtime() { DBG("Runtime " << this << " is terminating"); + shutdown = true; + notifyAboutNewWork(); for (workerid_t i = 0; i < workerCount; ++i) { DBG("Runtime " << this << " is cancelling worker " << unsigned(i)); - errno = pthread_cancel(threads[i]); + errno = pthread_join(threads[i], nullptr); if (errno) { DIE_MSG_ERRNO("pthread_cancel() failed"); } @@ -85,6 +92,12 @@ Runtime::~Runtime() { std::lock_guard<std::mutex> lock(currentRuntimeMutex); currentRuntime = nullptr; } +#ifdef EMPER_FIBRIL_STATS + printf(" Statistics summary:\n"); + printf(" # of steals: %lu K\n", (statsSteals.load() + 500) / 1000); + printf(" # of unmapps: %lu K\n", (statsUnmapp.load() + 500) / 1000); + printf("===========================================\n"); +#endif DBG("Runtime " << this << " terminated"); } @@ -145,16 +158,41 @@ void Runtime::executeAndWait(std::function<void()> f) { ABORT("Ca not use executeAndWait() from within the Runtime"); } - std::mutex fiberFinished; - fiberFinished.lock(); + pthread_mutex_t m; + pthread_cond_t c; + bool fiberFinished = false; + + if (int err = pthread_mutex_init(&m, NULL); err) { + errno = err; + DIE_MSG_ERRNO("pthread_mutex_init"); + } + if (int err = pthread_cond_init(&c, NULL); err) { + errno = err; + DIE_MSG_ERRNO("pthread_cond_init"); + } Fiber* fiber = Fiber::from([&] { f(); - fiberFinished.unlock(); + pthread_mutex_lock(&m); + fiberFinished = true; + pthread_cond_signal(&c); + pthread_mutex_unlock(&m); }); schedule(*fiber); - fiberFinished.lock(); + pthread_mutex_lock(&m); + while (!fiberFinished) { + pthread_cond_wait(&c, &m); + } + pthread_mutex_unlock(&m); + if (int err = pthread_mutex_destroy(&m); err) { + errno = err; + DIE_MSG_ERRNO("pthread_mutex_destroy"); + } + if (int err = pthread_cond_destroy(&c); err) { + errno = err; + DIE_MSG_ERRNO("pthread_cond_destroy"); + } } diff --git a/emper/Runtime.hpp b/emper/Runtime.hpp index 69b6ff3b..0f6df3eb 100644 --- a/emper/Runtime.hpp +++ b/emper/Runtime.hpp @@ -13,6 +13,11 @@ class ContextManager; +#ifdef EMPER_FIBRIL_STATS +extern std::atomic<uint64_t> statsSteals; +extern std::atomic<uint64_t> statsUnmapp; +#endif + class Runtime : public Logger<LogSubsystem::RUNTI> { private: static std::mutex currentRuntimeMutex; @@ -44,6 +49,8 @@ private: static void printLastRuntimeStats(); + volatile bool shutdown = false; + protected: void addNewWorkerHook(std::function<void(void)> hook) { newWorkerHooks.push_back(hook); @@ -78,12 +85,24 @@ public: ~Runtime(); + inline bool isShuttingDown() { + return shutdown; + } + inline void schedule(Fiber& fiber) { scheduler.schedule(fiber); } Fiber* nextFiber(); + inline void pushBottom(Fiber& fiber) { + scheduler.pushBottom(fiber); + } + + inline Fiber* popBottom() { + return scheduler.popBottom(); + } + // https://stackoverflow.com/a/3747462/194894 static inline int rand() { seed = 214013 * seed + 2531011; diff --git a/emper/Scheduler.hpp b/emper/Scheduler.hpp index 043165b1..e3427ba6 100644 --- a/emper/Scheduler.hpp +++ b/emper/Scheduler.hpp @@ -31,4 +31,7 @@ public: virtual Fiber* nextFiber() = 0; + virtual void pushBottom(Fiber& fiber) = 0; + virtual Fiber* popBottom() = 0; + }; diff --git a/emper/SynchronizedFiber.hpp b/emper/SynchronizedFiber.hpp index cbdf6b88..785fdc33 100644 --- a/emper/SynchronizedFiber.hpp +++ b/emper/SynchronizedFiber.hpp @@ -31,7 +31,7 @@ private: explicit SynchronizedFiber(fiber_fun0_t function, PrivateSemaphore& semaphore) : SynchronizedFiber(function, nullptr, semaphore) { } - void run() const override { + void run() override { Fiber::run(); semaphore.signalAndExit(); } diff --git a/emper/include/emper-common.h b/emper/include/emper-common.h index f73f8cc4..b86f3b3d 100644 --- a/emper/include/emper-common.h +++ b/emper/include/emper-common.h @@ -10,7 +10,7 @@ #endif -typedef uint8_t workerid_t; +typedef uint16_t workerid_t; typedef int16_t workeraffinity_t; #define UNUSED_ARG __attribute__((unused)) diff --git a/emper/include/emper.hpp b/emper/include/emper.hpp index 9fb63c84..16494392 100644 --- a/emper/include/emper.hpp +++ b/emper/include/emper.hpp @@ -2,6 +2,7 @@ #include <functional> #include <cassert> +#include <cstdio> #include "Runtime.hpp" #include "Fiber.hpp" @@ -57,3 +58,13 @@ void spawn(Fiber::fiber_fun0_t function, workeraffinity_t* affinity, S& semaphor Fiber* fiber = SynchronizedFiber::from(function, affinity, semaphore); async(fiber); } + + +/* VVVVVVVVVVVVVVVV CONTINUATION VVVVVVVVVVVVVVVVVVVV */ + +#include "Fibril.hpp" + + +//#define fibril __attribute__((noinline, sysv_abi, optimize("no-omit-frame-pointer"))) +#define fibril __attribute__((optimize("no-omit-frame-pointer"))) + diff --git a/emper/lib/adt/BoundedMpmcQueue.hpp b/emper/lib/adt/BoundedMpmcQueue.hpp new file mode 100644 index 00000000..e3e195a3 --- /dev/null +++ b/emper/lib/adt/BoundedMpmcQueue.hpp @@ -0,0 +1,94 @@ +#pragma once + +#include <atomic> + + + + +namespace adt { + + + template<typename T, const uintptr_t CAPACITY> + class BoundedMpmcQueue { + private: + + struct { + std::atomic<uint64_t> next; + T value; + } alignas(128) buf[CAPACITY]; + + + using QueueHead = 0; + using FreeListHead = CAPACITY - 1; + + + inline uint64_t updateNext(uint64_t next, uint64_t value) { + return ((next + CAPACITY) & ~(CAPACITY - 1)) | value; + } + + + inline uint64_t getIdx(uint64_t value) { + return value & (CAPACITY - 1); + } + + + public: + + BoundedMpmcQueue() { + buf[QueueHead].next.store(0, std::memory_order_relaxed); + + for (uint64_t i = 1; i < FreeListHead; k++) { + buf[i].next.store(i + 1, std::memory_order_relaxed); + } + + buf[FreeListHead].next.store(1, std::memory_order_release); + } + + + inline bool put(T item) { + size_t head, index; + + head = buf[FreeListHead].next.load(std::memory_order_acquire); + do { + index = getIdx(head); + if (index == FreeListHead) + return false; + uint64_t next = updateNext(head, buf[index].next.load(std::memory_order_acquire)); + } while (!buf[FreeListHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire)); + + buf[index].value = item; + head = buf[QueueHead].next.load(std::memory_order_acquire); + + do { + buf[index].next.store(getIdx(head), std::memory_order_relaxed); + uint64_t next = updateNext(head, index); + } while (!buf[QueueHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire)); + } + + + inline bool get(T *itemPtr) { + uint64_t head, index; + + head = buf[QueueHead].next.load(std::memory_order_acquire); + do { + index = getIdx(head); + if (!index) + return false; + uint64_t next = updateNext(head, buf[index].next.load(std::memory_order_acquire)); + } while (!buf[QueueHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire)); + + *itemPtr = buf[index].value; + head = buf[FreeListHead].next.load(std::memory_order_acquire); + + do { + buf[index].next.store(getIdx(head, std::memory_order_relaxed)); + uint64_t next = updateNext(head, index); + } while (!buf[FreeListHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire)); + + return true; + } + + }; + + +} diff --git a/emper/lib/adt/FibrilDeque.hpp b/emper/lib/adt/FibrilDeque.hpp new file mode 100644 index 00000000..a3138292 --- /dev/null +++ b/emper/lib/adt/FibrilDeque.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include "FibrilLock.hpp" + + + +template<typename T, const uintptr_t CAPACITY> +class FibrilDeque { +private: + FibrilLock lock; + int head = 0; + int tail = 0; + T buff[CAPACITY]; + +public: + bool pushBottom(const T item) { + if (tail == CAPACITY) + return false; + + buff[tail++] = item; + //atomic_thread_fence(std::memory_order_seq_cst); + return true; + } + + + bool popBottom(T* itemPtr) { + int t = tail; + + if (t == 0) + return false; + + tail = --t; + atomic_thread_fence(std::memory_order_seq_cst); + *itemPtr = buff[tail]; + + if (head > t) { + tail = t + 1; + lock.lock(); + + if (head > t) { + head = 0; + tail = 0; + + lock.unlock(); + return false; + } + + tail = t; + lock.unlock(); + } + + return true; + } + + bool popTop(T* itemPtr) { + if (head >= tail) + return false; + + lock.lock(); + + int h = head++; + atomic_thread_fence(std::memory_order_seq_cst); + + if (h >= tail) { + head--; + + lock.unlock(); + return false; + } + + *itemPtr = buff[h]; +#ifdef EMPER_LOCKED_FIBRIL + if constexpr (std::is_same<Fiber*, T>::value) { + if ((*itemPtr)->type == Fiber::Type::FibrilType) { + static_cast<Fibril*>(*itemPtr)->m.lock(); + } + } +#endif + + lock.unlock(); + return true; + } + +}; + diff --git a/emper/lib/adt/FibrilLock.hpp b/emper/lib/adt/FibrilLock.hpp new file mode 100644 index 00000000..b28b7adf --- /dev/null +++ b/emper/lib/adt/FibrilLock.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include <atomic> + + + +class FibrilLock { +private: + std::atomic_flag value = ATOMIC_FLAG_INIT; + +public: + + __attribute__((always_inline)) + inline void lock() { + do { + asm( "pause" ::: "memory" ); + } while (value.test_and_set(std::memory_order_acquire)); + } + + __attribute__((always_inline)) + inline void unlock() { + value.clear(std::memory_order_release); + } + +}; diff --git a/emper/lib/adt/LockedQueue.hpp b/emper/lib/adt/LockedQueue.hpp index fa24f875..c967b367 100644 --- a/emper/lib/adt/LockedQueue.hpp +++ b/emper/lib/adt/LockedQueue.hpp @@ -5,6 +5,10 @@ #include <mutex> #include <deque> +#include <type_traits> + +#include "Fibril.hpp" + namespace adt { template<typename I, const uintptr_t SIZE> @@ -37,6 +41,13 @@ namespace adt { if (deque.empty()) return false; *itemPtr = deque.front(); +#ifdef EMPER_LOCKED_FIBRIL + if constexpr (std::is_same<Fiber*, I>::value) { + if ((*itemPtr)->type == Fiber::Type::FibrilType) { + static_cast<Fibril*>(*itemPtr)->m.lock(); + } + } +#endif deque.pop_front(); diff --git a/emper/lib/adt/WsClV3Queue.hpp b/emper/lib/adt/WsClV3Queue.hpp new file mode 100644 index 00000000..bc14299a --- /dev/null +++ b/emper/lib/adt/WsClV3Queue.hpp @@ -0,0 +1,79 @@ +#pragma once + +#include <atomic> + + + +namespace adt { + + template<typename T, const uintptr_t CAPACITY> + class WsClV3Queue { + protected: + + alignas(64) std::atomic<uint64_t> top; + alignas(64) std::atomic<uint64_t> bottom; + + alignas(64) T queue[CAPACITY]; + + + public: + WsClV3Queue() : top(1), bottom(1) { } + + + bool pushBottom(const T item) { + uint64_t localTop, localBottom; + + localBottom = bottom.load(std::memory_order_relaxed); + localTop = top.load(std::memory_order_acquire); + if ((localBottom - localTop) == CAPACITY) + return false; + + queue[localBottom % CAPACITY] = item; + bottom.store(localBottom + 1, std::memory_order_release); + + return true; + } + + + bool popBottom(T* itemPtr) { + bool ret; + uint64_t localTop, localBottom; + + localBottom = bottom.fetch_sub(1, std::memory_order_acq_rel) - 1; + localTop = top.load(std::memory_order_acquire); + + *itemPtr = queue[localBottom % CAPACITY]; + + if (localBottom < localTop) { + bottom.store(localTop, std::memory_order_relaxed); + return false; + } else if (localBottom > localTop) + return true; + + ret = top.compare_exchange_strong(localTop, localTop + 1, std::memory_order_release, std::memory_order_relaxed); + bottom.store(localBottom + 1, std::memory_order_relaxed); + + return ret; + } + + + bool popTop(T* itemPtr) { + uint64_t localTop, localBottom; + + localTop = top.load(std::memory_order_relaxed); +again: + localBottom = bottom.load(std::memory_order_acquire); + if (localBottom <= localTop) + return false; + + *itemPtr = queue[localTop % CAPACITY]; + + if (!top.compare_exchange_weak(localTop, localTop + 1, std::memory_order_release, std::memory_order_acquire)) + goto again; + + return true; + } + + }; + +} diff --git a/emper/lib/adt/WsClV4Queue.hpp b/emper/lib/adt/WsClV4Queue.hpp new file mode 100644 index 00000000..9d58b2d5 --- /dev/null +++ b/emper/lib/adt/WsClV4Queue.hpp @@ -0,0 +1,84 @@ +#pragma once + +#include <atomic> + + + +namespace adt { + + template<typename T, const uintptr_t CAPACITY> + class WsClV4Queue { + protected: + + alignas(64) std::atomic<uint64_t> top; + alignas(64) std::atomic<uint64_t> bottom; + alignas(64) uint64_t top_private; + + alignas(64) T queue[CAPACITY]; + + + public: + WsClV4Queue() : top(1), bottom(1), top_private(1) { } + + + bool pushBottom(const T item) { + uint64_t localTop, localBottom; + + localBottom = bottom.load(std::memory_order_relaxed); + localTop = top_private; + if ((localBottom - localTop) == CAPACITY) { + localTop = top.load(std::memory_order_acquire); + if ((localBottom - localTop) == CAPACITY) + return false; + top_private = localTop; + } + + queue[localBottom % CAPACITY] = item; + bottom.store(localBottom + 1, std::memory_order_release); + + return true; + } + + + bool popBottom(T* itemPtr) { + bool ret; + uint64_t localTop, localBottom; + + localBottom = bottom.fetch_sub(1, std::memory_order_acq_rel) - 1; + localTop = top.load(std::memory_order_acquire); + + *itemPtr = queue[localBottom % CAPACITY]; + + if (localBottom < localTop) { + bottom.store(localTop, std::memory_order_relaxed); + return false; + } else if (localBottom > localTop) + return true; + + ret = top.compare_exchange_strong(localTop, localTop + 1, std::memory_order_release, std::memory_order_relaxed); + bottom.store(localBottom + 1, std::memory_order_relaxed); + + return ret; + } + + + bool popTop(T* itemPtr) { + uint64_t localTop, localBottom; + + localTop = top.load(std::memory_order_relaxed); +again: + localBottom = bottom.load(std::memory_order_acquire); + if (localBottom <= localTop) + return false; + + *itemPtr = queue[localTop % CAPACITY]; + + if (!top.compare_exchange_weak(localTop, localTop + 1, std::memory_order_release, std::memory_order_acquire)) + goto again; + + return true; + } + + }; + +} diff --git a/emper/strategies/laws/LawsDispatcher.cpp b/emper/strategies/laws/LawsDispatcher.cpp index 5e354a35..b135e73a 100644 --- a/emper/strategies/laws/LawsDispatcher.cpp +++ b/emper/strategies/laws/LawsDispatcher.cpp @@ -2,11 +2,15 @@ #include "Runtime.hpp" #include "LawsStrategy.hpp" +#include "ContextManager.hpp" void LawsDispatcher::dispatchLoop() { while (true) { - Fiber* const fiber = runtime.nextFiber(); + Fiber* fiber = runtime.nextFiber(); if (!fiber) { + Runtime *runtime = Runtime::getRuntime(); + if (runtime->isShuttingDown()) + runtime->getContextManager().resume(0); #ifdef EMPER_WORKER_SLEEP putRuntimeWorkerToSleep(); #else diff --git a/emper/strategies/laws/LawsScheduler.cpp b/emper/strategies/laws/LawsScheduler.cpp index 26d1684d..6a5c832e 100644 --- a/emper/strategies/laws/LawsScheduler.cpp +++ b/emper/strategies/laws/LawsScheduler.cpp @@ -26,6 +26,34 @@ LawsScheduler::LawsScheduler(Runtime& runtime, LawsStrategy& lawsStrategy) : Sch addNewWorkerHook(newWorkerHook); } +void LawsScheduler::pushBottom(Fiber& fiber) { + fiber.runnable = true; + bool pushed = queue.pushBottom(&fiber); + if (unlikely(!pushed)) { + // Work-stealing should not use an overflow queue + // (EMPER_OVERFLOW_QUEUE), because of the extra overhead + // required to check that queue for work, so we have to abort + // here. + ABORT("Could not push fiber " << &fiber << " into queue"); + } +} + +Fiber* LawsScheduler::popBottom() { + Fiber* fiber; + + bool poped = queue.popBottom(&fiber); + if (unlikely(!poped)) { + // Work-stealing should not use an overflow queue + // (EMPER_OVERFLOW_QUEUE), because of the extra overhead + // required to check that queue for work, so we have to abort + // here. + //ABORT("Could not pop fiber from queue"); + fiber = nullptr; + } + + return fiber; +} + void LawsScheduler::schedule(Fiber& fiber) { LOGD("Scheduling fiber " << &fiber); diff --git a/emper/strategies/laws/LawsScheduler.hpp b/emper/strategies/laws/LawsScheduler.hpp index f3ce4269..b556568d 100644 --- a/emper/strategies/laws/LawsScheduler.hpp +++ b/emper/strategies/laws/LawsScheduler.hpp @@ -57,4 +57,7 @@ public: Fiber* nextFiber() override; + void pushBottom(Fiber& fiber) override; + Fiber* popBottom() override; + }; diff --git a/emper/strategies/ws/WsDispatcher.cpp b/emper/strategies/ws/WsDispatcher.cpp index 20338201..d6864c46 100644 --- a/emper/strategies/ws/WsDispatcher.cpp +++ b/emper/strategies/ws/WsDispatcher.cpp @@ -2,11 +2,15 @@ #include "Runtime.hpp" #include "Debug.hpp" +#include "ContextManager.hpp" void WsDispatcher::dispatchLoop() { while (true) { - const Fiber* fiber = runtime.nextFiber(); + Fiber* fiber = runtime.nextFiber(); if (!fiber) { + Runtime *runtime = Runtime::getRuntime(); + if (runtime->isShuttingDown()) + runtime->getContextManager().resume(0); #ifdef EMPER_WORKER_SLEEP putRuntimeWorkerToSleep(); #else diff --git a/emper/strategies/ws/WsScheduler.cpp b/emper/strategies/ws/WsScheduler.cpp index 7766bf57..65718d11 100644 --- a/emper/strategies/ws/WsScheduler.cpp +++ b/emper/strategies/ws/WsScheduler.cpp @@ -16,6 +16,34 @@ WsScheduler::WsScheduler(Runtime& runtime, WsStrategy& wsStrategy) : Scheduler(r addNewWorkerHook(newWorkerHook); } +void WsScheduler::pushBottom(Fiber& fiber) { + bool pushed = queue.pushBottom(&fiber); + if (unlikely(!pushed)) { + // Work-stealing should not use an overflow queue + // (EMPER_OVERFLOW_QUEUE), because of the extra overhead + // required to check that queue for work, so we have to abort + // here. + ABORT("Could not push fiber " << &fiber << " into queue"); + } + //schedule(fiber); +} + +Fiber* WsScheduler::popBottom() { + Fiber* fiber; + + bool poped = queue.popBottom(&fiber); + if (unlikely(!poped)) { + // Work-stealing should not use an overflow queue + // (EMPER_OVERFLOW_QUEUE), because of the extra overhead + // required to check that queue for work, so we have to abort + // here. + //ABORT("Could not pop fiber from queue"); + fiber = nullptr; + } + + return fiber; +} + void WsScheduler::schedule(Fiber& fiber) { LOGD("Scheduling fiber " << &fiber); @@ -62,6 +90,7 @@ Fiber* WsScheduler::nextFiber() { #ifdef EMPER_STATS wsStrategy.nextFiberStolen.fetch_add(1, std::memory_order_relaxed); #endif + /* fibril->stack.ptr = victim->stack TODO */ return fiber; } } diff --git a/emper/strategies/ws/WsScheduler.hpp b/emper/strategies/ws/WsScheduler.hpp index 382caff2..30e32ee9 100644 --- a/emper/strategies/ws/WsScheduler.hpp +++ b/emper/strategies/ws/WsScheduler.hpp @@ -5,12 +5,18 @@ #include "LockedQueue.hpp" #include "emper-common.h" +#include "FibrilDeque.hpp" + class WsStrategy; class WsScheduler: public Scheduler { template <size_t SIZE> #ifdef EMPER_LOCKED_WS_QUEUE +#ifdef EMPER_FIBRIL_SYNC + using WsQueue = FibrilDeque<Fiber*, SIZE>; +#else using WsQueue = adt::LockedQueue<Fiber*, SIZE>; +#endif #else using WsQueue = adt::WsClQueue<Fiber*, SIZE>; #endif @@ -41,4 +47,7 @@ public: Fiber* nextFiber() override; + void pushBottom(Fiber& fiber) override; + Fiber* popBottom() override; + }; diff --git a/run_benchmarks.sh b/run_benchmarks.sh new file mode 100755 index 00000000..f0d9ce6e --- /dev/null +++ b/run_benchmarks.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash + + + +benchmark_dir=build/benchmarks +result_dir=results + +benchmarks=( "serial" "emper_fiber" "tbb" "fibril" "cilkplus" ) +benchmark_serial="serial" + +max_cores=96 +step_size=12 + + + + +run_target() { # $1 target, $2 cores + ( + EMPER_BENCH_NPROCS=$2 ./$1 > tmp + while [[ $? -ne 0 ]] && [[ $(cat tmp | wc -l) -ne 24 ]]; do + EMPER_BENCH_NPROCS=$2 ./$1 > tmp + done + cat tmp + ) 2> /dev/null +} + + +run_benchmarks() { # $1 target_dir, $2 output_parent_dir + for target_name in $(ls $1); do + target="$1/$target_name" + if [[ -f $target && -x $target ]]; then + output_dir="$2/$target_name" + + [[ -d $output_dir ]] || mkdir -p $output_dir + + run_target $target 1 > $output_dir/001.txt + + if [[ $benchmark_name != $benchmark_serial ]]; then + for cores in $(seq $step_size $step_size $max_cores); do + run_target $target $cores > $output_dir/$(printf "%03d" $cores).txt + done + fi + fi + done +} + + + + + +if [[ $# -ge 1 ]]; then + target_dir=$benchmark_dir/$1 + output_parent_dir=$result_dir/$1 + if [[ $# -eq 2 ]]; then + output_parent_dir=$result_dir/$2 + fi + + run_benchmarks $target_dir $output_parent_dir + + exit 0 +fi + + + +cp Makefile Makefile.$$ + +cat Makefile.$$ | sed -e "s/-DEMPER_CM_WITH_MEMORY_MANAGER=..*\>/-DEMPER_CM_WITH_MEMORY_MANAGER=OFF/" > tmp.$$ +mv tmp.$$ Makefile + +make clean &> /dev/null || exit 1 +make release &> /dev/null || exit 1 + +for benchmark_name in ${benchmarks[@]}; do + target_dir="$benchmark_dir/$benchmark_name" + output_parent_dir="$result_dir/$benchmark_name" + + run_benchmarks $target_dir $output_parent_dir +done + + + +target_dir=$benchmark_dir/emper_continuation +output_parent_dir=$result_dir/emper_continuation + +cat Makefile.$$ | sed -e "s/-DEMPER_CM_WITH_MEMORY_MANAGER=..*\>/-DEMPER_CM_WITH_MEMORY_MANAGER=ON/" > tmp.$$ + +for madv in "ON" "OFF" ; do + cat tmp.$$ | sed -e "s/-DEMPER_MADVISE=..*\>/-DEMPER_MADVISE=$madv/" > madv.$$ + + output_madv=$output_parent_dir + if [[ $madv == "ON" ]]; then + output_madv=${output_madv}_madv + fi + + for lq in "ON" "OFF" ; do + cat madv.$$ | sed -e "s/-DEMPER_LOCKED_WS_QUEUE=..*\>/-DEMPER_LOCKED_WS_QUEUE=$lq/" > lq.$$ + + output_lq=${output_madv} + if [[ $lq == "ON" ]]; then + output_lq=${output_lq}_lq + for lf in "ON" "OFF" ; do + cat lq.$$ | sed -e "s/-DEMPER_LOCKED_FIBRIL=..*\>/-DEMPER_LOCKED_FIBRIL=$lf/" > lf.$$ + + output_lf=${output_lq} + if [[ $lf == "ON" ]]; then + output_lf=${output_lf}_lf + fi + + mv lf.$$ Makefile + make clean &> /dev/null || exit 1 + make release &> /dev/null || exit 1 + run_benchmarks $target_dir $output_lf + done + else + mv lq.$$ Makefile + make clean &> /dev/null || exit 1 + make release &> /dev/null || exit 1 + run_benchmarks $target_dir $output_lq + fi + done +done + +mv Makefile.$$ Makefile +rm -f *.$$ + + + +exit 0 diff --git a/test.sh b/test.sh new file mode 100755 index 00000000..863a382d --- /dev/null +++ b/test.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + + + +num_tests=100 + + +benchmark_dir=build/benchmarks + + +if [[ $# -eq 1 ]]; then + target_dir=$benchmark_dir/$1 +else + target_dir=$benchmark_dir/emper_continuation +fi + + +cd $target_dir + + +for i in $(ls); do + if [[ -f $i && -x $i ]]; then + passed=0 + for j in $(seq $num_tests); do + printf "\r[ %5d / %5d ] %30s |\t %5d / %5d PASSED" $j $num_tests $i $passed $num_tests + ( + ./$i + exit $? + ) &> /dev/null + if [[ $? -eq 0 ]]; then + passed=$((passed + 1)) + fi + done + printf "\r %30s |\t %5d / %5d PASSED\n" $i $passed $num_tests + fi +done + + +exit 0 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7cdd4e82..d3da0b0a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -14,6 +14,14 @@ add_executable(cpp_api_test CppApiTest.cpp) target_link_libraries(cpp_api_test emper) add_test(CppApiTest cpp_api_test) +add_executable(cpp_continuation_api_test CppContinuationApiTest.cpp) +target_link_libraries(cpp_continuation_api_test emper) +add_test(CppContinuationApiTest cpp_continuation_api_test) + +add_executable(simple_continuation_fib_test SimpleContinuationFibTest.cpp) +target_link_libraries(simple_continuation_fib_test Threads::Threads emper) +add_test(SimpleContinuationFibTeste simple_continuation_fib_test) + add_executable(simple_actor_test SimpleActorTest.cpp) target_link_libraries(simple_actor_test emper) add_test(SimpleActorTest simple_actor_test) @@ -21,3 +29,16 @@ add_test(SimpleActorTest simple_actor_test) add_executable(simple_laws_test SimpleLawsTest.cpp) target_link_libraries(simple_laws_test emper) add_test(SimpleLawsTest simple_laws_test) + +add_executable(continuation_sync_test ContinuationSyncTest.cpp) +target_link_libraries(continuation_sync_test Threads::Threads emper) +add_test(ContinuationSyncTest continuation_sync_test) + +add_executable(continuation_variable_parameter_test ContinuationVariableParameterTest.cpp) +target_link_libraries(continuation_variable_parameter_test Threads::Threads emper) +add_test(ContinuationVariableParameterTest continuation_variable_parameter_test) + +add_executable(simple_continuation_laws_test SimpleContinuationLawsTest.cpp) +target_link_libraries(simple_continuation_laws_test emper) +add_test(SimpleContinuationLawsTest simple_continuation_laws_test) + diff --git a/tests/ContinuationSyncTest.cpp b/tests/ContinuationSyncTest.cpp new file mode 100644 index 00000000..7cb57596 --- /dev/null +++ b/tests/ContinuationSyncTest.cpp @@ -0,0 +1,68 @@ +#include <stdio.h> +#include <stdlib.h> +#include <iostream> +#include <list> +#include <string> + +#include "emper.hpp" +#include "PrivateSemaphore.hpp" +#include "BinaryPrivateSemaphore.hpp" +#include "CountingPrivateSemaphore.hpp" + + + +struct param { + BPS *sem1; + BPS *sem2; +}; + + +fibril static void childFiber(void *p) { + param *params = static_cast<param*>(p); + //params->sem1->wait(); + for (volatile uint64_t i = 0; i < (1UL<<16); i++) { } + params->sem2->signal(); +} + + +fibril static void mainFiber(void) { + BPS sem1, sem2; + param params; + params.sem1 = &sem1; + params.sem2 = &sem2; + Fibril *fr = new Fibril(); + + fr->fork(childFiber, ¶ms); + + //sem1.signal(); + sem2.wait(); + + fr->join(); + + delete fr; +} + + +int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) { + Runtime runtime; + //Runtime runtime(2); + exit(0); + + Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) fibril { + Fibril *fr = new Fibril(); + + fr->fork(mainFiber); + + fr->join(); + + delete fr; + + exit(EXIT_SUCCESS); + }, nullptr); + + runtime.schedule(*fibFiber); + + runtime.waitUntilFinished(); + + return EXIT_FAILURE; +} diff --git a/tests/ContinuationVariableParameterTest.cpp b/tests/ContinuationVariableParameterTest.cpp new file mode 100644 index 00000000..1e2df933 --- /dev/null +++ b/tests/ContinuationVariableParameterTest.cpp @@ -0,0 +1,99 @@ +#include <stdio.h> +#include <stdlib.h> +#include <iostream> +#include <list> +#include <string> + +#include "emper.hpp" + + + +int64_t fib_fast(int n) { + int64_t val0 = 0; + int64_t val1 = 1; + int64_t fib = n; + + for (int i = 2; i < n + 1; i++) { + fib = val0 + val1; + val0 = val1; + val1 = fib; + } + + return fib; +} + + +fibril static void fib(int64_t *r, int64_t n) { + if (n < 2) { + *r = n; + } else { + int64_t a, b; + a = b = -1337; + + char buffer[sizeof(Fibril)]; + Fibril *fr = new (buffer) Fibril(); + + fr->fork(fib, &a, n - 1); + //fib(&a, n - 1); + //fr->fork(fib, &b, n - 2); + fib(&b, n - 2); + + fr->join(); + + fr->~Fibril(); + + *r = a + b; + } +} + +fibril static int64_t fib(int64_t n) { + if (n < 2) { + return n; + } else { + int64_t a, b; + a = b = -1337; + + char buffer[sizeof(Fibril)]; + Fibril *fr = new (buffer) Fibril(); + + fr->fork(&a, fib, n - 1); + //a = fib(n - 1); + //fr->fork(&b, fib, n - 2); + b = fib(n - 2); + + fr->join(); + + fr->~Fibril(); + + return a + b; + } +} + + +int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) { + Runtime runtime; + + Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) { + //const int fibNum = 35; + //const int fibNum = 13; + const int fibNum = 42; + const int expected = fib_fast(fibNum); + + int64_t result; + + result = fib(fibNum); + //fib(&result, fibNum); + + if (result != expected) { + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); + }, nullptr); + + runtime.schedule(*fibFiber); + + runtime.waitUntilFinished(); + + return EXIT_FAILURE; +} diff --git a/tests/CppApiTest.cpp b/tests/CppApiTest.cpp index 71077710..03b8be1c 100644 --- a/tests/CppApiTest.cpp +++ b/tests/CppApiTest.cpp @@ -11,13 +11,13 @@ static void increaseCounterByOne() { } static void mainFiber(void) { - const unsigned int FIBER_COUNT = 100; + const unsigned int FIBER_COUNT = 10000; CountingPrivateSemaphore cps; for (unsigned int i = 0; i < FIBER_COUNT; ++i) { spawn(&increaseCounterByOne, cps); - } + } cps.wait(); diff --git a/tests/CppContinuationApiTest.cpp b/tests/CppContinuationApiTest.cpp new file mode 100644 index 00000000..971a778d --- /dev/null +++ b/tests/CppContinuationApiTest.cpp @@ -0,0 +1,41 @@ +#include <atomic> + +#include "emper.hpp" + + + +static std::atomic_uint counter; + +fibril static void increaseCounterByOne() { + counter++; +} + +fibril static void mainFiber(void) { + const unsigned int FIBER_COUNT = 10000; + + Fibril *fr = new Fibril(); + + for (unsigned int i = 0; i < FIBER_COUNT; ++i) { + fr->fork(increaseCounterByOne); + } + + fr->join(); + + delete fr; + + if (counter != FIBER_COUNT) { + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} + +int main(UNUSED_ARG int arg, UNUSED_ARG char *argv[]) { + Runtime runtime; + + async(&mainFiber); + + runtime.waitUntilFinished(); + + return EXIT_FAILURE; +} diff --git a/tests/SimpleContinuationFibTest.cpp b/tests/SimpleContinuationFibTest.cpp new file mode 100644 index 00000000..41c894d9 --- /dev/null +++ b/tests/SimpleContinuationFibTest.cpp @@ -0,0 +1,94 @@ +#include <stdio.h> +#include <stdlib.h> +#include <iostream> +#include <list> +#include <string> + +#include "emper.hpp" + + + +int64_t fib_fast(int n) { + int64_t val0 = 0; + int64_t val1 = 1; + int64_t fib = n; + + for (int i = 2; i < n + 1; i++) { + fib = val0 + val1; + val0 = val1; + val1 = fib; + } + + return fib; +} + + +typedef struct { + int n; + int64_t* result; +} fibParams; + +fibril static void fib(void *voidParams) { + fibParams* params = static_cast<fibParams*>(voidParams); + int n = params->n; + int64_t *result = params->result; + + if (n < 2) { + *result = n; + } else { + int64_t a, b; + a = b = -1337; + + //Fibril *fr = new Fibril(); + char buffer[sizeof(Fibril)]; + Fibril *fr = new (buffer) Fibril(); + + fibParams newParams1; + newParams1.n = n - 1; + newParams1.result = &a; + fibParams newParams2; + newParams2.n = n - 2; + newParams2.result = &b; + + fr->fork(fib, &newParams1); + fr->fork(fib, &newParams2); + //fib(&newParams1); + //fib(&newParams2); + + fr->join(); + + //delete fr; + fr->~Fibril(); + + *result = a + b; + + } +} + + +int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) { + Runtime runtime; + //Runtime runtime(2); + + Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) { + const int fibNum = 42; + const int expected = fib_fast(fibNum); + + int64_t result; + fibParams params = { fibNum, &result }; + + fib(¶ms); + + if (result != expected) { + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); + }, nullptr); + + runtime.schedule(*fibFiber); + + runtime.waitUntilFinished(); + + return EXIT_FAILURE; +} diff --git a/tests/SimpleContinuationLawsTest.cpp b/tests/SimpleContinuationLawsTest.cpp new file mode 100644 index 00000000..78e560a7 --- /dev/null +++ b/tests/SimpleContinuationLawsTest.cpp @@ -0,0 +1,106 @@ +#include "emper.hpp" + +#include "LawsStrategy.hpp" +#include "Fiber.hpp" + +#include <random> + + +static const unsigned int ROUND_COUNT = 10; +static const unsigned int FIBER_LOOPS = 10; +static const unsigned int PAYLOAD_COUNT = 4096; + + +typedef struct ALIGN_TO_CACHE_LINE { + // 4096 * 8 byte (64 bit) = 32 KiB = L1 cache size of most systems + uint64_t payload[PAYLOAD_COUNT]; +} FiberData; + +typedef struct ALIGN_TO_CACHE_LINE { + workeraffinity_t affinity; +} AlignedWorkerAffinity; + + +static void fiberFun(FiberData* fiberData) { + std::random_device randomDevice; + std::mt19937_64 randomGenerator(randomDevice()); + std::uniform_int_distribution<unsigned long long> randomDistribution(0, UINT64_MAX); + + for (unsigned int i = 0; i < FIBER_LOOPS; ++i) { + for (unsigned int j = 0; j < PAYLOAD_COUNT; ++j) { + unsigned long long r = randomDistribution(randomGenerator); + fiberData->payload[j] += r; + } + } +} + +fibril static void alphaFun() { + Runtime* runtime = Runtime::getRuntime(); + const unsigned int FIBER_COUNT = runtime->getWorkerCount() + 3; + + AlignedWorkerAffinity *affinities = new AlignedWorkerAffinity[FIBER_COUNT]; + FiberData* fiberData = new FiberData[FIBER_COUNT]; + + for (unsigned int i = 0; i < FIBER_COUNT; ++i) { + FiberData& currentFiberData = fiberData[i]; + memset(currentFiberData.payload, 0, sizeof(uint64_t) * PAYLOAD_COUNT); + + affinities[i].affinity = Fiber::NOT_AFFINE; + } + + char buffer[sizeof(Fibril)]; + Fibril *fr = new (buffer) Fibril(); + + for (unsigned int round = 0; round < ROUND_COUNT; ++round) { + for (unsigned int i = 0; i < FIBER_COUNT; ++i) { + FiberData* myFiberData = &fiberData[i]; + fr->fork(fiberFun, myFiberData); + //Fiber* fiber = Fiber::from(&fiberFun, + // myFiberData, + // &affinities[i].affinity); + } + fr->join(); + } + + std::atomic<uint64_t> finalResult(0); + for (unsigned int i = 0; i < FIBER_COUNT; ++i) { + FiberData* myFiberData = &fiberData[i]; + fr->fork<void(void)>([myFiberData, &finalResult]() { + uint64_t mySum = 0; + for (unsigned int i = 0; i < PAYLOAD_COUNT; ++i) { + mySum += myFiberData->payload[i]; + } + finalResult += mySum; + }); + //Fiber* fiber = Fiber::from([myFiberData, &finalResult]() { + // uint64_t mySum = 0; + // for (unsigned int i = 0; i < PAYLOAD_COUNT; ++i) { + // mySum += myFiberData->payload[i]; + // } + // finalResult += mySum; + // }, + // &affinities[i].affinity); + } + fr->join(); + + fr->~Fibril(); + free(fiberData); + free(affinities); + + std::cerr << "Result: " << finalResult << std::endl; + + exit(EXIT_SUCCESS); +} + +int main(UNUSED_ARG int args, UNUSED_ARG char *argv[]) { + RuntimeStrategy& lawsStrategy = LawsStrategy::INSTANCE; + Runtime runtime(lawsStrategy); + + Fiber* alphaFiber = Fiber::from(&alphaFun); + + runtime.schedule(*alphaFiber); + + runtime.waitUntilFinished(); + + return EXIT_FAILURE; +} diff --git a/tests/SimpleFibTest.cpp b/tests/SimpleFibTest.cpp index d31da0d7..35e9918e 100644 --- a/tests/SimpleFibTest.cpp +++ b/tests/SimpleFibTest.cpp @@ -11,6 +11,23 @@ #include "CountingPrivateSemaphore.hpp" #include "Debug.hpp" + + +int64_t fib_fast(int n) { + int64_t val0 = 0; + int64_t val1 = 1; + int64_t fib = n; + + for (int i = 2; i < n + 1; i++) { + fib = val0 + val1; + val0 = val1; + val1 = fib; + } + + return fib; +} + + typedef struct { int n; int* result; @@ -59,7 +76,9 @@ int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) { Runtime runtime; Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) { - const int fibNum = 13; + //const int fibNum = 13; + const int fibNum = 30; + int result; BPS sem; fibParams params = { fibNum, &result, &sem }; @@ -68,7 +87,7 @@ int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) { sem.wait(); - if (result != 233) { + if (result != fib_fast(fibNum)) { exit(EXIT_FAILURE); } @@ -78,6 +97,6 @@ int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) { runtime.schedule(*fibFiber); runtime.waitUntilFinished(); - + return EXIT_FAILURE; } diff --git a/time.sh b/time.sh new file mode 100755 index 00000000..8a66a882 --- /dev/null +++ b/time.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + + + +benchmark_dir=build/benchmarks + + +if [[ $# -eq 1 ]]; then + target_dir=$benchmark_dir/$1 +else + target_dir=$benchmark_dir/emper_continuation +fi + + +cd $target_dir + + +for i in $(ls); do + if [[ -f $i && -x $i ]]; then + ( + false || while [[ $? -ne 0 ]]; do + /usr/bin/time -f "%e" ./$i 2> tmp + done + printf "%30s |\t %.2f\n" $i $(cat tmp) + ) + fi +done + + +exit 0 -- GitLab