From 5745322bb5147f7a4c7bdf7122bd77c6fd15ef8f Mon Sep 17 00:00:00 2001
From: Florian Schmaus <flow@cs.fau.de>
Date: Fri, 21 May 2021 17:04:35 +0200
Subject: [PATCH] EMPER Cactus Stack Devel

---
 CMakeLists.txt                                |    7 +-
 Makefile                                      |    6 +-
 benchmarks/CMakeLists.txt                     |   17 +
 benchmarks/cholesky.cpp                       |  665 ++++
 benchmarks/cilkplus/CMakeLists.txt            |   44 +
 benchmarks/cilkplus/cilkplus.h                |   34 +
 benchmarks/emper_continuation/CMakeLists.txt  |   53 +
 .../emper_continuation/emper_continuation.h   |  153 +
 benchmarks/emper_continuation/fork.h          |   70 +
 benchmarks/emper_fiber/CMakeLists.txt         |   39 +
 benchmarks/emper_fiber/emper_fiber.h          |   39 +
 benchmarks/fft.cpp                            |  354 ++
 benchmarks/fft.h                              | 2877 +++++++++++++++++
 benchmarks/fib.cpp                            |   56 +
 benchmarks/fibril.h                           |   69 +
 benchmarks/fibril/CMakeLists.txt              |   41 +
 benchmarks/fibril/fibrile.h                   |   97 +
 benchmarks/fibril/fibrili.h                   |   90 +
 benchmarks/fibril/fork.h                      |   70 +
 benchmarks/fibril_lf/CMakeLists.txt           |   41 +
 benchmarks/fibril_lf/fork.h                   |   70 +
 benchmarks/heat.cpp                           |  205 ++
 benchmarks/integrate.cpp                      |   79 +
 benchmarks/knapsack.cpp                       |  165 +
 benchmarks/lu.cpp                             |  458 +++
 benchmarks/matmul.cpp                         |  142 +
 benchmarks/nqueens.cpp                        |   70 +
 benchmarks/openmp/CMakeLists.txt              |   28 +
 benchmarks/openmp/openmp.h                    |  101 +
 benchmarks/quicksort.cpp                      |   84 +
 benchmarks/rectmul.cpp                        |  365 +++
 benchmarks/serial/CMakeLists.txt              |   27 +
 benchmarks/serial/serial.h                    |   18 +
 benchmarks/strassen.cpp                       |  644 ++++
 benchmarks/tbb/CMakeLists.txt                 |   43 +
 benchmarks/tbb/tbb.h                          |   36 +
 benchmarks/test.h                             |  148 +
 emper/CMakeLists.txt                          |    1 +
 emper/Context.hpp                             |   59 +-
 emper/ContextManager.cpp                      |   40 +-
 emper/ContextManager.hpp                      |    5 +-
 emper/Continuation.hpp                        |   61 +
 emper/Dispatcher.cpp                          |    2 -
 emper/Dispatcher.hpp                          |   14 +-
 emper/Fiber.cpp                               |    2 +-
 emper/Fiber.hpp                               |   13 +-
 emper/Fibril.cpp                              |    5 +
 emper/Fibril.hpp                              |  283 ++
 emper/MemoryManager.hpp                       |   14 +-
 emper/Runtime.cpp                             |   48 +-
 emper/Runtime.hpp                             |   19 +
 emper/Scheduler.hpp                           |    3 +
 emper/SynchronizedFiber.hpp                   |    2 +-
 emper/include/emper-common.h                  |    2 +-
 emper/include/emper.hpp                       |   11 +
 emper/lib/adt/BoundedMpmcQueue.hpp            |   94 +
 emper/lib/adt/FibrilDeque.hpp                 |   85 +
 emper/lib/adt/FibrilLock.hpp                  |   25 +
 emper/lib/adt/LockedQueue.hpp                 |   11 +
 emper/lib/adt/WsClV3Queue.hpp                 |   79 +
 emper/lib/adt/WsClV4Queue.hpp                 |   84 +
 emper/strategies/laws/LawsDispatcher.cpp      |    6 +-
 emper/strategies/laws/LawsScheduler.cpp       |   28 +
 emper/strategies/laws/LawsScheduler.hpp       |    3 +
 emper/strategies/ws/WsDispatcher.cpp          |    6 +-
 emper/strategies/ws/WsScheduler.cpp           |   29 +
 emper/strategies/ws/WsScheduler.hpp           |    9 +
 run_benchmarks.sh                             |  128 +
 test.sh                                       |   39 +
 tests/CMakeLists.txt                          |   21 +
 tests/ContinuationSyncTest.cpp                |   68 +
 tests/ContinuationVariableParameterTest.cpp   |   99 +
 tests/CppApiTest.cpp                          |    4 +-
 tests/CppContinuationApiTest.cpp              |   41 +
 tests/SimpleContinuationFibTest.cpp           |   94 +
 tests/SimpleContinuationLawsTest.cpp          |  106 +
 tests/SimpleFibTest.cpp                       |   25 +-
 time.sh                                       |   30 +
 78 files changed, 9164 insertions(+), 39 deletions(-)
 create mode 100644 benchmarks/CMakeLists.txt
 create mode 100644 benchmarks/cholesky.cpp
 create mode 100644 benchmarks/cilkplus/CMakeLists.txt
 create mode 100644 benchmarks/cilkplus/cilkplus.h
 create mode 100644 benchmarks/emper_continuation/CMakeLists.txt
 create mode 100644 benchmarks/emper_continuation/emper_continuation.h
 create mode 100644 benchmarks/emper_continuation/fork.h
 create mode 100644 benchmarks/emper_fiber/CMakeLists.txt
 create mode 100644 benchmarks/emper_fiber/emper_fiber.h
 create mode 100644 benchmarks/fft.cpp
 create mode 100644 benchmarks/fft.h
 create mode 100644 benchmarks/fib.cpp
 create mode 100644 benchmarks/fibril.h
 create mode 100644 benchmarks/fibril/CMakeLists.txt
 create mode 100644 benchmarks/fibril/fibrile.h
 create mode 100644 benchmarks/fibril/fibrili.h
 create mode 100644 benchmarks/fibril/fork.h
 create mode 100644 benchmarks/fibril_lf/CMakeLists.txt
 create mode 100644 benchmarks/fibril_lf/fork.h
 create mode 100644 benchmarks/heat.cpp
 create mode 100644 benchmarks/integrate.cpp
 create mode 100644 benchmarks/knapsack.cpp
 create mode 100644 benchmarks/lu.cpp
 create mode 100644 benchmarks/matmul.cpp
 create mode 100644 benchmarks/nqueens.cpp
 create mode 100644 benchmarks/openmp/CMakeLists.txt
 create mode 100644 benchmarks/openmp/openmp.h
 create mode 100644 benchmarks/quicksort.cpp
 create mode 100644 benchmarks/rectmul.cpp
 create mode 100644 benchmarks/serial/CMakeLists.txt
 create mode 100644 benchmarks/serial/serial.h
 create mode 100644 benchmarks/strassen.cpp
 create mode 100644 benchmarks/tbb/CMakeLists.txt
 create mode 100644 benchmarks/tbb/tbb.h
 create mode 100644 benchmarks/test.h
 create mode 100644 emper/Continuation.hpp
 create mode 100644 emper/Fibril.cpp
 create mode 100644 emper/Fibril.hpp
 create mode 100644 emper/lib/adt/BoundedMpmcQueue.hpp
 create mode 100644 emper/lib/adt/FibrilDeque.hpp
 create mode 100644 emper/lib/adt/FibrilLock.hpp
 create mode 100644 emper/lib/adt/WsClV3Queue.hpp
 create mode 100644 emper/lib/adt/WsClV4Queue.hpp
 create mode 100755 run_benchmarks.sh
 create mode 100755 test.sh
 create mode 100644 tests/ContinuationSyncTest.cpp
 create mode 100644 tests/ContinuationVariableParameterTest.cpp
 create mode 100644 tests/CppContinuationApiTest.cpp
 create mode 100644 tests/SimpleContinuationFibTest.cpp
 create mode 100644 tests/SimpleContinuationLawsTest.cpp
 create mode 100755 time.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48af4ac6..0d1951ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,9 +56,12 @@ endmacro()
 
 emper_option(WORKER_SLEEP "Enable sleeping worker support")
 emper_option(LOCKED_WS_QUEUE "Use a fully locked queue for work-stealing")
+emper_option(LOCKED_FIBRIL "Use a fully locked Fibril. Only works with locked work-stealing queues")
 emper_option(OVERFLOW_QUEUE "Use a overflow queue in case the primary queue is full")
 emper_option(LOCKED_MPSC_QUEUE "Use the locked variant for the MPSC queue")
 emper_option(STATS "Collect stats and print them at the end of the execution")
+emper_option(MADVISE "Use madvise(MADV_DONTNEED) to unmap unused stack pages. Bound memory consumption")
+emper_option(CM_WITH_MEMORY_MANAGER "Use context manager with a memory manager")
 
 # Macro to add files to a var. Can even be used in subdirectories.
 # Source: http://stackoverflow.com/a/7049380/194894
@@ -108,12 +111,14 @@ add_library(c_emper STATIC ${C_EMPER_SOURCE})
 # set_property(TARGET c_emper PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
 target_link_libraries(c_emper emper)
 
-add_subdirectory("lib")
+#add_subdirectory("lib")
 
 add_subdirectory("apps")
 
 add_subdirectory("tests")
 
+add_subdirectory("benchmarks")
+
 add_subdirectory("eval")
 
 file(GLOB ALL_SOURCE_FILES *.cpp)
diff --git a/Makefile b/Makefile
index 0af760e4..1c4c50d5 100644
--- a/Makefile
+++ b/Makefile
@@ -23,7 +23,11 @@ debug release relwithdebug:
 	rm -f build
 	ln -rs build-$@ build
 	cd build-$@; \
-		[[ -f CMakeCache.txt ]] || cmake -DCMAKE_BUILD_TYPE=$@ .. \
+		[[ -f CMakeCache.txt ]] || cmake -DCMAKE_BUILD_TYPE=$@ \
+		-DEMPER_CM_WITH_MEMORY_MANAGER=OFF \
+		-DEMPER_LOCKED_WS_QUEUE=OFF \
+		-DEMPER_LOCKED_FIBRIL=OFF \
+		-DEMPER_MADVISE=OFF .. \
 		&& make $(COMMON_MAKE_ARGS)
 
 reldebug: relwithdebug
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 00000000..715f4ed1
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+add_subdirectory(tbb)
+
+add_subdirectory(openmp)
+
+add_subdirectory(serial)
+
+add_subdirectory(emper_continuation)
+
+add_subdirectory(emper_fiber)
+
+add_subdirectory(fibril)
+
+add_subdirectory(fibril_lf)
+
+#add_subdirectory(cilkplus)
+
diff --git a/benchmarks/cholesky.cpp b/benchmarks/cholesky.cpp
new file mode 100644
index 00000000..701f8b3f
--- /dev/null
+++ b/benchmarks/cholesky.cpp
@@ -0,0 +1,665 @@
+/*
+ * Sparse Cholesky code with little blocks at the leaves of the Quad tree
+ * Keith Randall -- Aske Plaat
+ *
+ * This code should run with any square sparse real symmetric matrix
+ * from MatrixMarket (http://math.nist.gov/MatrixMarket)
+ *
+ * run with `cholesky -f george-liu.mtx' for a given matrix, or
+ * `cholesky -n 1000 -z 10000' for a 1000x1000 random matrix with 10000
+ * nonzeros (caution: random matrices produce lots of fill).
+ */
+/*
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <math.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "test.h"
+
+/*************************************************************\
+ * Basic types
+ \*************************************************************/
+
+typedef double Real;
+
+#define BLOCK_DEPTH 2		/* logarithm base 2 of BLOCK_SIZE */
+#define BLOCK_SIZE  (1<<BLOCK_DEPTH)	/* 4 seems to be the optimum */
+
+typedef Real Block[BLOCK_SIZE][BLOCK_SIZE];
+
+#define BLOCK(B,I,J) (B[I][J])
+
+#define _00 0
+#define _01 1
+#define _10 2
+#define _11 3
+
+#define TR_00 _00
+#define TR_01 _10
+#define TR_10 _01
+#define TR_11 _11
+
+typedef struct InternalNode {
+  struct InternalNode *child[4];
+} InternalNode;
+
+typedef struct {
+  Block block;
+} LeafNode;
+
+typedef InternalNode *Matrix;
+
+static Matrix A, R;
+static int depth;
+
+#ifndef BENCHMARK
+int n = 2000;
+static int nonzeros = 10000;
+#else
+int n = 4000;
+static int nonzeros = 40000;
+#endif
+
+/*************************************************************\
+ * Linear algebra on blocks
+ \*************************************************************/
+
+/*
+ * elem_daxmy - Compute y' = y - ax where a is a Real and x and y are
+ * vectors of Reals.
+ */
+static void elem_daxmy(Real a, Real * x, Real * y, int n)
+{
+  for (n--; n >= 0; n--)
+    y[n] -= a * x[n];
+}
+
+/*
+ * block_schur - Compute Schur complement B' = B - AC.
+ */
+static void block_schur_full(Block B, Block A, Block C)
+{
+  int i, j, k;
+  for (i = 0; i < BLOCK_SIZE; i++) {
+    for (j = 0; j < BLOCK_SIZE; j++) {
+      for (k = 0; k < BLOCK_SIZE; k++) {
+        BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k);
+      }
+    }
+  }
+}
+
+/*
+ * block_schur - Compute Schur complement B' = B - AC.
+ */
+static void block_schur_half(Block B, Block A, Block C)
+{
+  int i, j, k;
+
+  /*
+   * printf("schur half\n");
+   */
+  /* Compute Schur complement. */
+  for (i = 0; i < BLOCK_SIZE; i++) {
+    for (j = 0; j <= i /* BLOCK_SIZE */ ; j++) {
+      for (k = 0; k < BLOCK_SIZE; k++) {
+        BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k);
+      }
+    }
+  }
+}
+
+/*
+ * block_upper_solve - Perform substitution to solve for B' in
+ * B'U = B.
+ */
+static void block_backsub(Block B, Block U)
+{
+  int i, j, k;
+
+  /* Perform backward substitution. */
+  for (i = 0; i < BLOCK_SIZE; i++) {
+    for (j = 0; j < BLOCK_SIZE; j++) {
+      for (k = 0; k < i; k++) {
+        BLOCK(B, j, i) -= BLOCK(U, i, k) * BLOCK(B, j, k);	/* transpose? */
+      }
+      BLOCK(B, j, i) /= BLOCK(U, i, i);
+    }
+  }
+}
+
+/*
+ * block_lower_solve - Perform forward substitution to solve for B' in
+ * LB' = B.
+ */
+static void xblock_backsub(Block B, Block L)
+{
+  int i, k;
+  (void) xblock_backsub;
+
+  /* Perform forward substitution. */
+  for (i = 0; i < BLOCK_SIZE; i++)
+    for (k = 0; k <= i; k++) {
+      BLOCK(B, i, k) /= BLOCK(L, k, k);
+      elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0),
+          &BLOCK(B, i, 0), BLOCK_SIZE - k);
+    }
+}
+
+/*
+ * block_cholesky - Factor block B.
+ */
+static void block_cholesky(Block B)
+{
+  int i, j, k;
+
+  for (k = 0; k < BLOCK_SIZE; k++) {
+    Real x;
+    if (BLOCK(B, k, k) < 0.0) {
+      printf("sqrt error: %f\n", BLOCK(B, k, k));
+      printf("matrix is probably not numerically stable\n");
+      exit(9);
+    }
+    x = sqrt(BLOCK(B, k, k));
+    for (i = k; i < BLOCK_SIZE; i++) {
+      BLOCK(B, i, k) /= x;
+    }
+    for (j = k + 1; j < BLOCK_SIZE; j++) {
+      for (i = j; i < BLOCK_SIZE; i++) {
+        BLOCK(B, i, j) -= BLOCK(B, i, k) * BLOCK(B, j, k);
+        if (j > i && BLOCK(B, i, j) != 0.0) {
+          printf("Upper not empty\n");
+        }
+      }
+    }
+  }
+}
+
+/*
+ * block_zero - zero block B.
+ */
+static void block_zero(Block B)
+{
+  int i, k;
+
+  for (i = 0; i < BLOCK_SIZE; i++) {
+    for (k = 0; k < BLOCK_SIZE; k++) {
+      BLOCK(B, i, k) = 0.0;
+    }
+  }
+}
+
+/*************************************************************\
+ * Allocation and initialization
+ \*************************************************************/
+
+    /*
+     * Create new leaf nodes (BLOCK_SIZE x BLOCK_SIZE submatrices)
+     */
+static inline InternalNode *new_block_leaf(void)
+{
+  LeafNode *leaf = (LeafNode*) malloc(sizeof(LeafNode));
+  if (leaf == NULL) {
+    printf("out of memory!\n");
+    exit(1);
+  }
+  return (InternalNode *) leaf;
+}
+
+/*
+ * Create internal node in quadtree representation
+ */
+static inline InternalNode *new_internal(InternalNode * a00, InternalNode * a01,
+    InternalNode * a10, InternalNode * a11)
+{
+  InternalNode *node = (InternalNode*) malloc(sizeof(InternalNode));
+  if (node == NULL) {
+    printf("out of memory!\n");
+    exit(1);
+  }
+  node->child[_00] = a00;
+  node->child[_01] = a01;
+  node->child[_10] = a10;
+  node->child[_11] = a11;
+  return node;
+}
+
+/*
+ * Duplicate matrix.  Resulting matrix may be laid out in memory
+ * better than source matrix.
+ */
+fibril static Matrix copy_matrix(int depth, Matrix a)
+{
+  Matrix r;
+
+  if (!a)
+    return a;
+
+  if (depth == BLOCK_DEPTH) {
+    LeafNode *A = (LeafNode *) a;
+    LeafNode *R;
+    r = new_block_leaf();
+    R = (LeafNode *) r;
+    memcpy(R->block, A->block, sizeof(Block));
+  } else {
+    Matrix r00, r01, r10, r11;
+
+    depth--;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, &r00, copy_matrix, (depth, a->child[_00]));
+    fibril_fork(&fr, &r01, copy_matrix, (depth, a->child[_01]));
+    fibril_fork(&fr, &r10, copy_matrix, (depth, a->child[_10]));
+    r11 = copy_matrix(depth, a->child[_11]);
+    fibril_join(&fr);
+
+    r = new_internal(r00, r01, r10, r11);
+  }
+  return r;
+}
+
+/*
+ * Deallocate matrix.
+ */
+void free_matrix(int depth, Matrix a)
+{
+  if (a == NULL)
+    return;
+  if (depth == BLOCK_DEPTH) {
+    free(a);
+  } else {
+    depth--;
+    free_matrix(depth, a->child[_00]);
+    free_matrix(depth, a->child[_01]);
+    free_matrix(depth, a->child[_10]);
+    free_matrix(depth, a->child[_11]);
+    free(a);
+  }
+}
+
+/*************************************************************\
+ * Simple matrix operations
+ \*************************************************************/
+
+  /*
+   * Get matrix element at row r, column c.
+   */
+static Real get_matrix(int depth, Matrix a, int r, int c)
+{
+  if (a == NULL)
+    return 0.0;
+
+  if (depth == BLOCK_DEPTH) {
+    LeafNode *A = (LeafNode *) a;
+    return BLOCK(A->block, r, c);
+  } else {
+    int mid;
+
+    depth--;
+    mid = 1 << depth;
+
+    if (r < mid) {
+      if (c < mid)
+        return get_matrix(depth, a->child[_00], r, c);
+      else
+        return get_matrix(depth, a->child[_01], r, c - mid);
+    } else {
+      if (c < mid)
+        return get_matrix(depth, a->child[_10], r - mid, c);
+      else
+        return get_matrix(depth, a->child[_11], r - mid, c - mid);
+    }
+  }
+}
+
+/*
+ * Set matrix element at row r, column c to value.
+ */
+static Matrix set_matrix(int depth, Matrix a, int r, int c, Real value)
+{
+  if (depth == BLOCK_DEPTH) {
+    LeafNode *A;
+    if (a == NULL) {
+      a = new_block_leaf();
+      A = (LeafNode *) a;
+      block_zero(A->block);
+    } else {
+      A = (LeafNode *) a;
+    }
+    BLOCK(A->block, r, c) = value;
+  } else {
+    int mid;
+
+    if (a == NULL)
+      a = new_internal(NULL, NULL, NULL, NULL);
+
+    depth--;
+    mid = 1 << depth;
+
+    if (r < mid) {
+      if (c < mid)
+        a->child[_00] = set_matrix(depth, a->child[_00],
+            r, c, value);
+      else
+        a->child[_01] = set_matrix(depth, a->child[_01],
+            r, c - mid, value);
+    } else {
+      if (c < mid)
+        a->child[_10] = set_matrix(depth, a->child[_10],
+            r - mid, c, value);
+      else
+        a->child[_11] = set_matrix(depth, a->child[_11],
+            r - mid, c - mid, value);
+    }
+  }
+  return a;
+}
+
+/*
+ * Compute sum of squares of elements of matrix
+ */
+static Real mag(int depth, Matrix a)
+{
+  Real res = 0.0;
+  if (!a)
+    return res;
+
+  if (depth == BLOCK_DEPTH) {
+    LeafNode *A = (LeafNode *) a;
+    int i, j;
+    for (i = 0; i < BLOCK_SIZE; i++)
+      for (j = 0; j < BLOCK_SIZE; j++)
+        res += BLOCK(A->block, i, j) * BLOCK(A->block, i, j);
+  } else {
+    depth--;
+    res += mag(depth, a->child[_00]);
+    res += mag(depth, a->child[_01]);
+    res += mag(depth, a->child[_10]);
+    res += mag(depth, a->child[_11]);
+  }
+  return res;
+}
+
+/*************************************************************\
+ * Cholesky algorithm
+ \*************************************************************/
+
+  /*
+   * Perform R -= A * Transpose(B)
+   * if lower==1, update only lower-triangular part of R
+   */
+fibril static
+Matrix mul_and_subT(int depth, int lower, Matrix a, Matrix b, Matrix r)
+{
+  if (depth == BLOCK_DEPTH) {
+    LeafNode *A = (LeafNode *) a;
+    LeafNode *B = (LeafNode *) b;
+    LeafNode *R;
+
+    if (r == NULL) {
+      r = new_block_leaf();
+      R = (LeafNode *) r;
+      block_zero(R->block);
+    } else
+      R = (LeafNode *) r;
+
+    if (lower)
+      block_schur_half(R->block, A->block, B->block);
+    else
+      block_schur_full(R->block, A->block, B->block);
+  } else {
+    Matrix r00, r01, r10, r11;
+
+    depth--;
+
+    if (r != NULL) {
+      r00 = r->child[_00];
+      r01 = r->child[_01];
+      r10 = r->child[_10];
+      r11 = r->child[_11];
+    } else {
+      r00 = NULL;
+      r01 = NULL;
+      r10 = NULL;
+      r11 = NULL;
+    }
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    if (a->child[_00] && b->child[TR_00])
+      fibril_fork(&fr, &r00, mul_and_subT, (depth, lower,
+          a->child[_00], b->child[TR_00],
+          r00));
+
+    if (!lower && a->child[_00] && b->child[TR_01])
+      fibril_fork(&fr, &r01, mul_and_subT, (depth, 0,
+          a->child[_00], b->child[TR_01],
+          r01));
+
+    if (a->child[_10] && b->child[TR_00])
+      fibril_fork(&fr, &r10, mul_and_subT, (depth, 0,
+          a->child[_10], b->child[TR_00],
+          r10));
+
+    if (a->child[_10] && b->child[TR_01])
+      fibril_fork(&fr, &r11, mul_and_subT, (depth, lower,
+          a->child[_10], b->child[TR_01],
+          r11));
+
+    fibril_join(&fr);
+
+    if (a->child[_01] && b->child[TR_10])
+      fibril_fork(&fr, &r00, mul_and_subT, (depth, lower,
+          a->child[_01], b->child[TR_10],
+          r00));
+
+    if (!lower && a->child[_01] && b->child[TR_11])
+      fibril_fork(&fr, &r01, mul_and_subT, (depth, 0,
+          a->child[_01], b->child[TR_11],
+          r01));
+
+    if (a->child[_11] && b->child[TR_10])
+      fibril_fork(&fr, &r10, mul_and_subT, (depth, 0,
+          a->child[_11], b->child[TR_10],
+          r10));
+
+    if (a->child[_11] && b->child[TR_11])
+      fibril_fork(&fr, &r11, mul_and_subT, (depth, lower,
+          a->child[_11], b->child[TR_11],
+          r11));
+
+    fibril_join(&fr);
+
+    if (r == NULL) {
+      if (r00 || r01 || r10 || r11)
+        r = new_internal(r00, r01, r10, r11);
+    } else {
+      r->child[_00] = r00;
+      r->child[_01] = r01;
+      r->child[_10] = r10;
+      r->child[_11] = r11;
+    }
+  }
+  return r;
+}
+
+/*
+ * Perform substitution to solve for B in BL = A
+ * Returns B in place of A.
+ */
+fibril static Matrix backsub(int depth, Matrix a, Matrix l)
+{
+  if (depth == BLOCK_DEPTH) {
+    LeafNode *A = (LeafNode *) a;
+    LeafNode *L = (LeafNode *) l;
+    block_backsub(A->block, L->block);
+  } else {
+    Matrix a00, a01, a10, a11;
+    Matrix l00, l10, l11;
+
+    depth--;
+
+    a00 = a->child[_00];
+    a01 = a->child[_01];
+    a10 = a->child[_10];
+    a11 = a->child[_11];
+
+    l00 = l->child[_00];
+    l10 = l->child[_10];
+    l11 = l->child[_11];
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    if (a00)
+      fibril_fork(&fr, &a00, backsub, (depth, a00, l00));
+    if (a10)
+      fibril_fork(&fr, &a10, backsub, (depth, a10, l00));
+
+    fibril_join(&fr);
+
+    if (a00 && l10)
+      fibril_fork(&fr, &a01, mul_and_subT, (depth, 0, a00, l10, a01));
+    if (a10 && l10)
+      fibril_fork(&fr, &a11, mul_and_subT, (depth, 0, a10, l10, a11));
+
+    fibril_join(&fr);
+
+    if (a01)
+      fibril_fork(&fr, &a01, backsub, (depth, a01, l11));
+    if (a11)
+      fibril_fork(&fr, &a11, backsub, (depth, a11, l11));
+
+    fibril_join(&fr);
+
+    a->child[_00] = a00;
+    a->child[_01] = a01;
+    a->child[_10] = a10;
+    a->child[_11] = a11;
+  }
+
+  return a;
+}
+
+/*
+ * Compute Cholesky factorization of A.
+ */
+fibril static Matrix cholesky(int depth, Matrix a)
+{
+  if (depth == BLOCK_DEPTH) {
+    LeafNode *A = (LeafNode *) a;
+    block_cholesky(A->block);
+  } else {
+    Matrix a00, a10, a11;
+
+    depth--;
+
+    a00 = a->child[_00];
+    a10 = a->child[_10];
+    a11 = a->child[_11];
+
+    if (!a10) {
+      fibril_t fr;
+      fibril_init(&fr);
+      fibril_fork(&fr, &a00, cholesky, (depth, a00));
+      a11 = cholesky(depth, a11);
+      fibril_join(&fr);
+    } else {
+      a00 = cholesky(depth, a00);
+      a10 = backsub(depth, a10, a00);
+      a11 = mul_and_subT(depth, 1, a10, a10, a11);
+      a11 = cholesky(depth, a11);
+    }
+    a->child[_00] = a00;
+    a->child[_10] = a10;
+    a->child[_11] = a11;
+  }
+  return a;
+}
+
+static int logarithm(int size)
+{
+  int k = 0;
+
+  while ((1 << k) < size)
+    k++;
+  return k;
+}
+
+void init()
+{
+  /* generate random matrix */
+  depth = logarithm(n);
+
+  /* diagonal elements */
+  int i;
+  for (i = 0; i < n; i++)
+    A = set_matrix(depth, A, i, i, 1.0);
+
+  /* off-diagonal elements */
+  for (i = 0; i < nonzeros - n; i++) {
+    int r, c;
+
+    do {
+      r = rand() % n;
+      c = rand() % n;
+    } while (r <= c || get_matrix(depth, A, r, c) != 0.0);
+
+    A = set_matrix(depth, A, r, c, 0.1);
+  }
+
+  /* extend to power of two n with identity matrix */
+  for (i = n; i < (1 << depth); i++) {
+    A = set_matrix(depth, A, i, i, 1.0);
+  }
+}
+
+void prep()
+{
+  free_matrix(depth, R);
+  R = copy_matrix(depth, A);
+}
+
+void test()
+{
+  R = cholesky(depth, R);
+}
+
+int verify()
+{
+  int fail = 0;
+
+#ifndef BENCHMARK
+  /* test - make sure R * Transpose(R) == A */
+  /* compute || A - R * Transpose(R) ||    */
+  A = mul_and_subT(depth, 1, R, R, A);
+  Real error = mag(depth, A);
+  fail = (error > 0.00001);
+#endif
+
+  free_matrix(depth, A);
+  free_matrix(depth, R);
+  return fail;
+}
diff --git a/benchmarks/cilkplus/CMakeLists.txt b/benchmarks/cilkplus/CMakeLists.txt
new file mode 100644
index 00000000..18d21761
--- /dev/null
+++ b/benchmarks/cilkplus/CMakeLists.txt
@@ -0,0 +1,44 @@
+
+add_definitions(-DFIBRIL_CILKPLUS)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus")
+
+find_library(CILKRTS_LIB cilkrts /srv/scratch/uh15efil/intel-cilk-runtime/build/lib)
+find_library(DL_LIB NAMES dl)
+
+
+add_executable(cholesky_cilkplus ../cholesky.cpp)
+target_link_libraries(cholesky_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(fft_cilkplus ../fft.cpp)
+target_link_libraries(fft_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(fib_cilkplus ../fib.cpp)
+target_link_libraries(fib_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(heat_cilkplus ../heat.cpp)
+target_link_libraries(heat_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(integrate_cilkplus ../integrate.cpp)
+target_link_libraries(integrate_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(knapsack_cilkplus ../knapsack.cpp)
+target_link_libraries(knapsack_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(lu_cilkplus ../lu.cpp)
+target_link_libraries(lu_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(matmul_cilkplus ../matmul.cpp)
+target_link_libraries(matmul_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(nqueens_cilkplus ../nqueens.cpp)
+target_link_libraries(nqueens_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(quicksort_cilkplus ../quicksort.cpp)
+target_link_libraries(quicksort_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(rectmul_cilkplus ../rectmul.cpp)
+target_link_libraries(rectmul_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
+
+add_executable(strassen_cilkplus ../strassen.cpp)
+target_link_libraries(strassen_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
diff --git a/benchmarks/cilkplus/cilkplus.h b/benchmarks/cilkplus/cilkplus.h
new file mode 100644
index 00000000..722d72c1
--- /dev/null
+++ b/benchmarks/cilkplus/cilkplus.h
@@ -0,0 +1,34 @@
+#ifndef CILKPLUS_H
+#define CILKPLUS_H
+
+#include <thread>
+#include <stdio.h>
+#include <cilk/cilk.h>
+#include <cilk/cilk_api.h>
+
+#define fibril
+#define fibril_t __attribute__((unused)) int
+#define fibril_init(fp)
+#define fibril_join(fp) cilk_sync
+
+#define fibril_fork_nrt(fp, fn, ag)     cilk_spawn fn ag
+#define fibril_fork_wrt(fp, rt, fn, ag) *rt = cilk_spawn fn ag
+
+
+#define _nthreads(_n) [](int n) -> int { \
+	int nprocs = std::thread::hardware_concurrency(); \
+	if (n > 0 && n < nprocs) \
+		return n; \
+	return nprocs; \
+}(_n)
+
+#define fibril_rt_init(n) do { \
+	char nprocs[32]; \
+	snprintf(nprocs, 32, "%d", _nthreads(n)); \
+	__cilkrts_set_param("nworkers", nprocs); \
+	__cilkrts_set_param("stack size", "0x800000"); \
+} while (0);
+#define fibril_rt_exit() (__cilkrts_end_cilk())
+#define fibril_rt_nprocs() (__cilkrts_get_nworkers())
+
+#endif /* end of include guard: CILKPLUS_H */
diff --git a/benchmarks/emper_continuation/CMakeLists.txt b/benchmarks/emper_continuation/CMakeLists.txt
new file mode 100644
index 00000000..d6473281
--- /dev/null
+++ b/benchmarks/emper_continuation/CMakeLists.txt
@@ -0,0 +1,53 @@
+
+add_definitions(-DFIBRIL_EMPER_CONTINUATION)
+
+
+add_executable(cholesky_emper_continuation ../cholesky.cpp)
+target_link_libraries(cholesky_emper_continuation Threads::Threads emper)
+
+add_executable(fft_emper_continuation ../fft.cpp)
+target_link_libraries(fft_emper_continuation Threads::Threads emper)
+
+add_executable(fib_emper_continuation ../fib.cpp)
+target_link_libraries(fib_emper_continuation Threads::Threads emper)
+
+add_executable(heat_emper_continuation ../heat.cpp)
+target_link_libraries(heat_emper_continuation Threads::Threads emper)
+
+add_executable(integrate_emper_continuation ../integrate.cpp)
+target_link_libraries(integrate_emper_continuation Threads::Threads emper)
+
+add_executable(knapsack_emper_continuation ../knapsack.cpp)
+target_link_libraries(knapsack_emper_continuation Threads::Threads emper)
+
+add_executable(lu_emper_continuation ../lu.cpp)
+target_link_libraries(lu_emper_continuation Threads::Threads emper)
+
+add_executable(matmul_emper_continuation ../matmul.cpp)
+target_link_libraries(matmul_emper_continuation Threads::Threads emper)
+
+add_executable(nqueens_emper_continuation ../nqueens.cpp)
+target_link_libraries(nqueens_emper_continuation Threads::Threads emper)
+
+add_executable(quicksort_emper_continuation ../quicksort.cpp)
+target_link_libraries(quicksort_emper_continuation Threads::Threads emper)
+
+add_executable(rectmul_emper_continuation ../rectmul.cpp)
+target_link_libraries(rectmul_emper_continuation Threads::Threads emper)
+
+add_executable(strassen_emper_continuation ../strassen.cpp)
+target_link_libraries(strassen_emper_continuation Threads::Threads emper)
+
+
+add_test(cholesky cholesky_emper_continuation)
+add_test(fft fft_emper_continuation)
+add_test(fib fib_emper_continuation)
+add_test(heat heat_emper_continuation)
+add_test(integrate integrate_emper_continuation)
+add_test(knapsack knapsack_emper_continuation)
+add_test(lu lu_emper_continuation)
+add_test(matmul matmul_emper_continuation)
+add_test(nqueens nqueens_emper_continuation)
+add_test(quicksort quicksort_emper_continuation)
+add_test(rectmul rectmul_emper_continuation)
+add_test(strassen strassen_emper_continuation)
diff --git a/benchmarks/emper_continuation/emper_continuation.h b/benchmarks/emper_continuation/emper_continuation.h
new file mode 100644
index 00000000..495fb836
--- /dev/null
+++ b/benchmarks/emper_continuation/emper_continuation.h
@@ -0,0 +1,153 @@
+#ifndef EMPER_CONTINUATION_H
+#define EMPER_CONTINUATION_H
+
+
+
+#include <thread>
+
+//#include "fork.h"
+#include "emper.hpp"
+
+
+
+#if 0
+class StackFibril {
+private:
+	//Fibril *f;
+	//char memory[sizeof(Fibril) + alignof(Fibril)];
+	char memory[sizeof(Fibril)];
+
+public:
+	__attribute__((always_inline))
+	inline StackFibril() {
+		//char *addr = (char*) ((uintptr_t) (memory + alignof(Fibril) - 1) & ~(alignof(Fibril) - 1));
+		//f = new (addr) Fibril();
+		new (memory) Fibril();
+	}
+
+	__attribute__((always_inline))
+	inline ~StackFibril() {
+		//f->~Fibril();
+		((Fibril*) memory)->~Fibril();
+	}
+
+	__attribute__((always_inline))
+	inline Fibril* operator->() const noexcept {
+		//return f;
+		return (Fibril*) memory;
+	}
+
+	__attribute__((always_inline))
+	inline Fibril& operator*() const {
+		//return *f;
+		return *((Fibril*) memory);
+	}
+
+};
+
+
+#define fibril_t StackFibril
+#define fibril_init(fp)
+#define fibril_join(fp) (*fp)->join();
+
+
+#if 1
+#include "fork.h"
+#define fibril_fork_nrt(fp, fn, ag) do { \
+	auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \
+		(*f)->cont.ip = __builtin_return_address(0); \
+		Runtime* runtime = Runtime::getRuntime(); \
+		runtime->pushBottom(**f); \
+		fn(_fibril_args ag); \
+		if (!runtime->popBottom()) { \
+			(*f)->resume(); \
+		} \
+	}; \
+	membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
+} while (0);
+
+#define fibril_fork_wrt(fp, rt, fn, ag) do { \
+	auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rt) p) __attribute__((noinline, hot, optimize(3))) { \
+		(*f)->cont.ip = __builtin_return_address(0); \
+		Runtime* runtime = Runtime::getRuntime(); \
+		runtime->pushBottom(**f); \
+		*p = fn(_fibril_args ag); \
+		if (!runtime->popBottom()) { \
+			(*f)->resume(); \
+		} \
+	}; \
+	membar(_fibril_##fn##_fork(_fibril_expand ag fp, rt)); \
+} while (0);
+#else
+#define _fibril_expand(...) \
+  _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_expand_(n, ...) \
+  _fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
+#define _fibril_expand_16(...) __VA_ARGS__
+#define _fibril_expand_15(...) __VA_ARGS__
+#define _fibril_expand_14(...) __VA_ARGS__
+#define _fibril_expand_13(...) __VA_ARGS__
+#define _fibril_expand_12(...) __VA_ARGS__
+#define _fibril_expand_11(...) __VA_ARGS__
+#define _fibril_expand_10(...) __VA_ARGS__
+#define _fibril_expand_9( ...) __VA_ARGS__
+#define _fibril_expand_8( ...) __VA_ARGS__
+#define _fibril_expand_7( ...) __VA_ARGS__
+#define _fibril_expand_6( ...) __VA_ARGS__
+#define _fibril_expand_5( ...) __VA_ARGS__
+#define _fibril_expand_4( ...) __VA_ARGS__
+#define _fibril_expand_3( ...) __VA_ARGS__
+#define _fibril_expand_2( ...) __VA_ARGS__
+#define _fibril_expand_1( ...) __VA_ARGS__
+#define _fibril_expand_0()
+
+#define fibril_fork_nrt(fp, fn, ag) (*fp)->fork(fn, _fibril_expand ag)
+#define fibril_fork_wrt(fp, rt, fn, ag) (*fp)->fork(rt, fn, _fibril_expand ag)
+#endif
+#endif
+
+#define fibril_t Fibril
+#define fibril_init(fp)
+#define fibril_join(fp) (fp)->join();
+
+
+#include "fork.h"
+
+#define fibril_fork_nrt(fp, fn, ag) do { \
+	auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \
+		(f)->cont.ip = __builtin_return_address(0); \
+		Runtime* runtime = Runtime::getRuntime(); \
+		runtime->pushBottom(*f); \
+		fn(_fibril_args ag); \
+		if (!runtime->popBottom()) { \
+			(f)->resume(); \
+		} \
+	}; \
+	membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
+} while (0);
+
+#define fibril_fork_wrt(fp, rt, fn, ag) do { \
+	auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rt) p) __attribute__((noinline, hot, optimize(3))) { \
+		(f)->cont.ip = __builtin_return_address(0); \
+		Runtime* runtime = Runtime::getRuntime(); \
+		runtime->pushBottom(*f); \
+		*p = fn(_fibril_args ag); \
+		if (!runtime->popBottom()) { \
+			(f)->resume(); \
+		} \
+	}; \
+	membar(_fibril_##fn##_fork(_fibril_expand ag fp, rt)); \
+} while (0);
+
+#define _nthreads(_n) [](int n) -> int { \
+	int nprocs = std::thread::hardware_concurrency(); \
+	if (n > 0 && n < nprocs) \
+		return n; \
+	return nprocs; \
+}(_n)
+
+#define fibril_rt_init(n) Runtime runtime(_nthreads(n)); runtime.executeAndWait([&] () {
+#define fibril_rt_exit() });
+#define fibril_rt_nprocs() runtime.getWorkerCount()
+
+#endif /* end of include guard: EMPER_CONTINUATION_H */
diff --git a/benchmarks/emper_continuation/fork.h b/benchmarks/emper_continuation/fork.h
new file mode 100644
index 00000000..8ab080b0
--- /dev/null
+++ b/benchmarks/emper_continuation/fork.h
@@ -0,0 +1,70 @@
+#ifndef FIBRIL_FORK_H
+#define FIBRIL_FORK_H
+
+#define _fibril_defs(...) \
+  _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_defs_(n, ...) \
+  _fibril_concat(_fibril_defs_, n)(__VA_ARGS__)
+#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__)
+#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__)
+#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__)
+#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__)
+#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__)
+#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__)
+#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__)
+#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__)
+#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__)
+#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__)
+#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__)
+#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__)
+#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__)
+#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__)
+#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__)
+#define _fibril_defs_1(a)      __typeof__(a) a1,
+#define _fibril_defs_0()
+
+#define _fibril_args(...) \
+  _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_args_(n, ...) \
+  _fibril_concat(_fibril_args_, n)(__VA_ARGS__)
+#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__)
+#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__)
+#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__)
+#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__)
+#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__)
+#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__)
+#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__)
+#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__)
+#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__)
+#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__)
+#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__)
+#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__)
+#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__)
+#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__)
+#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__)
+#define _fibril_args_1(a)      a1
+#define _fibril_args_0()
+
+#define _fibril_expand(...) \
+  _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_expand_(n, ...) \
+  _fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
+#define _fibril_expand_16(...) __VA_ARGS__,
+#define _fibril_expand_15(...) __VA_ARGS__,
+#define _fibril_expand_14(...) __VA_ARGS__,
+#define _fibril_expand_13(...) __VA_ARGS__,
+#define _fibril_expand_12(...) __VA_ARGS__,
+#define _fibril_expand_11(...) __VA_ARGS__,
+#define _fibril_expand_10(...) __VA_ARGS__,
+#define _fibril_expand_9( ...) __VA_ARGS__,
+#define _fibril_expand_8( ...) __VA_ARGS__,
+#define _fibril_expand_7( ...) __VA_ARGS__,
+#define _fibril_expand_6( ...) __VA_ARGS__,
+#define _fibril_expand_5( ...) __VA_ARGS__,
+#define _fibril_expand_4( ...) __VA_ARGS__,
+#define _fibril_expand_3( ...) __VA_ARGS__,
+#define _fibril_expand_2( ...) __VA_ARGS__,
+#define _fibril_expand_1( ...) __VA_ARGS__,
+#define _fibril_expand_0()
+
+#endif /* end of include guard: FIBRIL_FORK_H */
diff --git a/benchmarks/emper_fiber/CMakeLists.txt b/benchmarks/emper_fiber/CMakeLists.txt
new file mode 100644
index 00000000..2acf2e69
--- /dev/null
+++ b/benchmarks/emper_fiber/CMakeLists.txt
@@ -0,0 +1,39 @@
+
+add_definitions(-DFIBRIL_EMPER_FIBER)
+
+
+add_executable(cholesky_emper_fiber ../cholesky.cpp)
+target_link_libraries(cholesky_emper_fiber Threads::Threads emper)
+
+add_executable(fft_emper_fiber ../fft.cpp)
+target_link_libraries(fft_emper_fiber Threads::Threads emper)
+
+add_executable(fib_emper_fiber ../fib.cpp)
+target_link_libraries(fib_emper_fiber Threads::Threads emper)
+
+add_executable(heat_emper_fiber ../heat.cpp)
+target_link_libraries(heat_emper_fiber Threads::Threads emper)
+
+add_executable(integrate_emper_fiber ../integrate.cpp)
+target_link_libraries(integrate_emper_fiber Threads::Threads emper)
+
+add_executable(knapsack_emper_fiber ../knapsack.cpp)
+target_link_libraries(knapsack_emper_fiber Threads::Threads emper)
+
+add_executable(lu_emper_fiber ../lu.cpp)
+target_link_libraries(lu_emper_fiber Threads::Threads emper)
+
+add_executable(matmul_emper_fiber ../matmul.cpp)
+target_link_libraries(matmul_emper_fiber Threads::Threads emper)
+
+add_executable(nqueens_emper_fiber ../nqueens.cpp)
+target_link_libraries(nqueens_emper_fiber Threads::Threads emper)
+
+add_executable(quicksort_emper_fiber ../quicksort.cpp)
+target_link_libraries(quicksort_emper_fiber Threads::Threads emper)
+
+add_executable(rectmul_emper_fiber ../rectmul.cpp)
+target_link_libraries(rectmul_emper_fiber Threads::Threads emper)
+
+add_executable(strassen_emper_fiber ../strassen.cpp)
+target_link_libraries(strassen_emper_fiber Threads::Threads emper)
diff --git a/benchmarks/emper_fiber/emper_fiber.h b/benchmarks/emper_fiber/emper_fiber.h
new file mode 100644
index 00000000..de549c49
--- /dev/null
+++ b/benchmarks/emper_fiber/emper_fiber.h
@@ -0,0 +1,39 @@
+#ifndef EMPER_FIBER_H
+#define EMPER_FIBER_H
+
+
+
+#include <thread>
+
+#include "emper.hpp"
+
+
+#define fibril_t CPS
+#define fibril_init(fp)
+#define fibril_join(fp) (*fp).wait();
+
+#define fibril_fork_nrt(fp, fn, ag) do { \
+	(*fp).incrementCounterByOne(); \
+	__typeof__(fp) fpp = fp; \
+	Runtime::getRuntime()->schedule(*Fiber::from([=] () {fn ag; (*fpp).signalAndExit(); })); \
+} while (0);
+#define fibril_fork_wrt(fp, rt, fn, ag) do { \
+	(*fp).incrementCounterByOne(); \
+	__typeof__(fp) fpp = fp; \
+	__typeof__(rt) rtp = rt; \
+	Runtime::getRuntime()->schedule(*Fiber::from([=] () { *rtp = fn ag; (*fpp).signalAndExit(); })); \
+} while (0);
+
+
+#define _nthreads(_n) [](int n) -> int { \
+	int nprocs = std::thread::hardware_concurrency(); \
+	if (n > 0 && n < nprocs) \
+		return n; \
+	return nprocs; \
+}(_n)
+
+#define fibril_rt_init(n) Runtime runtime(_nthreads(n)); runtime.executeAndWait([&] () {
+#define fibril_rt_exit() });
+#define fibril_rt_nprocs() runtime.getWorkerCount()
+
+#endif /* end of include guard: EMPER_FIBER_H */
diff --git a/benchmarks/fft.cpp b/benchmarks/fft.cpp
new file mode 100644
index 00000000..5f3ac2af
--- /dev/null
+++ b/benchmarks/fft.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "test.h"
+#include "fft.h"
+
+#ifdef BENCHMARK
+int n = 26;
+#else
+int n = 12;
+#endif
+
+static int size;
+static COMPLEX *in, *out, *cp, *W;
+static const REAL pi = 3.1415926535897932384626434;
+
+/*
+ * compute the W coefficients (that is, powers of the root of 1)
+ * and store them into an array.
+ */
+fibril static void compute_w_coefficients(int n, int a, int b, COMPLEX * W)
+{
+  //register double twoPiOverN;
+  //register int k;
+  //register REAL s, c;
+  double twoPiOverN;
+  int k;
+  REAL s, c;
+
+  if (b - a < 128) {
+    twoPiOverN = 2.0 * pi / n;
+    for (k = a; k <= b; ++k) {
+      c = cos(twoPiOverN * k);
+      c_re(W[k]) = c_re(W[n - k]) = c;
+      s = sin(twoPiOverN * k);
+      c_im(W[k]) = -s;
+      c_im(W[n - k]) = s;
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, compute_w_coefficients, (n, a, ab, W));
+    compute_w_coefficients(n, ab + 1, b, W);
+
+    fibril_join(&fr);
+  }
+}
+
+/*
+ * Determine (in a stupid way) if n is divisible by eight, then by four, else
+ * find the smallest prime factor of n.
+ */
+static int factor(int n)
+{
+  int r;
+
+  if (n < 2)
+    return 1;
+
+  if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048
+      || n == 4096)
+    return 8;
+  if ((n & 15) == 0)
+    return 16;
+  if ((n & 7) == 0)
+    return 8;
+  if ((n & 3) == 0)
+    return 4;
+  if ((n & 1) == 0)
+    return 2;
+
+#if 0
+  /* radix-32 is too big --- wait for processors with more registers
+   * :-) */
+  if ((n & 31) == 0 && n > 256)
+    return 32;
+#endif
+
+  /* try odd numbers up to n (computing the sqrt may be slower) */
+  for (r = 3; r < n; r += 2)
+    if (n % r == 0)
+      return r;
+
+  /* n is prime */
+  return n;
+}
+
+fibril static void unshuffle(int a, int b,
+    COMPLEX * in, COMPLEX * out, int r, int m)
+{
+  int i, j;
+  int r4 = r & (~0x3);
+  const COMPLEX *ip;
+  COMPLEX *jp;
+
+  if (b - a < 16) {
+    ip = in + a * r;
+    for (i = a; i < b; ++i) {
+      jp = out + i;
+      for (j = 0; j < r4; j += 4) {
+        jp[0] = ip[0];
+        jp[m] = ip[1];
+        jp[2 * m] = ip[2];
+        jp[3 * m] = ip[3];
+        jp += 4 * m;
+        ip += 4;
+      }
+      for (; j < r; ++j) {
+        *jp = *ip;
+        ip++;
+        jp += m;
+      }
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, unshuffle, (a, ab, in, out, r, m));
+    unshuffle(ab, b, in, out, r, m);
+
+    fibril_join(&fr);
+  }
+}
+
+/*
+ * Recursive complex FFT on the n complex components of the array in:
+ * basic Cooley-Tukey algorithm, with some improvements for
+ * n power of two. The result is placed in the array out. n is arbitrary.
+ * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk
+ * are prime numbers, and r1 * r2 * ... * rk = n.
+ *
+ * n: size of the input
+ * in: pointer to input
+ * out: pointer to output
+ * factors: list of factors of n, precomputed
+ * W: twiddle factors
+ * nW: size of W, that is, size of the original transform
+ *
+ */
+fibril static void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors,
+    COMPLEX * W, int nW)
+{
+  int r, m;
+
+  /* special cases */
+  if (n == 32) {
+    fft_base_32(in, out);
+    return;
+  }
+  if (n == 16) {
+    fft_base_16(in, out);
+    return;
+  }
+  if (n == 8) {
+    fft_base_8(in, out);
+    return;
+  }
+  if (n == 4) {
+    fft_base_4(in, out);
+    return;
+  }
+  if (n == 2) {
+    fft_base_2(in, out);
+    return;
+  }
+  /* the cases n == 3, n == 5, and maybe 7 should be implemented as well */
+
+  r = *factors;
+  m = n / r;
+
+  if (r < n) {
+    /* split the DFT of length n into r DFTs of length n/r,  and recurse */
+    if (r == 32)
+      fft_unshuffle_32(0, m, in, out, m);
+    else if (r == 16)
+      fft_unshuffle_16(0, m, in, out, m);
+    else if (r == 8)
+      fft_unshuffle_8(0, m, in, out, m);
+    else if (r == 4)
+      fft_unshuffle_4(0, m, in, out, m);
+    else if (r == 2)
+      fft_unshuffle_2(0, m, in, out, m);
+    else
+      unshuffle(0, m, in, out, r, m);
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    int k;
+    for(k = 0; k < n; k += m) {
+      fibril_fork(&fr, fft_aux, (m, out + k, in + k, factors + 1, W, nW));
+    }
+
+    fibril_join(&fr);
+  }
+
+  /* now multiply by the twiddle factors, and perform m FFTs of length r */
+  if (r == 2)
+    fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
+  else if (r == 4)
+    fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
+  else if (r == 8)
+    fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
+  else if (r == 16)
+    fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
+  else if (r == 32)
+    fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
+  else
+    fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
+
+  return;
+}
+
+/*
+ * user interface for fft_aux
+ */
+static void fft(int n, COMPLEX * in, COMPLEX * out)
+{
+  int factors[40];		/* allows FFTs up to at least 3^40 */
+  int *p = factors;
+  int l = n;
+  int r;
+
+  compute_w_coefficients(n, 0, n / 2, W);
+
+  /**
+   * find factors of n, first 8, then 4 and then primes in ascending
+   * order.
+   */
+  do {
+    r = factor(l);
+    *p++ = r;
+    l /= r;
+  } while (l > 1);
+
+  fft_aux(n, in, out, factors, W, n);
+  return;
+}
+
+/****************************************************************
+ *                     END OF FFT ALGORITHM
+ ****************************************************************/
+
+/*                            tests                             */
+
+static void fft_alt(int n, COMPLEX * in, COMPLEX * out)
+{
+  int i, j;
+  COMPLEX sum;
+  COMPLEX w;
+  (void) fft_alt;
+
+  for (j = 0; j < n; ++j) {
+    c_re(sum) = c_im(sum) = 0.0;
+
+    for (i = 0; i < n; ++i) {
+      c_re(w) = cos((2.0 * pi * (i * j % n)) / n);
+      c_im(w) = -sin((2.0 * pi * (i * j % n)) / n);
+      c_re(sum) += c_re(in[i]) * c_re(w) - c_im(in[i]) * c_im(w);
+      c_im(sum) += c_im(in[i]) * c_re(w) + c_re(in[i]) * c_im(w);
+    }
+
+    out[j] = sum;
+  }
+
+  return;
+}
+
+void init()
+{
+  size = (1 << n);
+  out = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
+  in  = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
+  W   = (COMPLEX*) malloc(sizeof(COMPLEX) * (size + 1));
+
+  int i;
+  for (i = 0; i < size; ++i) {
+    c_re(in[i]) = drand48();
+    c_im(in[i]) = drand48();
+  }
+}
+
+void prep()
+{
+  if (cp == NULL)
+    cp = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
+
+  memcpy(cp, in, sizeof(COMPLEX) * size);
+}
+
+void test()
+{
+  fft(size, cp, out);
+}
+
+#ifdef BENCHMARK
+int verify(void) { return 0; }
+#else
+int verify(void)
+{
+  COMPLEX * expect = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
+
+  fft_alt(size, in, expect);
+
+  /* compute the relative error */
+  double error = 0.0;
+
+  int i;
+  for (i = 0; i < size; ++i) {
+    double a = sqrt(
+        (c_re(out[i]) - c_re(expect[i])) * (c_re(out[i]) - c_re(expect[i])) +
+        (c_im(out[i]) - c_im(expect[i])) * (c_im(out[i]) - c_im(expect[i])));
+    double d = sqrt(
+        c_re(expect[i]) * c_re(expect[i]) + c_im(expect[i]) * c_im(expect[i]));
+
+    if (d < -1.0e-10 || d > 1.0e-10) a /= d;
+    if (a > error) error = a;
+  }
+
+  if (error > 1e-3) {
+    printf("size=%d error=%e\n", size, error);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+#endif
+
diff --git a/benchmarks/fft.h b/benchmarks/fft.h
new file mode 100644
index 00000000..7d9debf8
--- /dev/null
+++ b/benchmarks/fft.h
@@ -0,0 +1,2877 @@
+#ifndef FFT_H
+#define FFT_H
+
+/* our real numbers */
+typedef float REAL;
+
+/* Complex numbers and operations */
+typedef struct {
+  REAL re, im;
+} COMPLEX;
+
+#define c_re(c)  ((c).re)
+#define c_im(c)  ((c).im)
+
+static void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out,
+    COMPLEX * W, int r, int m,
+    int nW, int nWdnti, int nWdntm)
+{
+  int j, k;
+  COMPLEX *jp, *kp;
+
+  for (k = 0, kp = out; k < r; ++k, kp += m) {
+    REAL r0, i0, rt, it, rw, iw;
+    int l1 = nWdnti + nWdntm * k;
+    int l0;
+
+    r0 = i0 = 0.0;
+    for (j = 0, jp = in, l0 = 0; j < r; ++j, jp += m) {
+      rw = c_re(W[l0]);
+      iw = c_im(W[l0]);
+      rt = c_re(*jp);
+      it = c_im(*jp);
+      r0 += rt * rw - it * iw;
+      i0 += rt * iw + it * rw;
+      l0 += l1;
+      if (l0 > nW)
+        l0 -= nW;
+    }
+    c_re(*kp) = r0;
+    c_im(*kp) = i0;
+  }
+}
+
+fibril static void fft_twiddle_gen(int i, int i1,
+    COMPLEX * in, COMPLEX * out,
+    COMPLEX * W,
+    int nW, int nWdn, int r, int m)
+{
+  if (i == i1 - 1) {
+    fft_twiddle_gen1(in + i, out + i, W,
+        r, m, nW, nWdn * i, nWdn * m);
+  } else {
+    int i2 = (i + i1) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_twiddle_gen, (i, i2, in, out, W, nW, nWdn, r, m));
+    fft_twiddle_gen(i2, i1, in, out, W, nW, nWdn, r, m);
+
+    fibril_join(&fr);
+  }
+}
+
+/* machine-generated code begins here */
+static void fft_base_2(COMPLEX * in, COMPLEX * out)
+{
+  REAL r1_0, i1_0;
+  REAL r1_1, i1_1;
+  r1_0 = c_re(in[0]);
+  i1_0 = c_im(in[0]);
+  r1_1 = c_re(in[1]);
+  i1_1 = c_im(in[1]);
+  c_re(out[0]) = (r1_0 + r1_1);
+  c_im(out[0]) = (i1_0 + i1_1);
+  c_re(out[1]) = (r1_0 - r1_1);
+  c_im(out[1]) = (i1_0 - i1_1);
+}
+
+fibril static void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out,
+    COMPLEX * W, int nW, int nWdn, int m)
+{
+  int l1, i;
+  COMPLEX *jp, *kp;
+  REAL tmpr, tmpi, wr, wi;
+  if ((b - a) < 128) {
+    for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+        i++, l1 += nWdn, kp++) {
+      jp = in + i;
+      {
+        REAL r1_0, i1_0;
+        REAL r1_1, i1_1;
+        r1_0 = c_re(jp[0 * m]);
+        i1_0 = c_im(jp[0 * m]);
+        wr = c_re(W[1 * l1]);
+        wi = c_im(W[1 * l1]);
+        tmpr = c_re(jp[1 * m]);
+        tmpi = c_im(jp[1 * m]);
+        r1_1 = ((wr * tmpr) - (wi * tmpi));
+        i1_1 = ((wi * tmpr) + (wr * tmpi));
+        c_re(kp[0 * m]) = (r1_0 + r1_1);
+        c_im(kp[0 * m]) = (i1_0 + i1_1);
+        c_re(kp[1 * m]) = (r1_0 - r1_1);
+        c_im(kp[1 * m]) = (i1_0 - i1_1);
+      }
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_twiddle_2, (a, ab, in, out, W, nW, nWdn, m));
+    fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m);
+
+    fibril_join(&fr);
+  }
+}
+
+fibril static void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+  int i;
+  const COMPLEX *ip;
+  COMPLEX *jp;
+  if ((b - a) < 128) {
+    ip = in + a * 2;
+    for (i = a; i < b; ++i) {
+      jp = out + i;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_unshuffle_2, (a, ab, in, out, m));
+    fft_unshuffle_2(ab, b, in, out, m);
+
+    fibril_join(&fr);
+  }
+}
+
+static void fft_base_4(COMPLEX * in, COMPLEX * out)
+{
+  REAL r1_0, i1_0;
+  REAL r1_1, i1_1;
+  REAL r1_2, i1_2;
+  REAL r1_3, i1_3;
+  {
+    REAL r2_0, i2_0;
+    REAL r2_2, i2_2;
+    r2_0 = c_re(in[0]);
+    i2_0 = c_im(in[0]);
+    r2_2 = c_re(in[2]);
+    i2_2 = c_im(in[2]);
+    r1_0 = (r2_0 + r2_2);
+    i1_0 = (i2_0 + i2_2);
+    r1_2 = (r2_0 - r2_2);
+    i1_2 = (i2_0 - i2_2);
+  }
+  {
+    REAL r2_1, i2_1;
+    REAL r2_3, i2_3;
+    r2_1 = c_re(in[1]);
+    i2_1 = c_im(in[1]);
+    r2_3 = c_re(in[3]);
+    i2_3 = c_im(in[3]);
+    r1_1 = (r2_1 + r2_3);
+    i1_1 = (i2_1 + i2_3);
+    r1_3 = (r2_1 - r2_3);
+    i1_3 = (i2_1 - i2_3);
+  }
+  c_re(out[0]) = (r1_0 + r1_1);
+  c_im(out[0]) = (i1_0 + i1_1);
+  c_re(out[2]) = (r1_0 - r1_1);
+  c_im(out[2]) = (i1_0 - i1_1);
+  c_re(out[1]) = (r1_2 + i1_3);
+  c_im(out[1]) = (i1_2 - r1_3);
+  c_re(out[3]) = (r1_2 - i1_3);
+  c_im(out[3]) = (i1_2 + r1_3);
+}
+
+fibril static void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out,
+    COMPLEX * W, int nW, int nWdn, int m)
+{
+  int l1, i;
+  COMPLEX *jp, *kp;
+  REAL tmpr, tmpi, wr, wi;
+  if ((b - a) < 128) {
+    for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+        i++, l1 += nWdn, kp++) {
+      jp = in + i;
+      {
+        REAL r1_0, i1_0;
+        REAL r1_1, i1_1;
+        REAL r1_2, i1_2;
+        REAL r1_3, i1_3;
+        {
+          REAL r2_0, i2_0;
+          REAL r2_2, i2_2;
+          r2_0 = c_re(jp[0 * m]);
+          i2_0 = c_im(jp[0 * m]);
+          wr = c_re(W[2 * l1]);
+          wi = c_im(W[2 * l1]);
+          tmpr = c_re(jp[2 * m]);
+          tmpi = c_im(jp[2 * m]);
+          r2_2 = ((wr * tmpr) - (wi * tmpi));
+          i2_2 = ((wi * tmpr) + (wr * tmpi));
+          r1_0 = (r2_0 + r2_2);
+          i1_0 = (i2_0 + i2_2);
+          r1_2 = (r2_0 - r2_2);
+          i1_2 = (i2_0 - i2_2);
+        }
+        {
+          REAL r2_1, i2_1;
+          REAL r2_3, i2_3;
+          wr = c_re(W[1 * l1]);
+          wi = c_im(W[1 * l1]);
+          tmpr = c_re(jp[1 * m]);
+          tmpi = c_im(jp[1 * m]);
+          r2_1 = ((wr * tmpr) - (wi * tmpi));
+          i2_1 = ((wi * tmpr) + (wr * tmpi));
+          wr = c_re(W[3 * l1]);
+          wi = c_im(W[3 * l1]);
+          tmpr = c_re(jp[3 * m]);
+          tmpi = c_im(jp[3 * m]);
+          r2_3 = ((wr * tmpr) - (wi * tmpi));
+          i2_3 = ((wi * tmpr) + (wr * tmpi));
+          r1_1 = (r2_1 + r2_3);
+          i1_1 = (i2_1 + i2_3);
+          r1_3 = (r2_1 - r2_3);
+          i1_3 = (i2_1 - i2_3);
+        }
+        c_re(kp[0 * m]) = (r1_0 + r1_1);
+        c_im(kp[0 * m]) = (i1_0 + i1_1);
+        c_re(kp[2 * m]) = (r1_0 - r1_1);
+        c_im(kp[2 * m]) = (i1_0 - i1_1);
+        c_re(kp[1 * m]) = (r1_2 + i1_3);
+        c_im(kp[1 * m]) = (i1_2 - r1_3);
+        c_re(kp[3 * m]) = (r1_2 - i1_3);
+        c_im(kp[3 * m]) = (i1_2 + r1_3);
+      }
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_twiddle_4, (a, ab, in, out, W, nW, nWdn, m));
+    fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m);
+
+    fibril_join(&fr);
+  }
+}
+
+fibril static void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+  int i;
+  const COMPLEX *ip;
+  COMPLEX *jp;
+  if ((b - a) < 128) {
+    ip = in + a * 4;
+    for (i = a; i < b; ++i) {
+      jp = out + i;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_unshuffle_4, (a, ab, in, out, m));
+    fft_unshuffle_4(ab, b, in, out, m);
+
+    fibril_join(&fr);
+  }
+}
+
+static void fft_base_8(COMPLEX * in, COMPLEX * out)
+{
+  REAL tmpr, tmpi;
+  {
+    REAL r1_0, i1_0;
+    REAL r1_1, i1_1;
+    REAL r1_2, i1_2;
+    REAL r1_3, i1_3;
+    REAL r1_4, i1_4;
+    REAL r1_5, i1_5;
+    REAL r1_6, i1_6;
+    REAL r1_7, i1_7;
+    {
+      REAL r2_0, i2_0;
+      REAL r2_2, i2_2;
+      REAL r2_4, i2_4;
+      REAL r2_6, i2_6;
+      {
+        REAL r3_0, i3_0;
+        REAL r3_4, i3_4;
+        r3_0 = c_re(in[0]);
+        i3_0 = c_im(in[0]);
+        r3_4 = c_re(in[4]);
+        i3_4 = c_im(in[4]);
+        r2_0 = (r3_0 + r3_4);
+        i2_0 = (i3_0 + i3_4);
+        r2_4 = (r3_0 - r3_4);
+        i2_4 = (i3_0 - i3_4);
+      }
+      {
+        REAL r3_2, i3_2;
+        REAL r3_6, i3_6;
+        r3_2 = c_re(in[2]);
+        i3_2 = c_im(in[2]);
+        r3_6 = c_re(in[6]);
+        i3_6 = c_im(in[6]);
+        r2_2 = (r3_2 + r3_6);
+        i2_2 = (i3_2 + i3_6);
+        r2_6 = (r3_2 - r3_6);
+        i2_6 = (i3_2 - i3_6);
+      }
+      r1_0 = (r2_0 + r2_2);
+      i1_0 = (i2_0 + i2_2);
+      r1_4 = (r2_0 - r2_2);
+      i1_4 = (i2_0 - i2_2);
+      r1_2 = (r2_4 + i2_6);
+      i1_2 = (i2_4 - r2_6);
+      r1_6 = (r2_4 - i2_6);
+      i1_6 = (i2_4 + r2_6);
+    }
+    {
+      REAL r2_1, i2_1;
+      REAL r2_3, i2_3;
+      REAL r2_5, i2_5;
+      REAL r2_7, i2_7;
+      {
+        REAL r3_1, i3_1;
+        REAL r3_5, i3_5;
+        r3_1 = c_re(in[1]);
+        i3_1 = c_im(in[1]);
+        r3_5 = c_re(in[5]);
+        i3_5 = c_im(in[5]);
+        r2_1 = (r3_1 + r3_5);
+        i2_1 = (i3_1 + i3_5);
+        r2_5 = (r3_1 - r3_5);
+        i2_5 = (i3_1 - i3_5);
+      }
+      {
+        REAL r3_3, i3_3;
+        REAL r3_7, i3_7;
+        r3_3 = c_re(in[3]);
+        i3_3 = c_im(in[3]);
+        r3_7 = c_re(in[7]);
+        i3_7 = c_im(in[7]);
+        r2_3 = (r3_3 + r3_7);
+        i2_3 = (i3_3 + i3_7);
+        r2_7 = (r3_3 - r3_7);
+        i2_7 = (i3_3 - i3_7);
+      }
+      r1_1 = (r2_1 + r2_3);
+      i1_1 = (i2_1 + i2_3);
+      r1_5 = (r2_1 - r2_3);
+      i1_5 = (i2_1 - i2_3);
+      r1_3 = (r2_5 + i2_7);
+      i1_3 = (i2_5 - r2_7);
+      r1_7 = (r2_5 - i2_7);
+      i1_7 = (i2_5 + r2_7);
+    }
+    c_re(out[0]) = (r1_0 + r1_1);
+    c_im(out[0]) = (i1_0 + i1_1);
+    c_re(out[4]) = (r1_0 - r1_1);
+    c_im(out[4]) = (i1_0 - i1_1);
+    tmpr = (0.707106781187 * (r1_3 + i1_3));
+    tmpi = (0.707106781187 * (i1_3 - r1_3));
+    c_re(out[1]) = (r1_2 + tmpr);
+    c_im(out[1]) = (i1_2 + tmpi);
+    c_re(out[5]) = (r1_2 - tmpr);
+    c_im(out[5]) = (i1_2 - tmpi);
+    c_re(out[2]) = (r1_4 + i1_5);
+    c_im(out[2]) = (i1_4 - r1_5);
+    c_re(out[6]) = (r1_4 - i1_5);
+    c_im(out[6]) = (i1_4 + r1_5);
+    tmpr = (0.707106781187 * (i1_7 - r1_7));
+    tmpi = (0.707106781187 * (r1_7 + i1_7));
+    c_re(out[3]) = (r1_6 + tmpr);
+    c_im(out[3]) = (i1_6 - tmpi);
+    c_re(out[7]) = (r1_6 - tmpr);
+    c_im(out[7]) = (i1_6 + tmpi);
+  }
+}
+
+fibril static void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out,
+    COMPLEX * W, int nW, int nWdn, int m)
+{
+  int l1, i;
+  COMPLEX *jp, *kp;
+  REAL tmpr, tmpi, wr, wi;
+  if ((b - a) < 128) {
+    for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+        i++, l1 += nWdn, kp++) {
+      jp = in + i;
+      {
+        REAL r1_0, i1_0;
+        REAL r1_1, i1_1;
+        REAL r1_2, i1_2;
+        REAL r1_3, i1_3;
+        REAL r1_4, i1_4;
+        REAL r1_5, i1_5;
+        REAL r1_6, i1_6;
+        REAL r1_7, i1_7;
+        {
+          REAL r2_0, i2_0;
+          REAL r2_2, i2_2;
+          REAL r2_4, i2_4;
+          REAL r2_6, i2_6;
+          {
+            REAL r3_0, i3_0;
+            REAL r3_4, i3_4;
+            r3_0 = c_re(jp[0 * m]);
+            i3_0 = c_im(jp[0 * m]);
+            wr = c_re(W[4 * l1]);
+            wi = c_im(W[4 * l1]);
+            tmpr = c_re(jp[4 * m]);
+            tmpi = c_im(jp[4 * m]);
+            r3_4 = ((wr * tmpr) - (wi * tmpi));
+            i3_4 = ((wi * tmpr) + (wr * tmpi));
+            r2_0 = (r3_0 + r3_4);
+            i2_0 = (i3_0 + i3_4);
+            r2_4 = (r3_0 - r3_4);
+            i2_4 = (i3_0 - i3_4);
+          }
+          {
+            REAL r3_2, i3_2;
+            REAL r3_6, i3_6;
+            wr = c_re(W[2 * l1]);
+            wi = c_im(W[2 * l1]);
+            tmpr = c_re(jp[2 * m]);
+            tmpi = c_im(jp[2 * m]);
+            r3_2 = ((wr * tmpr) - (wi * tmpi));
+            i3_2 = ((wi * tmpr) + (wr * tmpi));
+            wr = c_re(W[6 * l1]);
+            wi = c_im(W[6 * l1]);
+            tmpr = c_re(jp[6 * m]);
+            tmpi = c_im(jp[6 * m]);
+            r3_6 = ((wr * tmpr) - (wi * tmpi));
+            i3_6 = ((wi * tmpr) + (wr * tmpi));
+            r2_2 = (r3_2 + r3_6);
+            i2_2 = (i3_2 + i3_6);
+            r2_6 = (r3_2 - r3_6);
+            i2_6 = (i3_2 - i3_6);
+          }
+          r1_0 = (r2_0 + r2_2);
+          i1_0 = (i2_0 + i2_2);
+          r1_4 = (r2_0 - r2_2);
+          i1_4 = (i2_0 - i2_2);
+          r1_2 = (r2_4 + i2_6);
+          i1_2 = (i2_4 - r2_6);
+          r1_6 = (r2_4 - i2_6);
+          i1_6 = (i2_4 + r2_6);
+        }
+        {
+          REAL r2_1, i2_1;
+          REAL r2_3, i2_3;
+          REAL r2_5, i2_5;
+          REAL r2_7, i2_7;
+          {
+            REAL r3_1, i3_1;
+            REAL r3_5, i3_5;
+            wr = c_re(W[1 * l1]);
+            wi = c_im(W[1 * l1]);
+            tmpr = c_re(jp[1 * m]);
+            tmpi = c_im(jp[1 * m]);
+            r3_1 = ((wr * tmpr) - (wi * tmpi));
+            i3_1 = ((wi * tmpr) + (wr * tmpi));
+            wr = c_re(W[5 * l1]);
+            wi = c_im(W[5 * l1]);
+            tmpr = c_re(jp[5 * m]);
+            tmpi = c_im(jp[5 * m]);
+            r3_5 = ((wr * tmpr) - (wi * tmpi));
+            i3_5 = ((wi * tmpr) + (wr * tmpi));
+            r2_1 = (r3_1 + r3_5);
+            i2_1 = (i3_1 + i3_5);
+            r2_5 = (r3_1 - r3_5);
+            i2_5 = (i3_1 - i3_5);
+          }
+          {
+            REAL r3_3, i3_3;
+            REAL r3_7, i3_7;
+            wr = c_re(W[3 * l1]);
+            wi = c_im(W[3 * l1]);
+            tmpr = c_re(jp[3 * m]);
+            tmpi = c_im(jp[3 * m]);
+            r3_3 = ((wr * tmpr) - (wi * tmpi));
+            i3_3 = ((wi * tmpr) + (wr * tmpi));
+            wr = c_re(W[7 * l1]);
+            wi = c_im(W[7 * l1]);
+            tmpr = c_re(jp[7 * m]);
+            tmpi = c_im(jp[7 * m]);
+            r3_7 = ((wr * tmpr) - (wi * tmpi));
+            i3_7 = ((wi * tmpr) + (wr * tmpi));
+            r2_3 = (r3_3 + r3_7);
+            i2_3 = (i3_3 + i3_7);
+            r2_7 = (r3_3 - r3_7);
+            i2_7 = (i3_3 - i3_7);
+          }
+          r1_1 = (r2_1 + r2_3);
+          i1_1 = (i2_1 + i2_3);
+          r1_5 = (r2_1 - r2_3);
+          i1_5 = (i2_1 - i2_3);
+          r1_3 = (r2_5 + i2_7);
+          i1_3 = (i2_5 - r2_7);
+          r1_7 = (r2_5 - i2_7);
+          i1_7 = (i2_5 + r2_7);
+        }
+        c_re(kp[0 * m]) = (r1_0 + r1_1);
+        c_im(kp[0 * m]) = (i1_0 + i1_1);
+        c_re(kp[4 * m]) = (r1_0 - r1_1);
+        c_im(kp[4 * m]) = (i1_0 - i1_1);
+        tmpr = (0.707106781187 * (r1_3 + i1_3));
+        tmpi = (0.707106781187 * (i1_3 - r1_3));
+        c_re(kp[1 * m]) = (r1_2 + tmpr);
+        c_im(kp[1 * m]) = (i1_2 + tmpi);
+        c_re(kp[5 * m]) = (r1_2 - tmpr);
+        c_im(kp[5 * m]) = (i1_2 - tmpi);
+        c_re(kp[2 * m]) = (r1_4 + i1_5);
+        c_im(kp[2 * m]) = (i1_4 - r1_5);
+        c_re(kp[6 * m]) = (r1_4 - i1_5);
+        c_im(kp[6 * m]) = (i1_4 + r1_5);
+        tmpr = (0.707106781187 * (i1_7 - r1_7));
+        tmpi = (0.707106781187 * (r1_7 + i1_7));
+        c_re(kp[3 * m]) = (r1_6 + tmpr);
+        c_im(kp[3 * m]) = (i1_6 - tmpi);
+        c_re(kp[7 * m]) = (r1_6 - tmpr);
+        c_im(kp[7 * m]) = (i1_6 + tmpi);
+      }
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_twiddle_8, (a, ab, in, out, W, nW, nWdn, m));
+    fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m);
+
+    fibril_join(&fr);
+  }
+}
+
+fibril static void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+  int i;
+  const COMPLEX *ip;
+  COMPLEX *jp;
+  if ((b - a) < 128) {
+    ip = in + a * 8;
+    for (i = a; i < b; ++i) {
+      jp = out + i;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_unshuffle_8, (a, ab, in, out, m));
+    fft_unshuffle_8(ab, b, in, out, m);
+
+    fibril_join(&fr);
+  }
+}
+
+static void fft_base_16(COMPLEX * in, COMPLEX * out)
+{
+  REAL tmpr, tmpi;
+  {
+    REAL r1_0, i1_0;
+    REAL r1_1, i1_1;
+    REAL r1_2, i1_2;
+    REAL r1_3, i1_3;
+    REAL r1_4, i1_4;
+    REAL r1_5, i1_5;
+    REAL r1_6, i1_6;
+    REAL r1_7, i1_7;
+    REAL r1_8, i1_8;
+    REAL r1_9, i1_9;
+    REAL r1_10, i1_10;
+    REAL r1_11, i1_11;
+    REAL r1_12, i1_12;
+    REAL r1_13, i1_13;
+    REAL r1_14, i1_14;
+    REAL r1_15, i1_15;
+    {
+      REAL r2_0, i2_0;
+      REAL r2_2, i2_2;
+      REAL r2_4, i2_4;
+      REAL r2_6, i2_6;
+      REAL r2_8, i2_8;
+      REAL r2_10, i2_10;
+      REAL r2_12, i2_12;
+      REAL r2_14, i2_14;
+      {
+        REAL r3_0, i3_0;
+        REAL r3_4, i3_4;
+        REAL r3_8, i3_8;
+        REAL r3_12, i3_12;
+        {
+          REAL r4_0, i4_0;
+          REAL r4_8, i4_8;
+          r4_0 = c_re(in[0]);
+          i4_0 = c_im(in[0]);
+          r4_8 = c_re(in[8]);
+          i4_8 = c_im(in[8]);
+          r3_0 = (r4_0 + r4_8);
+          i3_0 = (i4_0 + i4_8);
+          r3_8 = (r4_0 - r4_8);
+          i3_8 = (i4_0 - i4_8);
+        }
+        {
+          REAL r4_4, i4_4;
+          REAL r4_12, i4_12;
+          r4_4 = c_re(in[4]);
+          i4_4 = c_im(in[4]);
+          r4_12 = c_re(in[12]);
+          i4_12 = c_im(in[12]);
+          r3_4 = (r4_4 + r4_12);
+          i3_4 = (i4_4 + i4_12);
+          r3_12 = (r4_4 - r4_12);
+          i3_12 = (i4_4 - i4_12);
+        }
+        r2_0 = (r3_0 + r3_4);
+        i2_0 = (i3_0 + i3_4);
+        r2_8 = (r3_0 - r3_4);
+        i2_8 = (i3_0 - i3_4);
+        r2_4 = (r3_8 + i3_12);
+        i2_4 = (i3_8 - r3_12);
+        r2_12 = (r3_8 - i3_12);
+        i2_12 = (i3_8 + r3_12);
+      }
+      {
+        REAL r3_2, i3_2;
+        REAL r3_6, i3_6;
+        REAL r3_10, i3_10;
+        REAL r3_14, i3_14;
+        {
+          REAL r4_2, i4_2;
+          REAL r4_10, i4_10;
+          r4_2 = c_re(in[2]);
+          i4_2 = c_im(in[2]);
+          r4_10 = c_re(in[10]);
+          i4_10 = c_im(in[10]);
+          r3_2 = (r4_2 + r4_10);
+          i3_2 = (i4_2 + i4_10);
+          r3_10 = (r4_2 - r4_10);
+          i3_10 = (i4_2 - i4_10);
+        }
+        {
+          REAL r4_6, i4_6;
+          REAL r4_14, i4_14;
+          r4_6 = c_re(in[6]);
+          i4_6 = c_im(in[6]);
+          r4_14 = c_re(in[14]);
+          i4_14 = c_im(in[14]);
+          r3_6 = (r4_6 + r4_14);
+          i3_6 = (i4_6 + i4_14);
+          r3_14 = (r4_6 - r4_14);
+          i3_14 = (i4_6 - i4_14);
+        }
+        r2_2 = (r3_2 + r3_6);
+        i2_2 = (i3_2 + i3_6);
+        r2_10 = (r3_2 - r3_6);
+        i2_10 = (i3_2 - i3_6);
+        r2_6 = (r3_10 + i3_14);
+        i2_6 = (i3_10 - r3_14);
+        r2_14 = (r3_10 - i3_14);
+        i2_14 = (i3_10 + r3_14);
+      }
+      r1_0 = (r2_0 + r2_2);
+      i1_0 = (i2_0 + i2_2);
+      r1_8 = (r2_0 - r2_2);
+      i1_8 = (i2_0 - i2_2);
+      tmpr = (0.707106781187 * (r2_6 + i2_6));
+      tmpi = (0.707106781187 * (i2_6 - r2_6));
+      r1_2 = (r2_4 + tmpr);
+      i1_2 = (i2_4 + tmpi);
+      r1_10 = (r2_4 - tmpr);
+      i1_10 = (i2_4 - tmpi);
+      r1_4 = (r2_8 + i2_10);
+      i1_4 = (i2_8 - r2_10);
+      r1_12 = (r2_8 - i2_10);
+      i1_12 = (i2_8 + r2_10);
+      tmpr = (0.707106781187 * (i2_14 - r2_14));
+      tmpi = (0.707106781187 * (r2_14 + i2_14));
+      r1_6 = (r2_12 + tmpr);
+      i1_6 = (i2_12 - tmpi);
+      r1_14 = (r2_12 - tmpr);
+      i1_14 = (i2_12 + tmpi);
+    }
+    {
+      REAL r2_1, i2_1;
+      REAL r2_3, i2_3;
+      REAL r2_5, i2_5;
+      REAL r2_7, i2_7;
+      REAL r2_9, i2_9;
+      REAL r2_11, i2_11;
+      REAL r2_13, i2_13;
+      REAL r2_15, i2_15;
+      {
+        REAL r3_1, i3_1;
+        REAL r3_5, i3_5;
+        REAL r3_9, i3_9;
+        REAL r3_13, i3_13;
+        {
+          REAL r4_1, i4_1;
+          REAL r4_9, i4_9;
+          r4_1 = c_re(in[1]);
+          i4_1 = c_im(in[1]);
+          r4_9 = c_re(in[9]);
+          i4_9 = c_im(in[9]);
+          r3_1 = (r4_1 + r4_9);
+          i3_1 = (i4_1 + i4_9);
+          r3_9 = (r4_1 - r4_9);
+          i3_9 = (i4_1 - i4_9);
+        }
+        {
+          REAL r4_5, i4_5;
+          REAL r4_13, i4_13;
+          r4_5 = c_re(in[5]);
+          i4_5 = c_im(in[5]);
+          r4_13 = c_re(in[13]);
+          i4_13 = c_im(in[13]);
+          r3_5 = (r4_5 + r4_13);
+          i3_5 = (i4_5 + i4_13);
+          r3_13 = (r4_5 - r4_13);
+          i3_13 = (i4_5 - i4_13);
+        }
+        r2_1 = (r3_1 + r3_5);
+        i2_1 = (i3_1 + i3_5);
+        r2_9 = (r3_1 - r3_5);
+        i2_9 = (i3_1 - i3_5);
+        r2_5 = (r3_9 + i3_13);
+        i2_5 = (i3_9 - r3_13);
+        r2_13 = (r3_9 - i3_13);
+        i2_13 = (i3_9 + r3_13);
+      }
+      {
+        REAL r3_3, i3_3;
+        REAL r3_7, i3_7;
+        REAL r3_11, i3_11;
+        REAL r3_15, i3_15;
+        {
+          REAL r4_3, i4_3;
+          REAL r4_11, i4_11;
+          r4_3 = c_re(in[3]);
+          i4_3 = c_im(in[3]);
+          r4_11 = c_re(in[11]);
+          i4_11 = c_im(in[11]);
+          r3_3 = (r4_3 + r4_11);
+          i3_3 = (i4_3 + i4_11);
+          r3_11 = (r4_3 - r4_11);
+          i3_11 = (i4_3 - i4_11);
+        }
+        {
+          REAL r4_7, i4_7;
+          REAL r4_15, i4_15;
+          r4_7 = c_re(in[7]);
+          i4_7 = c_im(in[7]);
+          r4_15 = c_re(in[15]);
+          i4_15 = c_im(in[15]);
+          r3_7 = (r4_7 + r4_15);
+          i3_7 = (i4_7 + i4_15);
+          r3_15 = (r4_7 - r4_15);
+          i3_15 = (i4_7 - i4_15);
+        }
+        r2_3 = (r3_3 + r3_7);
+        i2_3 = (i3_3 + i3_7);
+        r2_11 = (r3_3 - r3_7);
+        i2_11 = (i3_3 - i3_7);
+        r2_7 = (r3_11 + i3_15);
+        i2_7 = (i3_11 - r3_15);
+        r2_15 = (r3_11 - i3_15);
+        i2_15 = (i3_11 + r3_15);
+      }
+      r1_1 = (r2_1 + r2_3);
+      i1_1 = (i2_1 + i2_3);
+      r1_9 = (r2_1 - r2_3);
+      i1_9 = (i2_1 - i2_3);
+      tmpr = (0.707106781187 * (r2_7 + i2_7));
+      tmpi = (0.707106781187 * (i2_7 - r2_7));
+      r1_3 = (r2_5 + tmpr);
+      i1_3 = (i2_5 + tmpi);
+      r1_11 = (r2_5 - tmpr);
+      i1_11 = (i2_5 - tmpi);
+      r1_5 = (r2_9 + i2_11);
+      i1_5 = (i2_9 - r2_11);
+      r1_13 = (r2_9 - i2_11);
+      i1_13 = (i2_9 + r2_11);
+      tmpr = (0.707106781187 * (i2_15 - r2_15));
+      tmpi = (0.707106781187 * (r2_15 + i2_15));
+      r1_7 = (r2_13 + tmpr);
+      i1_7 = (i2_13 - tmpi);
+      r1_15 = (r2_13 - tmpr);
+      i1_15 = (i2_13 + tmpi);
+    }
+    c_re(out[0]) = (r1_0 + r1_1);
+    c_im(out[0]) = (i1_0 + i1_1);
+    c_re(out[8]) = (r1_0 - r1_1);
+    c_im(out[8]) = (i1_0 - i1_1);
+    tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+    tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+    c_re(out[1]) = (r1_2 + tmpr);
+    c_im(out[1]) = (i1_2 + tmpi);
+    c_re(out[9]) = (r1_2 - tmpr);
+    c_im(out[9]) = (i1_2 - tmpi);
+    tmpr = (0.707106781187 * (r1_5 + i1_5));
+    tmpi = (0.707106781187 * (i1_5 - r1_5));
+    c_re(out[2]) = (r1_4 + tmpr);
+    c_im(out[2]) = (i1_4 + tmpi);
+    c_re(out[10]) = (r1_4 - tmpr);
+    c_im(out[10]) = (i1_4 - tmpi);
+    tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+    tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+    c_re(out[3]) = (r1_6 + tmpr);
+    c_im(out[3]) = (i1_6 + tmpi);
+    c_re(out[11]) = (r1_6 - tmpr);
+    c_im(out[11]) = (i1_6 - tmpi);
+    c_re(out[4]) = (r1_8 + i1_9);
+    c_im(out[4]) = (i1_8 - r1_9);
+    c_re(out[12]) = (r1_8 - i1_9);
+    c_im(out[12]) = (i1_8 + r1_9);
+    tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+    tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+    c_re(out[5]) = (r1_10 + tmpr);
+    c_im(out[5]) = (i1_10 - tmpi);
+    c_re(out[13]) = (r1_10 - tmpr);
+    c_im(out[13]) = (i1_10 + tmpi);
+    tmpr = (0.707106781187 * (i1_13 - r1_13));
+    tmpi = (0.707106781187 * (r1_13 + i1_13));
+    c_re(out[6]) = (r1_12 + tmpr);
+    c_im(out[6]) = (i1_12 - tmpi);
+    c_re(out[14]) = (r1_12 - tmpr);
+    c_im(out[14]) = (i1_12 + tmpi);
+    tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+    tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+    c_re(out[7]) = (r1_14 + tmpr);
+    c_im(out[7]) = (i1_14 - tmpi);
+    c_re(out[15]) = (r1_14 - tmpr);
+    c_im(out[15]) = (i1_14 + tmpi);
+  }
+}
+
+fibril static void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out,
+    COMPLEX * W, int nW, int nWdn, int m)
+{
+  int l1, i;
+  COMPLEX *jp, *kp;
+  REAL tmpr, tmpi, wr, wi;
+  if ((b - a) < 128) {
+    for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+        i++, l1 += nWdn, kp++) {
+      jp = in + i;
+      {
+        REAL r1_0, i1_0;
+        REAL r1_1, i1_1;
+        REAL r1_2, i1_2;
+        REAL r1_3, i1_3;
+        REAL r1_4, i1_4;
+        REAL r1_5, i1_5;
+        REAL r1_6, i1_6;
+        REAL r1_7, i1_7;
+        REAL r1_8, i1_8;
+        REAL r1_9, i1_9;
+        REAL r1_10, i1_10;
+        REAL r1_11, i1_11;
+        REAL r1_12, i1_12;
+        REAL r1_13, i1_13;
+        REAL r1_14, i1_14;
+        REAL r1_15, i1_15;
+        {
+          REAL r2_0, i2_0;
+          REAL r2_2, i2_2;
+          REAL r2_4, i2_4;
+          REAL r2_6, i2_6;
+          REAL r2_8, i2_8;
+          REAL r2_10, i2_10;
+          REAL r2_12, i2_12;
+          REAL r2_14, i2_14;
+          {
+            REAL r3_0, i3_0;
+            REAL r3_4, i3_4;
+            REAL r3_8, i3_8;
+            REAL r3_12, i3_12;
+            {
+              REAL r4_0, i4_0;
+              REAL r4_8, i4_8;
+              r4_0 = c_re(jp[0 * m]);
+              i4_0 = c_im(jp[0 * m]);
+              wr = c_re(W[8 * l1]);
+              wi = c_im(W[8 * l1]);
+              tmpr = c_re(jp[8 * m]);
+              tmpi = c_im(jp[8 * m]);
+              r4_8 = ((wr * tmpr) - (wi * tmpi));
+              i4_8 = ((wi * tmpr) + (wr * tmpi));
+              r3_0 = (r4_0 + r4_8);
+              i3_0 = (i4_0 + i4_8);
+              r3_8 = (r4_0 - r4_8);
+              i3_8 = (i4_0 - i4_8);
+            }
+            {
+              REAL r4_4, i4_4;
+              REAL r4_12, i4_12;
+              wr = c_re(W[4 * l1]);
+              wi = c_im(W[4 * l1]);
+              tmpr = c_re(jp[4 * m]);
+              tmpi = c_im(jp[4 * m]);
+              r4_4 = ((wr * tmpr) - (wi * tmpi));
+              i4_4 = ((wi * tmpr) + (wr * tmpi));
+              wr = c_re(W[12 * l1]);
+              wi = c_im(W[12 * l1]);
+              tmpr = c_re(jp[12 * m]);
+              tmpi = c_im(jp[12 * m]);
+              r4_12 = ((wr * tmpr) - (wi * tmpi));
+              i4_12 = ((wi * tmpr) + (wr * tmpi));
+              r3_4 = (r4_4 + r4_12);
+              i3_4 = (i4_4 + i4_12);
+              r3_12 = (r4_4 - r4_12);
+              i3_12 = (i4_4 - i4_12);
+            }
+            r2_0 = (r3_0 + r3_4);
+            i2_0 = (i3_0 + i3_4);
+            r2_8 = (r3_0 - r3_4);
+            i2_8 = (i3_0 - i3_4);
+            r2_4 = (r3_8 + i3_12);
+            i2_4 = (i3_8 - r3_12);
+            r2_12 = (r3_8 - i3_12);
+            i2_12 = (i3_8 + r3_12);
+          }
+          {
+            REAL r3_2, i3_2;
+            REAL r3_6, i3_6;
+            REAL r3_10, i3_10;
+            REAL r3_14, i3_14;
+            {
+              REAL r4_2, i4_2;
+              REAL r4_10, i4_10;
+              wr = c_re(W[2 * l1]);
+              wi = c_im(W[2 * l1]);
+              tmpr = c_re(jp[2 * m]);
+              tmpi = c_im(jp[2 * m]);
+              r4_2 = ((wr * tmpr) - (wi * tmpi));
+              i4_2 = ((wi * tmpr) + (wr * tmpi));
+              wr = c_re(W[10 * l1]);
+              wi = c_im(W[10 * l1]);
+              tmpr = c_re(jp[10 * m]);
+              tmpi = c_im(jp[10 * m]);
+              r4_10 = ((wr * tmpr) - (wi * tmpi));
+              i4_10 = ((wi * tmpr) + (wr * tmpi));
+              r3_2 = (r4_2 + r4_10);
+              i3_2 = (i4_2 + i4_10);
+              r3_10 = (r4_2 - r4_10);
+              i3_10 = (i4_2 - i4_10);
+            }
+            {
+              REAL r4_6, i4_6;
+              REAL r4_14, i4_14;
+              wr = c_re(W[6 * l1]);
+              wi = c_im(W[6 * l1]);
+              tmpr = c_re(jp[6 * m]);
+              tmpi = c_im(jp[6 * m]);
+              r4_6 = ((wr * tmpr) - (wi * tmpi));
+              i4_6 = ((wi * tmpr) + (wr * tmpi));
+              wr = c_re(W[14 * l1]);
+              wi = c_im(W[14 * l1]);
+              tmpr = c_re(jp[14 * m]);
+              tmpi = c_im(jp[14 * m]);
+              r4_14 = ((wr * tmpr) - (wi * tmpi));
+              i4_14 = ((wi * tmpr) + (wr * tmpi));
+              r3_6 = (r4_6 + r4_14);
+              i3_6 = (i4_6 + i4_14);
+              r3_14 = (r4_6 - r4_14);
+              i3_14 = (i4_6 - i4_14);
+            }
+            r2_2 = (r3_2 + r3_6);
+            i2_2 = (i3_2 + i3_6);
+            r2_10 = (r3_2 - r3_6);
+            i2_10 = (i3_2 - i3_6);
+            r2_6 = (r3_10 + i3_14);
+            i2_6 = (i3_10 - r3_14);
+            r2_14 = (r3_10 - i3_14);
+            i2_14 = (i3_10 + r3_14);
+          }
+          r1_0 = (r2_0 + r2_2);
+          i1_0 = (i2_0 + i2_2);
+          r1_8 = (r2_0 - r2_2);
+          i1_8 = (i2_0 - i2_2);
+          tmpr = (0.707106781187 * (r2_6 + i2_6));
+          tmpi = (0.707106781187 * (i2_6 - r2_6));
+          r1_2 = (r2_4 + tmpr);
+          i1_2 = (i2_4 + tmpi);
+          r1_10 = (r2_4 - tmpr);
+          i1_10 = (i2_4 - tmpi);
+          r1_4 = (r2_8 + i2_10);
+          i1_4 = (i2_8 - r2_10);
+          r1_12 = (r2_8 - i2_10);
+          i1_12 = (i2_8 + r2_10);
+          tmpr = (0.707106781187 * (i2_14 - r2_14));
+          tmpi = (0.707106781187 * (r2_14 + i2_14));
+          r1_6 = (r2_12 + tmpr);
+          i1_6 = (i2_12 - tmpi);
+          r1_14 = (r2_12 - tmpr);
+          i1_14 = (i2_12 + tmpi);
+        }
+        {
+          REAL r2_1, i2_1;
+          REAL r2_3, i2_3;
+          REAL r2_5, i2_5;
+          REAL r2_7, i2_7;
+          REAL r2_9, i2_9;
+          REAL r2_11, i2_11;
+          REAL r2_13, i2_13;
+          REAL r2_15, i2_15;
+          {
+            REAL r3_1, i3_1;
+            REAL r3_5, i3_5;
+            REAL r3_9, i3_9;
+            REAL r3_13, i3_13;
+            {
+              REAL r4_1, i4_1;
+              REAL r4_9, i4_9;
+              wr = c_re(W[1 * l1]);
+              wi = c_im(W[1 * l1]);
+              tmpr = c_re(jp[1 * m]);
+              tmpi = c_im(jp[1 * m]);
+              r4_1 = ((wr * tmpr) - (wi * tmpi));
+              i4_1 = ((wi * tmpr) + (wr * tmpi));
+              wr = c_re(W[9 * l1]);
+              wi = c_im(W[9 * l1]);
+              tmpr = c_re(jp[9 * m]);
+              tmpi = c_im(jp[9 * m]);
+              r4_9 = ((wr * tmpr) - (wi * tmpi));
+              i4_9 = ((wi * tmpr) + (wr * tmpi));
+              r3_1 = (r4_1 + r4_9);
+              i3_1 = (i4_1 + i4_9);
+              r3_9 = (r4_1 - r4_9);
+              i3_9 = (i4_1 - i4_9);
+            }
+            {
+              REAL r4_5, i4_5;
+              REAL r4_13, i4_13;
+              wr = c_re(W[5 * l1]);
+              wi = c_im(W[5 * l1]);
+              tmpr = c_re(jp[5 * m]);
+              tmpi = c_im(jp[5 * m]);
+              r4_5 = ((wr * tmpr) - (wi * tmpi));
+              i4_5 = ((wi * tmpr) + (wr * tmpi));
+              wr = c_re(W[13 * l1]);
+              wi = c_im(W[13 * l1]);
+              tmpr = c_re(jp[13 * m]);
+              tmpi = c_im(jp[13 * m]);
+              r4_13 = ((wr * tmpr) - (wi * tmpi));
+              i4_13 = ((wi * tmpr) + (wr * tmpi));
+              r3_5 = (r4_5 + r4_13);
+              i3_5 = (i4_5 + i4_13);
+              r3_13 = (r4_5 - r4_13);
+              i3_13 = (i4_5 - i4_13);
+            }
+            r2_1 = (r3_1 + r3_5);
+            i2_1 = (i3_1 + i3_5);
+            r2_9 = (r3_1 - r3_5);
+            i2_9 = (i3_1 - i3_5);
+            r2_5 = (r3_9 + i3_13);
+            i2_5 = (i3_9 - r3_13);
+            r2_13 = (r3_9 - i3_13);
+            i2_13 = (i3_9 + r3_13);
+          }
+          {
+            REAL r3_3, i3_3;
+            REAL r3_7, i3_7;
+            REAL r3_11, i3_11;
+            REAL r3_15, i3_15;
+            {
+              REAL r4_3, i4_3;
+              REAL r4_11, i4_11;
+              wr = c_re(W[3 * l1]);
+              wi = c_im(W[3 * l1]);
+              tmpr = c_re(jp[3 * m]);
+              tmpi = c_im(jp[3 * m]);
+              r4_3 = ((wr * tmpr) - (wi * tmpi));
+              i4_3 = ((wi * tmpr) + (wr * tmpi));
+              wr = c_re(W[11 * l1]);
+              wi = c_im(W[11 * l1]);
+              tmpr = c_re(jp[11 * m]);
+              tmpi = c_im(jp[11 * m]);
+              r4_11 = ((wr * tmpr) - (wi * tmpi));
+              i4_11 = ((wi * tmpr) + (wr * tmpi));
+              r3_3 = (r4_3 + r4_11);
+              i3_3 = (i4_3 + i4_11);
+              r3_11 = (r4_3 - r4_11);
+              i3_11 = (i4_3 - i4_11);
+            }
+            {
+              REAL r4_7, i4_7;
+              REAL r4_15, i4_15;
+              wr = c_re(W[7 * l1]);
+              wi = c_im(W[7 * l1]);
+              tmpr = c_re(jp[7 * m]);
+              tmpi = c_im(jp[7 * m]);
+              r4_7 = ((wr * tmpr) - (wi * tmpi));
+              i4_7 = ((wi * tmpr) + (wr * tmpi));
+              wr = c_re(W[15 * l1]);
+              wi = c_im(W[15 * l1]);
+              tmpr = c_re(jp[15 * m]);
+              tmpi = c_im(jp[15 * m]);
+              r4_15 = ((wr * tmpr) - (wi * tmpi));
+              i4_15 = ((wi * tmpr) + (wr * tmpi));
+              r3_7 = (r4_7 + r4_15);
+              i3_7 = (i4_7 + i4_15);
+              r3_15 = (r4_7 - r4_15);
+              i3_15 = (i4_7 - i4_15);
+            }
+            r2_3 = (r3_3 + r3_7);
+            i2_3 = (i3_3 + i3_7);
+            r2_11 = (r3_3 - r3_7);
+            i2_11 = (i3_3 - i3_7);
+            r2_7 = (r3_11 + i3_15);
+            i2_7 = (i3_11 - r3_15);
+            r2_15 = (r3_11 - i3_15);
+            i2_15 = (i3_11 + r3_15);
+          }
+          r1_1 = (r2_1 + r2_3);
+          i1_1 = (i2_1 + i2_3);
+          r1_9 = (r2_1 - r2_3);
+          i1_9 = (i2_1 - i2_3);
+          tmpr = (0.707106781187 * (r2_7 + i2_7));
+          tmpi = (0.707106781187 * (i2_7 - r2_7));
+          r1_3 = (r2_5 + tmpr);
+          i1_3 = (i2_5 + tmpi);
+          r1_11 = (r2_5 - tmpr);
+          i1_11 = (i2_5 - tmpi);
+          r1_5 = (r2_9 + i2_11);
+          i1_5 = (i2_9 - r2_11);
+          r1_13 = (r2_9 - i2_11);
+          i1_13 = (i2_9 + r2_11);
+          tmpr = (0.707106781187 * (i2_15 - r2_15));
+          tmpi = (0.707106781187 * (r2_15 + i2_15));
+          r1_7 = (r2_13 + tmpr);
+          i1_7 = (i2_13 - tmpi);
+          r1_15 = (r2_13 - tmpr);
+          i1_15 = (i2_13 + tmpi);
+        }
+        c_re(kp[0 * m]) = (r1_0 + r1_1);
+        c_im(kp[0 * m]) = (i1_0 + i1_1);
+        c_re(kp[8 * m]) = (r1_0 - r1_1);
+        c_im(kp[8 * m]) = (i1_0 - i1_1);
+        tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+        tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+        c_re(kp[1 * m]) = (r1_2 + tmpr);
+        c_im(kp[1 * m]) = (i1_2 + tmpi);
+        c_re(kp[9 * m]) = (r1_2 - tmpr);
+        c_im(kp[9 * m]) = (i1_2 - tmpi);
+        tmpr = (0.707106781187 * (r1_5 + i1_5));
+        tmpi = (0.707106781187 * (i1_5 - r1_5));
+        c_re(kp[2 * m]) = (r1_4 + tmpr);
+        c_im(kp[2 * m]) = (i1_4 + tmpi);
+        c_re(kp[10 * m]) = (r1_4 - tmpr);
+        c_im(kp[10 * m]) = (i1_4 - tmpi);
+        tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+        tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+        c_re(kp[3 * m]) = (r1_6 + tmpr);
+        c_im(kp[3 * m]) = (i1_6 + tmpi);
+        c_re(kp[11 * m]) = (r1_6 - tmpr);
+        c_im(kp[11 * m]) = (i1_6 - tmpi);
+        c_re(kp[4 * m]) = (r1_8 + i1_9);
+        c_im(kp[4 * m]) = (i1_8 - r1_9);
+        c_re(kp[12 * m]) = (r1_8 - i1_9);
+        c_im(kp[12 * m]) = (i1_8 + r1_9);
+        tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+        tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+        c_re(kp[5 * m]) = (r1_10 + tmpr);
+        c_im(kp[5 * m]) = (i1_10 - tmpi);
+        c_re(kp[13 * m]) = (r1_10 - tmpr);
+        c_im(kp[13 * m]) = (i1_10 + tmpi);
+        tmpr = (0.707106781187 * (i1_13 - r1_13));
+        tmpi = (0.707106781187 * (r1_13 + i1_13));
+        c_re(kp[6 * m]) = (r1_12 + tmpr);
+        c_im(kp[6 * m]) = (i1_12 - tmpi);
+        c_re(kp[14 * m]) = (r1_12 - tmpr);
+        c_im(kp[14 * m]) = (i1_12 + tmpi);
+        tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+        tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+        c_re(kp[7 * m]) = (r1_14 + tmpr);
+        c_im(kp[7 * m]) = (i1_14 - tmpi);
+        c_re(kp[15 * m]) = (r1_14 - tmpr);
+        c_im(kp[15 * m]) = (i1_14 + tmpi);
+      }
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_twiddle_16, (a, ab, in, out, W, nW, nWdn, m));
+    fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m);
+
+    fibril_join(&fr);
+  }
+}
+
+fibril static void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+  int i;
+  const COMPLEX *ip;
+  COMPLEX *jp;
+  if ((b - a) < 128) {
+    ip = in + a * 16;
+    for (i = a; i < b; ++i) {
+      jp = out + i;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_unshuffle_16, (a, ab, in, out, m));
+    fft_unshuffle_16(ab, b, in, out, m);
+
+    fibril_join(&fr);
+  }
+}
+
+static void fft_base_32(COMPLEX * in, COMPLEX * out)
+{
+  REAL tmpr, tmpi;
+  {
+    REAL r1_0, i1_0;
+    REAL r1_1, i1_1;
+    REAL r1_2, i1_2;
+    REAL r1_3, i1_3;
+    REAL r1_4, i1_4;
+    REAL r1_5, i1_5;
+    REAL r1_6, i1_6;
+    REAL r1_7, i1_7;
+    REAL r1_8, i1_8;
+    REAL r1_9, i1_9;
+    REAL r1_10, i1_10;
+    REAL r1_11, i1_11;
+    REAL r1_12, i1_12;
+    REAL r1_13, i1_13;
+    REAL r1_14, i1_14;
+    REAL r1_15, i1_15;
+    REAL r1_16, i1_16;
+    REAL r1_17, i1_17;
+    REAL r1_18, i1_18;
+    REAL r1_19, i1_19;
+    REAL r1_20, i1_20;
+    REAL r1_21, i1_21;
+    REAL r1_22, i1_22;
+    REAL r1_23, i1_23;
+    REAL r1_24, i1_24;
+    REAL r1_25, i1_25;
+    REAL r1_26, i1_26;
+    REAL r1_27, i1_27;
+    REAL r1_28, i1_28;
+    REAL r1_29, i1_29;
+    REAL r1_30, i1_30;
+    REAL r1_31, i1_31;
+    {
+      REAL r2_0, i2_0;
+      REAL r2_2, i2_2;
+      REAL r2_4, i2_4;
+      REAL r2_6, i2_6;
+      REAL r2_8, i2_8;
+      REAL r2_10, i2_10;
+      REAL r2_12, i2_12;
+      REAL r2_14, i2_14;
+      REAL r2_16, i2_16;
+      REAL r2_18, i2_18;
+      REAL r2_20, i2_20;
+      REAL r2_22, i2_22;
+      REAL r2_24, i2_24;
+      REAL r2_26, i2_26;
+      REAL r2_28, i2_28;
+      REAL r2_30, i2_30;
+      {
+        REAL r3_0, i3_0;
+        REAL r3_4, i3_4;
+        REAL r3_8, i3_8;
+        REAL r3_12, i3_12;
+        REAL r3_16, i3_16;
+        REAL r3_20, i3_20;
+        REAL r3_24, i3_24;
+        REAL r3_28, i3_28;
+        {
+          REAL r4_0, i4_0;
+          REAL r4_8, i4_8;
+          REAL r4_16, i4_16;
+          REAL r4_24, i4_24;
+          {
+            REAL r5_0, i5_0;
+            REAL r5_16, i5_16;
+            r5_0 = c_re(in[0]);
+            i5_0 = c_im(in[0]);
+            r5_16 = c_re(in[16]);
+            i5_16 = c_im(in[16]);
+            r4_0 = (r5_0 + r5_16);
+            i4_0 = (i5_0 + i5_16);
+            r4_16 = (r5_0 - r5_16);
+            i4_16 = (i5_0 - i5_16);
+          }
+          {
+            REAL r5_8, i5_8;
+            REAL r5_24, i5_24;
+            r5_8 = c_re(in[8]);
+            i5_8 = c_im(in[8]);
+            r5_24 = c_re(in[24]);
+            i5_24 = c_im(in[24]);
+            r4_8 = (r5_8 + r5_24);
+            i4_8 = (i5_8 + i5_24);
+            r4_24 = (r5_8 - r5_24);
+            i4_24 = (i5_8 - i5_24);
+          }
+          r3_0 = (r4_0 + r4_8);
+          i3_0 = (i4_0 + i4_8);
+          r3_16 = (r4_0 - r4_8);
+          i3_16 = (i4_0 - i4_8);
+          r3_8 = (r4_16 + i4_24);
+          i3_8 = (i4_16 - r4_24);
+          r3_24 = (r4_16 - i4_24);
+          i3_24 = (i4_16 + r4_24);
+        }
+        {
+          REAL r4_4, i4_4;
+          REAL r4_12, i4_12;
+          REAL r4_20, i4_20;
+          REAL r4_28, i4_28;
+          {
+            REAL r5_4, i5_4;
+            REAL r5_20, i5_20;
+            r5_4 = c_re(in[4]);
+            i5_4 = c_im(in[4]);
+            r5_20 = c_re(in[20]);
+            i5_20 = c_im(in[20]);
+            r4_4 = (r5_4 + r5_20);
+            i4_4 = (i5_4 + i5_20);
+            r4_20 = (r5_4 - r5_20);
+            i4_20 = (i5_4 - i5_20);
+          }
+          {
+            REAL r5_12, i5_12;
+            REAL r5_28, i5_28;
+            r5_12 = c_re(in[12]);
+            i5_12 = c_im(in[12]);
+            r5_28 = c_re(in[28]);
+            i5_28 = c_im(in[28]);
+            r4_12 = (r5_12 + r5_28);
+            i4_12 = (i5_12 + i5_28);
+            r4_28 = (r5_12 - r5_28);
+            i4_28 = (i5_12 - i5_28);
+          }
+          r3_4 = (r4_4 + r4_12);
+          i3_4 = (i4_4 + i4_12);
+          r3_20 = (r4_4 - r4_12);
+          i3_20 = (i4_4 - i4_12);
+          r3_12 = (r4_20 + i4_28);
+          i3_12 = (i4_20 - r4_28);
+          r3_28 = (r4_20 - i4_28);
+          i3_28 = (i4_20 + r4_28);
+        }
+        r2_0 = (r3_0 + r3_4);
+        i2_0 = (i3_0 + i3_4);
+        r2_16 = (r3_0 - r3_4);
+        i2_16 = (i3_0 - i3_4);
+        tmpr = (0.707106781187 * (r3_12 + i3_12));
+        tmpi = (0.707106781187 * (i3_12 - r3_12));
+        r2_4 = (r3_8 + tmpr);
+        i2_4 = (i3_8 + tmpi);
+        r2_20 = (r3_8 - tmpr);
+        i2_20 = (i3_8 - tmpi);
+        r2_8 = (r3_16 + i3_20);
+        i2_8 = (i3_16 - r3_20);
+        r2_24 = (r3_16 - i3_20);
+        i2_24 = (i3_16 + r3_20);
+        tmpr = (0.707106781187 * (i3_28 - r3_28));
+        tmpi = (0.707106781187 * (r3_28 + i3_28));
+        r2_12 = (r3_24 + tmpr);
+        i2_12 = (i3_24 - tmpi);
+        r2_28 = (r3_24 - tmpr);
+        i2_28 = (i3_24 + tmpi);
+      }
+      {
+        REAL r3_2, i3_2;
+        REAL r3_6, i3_6;
+        REAL r3_10, i3_10;
+        REAL r3_14, i3_14;
+        REAL r3_18, i3_18;
+        REAL r3_22, i3_22;
+        REAL r3_26, i3_26;
+        REAL r3_30, i3_30;
+        {
+          REAL r4_2, i4_2;
+          REAL r4_10, i4_10;
+          REAL r4_18, i4_18;
+          REAL r4_26, i4_26;
+          {
+            REAL r5_2, i5_2;
+            REAL r5_18, i5_18;
+            r5_2 = c_re(in[2]);
+            i5_2 = c_im(in[2]);
+            r5_18 = c_re(in[18]);
+            i5_18 = c_im(in[18]);
+            r4_2 = (r5_2 + r5_18);
+            i4_2 = (i5_2 + i5_18);
+            r4_18 = (r5_2 - r5_18);
+            i4_18 = (i5_2 - i5_18);
+          }
+          {
+            REAL r5_10, i5_10;
+            REAL r5_26, i5_26;
+            r5_10 = c_re(in[10]);
+            i5_10 = c_im(in[10]);
+            r5_26 = c_re(in[26]);
+            i5_26 = c_im(in[26]);
+            r4_10 = (r5_10 + r5_26);
+            i4_10 = (i5_10 + i5_26);
+            r4_26 = (r5_10 - r5_26);
+            i4_26 = (i5_10 - i5_26);
+          }
+          r3_2 = (r4_2 + r4_10);
+          i3_2 = (i4_2 + i4_10);
+          r3_18 = (r4_2 - r4_10);
+          i3_18 = (i4_2 - i4_10);
+          r3_10 = (r4_18 + i4_26);
+          i3_10 = (i4_18 - r4_26);
+          r3_26 = (r4_18 - i4_26);
+          i3_26 = (i4_18 + r4_26);
+        }
+        {
+          REAL r4_6, i4_6;
+          REAL r4_14, i4_14;
+          REAL r4_22, i4_22;
+          REAL r4_30, i4_30;
+          {
+            REAL r5_6, i5_6;
+            REAL r5_22, i5_22;
+            r5_6 = c_re(in[6]);
+            i5_6 = c_im(in[6]);
+            r5_22 = c_re(in[22]);
+            i5_22 = c_im(in[22]);
+            r4_6 = (r5_6 + r5_22);
+            i4_6 = (i5_6 + i5_22);
+            r4_22 = (r5_6 - r5_22);
+            i4_22 = (i5_6 - i5_22);
+          }
+          {
+            REAL r5_14, i5_14;
+            REAL r5_30, i5_30;
+            r5_14 = c_re(in[14]);
+            i5_14 = c_im(in[14]);
+            r5_30 = c_re(in[30]);
+            i5_30 = c_im(in[30]);
+            r4_14 = (r5_14 + r5_30);
+            i4_14 = (i5_14 + i5_30);
+            r4_30 = (r5_14 - r5_30);
+            i4_30 = (i5_14 - i5_30);
+          }
+          r3_6 = (r4_6 + r4_14);
+          i3_6 = (i4_6 + i4_14);
+          r3_22 = (r4_6 - r4_14);
+          i3_22 = (i4_6 - i4_14);
+          r3_14 = (r4_22 + i4_30);
+          i3_14 = (i4_22 - r4_30);
+          r3_30 = (r4_22 - i4_30);
+          i3_30 = (i4_22 + r4_30);
+        }
+        r2_2 = (r3_2 + r3_6);
+        i2_2 = (i3_2 + i3_6);
+        r2_18 = (r3_2 - r3_6);
+        i2_18 = (i3_2 - i3_6);
+        tmpr = (0.707106781187 * (r3_14 + i3_14));
+        tmpi = (0.707106781187 * (i3_14 - r3_14));
+        r2_6 = (r3_10 + tmpr);
+        i2_6 = (i3_10 + tmpi);
+        r2_22 = (r3_10 - tmpr);
+        i2_22 = (i3_10 - tmpi);
+        r2_10 = (r3_18 + i3_22);
+        i2_10 = (i3_18 - r3_22);
+        r2_26 = (r3_18 - i3_22);
+        i2_26 = (i3_18 + r3_22);
+        tmpr = (0.707106781187 * (i3_30 - r3_30));
+        tmpi = (0.707106781187 * (r3_30 + i3_30));
+        r2_14 = (r3_26 + tmpr);
+        i2_14 = (i3_26 - tmpi);
+        r2_30 = (r3_26 - tmpr);
+        i2_30 = (i3_26 + tmpi);
+      }
+      r1_0 = (r2_0 + r2_2);
+      i1_0 = (i2_0 + i2_2);
+      r1_16 = (r2_0 - r2_2);
+      i1_16 = (i2_0 - i2_2);
+      tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+      tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+      r1_2 = (r2_4 + tmpr);
+      i1_2 = (i2_4 + tmpi);
+      r1_18 = (r2_4 - tmpr);
+      i1_18 = (i2_4 - tmpi);
+      tmpr = (0.707106781187 * (r2_10 + i2_10));
+      tmpi = (0.707106781187 * (i2_10 - r2_10));
+      r1_4 = (r2_8 + tmpr);
+      i1_4 = (i2_8 + tmpi);
+      r1_20 = (r2_8 - tmpr);
+      i1_20 = (i2_8 - tmpi);
+      tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+      tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+      r1_6 = (r2_12 + tmpr);
+      i1_6 = (i2_12 + tmpi);
+      r1_22 = (r2_12 - tmpr);
+      i1_22 = (i2_12 - tmpi);
+      r1_8 = (r2_16 + i2_18);
+      i1_8 = (i2_16 - r2_18);
+      r1_24 = (r2_16 - i2_18);
+      i1_24 = (i2_16 + r2_18);
+      tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+      tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+      r1_10 = (r2_20 + tmpr);
+      i1_10 = (i2_20 - tmpi);
+      r1_26 = (r2_20 - tmpr);
+      i1_26 = (i2_20 + tmpi);
+      tmpr = (0.707106781187 * (i2_26 - r2_26));
+      tmpi = (0.707106781187 * (r2_26 + i2_26));
+      r1_12 = (r2_24 + tmpr);
+      i1_12 = (i2_24 - tmpi);
+      r1_28 = (r2_24 - tmpr);
+      i1_28 = (i2_24 + tmpi);
+      tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+      tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+      r1_14 = (r2_28 + tmpr);
+      i1_14 = (i2_28 - tmpi);
+      r1_30 = (r2_28 - tmpr);
+      i1_30 = (i2_28 + tmpi);
+    }
+    {
+      REAL r2_1, i2_1;
+      REAL r2_3, i2_3;
+      REAL r2_5, i2_5;
+      REAL r2_7, i2_7;
+      REAL r2_9, i2_9;
+      REAL r2_11, i2_11;
+      REAL r2_13, i2_13;
+      REAL r2_15, i2_15;
+      REAL r2_17, i2_17;
+      REAL r2_19, i2_19;
+      REAL r2_21, i2_21;
+      REAL r2_23, i2_23;
+      REAL r2_25, i2_25;
+      REAL r2_27, i2_27;
+      REAL r2_29, i2_29;
+      REAL r2_31, i2_31;
+      {
+        REAL r3_1, i3_1;
+        REAL r3_5, i3_5;
+        REAL r3_9, i3_9;
+        REAL r3_13, i3_13;
+        REAL r3_17, i3_17;
+        REAL r3_21, i3_21;
+        REAL r3_25, i3_25;
+        REAL r3_29, i3_29;
+        {
+          REAL r4_1, i4_1;
+          REAL r4_9, i4_9;
+          REAL r4_17, i4_17;
+          REAL r4_25, i4_25;
+          {
+            REAL r5_1, i5_1;
+            REAL r5_17, i5_17;
+            r5_1 = c_re(in[1]);
+            i5_1 = c_im(in[1]);
+            r5_17 = c_re(in[17]);
+            i5_17 = c_im(in[17]);
+            r4_1 = (r5_1 + r5_17);
+            i4_1 = (i5_1 + i5_17);
+            r4_17 = (r5_1 - r5_17);
+            i4_17 = (i5_1 - i5_17);
+          }
+          {
+            REAL r5_9, i5_9;
+            REAL r5_25, i5_25;
+            r5_9 = c_re(in[9]);
+            i5_9 = c_im(in[9]);
+            r5_25 = c_re(in[25]);
+            i5_25 = c_im(in[25]);
+            r4_9 = (r5_9 + r5_25);
+            i4_9 = (i5_9 + i5_25);
+            r4_25 = (r5_9 - r5_25);
+            i4_25 = (i5_9 - i5_25);
+          }
+          r3_1 = (r4_1 + r4_9);
+          i3_1 = (i4_1 + i4_9);
+          r3_17 = (r4_1 - r4_9);
+          i3_17 = (i4_1 - i4_9);
+          r3_9 = (r4_17 + i4_25);
+          i3_9 = (i4_17 - r4_25);
+          r3_25 = (r4_17 - i4_25);
+          i3_25 = (i4_17 + r4_25);
+        }
+        {
+          REAL r4_5, i4_5;
+          REAL r4_13, i4_13;
+          REAL r4_21, i4_21;
+          REAL r4_29, i4_29;
+          {
+            REAL r5_5, i5_5;
+            REAL r5_21, i5_21;
+            r5_5 = c_re(in[5]);
+            i5_5 = c_im(in[5]);
+            r5_21 = c_re(in[21]);
+            i5_21 = c_im(in[21]);
+            r4_5 = (r5_5 + r5_21);
+            i4_5 = (i5_5 + i5_21);
+            r4_21 = (r5_5 - r5_21);
+            i4_21 = (i5_5 - i5_21);
+          }
+          {
+            REAL r5_13, i5_13;
+            REAL r5_29, i5_29;
+            r5_13 = c_re(in[13]);
+            i5_13 = c_im(in[13]);
+            r5_29 = c_re(in[29]);
+            i5_29 = c_im(in[29]);
+            r4_13 = (r5_13 + r5_29);
+            i4_13 = (i5_13 + i5_29);
+            r4_29 = (r5_13 - r5_29);
+            i4_29 = (i5_13 - i5_29);
+          }
+          r3_5 = (r4_5 + r4_13);
+          i3_5 = (i4_5 + i4_13);
+          r3_21 = (r4_5 - r4_13);
+          i3_21 = (i4_5 - i4_13);
+          r3_13 = (r4_21 + i4_29);
+          i3_13 = (i4_21 - r4_29);
+          r3_29 = (r4_21 - i4_29);
+          i3_29 = (i4_21 + r4_29);
+        }
+        r2_1 = (r3_1 + r3_5);
+        i2_1 = (i3_1 + i3_5);
+        r2_17 = (r3_1 - r3_5);
+        i2_17 = (i3_1 - i3_5);
+        tmpr = (0.707106781187 * (r3_13 + i3_13));
+        tmpi = (0.707106781187 * (i3_13 - r3_13));
+        r2_5 = (r3_9 + tmpr);
+        i2_5 = (i3_9 + tmpi);
+        r2_21 = (r3_9 - tmpr);
+        i2_21 = (i3_9 - tmpi);
+        r2_9 = (r3_17 + i3_21);
+        i2_9 = (i3_17 - r3_21);
+        r2_25 = (r3_17 - i3_21);
+        i2_25 = (i3_17 + r3_21);
+        tmpr = (0.707106781187 * (i3_29 - r3_29));
+        tmpi = (0.707106781187 * (r3_29 + i3_29));
+        r2_13 = (r3_25 + tmpr);
+        i2_13 = (i3_25 - tmpi);
+        r2_29 = (r3_25 - tmpr);
+        i2_29 = (i3_25 + tmpi);
+      }
+      {
+        REAL r3_3, i3_3;
+        REAL r3_7, i3_7;
+        REAL r3_11, i3_11;
+        REAL r3_15, i3_15;
+        REAL r3_19, i3_19;
+        REAL r3_23, i3_23;
+        REAL r3_27, i3_27;
+        REAL r3_31, i3_31;
+        {
+          REAL r4_3, i4_3;
+          REAL r4_11, i4_11;
+          REAL r4_19, i4_19;
+          REAL r4_27, i4_27;
+          {
+            REAL r5_3, i5_3;
+            REAL r5_19, i5_19;
+            r5_3 = c_re(in[3]);
+            i5_3 = c_im(in[3]);
+            r5_19 = c_re(in[19]);
+            i5_19 = c_im(in[19]);
+            r4_3 = (r5_3 + r5_19);
+            i4_3 = (i5_3 + i5_19);
+            r4_19 = (r5_3 - r5_19);
+            i4_19 = (i5_3 - i5_19);
+          }
+          {
+            REAL r5_11, i5_11;
+            REAL r5_27, i5_27;
+            r5_11 = c_re(in[11]);
+            i5_11 = c_im(in[11]);
+            r5_27 = c_re(in[27]);
+            i5_27 = c_im(in[27]);
+            r4_11 = (r5_11 + r5_27);
+            i4_11 = (i5_11 + i5_27);
+            r4_27 = (r5_11 - r5_27);
+            i4_27 = (i5_11 - i5_27);
+          }
+          r3_3 = (r4_3 + r4_11);
+          i3_3 = (i4_3 + i4_11);
+          r3_19 = (r4_3 - r4_11);
+          i3_19 = (i4_3 - i4_11);
+          r3_11 = (r4_19 + i4_27);
+          i3_11 = (i4_19 - r4_27);
+          r3_27 = (r4_19 - i4_27);
+          i3_27 = (i4_19 + r4_27);
+        }
+        {
+          REAL r4_7, i4_7;
+          REAL r4_15, i4_15;
+          REAL r4_23, i4_23;
+          REAL r4_31, i4_31;
+          {
+            REAL r5_7, i5_7;
+            REAL r5_23, i5_23;
+            r5_7 = c_re(in[7]);
+            i5_7 = c_im(in[7]);
+            r5_23 = c_re(in[23]);
+            i5_23 = c_im(in[23]);
+            r4_7 = (r5_7 + r5_23);
+            i4_7 = (i5_7 + i5_23);
+            r4_23 = (r5_7 - r5_23);
+            i4_23 = (i5_7 - i5_23);
+          }
+          {
+            REAL r5_15, i5_15;
+            REAL r5_31, i5_31;
+            r5_15 = c_re(in[15]);
+            i5_15 = c_im(in[15]);
+            r5_31 = c_re(in[31]);
+            i5_31 = c_im(in[31]);
+            r4_15 = (r5_15 + r5_31);
+            i4_15 = (i5_15 + i5_31);
+            r4_31 = (r5_15 - r5_31);
+            i4_31 = (i5_15 - i5_31);
+          }
+          r3_7 = (r4_7 + r4_15);
+          i3_7 = (i4_7 + i4_15);
+          r3_23 = (r4_7 - r4_15);
+          i3_23 = (i4_7 - i4_15);
+          r3_15 = (r4_23 + i4_31);
+          i3_15 = (i4_23 - r4_31);
+          r3_31 = (r4_23 - i4_31);
+          i3_31 = (i4_23 + r4_31);
+        }
+        r2_3 = (r3_3 + r3_7);
+        i2_3 = (i3_3 + i3_7);
+        r2_19 = (r3_3 - r3_7);
+        i2_19 = (i3_3 - i3_7);
+        tmpr = (0.707106781187 * (r3_15 + i3_15));
+        tmpi = (0.707106781187 * (i3_15 - r3_15));
+        r2_7 = (r3_11 + tmpr);
+        i2_7 = (i3_11 + tmpi);
+        r2_23 = (r3_11 - tmpr);
+        i2_23 = (i3_11 - tmpi);
+        r2_11 = (r3_19 + i3_23);
+        i2_11 = (i3_19 - r3_23);
+        r2_27 = (r3_19 - i3_23);
+        i2_27 = (i3_19 + r3_23);
+        tmpr = (0.707106781187 * (i3_31 - r3_31));
+        tmpi = (0.707106781187 * (r3_31 + i3_31));
+        r2_15 = (r3_27 + tmpr);
+        i2_15 = (i3_27 - tmpi);
+        r2_31 = (r3_27 - tmpr);
+        i2_31 = (i3_27 + tmpi);
+      }
+      r1_1 = (r2_1 + r2_3);
+      i1_1 = (i2_1 + i2_3);
+      r1_17 = (r2_1 - r2_3);
+      i1_17 = (i2_1 - i2_3);
+      tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+      tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+      r1_3 = (r2_5 + tmpr);
+      i1_3 = (i2_5 + tmpi);
+      r1_19 = (r2_5 - tmpr);
+      i1_19 = (i2_5 - tmpi);
+      tmpr = (0.707106781187 * (r2_11 + i2_11));
+      tmpi = (0.707106781187 * (i2_11 - r2_11));
+      r1_5 = (r2_9 + tmpr);
+      i1_5 = (i2_9 + tmpi);
+      r1_21 = (r2_9 - tmpr);
+      i1_21 = (i2_9 - tmpi);
+      tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+      tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+      r1_7 = (r2_13 + tmpr);
+      i1_7 = (i2_13 + tmpi);
+      r1_23 = (r2_13 - tmpr);
+      i1_23 = (i2_13 - tmpi);
+      r1_9 = (r2_17 + i2_19);
+      i1_9 = (i2_17 - r2_19);
+      r1_25 = (r2_17 - i2_19);
+      i1_25 = (i2_17 + r2_19);
+      tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+      tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+      r1_11 = (r2_21 + tmpr);
+      i1_11 = (i2_21 - tmpi);
+      r1_27 = (r2_21 - tmpr);
+      i1_27 = (i2_21 + tmpi);
+      tmpr = (0.707106781187 * (i2_27 - r2_27));
+      tmpi = (0.707106781187 * (r2_27 + i2_27));
+      r1_13 = (r2_25 + tmpr);
+      i1_13 = (i2_25 - tmpi);
+      r1_29 = (r2_25 - tmpr);
+      i1_29 = (i2_25 + tmpi);
+      tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+      tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+      r1_15 = (r2_29 + tmpr);
+      i1_15 = (i2_29 - tmpi);
+      r1_31 = (r2_29 - tmpr);
+      i1_31 = (i2_29 + tmpi);
+    }
+    c_re(out[0]) = (r1_0 + r1_1);
+    c_im(out[0]) = (i1_0 + i1_1);
+    c_re(out[16]) = (r1_0 - r1_1);
+    c_im(out[16]) = (i1_0 - i1_1);
+    tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+    tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+    c_re(out[1]) = (r1_2 + tmpr);
+    c_im(out[1]) = (i1_2 + tmpi);
+    c_re(out[17]) = (r1_2 - tmpr);
+    c_im(out[17]) = (i1_2 - tmpi);
+    tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+    tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+    c_re(out[2]) = (r1_4 + tmpr);
+    c_im(out[2]) = (i1_4 + tmpi);
+    c_re(out[18]) = (r1_4 - tmpr);
+    c_im(out[18]) = (i1_4 - tmpi);
+    tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+    tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+    c_re(out[3]) = (r1_6 + tmpr);
+    c_im(out[3]) = (i1_6 + tmpi);
+    c_re(out[19]) = (r1_6 - tmpr);
+    c_im(out[19]) = (i1_6 - tmpi);
+    tmpr = (0.707106781187 * (r1_9 + i1_9));
+    tmpi = (0.707106781187 * (i1_9 - r1_9));
+    c_re(out[4]) = (r1_8 + tmpr);
+    c_im(out[4]) = (i1_8 + tmpi);
+    c_re(out[20]) = (r1_8 - tmpr);
+    c_im(out[20]) = (i1_8 - tmpi);
+    tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+    tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+    c_re(out[5]) = (r1_10 + tmpr);
+    c_im(out[5]) = (i1_10 + tmpi);
+    c_re(out[21]) = (r1_10 - tmpr);
+    c_im(out[21]) = (i1_10 - tmpi);
+    tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+    tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+    c_re(out[6]) = (r1_12 + tmpr);
+    c_im(out[6]) = (i1_12 + tmpi);
+    c_re(out[22]) = (r1_12 - tmpr);
+    c_im(out[22]) = (i1_12 - tmpi);
+    tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+    tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+    c_re(out[7]) = (r1_14 + tmpr);
+    c_im(out[7]) = (i1_14 + tmpi);
+    c_re(out[23]) = (r1_14 - tmpr);
+    c_im(out[23]) = (i1_14 - tmpi);
+    c_re(out[8]) = (r1_16 + i1_17);
+    c_im(out[8]) = (i1_16 - r1_17);
+    c_re(out[24]) = (r1_16 - i1_17);
+    c_im(out[24]) = (i1_16 + r1_17);
+    tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+    tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+    c_re(out[9]) = (r1_18 + tmpr);
+    c_im(out[9]) = (i1_18 - tmpi);
+    c_re(out[25]) = (r1_18 - tmpr);
+    c_im(out[25]) = (i1_18 + tmpi);
+    tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+    tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+    c_re(out[10]) = (r1_20 + tmpr);
+    c_im(out[10]) = (i1_20 - tmpi);
+    c_re(out[26]) = (r1_20 - tmpr);
+    c_im(out[26]) = (i1_20 + tmpi);
+    tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+    tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+    c_re(out[11]) = (r1_22 + tmpr);
+    c_im(out[11]) = (i1_22 - tmpi);
+    c_re(out[27]) = (r1_22 - tmpr);
+    c_im(out[27]) = (i1_22 + tmpi);
+    tmpr = (0.707106781187 * (i1_25 - r1_25));
+    tmpi = (0.707106781187 * (r1_25 + i1_25));
+    c_re(out[12]) = (r1_24 + tmpr);
+    c_im(out[12]) = (i1_24 - tmpi);
+    c_re(out[28]) = (r1_24 - tmpr);
+    c_im(out[28]) = (i1_24 + tmpi);
+    tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+    tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+    c_re(out[13]) = (r1_26 + tmpr);
+    c_im(out[13]) = (i1_26 - tmpi);
+    c_re(out[29]) = (r1_26 - tmpr);
+    c_im(out[29]) = (i1_26 + tmpi);
+    tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+    tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+    c_re(out[14]) = (r1_28 + tmpr);
+    c_im(out[14]) = (i1_28 - tmpi);
+    c_re(out[30]) = (r1_28 - tmpr);
+    c_im(out[30]) = (i1_28 + tmpi);
+    tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+    tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+    c_re(out[15]) = (r1_30 + tmpr);
+    c_im(out[15]) = (i1_30 - tmpi);
+    c_re(out[31]) = (r1_30 - tmpr);
+    c_im(out[31]) = (i1_30 + tmpi);
+  }
+}
+
+fibril static void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out,
+    COMPLEX * W, int nW, int nWdn, int m)
+{
+  int l1, i;
+  COMPLEX *jp, *kp;
+  REAL tmpr, tmpi, wr, wi;
+  if ((b - a) < 128) {
+    for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+        i++, l1 += nWdn, kp++) {
+      jp = in + i;
+      {
+        REAL r1_0, i1_0;
+        REAL r1_1, i1_1;
+        REAL r1_2, i1_2;
+        REAL r1_3, i1_3;
+        REAL r1_4, i1_4;
+        REAL r1_5, i1_5;
+        REAL r1_6, i1_6;
+        REAL r1_7, i1_7;
+        REAL r1_8, i1_8;
+        REAL r1_9, i1_9;
+        REAL r1_10, i1_10;
+        REAL r1_11, i1_11;
+        REAL r1_12, i1_12;
+        REAL r1_13, i1_13;
+        REAL r1_14, i1_14;
+        REAL r1_15, i1_15;
+        REAL r1_16, i1_16;
+        REAL r1_17, i1_17;
+        REAL r1_18, i1_18;
+        REAL r1_19, i1_19;
+        REAL r1_20, i1_20;
+        REAL r1_21, i1_21;
+        REAL r1_22, i1_22;
+        REAL r1_23, i1_23;
+        REAL r1_24, i1_24;
+        REAL r1_25, i1_25;
+        REAL r1_26, i1_26;
+        REAL r1_27, i1_27;
+        REAL r1_28, i1_28;
+        REAL r1_29, i1_29;
+        REAL r1_30, i1_30;
+        REAL r1_31, i1_31;
+        {
+          REAL r2_0, i2_0;
+          REAL r2_2, i2_2;
+          REAL r2_4, i2_4;
+          REAL r2_6, i2_6;
+          REAL r2_8, i2_8;
+          REAL r2_10, i2_10;
+          REAL r2_12, i2_12;
+          REAL r2_14, i2_14;
+          REAL r2_16, i2_16;
+          REAL r2_18, i2_18;
+          REAL r2_20, i2_20;
+          REAL r2_22, i2_22;
+          REAL r2_24, i2_24;
+          REAL r2_26, i2_26;
+          REAL r2_28, i2_28;
+          REAL r2_30, i2_30;
+          {
+            REAL r3_0, i3_0;
+            REAL r3_4, i3_4;
+            REAL r3_8, i3_8;
+            REAL r3_12, i3_12;
+            REAL r3_16, i3_16;
+            REAL r3_20, i3_20;
+            REAL r3_24, i3_24;
+            REAL r3_28, i3_28;
+            {
+              REAL r4_0, i4_0;
+              REAL r4_8, i4_8;
+              REAL r4_16, i4_16;
+              REAL r4_24, i4_24;
+              {
+                REAL r5_0, i5_0;
+                REAL r5_16, i5_16;
+                r5_0 = c_re(jp[0 * m]);
+                i5_0 = c_im(jp[0 * m]);
+                wr = c_re(W[16 * l1]);
+                wi = c_im(W[16 * l1]);
+                tmpr = c_re(jp[16 * m]);
+                tmpi = c_im(jp[16 * m]);
+                r5_16 = ((wr * tmpr) - (wi * tmpi));
+                i5_16 = ((wi * tmpr) + (wr * tmpi));
+                r4_0 = (r5_0 + r5_16);
+                i4_0 = (i5_0 + i5_16);
+                r4_16 = (r5_0 - r5_16);
+                i4_16 = (i5_0 - i5_16);
+              }
+              {
+                REAL r5_8, i5_8;
+                REAL r5_24, i5_24;
+                wr = c_re(W[8 * l1]);
+                wi = c_im(W[8 * l1]);
+                tmpr = c_re(jp[8 * m]);
+                tmpi = c_im(jp[8 * m]);
+                r5_8 = ((wr * tmpr) - (wi * tmpi));
+                i5_8 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[24 * l1]);
+                wi = c_im(W[24 * l1]);
+                tmpr = c_re(jp[24 * m]);
+                tmpi = c_im(jp[24 * m]);
+                r5_24 = ((wr * tmpr) - (wi * tmpi));
+                i5_24 = ((wi * tmpr) + (wr * tmpi));
+                r4_8 = (r5_8 + r5_24);
+                i4_8 = (i5_8 + i5_24);
+                r4_24 = (r5_8 - r5_24);
+                i4_24 = (i5_8 - i5_24);
+              }
+              r3_0 = (r4_0 + r4_8);
+              i3_0 = (i4_0 + i4_8);
+              r3_16 = (r4_0 - r4_8);
+              i3_16 = (i4_0 - i4_8);
+              r3_8 = (r4_16 + i4_24);
+              i3_8 = (i4_16 - r4_24);
+              r3_24 = (r4_16 - i4_24);
+              i3_24 = (i4_16 + r4_24);
+            }
+            {
+              REAL r4_4, i4_4;
+              REAL r4_12, i4_12;
+              REAL r4_20, i4_20;
+              REAL r4_28, i4_28;
+              {
+                REAL r5_4, i5_4;
+                REAL r5_20, i5_20;
+                wr = c_re(W[4 * l1]);
+                wi = c_im(W[4 * l1]);
+                tmpr = c_re(jp[4 * m]);
+                tmpi = c_im(jp[4 * m]);
+                r5_4 = ((wr * tmpr) - (wi * tmpi));
+                i5_4 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[20 * l1]);
+                wi = c_im(W[20 * l1]);
+                tmpr = c_re(jp[20 * m]);
+                tmpi = c_im(jp[20 * m]);
+                r5_20 = ((wr * tmpr) - (wi * tmpi));
+                i5_20 = ((wi * tmpr) + (wr * tmpi));
+                r4_4 = (r5_4 + r5_20);
+                i4_4 = (i5_4 + i5_20);
+                r4_20 = (r5_4 - r5_20);
+                i4_20 = (i5_4 - i5_20);
+              }
+              {
+                REAL r5_12, i5_12;
+                REAL r5_28, i5_28;
+                wr = c_re(W[12 * l1]);
+                wi = c_im(W[12 * l1]);
+                tmpr = c_re(jp[12 * m]);
+                tmpi = c_im(jp[12 * m]);
+                r5_12 = ((wr * tmpr) - (wi * tmpi));
+                i5_12 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[28 * l1]);
+                wi = c_im(W[28 * l1]);
+                tmpr = c_re(jp[28 * m]);
+                tmpi = c_im(jp[28 * m]);
+                r5_28 = ((wr * tmpr) - (wi * tmpi));
+                i5_28 = ((wi * tmpr) + (wr * tmpi));
+                r4_12 = (r5_12 + r5_28);
+                i4_12 = (i5_12 + i5_28);
+                r4_28 = (r5_12 - r5_28);
+                i4_28 = (i5_12 - i5_28);
+              }
+              r3_4 = (r4_4 + r4_12);
+              i3_4 = (i4_4 + i4_12);
+              r3_20 = (r4_4 - r4_12);
+              i3_20 = (i4_4 - i4_12);
+              r3_12 = (r4_20 + i4_28);
+              i3_12 = (i4_20 - r4_28);
+              r3_28 = (r4_20 - i4_28);
+              i3_28 = (i4_20 + r4_28);
+            }
+            r2_0 = (r3_0 + r3_4);
+            i2_0 = (i3_0 + i3_4);
+            r2_16 = (r3_0 - r3_4);
+            i2_16 = (i3_0 - i3_4);
+            tmpr = (0.707106781187 * (r3_12 + i3_12));
+            tmpi = (0.707106781187 * (i3_12 - r3_12));
+            r2_4 = (r3_8 + tmpr);
+            i2_4 = (i3_8 + tmpi);
+            r2_20 = (r3_8 - tmpr);
+            i2_20 = (i3_8 - tmpi);
+            r2_8 = (r3_16 + i3_20);
+            i2_8 = (i3_16 - r3_20);
+            r2_24 = (r3_16 - i3_20);
+            i2_24 = (i3_16 + r3_20);
+            tmpr = (0.707106781187 * (i3_28 - r3_28));
+            tmpi = (0.707106781187 * (r3_28 + i3_28));
+            r2_12 = (r3_24 + tmpr);
+            i2_12 = (i3_24 - tmpi);
+            r2_28 = (r3_24 - tmpr);
+            i2_28 = (i3_24 + tmpi);
+          }
+          {
+            REAL r3_2, i3_2;
+            REAL r3_6, i3_6;
+            REAL r3_10, i3_10;
+            REAL r3_14, i3_14;
+            REAL r3_18, i3_18;
+            REAL r3_22, i3_22;
+            REAL r3_26, i3_26;
+            REAL r3_30, i3_30;
+            {
+              REAL r4_2, i4_2;
+              REAL r4_10, i4_10;
+              REAL r4_18, i4_18;
+              REAL r4_26, i4_26;
+              {
+                REAL r5_2, i5_2;
+                REAL r5_18, i5_18;
+                wr = c_re(W[2 * l1]);
+                wi = c_im(W[2 * l1]);
+                tmpr = c_re(jp[2 * m]);
+                tmpi = c_im(jp[2 * m]);
+                r5_2 = ((wr * tmpr) - (wi * tmpi));
+                i5_2 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[18 * l1]);
+                wi = c_im(W[18 * l1]);
+                tmpr = c_re(jp[18 * m]);
+                tmpi = c_im(jp[18 * m]);
+                r5_18 = ((wr * tmpr) - (wi * tmpi));
+                i5_18 = ((wi * tmpr) + (wr * tmpi));
+                r4_2 = (r5_2 + r5_18);
+                i4_2 = (i5_2 + i5_18);
+                r4_18 = (r5_2 - r5_18);
+                i4_18 = (i5_2 - i5_18);
+              }
+              {
+                REAL r5_10, i5_10;
+                REAL r5_26, i5_26;
+                wr = c_re(W[10 * l1]);
+                wi = c_im(W[10 * l1]);
+                tmpr = c_re(jp[10 * m]);
+                tmpi = c_im(jp[10 * m]);
+                r5_10 = ((wr * tmpr) - (wi * tmpi));
+                i5_10 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[26 * l1]);
+                wi = c_im(W[26 * l1]);
+                tmpr = c_re(jp[26 * m]);
+                tmpi = c_im(jp[26 * m]);
+                r5_26 = ((wr * tmpr) - (wi * tmpi));
+                i5_26 = ((wi * tmpr) + (wr * tmpi));
+                r4_10 = (r5_10 + r5_26);
+                i4_10 = (i5_10 + i5_26);
+                r4_26 = (r5_10 - r5_26);
+                i4_26 = (i5_10 - i5_26);
+              }
+              r3_2 = (r4_2 + r4_10);
+              i3_2 = (i4_2 + i4_10);
+              r3_18 = (r4_2 - r4_10);
+              i3_18 = (i4_2 - i4_10);
+              r3_10 = (r4_18 + i4_26);
+              i3_10 = (i4_18 - r4_26);
+              r3_26 = (r4_18 - i4_26);
+              i3_26 = (i4_18 + r4_26);
+            }
+            {
+              REAL r4_6, i4_6;
+              REAL r4_14, i4_14;
+              REAL r4_22, i4_22;
+              REAL r4_30, i4_30;
+              {
+                REAL r5_6, i5_6;
+                REAL r5_22, i5_22;
+                wr = c_re(W[6 * l1]);
+                wi = c_im(W[6 * l1]);
+                tmpr = c_re(jp[6 * m]);
+                tmpi = c_im(jp[6 * m]);
+                r5_6 = ((wr * tmpr) - (wi * tmpi));
+                i5_6 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[22 * l1]);
+                wi = c_im(W[22 * l1]);
+                tmpr = c_re(jp[22 * m]);
+                tmpi = c_im(jp[22 * m]);
+                r5_22 = ((wr * tmpr) - (wi * tmpi));
+                i5_22 = ((wi * tmpr) + (wr * tmpi));
+                r4_6 = (r5_6 + r5_22);
+                i4_6 = (i5_6 + i5_22);
+                r4_22 = (r5_6 - r5_22);
+                i4_22 = (i5_6 - i5_22);
+              }
+              {
+                REAL r5_14, i5_14;
+                REAL r5_30, i5_30;
+                wr = c_re(W[14 * l1]);
+                wi = c_im(W[14 * l1]);
+                tmpr = c_re(jp[14 * m]);
+                tmpi = c_im(jp[14 * m]);
+                r5_14 = ((wr * tmpr) - (wi * tmpi));
+                i5_14 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[30 * l1]);
+                wi = c_im(W[30 * l1]);
+                tmpr = c_re(jp[30 * m]);
+                tmpi = c_im(jp[30 * m]);
+                r5_30 = ((wr * tmpr) - (wi * tmpi));
+                i5_30 = ((wi * tmpr) + (wr * tmpi));
+                r4_14 = (r5_14 + r5_30);
+                i4_14 = (i5_14 + i5_30);
+                r4_30 = (r5_14 - r5_30);
+                i4_30 = (i5_14 - i5_30);
+              }
+              r3_6 = (r4_6 + r4_14);
+              i3_6 = (i4_6 + i4_14);
+              r3_22 = (r4_6 - r4_14);
+              i3_22 = (i4_6 - i4_14);
+              r3_14 = (r4_22 + i4_30);
+              i3_14 = (i4_22 - r4_30);
+              r3_30 = (r4_22 - i4_30);
+              i3_30 = (i4_22 + r4_30);
+            }
+            r2_2 = (r3_2 + r3_6);
+            i2_2 = (i3_2 + i3_6);
+            r2_18 = (r3_2 - r3_6);
+            i2_18 = (i3_2 - i3_6);
+            tmpr = (0.707106781187 * (r3_14 + i3_14));
+            tmpi = (0.707106781187 * (i3_14 - r3_14));
+            r2_6 = (r3_10 + tmpr);
+            i2_6 = (i3_10 + tmpi);
+            r2_22 = (r3_10 - tmpr);
+            i2_22 = (i3_10 - tmpi);
+            r2_10 = (r3_18 + i3_22);
+            i2_10 = (i3_18 - r3_22);
+            r2_26 = (r3_18 - i3_22);
+            i2_26 = (i3_18 + r3_22);
+            tmpr = (0.707106781187 * (i3_30 - r3_30));
+            tmpi = (0.707106781187 * (r3_30 + i3_30));
+            r2_14 = (r3_26 + tmpr);
+            i2_14 = (i3_26 - tmpi);
+            r2_30 = (r3_26 - tmpr);
+            i2_30 = (i3_26 + tmpi);
+          }
+          r1_0 = (r2_0 + r2_2);
+          i1_0 = (i2_0 + i2_2);
+          r1_16 = (r2_0 - r2_2);
+          i1_16 = (i2_0 - i2_2);
+          tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+          tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+          r1_2 = (r2_4 + tmpr);
+          i1_2 = (i2_4 + tmpi);
+          r1_18 = (r2_4 - tmpr);
+          i1_18 = (i2_4 - tmpi);
+          tmpr = (0.707106781187 * (r2_10 + i2_10));
+          tmpi = (0.707106781187 * (i2_10 - r2_10));
+          r1_4 = (r2_8 + tmpr);
+          i1_4 = (i2_8 + tmpi);
+          r1_20 = (r2_8 - tmpr);
+          i1_20 = (i2_8 - tmpi);
+          tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+          tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+          r1_6 = (r2_12 + tmpr);
+          i1_6 = (i2_12 + tmpi);
+          r1_22 = (r2_12 - tmpr);
+          i1_22 = (i2_12 - tmpi);
+          r1_8 = (r2_16 + i2_18);
+          i1_8 = (i2_16 - r2_18);
+          r1_24 = (r2_16 - i2_18);
+          i1_24 = (i2_16 + r2_18);
+          tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+          tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+          r1_10 = (r2_20 + tmpr);
+          i1_10 = (i2_20 - tmpi);
+          r1_26 = (r2_20 - tmpr);
+          i1_26 = (i2_20 + tmpi);
+          tmpr = (0.707106781187 * (i2_26 - r2_26));
+          tmpi = (0.707106781187 * (r2_26 + i2_26));
+          r1_12 = (r2_24 + tmpr);
+          i1_12 = (i2_24 - tmpi);
+          r1_28 = (r2_24 - tmpr);
+          i1_28 = (i2_24 + tmpi);
+          tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+          tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+          r1_14 = (r2_28 + tmpr);
+          i1_14 = (i2_28 - tmpi);
+          r1_30 = (r2_28 - tmpr);
+          i1_30 = (i2_28 + tmpi);
+        }
+        {
+          REAL r2_1, i2_1;
+          REAL r2_3, i2_3;
+          REAL r2_5, i2_5;
+          REAL r2_7, i2_7;
+          REAL r2_9, i2_9;
+          REAL r2_11, i2_11;
+          REAL r2_13, i2_13;
+          REAL r2_15, i2_15;
+          REAL r2_17, i2_17;
+          REAL r2_19, i2_19;
+          REAL r2_21, i2_21;
+          REAL r2_23, i2_23;
+          REAL r2_25, i2_25;
+          REAL r2_27, i2_27;
+          REAL r2_29, i2_29;
+          REAL r2_31, i2_31;
+          {
+            REAL r3_1, i3_1;
+            REAL r3_5, i3_5;
+            REAL r3_9, i3_9;
+            REAL r3_13, i3_13;
+            REAL r3_17, i3_17;
+            REAL r3_21, i3_21;
+            REAL r3_25, i3_25;
+            REAL r3_29, i3_29;
+            {
+              REAL r4_1, i4_1;
+              REAL r4_9, i4_9;
+              REAL r4_17, i4_17;
+              REAL r4_25, i4_25;
+              {
+                REAL r5_1, i5_1;
+                REAL r5_17, i5_17;
+                wr = c_re(W[1 * l1]);
+                wi = c_im(W[1 * l1]);
+                tmpr = c_re(jp[1 * m]);
+                tmpi = c_im(jp[1 * m]);
+                r5_1 = ((wr * tmpr) - (wi * tmpi));
+                i5_1 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[17 * l1]);
+                wi = c_im(W[17 * l1]);
+                tmpr = c_re(jp[17 * m]);
+                tmpi = c_im(jp[17 * m]);
+                r5_17 = ((wr * tmpr) - (wi * tmpi));
+                i5_17 = ((wi * tmpr) + (wr * tmpi));
+                r4_1 = (r5_1 + r5_17);
+                i4_1 = (i5_1 + i5_17);
+                r4_17 = (r5_1 - r5_17);
+                i4_17 = (i5_1 - i5_17);
+              }
+              {
+                REAL r5_9, i5_9;
+                REAL r5_25, i5_25;
+                wr = c_re(W[9 * l1]);
+                wi = c_im(W[9 * l1]);
+                tmpr = c_re(jp[9 * m]);
+                tmpi = c_im(jp[9 * m]);
+                r5_9 = ((wr * tmpr) - (wi * tmpi));
+                i5_9 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[25 * l1]);
+                wi = c_im(W[25 * l1]);
+                tmpr = c_re(jp[25 * m]);
+                tmpi = c_im(jp[25 * m]);
+                r5_25 = ((wr * tmpr) - (wi * tmpi));
+                i5_25 = ((wi * tmpr) + (wr * tmpi));
+                r4_9 = (r5_9 + r5_25);
+                i4_9 = (i5_9 + i5_25);
+                r4_25 = (r5_9 - r5_25);
+                i4_25 = (i5_9 - i5_25);
+              }
+              r3_1 = (r4_1 + r4_9);
+              i3_1 = (i4_1 + i4_9);
+              r3_17 = (r4_1 - r4_9);
+              i3_17 = (i4_1 - i4_9);
+              r3_9 = (r4_17 + i4_25);
+              i3_9 = (i4_17 - r4_25);
+              r3_25 = (r4_17 - i4_25);
+              i3_25 = (i4_17 + r4_25);
+            }
+            {
+              REAL r4_5, i4_5;
+              REAL r4_13, i4_13;
+              REAL r4_21, i4_21;
+              REAL r4_29, i4_29;
+              {
+                REAL r5_5, i5_5;
+                REAL r5_21, i5_21;
+                wr = c_re(W[5 * l1]);
+                wi = c_im(W[5 * l1]);
+                tmpr = c_re(jp[5 * m]);
+                tmpi = c_im(jp[5 * m]);
+                r5_5 = ((wr * tmpr) - (wi * tmpi));
+                i5_5 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[21 * l1]);
+                wi = c_im(W[21 * l1]);
+                tmpr = c_re(jp[21 * m]);
+                tmpi = c_im(jp[21 * m]);
+                r5_21 = ((wr * tmpr) - (wi * tmpi));
+                i5_21 = ((wi * tmpr) + (wr * tmpi));
+                r4_5 = (r5_5 + r5_21);
+                i4_5 = (i5_5 + i5_21);
+                r4_21 = (r5_5 - r5_21);
+                i4_21 = (i5_5 - i5_21);
+              }
+              {
+                REAL r5_13, i5_13;
+                REAL r5_29, i5_29;
+                wr = c_re(W[13 * l1]);
+                wi = c_im(W[13 * l1]);
+                tmpr = c_re(jp[13 * m]);
+                tmpi = c_im(jp[13 * m]);
+                r5_13 = ((wr * tmpr) - (wi * tmpi));
+                i5_13 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[29 * l1]);
+                wi = c_im(W[29 * l1]);
+                tmpr = c_re(jp[29 * m]);
+                tmpi = c_im(jp[29 * m]);
+                r5_29 = ((wr * tmpr) - (wi * tmpi));
+                i5_29 = ((wi * tmpr) + (wr * tmpi));
+                r4_13 = (r5_13 + r5_29);
+                i4_13 = (i5_13 + i5_29);
+                r4_29 = (r5_13 - r5_29);
+                i4_29 = (i5_13 - i5_29);
+              }
+              r3_5 = (r4_5 + r4_13);
+              i3_5 = (i4_5 + i4_13);
+              r3_21 = (r4_5 - r4_13);
+              i3_21 = (i4_5 - i4_13);
+              r3_13 = (r4_21 + i4_29);
+              i3_13 = (i4_21 - r4_29);
+              r3_29 = (r4_21 - i4_29);
+              i3_29 = (i4_21 + r4_29);
+            }
+            r2_1 = (r3_1 + r3_5);
+            i2_1 = (i3_1 + i3_5);
+            r2_17 = (r3_1 - r3_5);
+            i2_17 = (i3_1 - i3_5);
+            tmpr = (0.707106781187 * (r3_13 + i3_13));
+            tmpi = (0.707106781187 * (i3_13 - r3_13));
+            r2_5 = (r3_9 + tmpr);
+            i2_5 = (i3_9 + tmpi);
+            r2_21 = (r3_9 - tmpr);
+            i2_21 = (i3_9 - tmpi);
+            r2_9 = (r3_17 + i3_21);
+            i2_9 = (i3_17 - r3_21);
+            r2_25 = (r3_17 - i3_21);
+            i2_25 = (i3_17 + r3_21);
+            tmpr = (0.707106781187 * (i3_29 - r3_29));
+            tmpi = (0.707106781187 * (r3_29 + i3_29));
+            r2_13 = (r3_25 + tmpr);
+            i2_13 = (i3_25 - tmpi);
+            r2_29 = (r3_25 - tmpr);
+            i2_29 = (i3_25 + tmpi);
+          }
+          {
+            REAL r3_3, i3_3;
+            REAL r3_7, i3_7;
+            REAL r3_11, i3_11;
+            REAL r3_15, i3_15;
+            REAL r3_19, i3_19;
+            REAL r3_23, i3_23;
+            REAL r3_27, i3_27;
+            REAL r3_31, i3_31;
+            {
+              REAL r4_3, i4_3;
+              REAL r4_11, i4_11;
+              REAL r4_19, i4_19;
+              REAL r4_27, i4_27;
+              {
+                REAL r5_3, i5_3;
+                REAL r5_19, i5_19;
+                wr = c_re(W[3 * l1]);
+                wi = c_im(W[3 * l1]);
+                tmpr = c_re(jp[3 * m]);
+                tmpi = c_im(jp[3 * m]);
+                r5_3 = ((wr * tmpr) - (wi * tmpi));
+                i5_3 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[19 * l1]);
+                wi = c_im(W[19 * l1]);
+                tmpr = c_re(jp[19 * m]);
+                tmpi = c_im(jp[19 * m]);
+                r5_19 = ((wr * tmpr) - (wi * tmpi));
+                i5_19 = ((wi * tmpr) + (wr * tmpi));
+                r4_3 = (r5_3 + r5_19);
+                i4_3 = (i5_3 + i5_19);
+                r4_19 = (r5_3 - r5_19);
+                i4_19 = (i5_3 - i5_19);
+              }
+              {
+                REAL r5_11, i5_11;
+                REAL r5_27, i5_27;
+                wr = c_re(W[11 * l1]);
+                wi = c_im(W[11 * l1]);
+                tmpr = c_re(jp[11 * m]);
+                tmpi = c_im(jp[11 * m]);
+                r5_11 = ((wr * tmpr) - (wi * tmpi));
+                i5_11 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[27 * l1]);
+                wi = c_im(W[27 * l1]);
+                tmpr = c_re(jp[27 * m]);
+                tmpi = c_im(jp[27 * m]);
+                r5_27 = ((wr * tmpr) - (wi * tmpi));
+                i5_27 = ((wi * tmpr) + (wr * tmpi));
+                r4_11 = (r5_11 + r5_27);
+                i4_11 = (i5_11 + i5_27);
+                r4_27 = (r5_11 - r5_27);
+                i4_27 = (i5_11 - i5_27);
+              }
+              r3_3 = (r4_3 + r4_11);
+              i3_3 = (i4_3 + i4_11);
+              r3_19 = (r4_3 - r4_11);
+              i3_19 = (i4_3 - i4_11);
+              r3_11 = (r4_19 + i4_27);
+              i3_11 = (i4_19 - r4_27);
+              r3_27 = (r4_19 - i4_27);
+              i3_27 = (i4_19 + r4_27);
+            }
+            {
+              REAL r4_7, i4_7;
+              REAL r4_15, i4_15;
+              REAL r4_23, i4_23;
+              REAL r4_31, i4_31;
+              {
+                REAL r5_7, i5_7;
+                REAL r5_23, i5_23;
+                wr = c_re(W[7 * l1]);
+                wi = c_im(W[7 * l1]);
+                tmpr = c_re(jp[7 * m]);
+                tmpi = c_im(jp[7 * m]);
+                r5_7 = ((wr * tmpr) - (wi * tmpi));
+                i5_7 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[23 * l1]);
+                wi = c_im(W[23 * l1]);
+                tmpr = c_re(jp[23 * m]);
+                tmpi = c_im(jp[23 * m]);
+                r5_23 = ((wr * tmpr) - (wi * tmpi));
+                i5_23 = ((wi * tmpr) + (wr * tmpi));
+                r4_7 = (r5_7 + r5_23);
+                i4_7 = (i5_7 + i5_23);
+                r4_23 = (r5_7 - r5_23);
+                i4_23 = (i5_7 - i5_23);
+              }
+              {
+                REAL r5_15, i5_15;
+                REAL r5_31, i5_31;
+                wr = c_re(W[15 * l1]);
+                wi = c_im(W[15 * l1]);
+                tmpr = c_re(jp[15 * m]);
+                tmpi = c_im(jp[15 * m]);
+                r5_15 = ((wr * tmpr) - (wi * tmpi));
+                i5_15 = ((wi * tmpr) + (wr * tmpi));
+                wr = c_re(W[31 * l1]);
+                wi = c_im(W[31 * l1]);
+                tmpr = c_re(jp[31 * m]);
+                tmpi = c_im(jp[31 * m]);
+                r5_31 = ((wr * tmpr) - (wi * tmpi));
+                i5_31 = ((wi * tmpr) + (wr * tmpi));
+                r4_15 = (r5_15 + r5_31);
+                i4_15 = (i5_15 + i5_31);
+                r4_31 = (r5_15 - r5_31);
+                i4_31 = (i5_15 - i5_31);
+              }
+              r3_7 = (r4_7 + r4_15);
+              i3_7 = (i4_7 + i4_15);
+              r3_23 = (r4_7 - r4_15);
+              i3_23 = (i4_7 - i4_15);
+              r3_15 = (r4_23 + i4_31);
+              i3_15 = (i4_23 - r4_31);
+              r3_31 = (r4_23 - i4_31);
+              i3_31 = (i4_23 + r4_31);
+            }
+            r2_3 = (r3_3 + r3_7);
+            i2_3 = (i3_3 + i3_7);
+            r2_19 = (r3_3 - r3_7);
+            i2_19 = (i3_3 - i3_7);
+            tmpr = (0.707106781187 * (r3_15 + i3_15));
+            tmpi = (0.707106781187 * (i3_15 - r3_15));
+            r2_7 = (r3_11 + tmpr);
+            i2_7 = (i3_11 + tmpi);
+            r2_23 = (r3_11 - tmpr);
+            i2_23 = (i3_11 - tmpi);
+            r2_11 = (r3_19 + i3_23);
+            i2_11 = (i3_19 - r3_23);
+            r2_27 = (r3_19 - i3_23);
+            i2_27 = (i3_19 + r3_23);
+            tmpr = (0.707106781187 * (i3_31 - r3_31));
+            tmpi = (0.707106781187 * (r3_31 + i3_31));
+            r2_15 = (r3_27 + tmpr);
+            i2_15 = (i3_27 - tmpi);
+            r2_31 = (r3_27 - tmpr);
+            i2_31 = (i3_27 + tmpi);
+          }
+          r1_1 = (r2_1 + r2_3);
+          i1_1 = (i2_1 + i2_3);
+          r1_17 = (r2_1 - r2_3);
+          i1_17 = (i2_1 - i2_3);
+          tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+          tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+          r1_3 = (r2_5 + tmpr);
+          i1_3 = (i2_5 + tmpi);
+          r1_19 = (r2_5 - tmpr);
+          i1_19 = (i2_5 - tmpi);
+          tmpr = (0.707106781187 * (r2_11 + i2_11));
+          tmpi = (0.707106781187 * (i2_11 - r2_11));
+          r1_5 = (r2_9 + tmpr);
+          i1_5 = (i2_9 + tmpi);
+          r1_21 = (r2_9 - tmpr);
+          i1_21 = (i2_9 - tmpi);
+          tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+          tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+          r1_7 = (r2_13 + tmpr);
+          i1_7 = (i2_13 + tmpi);
+          r1_23 = (r2_13 - tmpr);
+          i1_23 = (i2_13 - tmpi);
+          r1_9 = (r2_17 + i2_19);
+          i1_9 = (i2_17 - r2_19);
+          r1_25 = (r2_17 - i2_19);
+          i1_25 = (i2_17 + r2_19);
+          tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+          tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+          r1_11 = (r2_21 + tmpr);
+          i1_11 = (i2_21 - tmpi);
+          r1_27 = (r2_21 - tmpr);
+          i1_27 = (i2_21 + tmpi);
+          tmpr = (0.707106781187 * (i2_27 - r2_27));
+          tmpi = (0.707106781187 * (r2_27 + i2_27));
+          r1_13 = (r2_25 + tmpr);
+          i1_13 = (i2_25 - tmpi);
+          r1_29 = (r2_25 - tmpr);
+          i1_29 = (i2_25 + tmpi);
+          tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+          tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+          r1_15 = (r2_29 + tmpr);
+          i1_15 = (i2_29 - tmpi);
+          r1_31 = (r2_29 - tmpr);
+          i1_31 = (i2_29 + tmpi);
+        }
+        c_re(kp[0 * m]) = (r1_0 + r1_1);
+        c_im(kp[0 * m]) = (i1_0 + i1_1);
+        c_re(kp[16 * m]) = (r1_0 - r1_1);
+        c_im(kp[16 * m]) = (i1_0 - i1_1);
+        tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+        tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+        c_re(kp[1 * m]) = (r1_2 + tmpr);
+        c_im(kp[1 * m]) = (i1_2 + tmpi);
+        c_re(kp[17 * m]) = (r1_2 - tmpr);
+        c_im(kp[17 * m]) = (i1_2 - tmpi);
+        tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+        tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+        c_re(kp[2 * m]) = (r1_4 + tmpr);
+        c_im(kp[2 * m]) = (i1_4 + tmpi);
+        c_re(kp[18 * m]) = (r1_4 - tmpr);
+        c_im(kp[18 * m]) = (i1_4 - tmpi);
+        tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+        tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+        c_re(kp[3 * m]) = (r1_6 + tmpr);
+        c_im(kp[3 * m]) = (i1_6 + tmpi);
+        c_re(kp[19 * m]) = (r1_6 - tmpr);
+        c_im(kp[19 * m]) = (i1_6 - tmpi);
+        tmpr = (0.707106781187 * (r1_9 + i1_9));
+        tmpi = (0.707106781187 * (i1_9 - r1_9));
+        c_re(kp[4 * m]) = (r1_8 + tmpr);
+        c_im(kp[4 * m]) = (i1_8 + tmpi);
+        c_re(kp[20 * m]) = (r1_8 - tmpr);
+        c_im(kp[20 * m]) = (i1_8 - tmpi);
+        tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+        tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+        c_re(kp[5 * m]) = (r1_10 + tmpr);
+        c_im(kp[5 * m]) = (i1_10 + tmpi);
+        c_re(kp[21 * m]) = (r1_10 - tmpr);
+        c_im(kp[21 * m]) = (i1_10 - tmpi);
+        tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+        tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+        c_re(kp[6 * m]) = (r1_12 + tmpr);
+        c_im(kp[6 * m]) = (i1_12 + tmpi);
+        c_re(kp[22 * m]) = (r1_12 - tmpr);
+        c_im(kp[22 * m]) = (i1_12 - tmpi);
+        tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+        tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+        c_re(kp[7 * m]) = (r1_14 + tmpr);
+        c_im(kp[7 * m]) = (i1_14 + tmpi);
+        c_re(kp[23 * m]) = (r1_14 - tmpr);
+        c_im(kp[23 * m]) = (i1_14 - tmpi);
+        c_re(kp[8 * m]) = (r1_16 + i1_17);
+        c_im(kp[8 * m]) = (i1_16 - r1_17);
+        c_re(kp[24 * m]) = (r1_16 - i1_17);
+        c_im(kp[24 * m]) = (i1_16 + r1_17);
+        tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+        tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+        c_re(kp[9 * m]) = (r1_18 + tmpr);
+        c_im(kp[9 * m]) = (i1_18 - tmpi);
+        c_re(kp[25 * m]) = (r1_18 - tmpr);
+        c_im(kp[25 * m]) = (i1_18 + tmpi);
+        tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+        tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+        c_re(kp[10 * m]) = (r1_20 + tmpr);
+        c_im(kp[10 * m]) = (i1_20 - tmpi);
+        c_re(kp[26 * m]) = (r1_20 - tmpr);
+        c_im(kp[26 * m]) = (i1_20 + tmpi);
+        tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+        tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+        c_re(kp[11 * m]) = (r1_22 + tmpr);
+        c_im(kp[11 * m]) = (i1_22 - tmpi);
+        c_re(kp[27 * m]) = (r1_22 - tmpr);
+        c_im(kp[27 * m]) = (i1_22 + tmpi);
+        tmpr = (0.707106781187 * (i1_25 - r1_25));
+        tmpi = (0.707106781187 * (r1_25 + i1_25));
+        c_re(kp[12 * m]) = (r1_24 + tmpr);
+        c_im(kp[12 * m]) = (i1_24 - tmpi);
+        c_re(kp[28 * m]) = (r1_24 - tmpr);
+        c_im(kp[28 * m]) = (i1_24 + tmpi);
+        tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+        tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+        c_re(kp[13 * m]) = (r1_26 + tmpr);
+        c_im(kp[13 * m]) = (i1_26 - tmpi);
+        c_re(kp[29 * m]) = (r1_26 - tmpr);
+        c_im(kp[29 * m]) = (i1_26 + tmpi);
+        tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+        tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+        c_re(kp[14 * m]) = (r1_28 + tmpr);
+        c_im(kp[14 * m]) = (i1_28 - tmpi);
+        c_re(kp[30 * m]) = (r1_28 - tmpr);
+        c_im(kp[30 * m]) = (i1_28 + tmpi);
+        tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+        tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+        c_re(kp[15 * m]) = (r1_30 + tmpr);
+        c_im(kp[15 * m]) = (i1_30 - tmpi);
+        c_re(kp[31 * m]) = (r1_30 - tmpr);
+        c_im(kp[31 * m]) = (i1_30 + tmpi);
+      }
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_twiddle_32, (a, ab, in, out, W, nW, nWdn, m));
+    fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m);
+
+    fibril_join(&fr);
+  }
+}
+
+fibril static void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+  int i;
+  const COMPLEX *ip;
+  COMPLEX *jp;
+  if ((b - a) < 128) {
+    ip = in + a * 32;
+    for (i = a; i < b; ++i) {
+      jp = out + i;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+      jp += 2 * m;
+      jp[0] = ip[0];
+      jp[m] = ip[1];
+      ip += 2;
+    }
+  } else {
+    int ab = (a + b) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, fft_unshuffle_32, (a, ab, in, out, m));
+    fft_unshuffle_32(ab, b, in, out, m);
+
+    fibril_join(&fr);
+  }
+}
+
+/* end of machine-generated code */
+
+#endif /* end of include guard: FFT_H */
diff --git a/benchmarks/fib.cpp b/benchmarks/fib.cpp
new file mode 100644
index 00000000..37cbd0e0
--- /dev/null
+++ b/benchmarks/fib.cpp
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include "test.h"
+
+int n = 42;
+int m;
+
+static int fib_fast(int n)
+{
+	if (n < 2) return n;
+
+	int i = 2, x = 0, y = 0, z = 1;
+
+	do {
+		x = y;
+		y = z;
+		z = x + y;
+	} while (i++ < n);
+
+	return z;
+}
+
+fibril int fib(int n)
+{
+	if (n < 2) return n;
+
+	int x, y;
+	fibril_t fr;
+	fibril_init(&fr);
+
+	fibril_fork(&fr, &x, fib, (n - 1));
+
+	y = fib(n - 2);
+	fibril_join(&fr);
+
+	return x + y;
+}
+
+int verify()
+{
+	int expect = fib_fast(n);
+
+	if (expect != m) {
+		printf("fib(%d)=%d (expected %d)\n", n, m, expect);
+		return 1;
+	}
+
+	return 0;
+}
+
+void init() {}
+void prep() {}
+
+void test() {
+	m = fib(n);
+}
+
diff --git a/benchmarks/fibril.h b/benchmarks/fibril.h
new file mode 100644
index 00000000..c4360d5e
--- /dev/null
+++ b/benchmarks/fibril.h
@@ -0,0 +1,69 @@
+#ifndef FIBRIL_H
+#define FIBRIL_H
+
+#define FIBRIL_SUCCESS 0
+#define FIBRIL_FAILURE -1
+
+/**
+ * These are special arguments to fibril_rt_init().
+ * FIBRIL_NPROCS tells the runtime to fetch the number of processors
+ * from the environment variable FIBRIL_NPROCS (getenv(FIBRIL_NPROCS)).
+ * FIBRIL_NPROCS_ONLN tells the runtime to use all available processors
+ * in the system (sysconf(_SC_NPROCESSORS_ONLN)).
+ */
+#define FIBRIL_NPROCS 0
+#define FIBRIL_NPROCS_ONLN -1
+
+
+
+/** Serial version. */
+#ifdef FIBRIL_SERIAL
+#include "serial/serial.h"
+
+/** Cilkplus version. */
+#elif FIBRIL_CILKPLUS
+#include "cilkplus/cilkplus.h"
+
+/** TBB version. */
+#elif FIBRIL_TBB
+#include "tbb/tbb.h"
+
+/** OpenMP version. */
+#elif FIBRIL_OPENMP
+#include "openmp/openmp.h"
+
+/** Emper continuation version. */
+#elif FIBRIL_EMPER_CONTINUATION
+#include "emper_continuation/emper_continuation.h"
+
+/** Emper fiber version. */
+#elif FIBRIL_EMPER_FIBER
+#include "emper_fiber/emper_fiber.h"
+
+/** Fibril version. */
+#elif FIBRIL_FIBRIL_LF
+#include "fibril_lf/fibrile.h"
+#elif FIBRIL_FIBRIL
+#include "fibril/fibrile.h"
+#endif
+
+/** fibril_fork has two versions: one with return value and one without. */
+#define fibril_fork(...) _fibril_fork_(_fibril_nth(__VA_ARGS__), __VA_ARGS__)
+#define _fibril_fork_(n, ...) _fibril_concat(_fibril_fork_, n)(__VA_ARGS__)
+
+/** If nargs is 3, use the no-return-value version. */
+#define _fibril_fork_3(...) fibril_fork_nrt(__VA_ARGS__)
+
+/** If nargs is 4, use the with-return-value version. */
+#define _fibril_fork_4(...) fibril_fork_wrt(__VA_ARGS__)
+
+/** Helper macros to count number of arguments. */
+#define _fibril_nth(...) _fibril_nth_(__VA_ARGS__, ## __VA_ARGS__, \
+    16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, \
+    8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 0)
+#define _fibril_nth_(_1, _1_, _2, _2_, _3, _3_, _4, _4_, _5, _5_, \
+    _6, _6_, _7, _7_, _8, _8_, _9, _9_, _10, _10_, _11, _11_, _12, _12_, \
+    _13, _13_, _14, _14_, _15, _15_, _16, _16_, N, ...) N
+#define _fibril_concat(left, right) left##right
+
+#endif /* end of include guard: FIBRIL_H */
diff --git a/benchmarks/fibril/CMakeLists.txt b/benchmarks/fibril/CMakeLists.txt
new file mode 100644
index 00000000..a2f98fd8
--- /dev/null
+++ b/benchmarks/fibril/CMakeLists.txt
@@ -0,0 +1,41 @@
+
+add_definitions(-DFIBRIL_FIBRIL)
+
+find_library(FIBRIL_LIB fibril /home/nicolas/uni/ma/fibril/build/lib)
+
+
+add_executable(cholesky_fibril ../cholesky.cpp)
+target_link_libraries(cholesky_fibril "${FIBRIL_LIB}")
+
+add_executable(fft_fibril ../fft.cpp)
+target_link_libraries(fft_fibril "${FIBRIL_LIB}")
+
+add_executable(fib_fibril ../fib.cpp)
+target_link_libraries(fib_fibril "${FIBRIL_LIB}")
+
+add_executable(heat_fibril ../heat.cpp)
+target_link_libraries(heat_fibril "${FIBRIL_LIB}")
+
+add_executable(integrate_fibril ../integrate.cpp)
+target_link_libraries(integrate_fibril "${FIBRIL_LIB}")
+
+add_executable(knapsack_fibril ../knapsack.cpp)
+target_link_libraries(knapsack_fibril "${FIBRIL_LIB}")
+
+add_executable(lu_fibril ../lu.cpp)
+target_link_libraries(lu_fibril "${FIBRIL_LIB}")
+
+add_executable(matmul_fibril ../matmul.cpp)
+target_link_libraries(matmul_fibril "${FIBRIL_LIB}")
+
+add_executable(nqueens_fibril ../nqueens.cpp)
+target_link_libraries(nqueens_fibril "${FIBRIL_LIB}")
+
+add_executable(quicksort_fibril ../quicksort.cpp)
+target_link_libraries(quicksort_fibril "${FIBRIL_LIB}")
+
+add_executable(rectmul_fibril ../rectmul.cpp)
+target_link_libraries(rectmul_fibril "${FIBRIL_LIB}")
+
+add_executable(strassen_fibril ../strassen.cpp)
+target_link_libraries(strassen_fibril "${FIBRIL_LIB}")
diff --git a/benchmarks/fibril/fibrile.h b/benchmarks/fibril/fibrile.h
new file mode 100644
index 00000000..8d24685f
--- /dev/null
+++ b/benchmarks/fibril/fibrile.h
@@ -0,0 +1,97 @@
+#ifndef FIBRILE_H
+#define FIBRILE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "fibrili.h"
+
+/** fibril. */
+#define fibril __attribute__((optimize("no-omit-frame-pointer")))
+
+/** fibril_t. */
+typedef struct _fibril_t fibril_t;
+
+/** fibril_init. */
+__attribute__((always_inline)) extern inline
+void fibril_init(fibril_t * frptr)
+{
+  register void * rbp asm ("rbp");
+  register void * rsp asm ("rsp");
+
+  frptr->lock = 0;
+  frptr->unmapped = 0;
+  frptr->count = -1;
+  frptr->stack.btm = rbp;
+  frptr->stack.top = rsp;
+}
+
+/** fibril_join. */
+__attribute__((always_inline)) extern inline
+void fibril_join(fibril_t * frptr)
+{
+  if (frptr->count > -1) {
+    fibrili_membar(fibrili_join(frptr));
+  }
+}
+
+#include "fork.h"
+
+#ifdef __cplusplus
+
+/** _fibril_fork_nrt. */
+#define fibril_fork_nrt(fp, fn, ag) do { \
+  auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \
+    fibrili_push(f); \
+    fn(_fibril_args ag); \
+    if (!fibrili_pop()) fibrili_resume(f); \
+  }; \
+  fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
+} while (0)
+
+/** _fibril_fork_wrt. */
+#define fibril_fork_wrt(fp, rtp, fn, ag) do { \
+  auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rtp) p) __attribute__((noinline, hot, optimize(3))) { \
+    fibrili_push(f); \
+    *p = fn(_fibril_args ag); \
+    if (!fibrili_pop()) fibrili_resume(f); \
+  }; \
+  fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \
+} while (0)
+
+#else
+
+/** _fibril_fork_nrt. */
+#define fibril_fork_nrt(fp, fn, ag) do { \
+  __attribute__((noinline, hot, optimize(3))) \
+  void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f) { \
+    fibrili_push(f); \
+    fn(_fibril_args ag); \
+    if (!fibrili_pop()) fibrili_resume(f); \
+  } \
+  fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
+} while (0)
+
+/** _fibril_fork_wrt. */
+#define fibril_fork_wrt(fp, rtp, fn, ag) do { \
+  __attribute__((noinline, hot, optimize(3))) \
+  void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f, __typeof__(rtp) p) { \
+    fibrili_push(f); \
+    *p = fn(_fibril_args ag); \
+    if (!fibrili_pop()) fibrili_resume(f); \
+  } \
+  fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \
+} while (0)
+
+#endif
+
+extern int fibril_rt_init(int nprocs);
+extern int fibril_rt_exit();
+extern int fibril_rt_nprocs();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* end of include guard: FIBRILE_H */
diff --git a/benchmarks/fibril/fibrili.h b/benchmarks/fibril/fibrili.h
new file mode 100644
index 00000000..11e94397
--- /dev/null
+++ b/benchmarks/fibril/fibrili.h
@@ -0,0 +1,90 @@
+#ifndef FIBRILI_H
+#define FIBRILI_H
+
+struct _fibril_t {
+  char lock;
+  char unmapped;
+  int count;
+  struct {
+    void * btm;
+    void * top;
+    void * ptr;
+  } stack;
+  void * pc;
+};
+
+extern __thread struct _fibrili_deque_t {
+  char lock;
+  int  head;
+  int  tail;
+  void * stack;
+  void * buff[1000];
+} fibrili_deq;
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7
+
+#define fibrili_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST)
+#define fibrili_lock(l) do { \
+  __asm__ ( "pause" : : : "memory" ); \
+} while (__atomic_test_and_set(&(l), __ATOMIC_ACQUIRE))
+#define fibrili_unlock(l) __atomic_clear(&(l), __ATOMIC_RELEASE)
+
+#else
+#if defined(__x86_64__) || defined(_M_X64_)
+
+#define fibrili_fence() __sync_synchronize()
+#define fibrili_lock(l) do { \
+  __asm__ ( "pause" ::: "memory" ); \
+} while (__sync_lock_test_and_set(&(l), 1))
+#define fibrili_unlock(l) __sync_lock_release(&(l))
+
+#endif
+#endif
+
+__attribute__((noinline)) extern
+void fibrili_join(struct _fibril_t * frptr);
+__attribute__((noreturn)) extern
+void fibrili_resume(struct _fibril_t * frptr);
+
+#define fibrili_push(frptr) do { \
+  (frptr)->pc = __builtin_return_address(0); \
+  fibrili_deq.buff[fibrili_deq.tail++] = (frptr); \
+} while (0)
+
+__attribute__((hot)) static
+int fibrili_pop(void)
+{
+  int tail = fibrili_deq.tail;
+
+  if (tail == 0) return 0;
+
+  fibrili_deq.tail = --tail;
+
+  fibrili_fence();
+
+  if (fibrili_deq.head > tail) {
+    fibrili_deq.tail = tail + 1;
+
+    fibrili_lock(fibrili_deq.lock);
+
+    if (fibrili_deq.head > tail) {
+      fibrili_deq.head = 0;
+      fibrili_deq.tail = 0;
+
+      fibrili_unlock(fibrili_deq.lock);
+      return 0;
+    }
+
+    fibrili_deq.tail = tail;
+    fibrili_unlock(fibrili_deq.lock);
+  }
+
+  return 1;
+}
+
+#define fibrili_membar(call) do { \
+  call; \
+  __asm__ ( "nop" : : : "rbx", "r12", "r13", "r14", "r15", "memory" ); \
+} while (0)
+
+#endif /* end of include guard: FIBRILI_H */
diff --git a/benchmarks/fibril/fork.h b/benchmarks/fibril/fork.h
new file mode 100644
index 00000000..8ab080b0
--- /dev/null
+++ b/benchmarks/fibril/fork.h
@@ -0,0 +1,70 @@
+#ifndef FIBRIL_FORK_H
+#define FIBRIL_FORK_H
+
+#define _fibril_defs(...) \
+  _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_defs_(n, ...) \
+  _fibril_concat(_fibril_defs_, n)(__VA_ARGS__)
+#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__)
+#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__)
+#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__)
+#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__)
+#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__)
+#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__)
+#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__)
+#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__)
+#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__)
+#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__)
+#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__)
+#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__)
+#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__)
+#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__)
+#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__)
+#define _fibril_defs_1(a)      __typeof__(a) a1,
+#define _fibril_defs_0()
+
+#define _fibril_args(...) \
+  _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_args_(n, ...) \
+  _fibril_concat(_fibril_args_, n)(__VA_ARGS__)
+#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__)
+#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__)
+#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__)
+#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__)
+#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__)
+#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__)
+#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__)
+#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__)
+#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__)
+#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__)
+#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__)
+#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__)
+#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__)
+#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__)
+#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__)
+#define _fibril_args_1(a)      a1
+#define _fibril_args_0()
+
+#define _fibril_expand(...) \
+  _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_expand_(n, ...) \
+  _fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
+#define _fibril_expand_16(...) __VA_ARGS__,
+#define _fibril_expand_15(...) __VA_ARGS__,
+#define _fibril_expand_14(...) __VA_ARGS__,
+#define _fibril_expand_13(...) __VA_ARGS__,
+#define _fibril_expand_12(...) __VA_ARGS__,
+#define _fibril_expand_11(...) __VA_ARGS__,
+#define _fibril_expand_10(...) __VA_ARGS__,
+#define _fibril_expand_9( ...) __VA_ARGS__,
+#define _fibril_expand_8( ...) __VA_ARGS__,
+#define _fibril_expand_7( ...) __VA_ARGS__,
+#define _fibril_expand_6( ...) __VA_ARGS__,
+#define _fibril_expand_5( ...) __VA_ARGS__,
+#define _fibril_expand_4( ...) __VA_ARGS__,
+#define _fibril_expand_3( ...) __VA_ARGS__,
+#define _fibril_expand_2( ...) __VA_ARGS__,
+#define _fibril_expand_1( ...) __VA_ARGS__,
+#define _fibril_expand_0()
+
+#endif /* end of include guard: FIBRIL_FORK_H */
diff --git a/benchmarks/fibril_lf/CMakeLists.txt b/benchmarks/fibril_lf/CMakeLists.txt
new file mode 100644
index 00000000..891e7720
--- /dev/null
+++ b/benchmarks/fibril_lf/CMakeLists.txt
@@ -0,0 +1,41 @@
+
+add_definitions(-DFIBRIL_FIBRIL_LF)
+
+find_library(FIBRIL_LF_LIB fibril /home/nicolas/uni/ma/fibril_wf/build/lib)
+
+
+add_executable(cholesky_fibril_lf ../cholesky.cpp)
+target_link_libraries(cholesky_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(fft_fibril_lf ../fft.cpp)
+target_link_libraries(fft_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(fib_fibril_lf ../fib.cpp)
+target_link_libraries(fib_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(heat_fibril_lf ../heat.cpp)
+target_link_libraries(heat_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(integrate_fibril_lf ../integrate.cpp)
+target_link_libraries(integrate_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(knapsack_fibril_lf ../knapsack.cpp)
+target_link_libraries(knapsack_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(lu_fibril_lf ../lu.cpp)
+target_link_libraries(lu_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(matmul_fibril_lf ../matmul.cpp)
+target_link_libraries(matmul_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(nqueens_fibril_lf ../nqueens.cpp)
+target_link_libraries(nqueens_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(quicksort_fibril_lf ../quicksort.cpp)
+target_link_libraries(quicksort_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(rectmul_fibril_lf ../rectmul.cpp)
+target_link_libraries(rectmul_fibril_lf "${FIBRIL_LF_LIB}")
+
+add_executable(strassen_fibril_lf ../strassen.cpp)
+target_link_libraries(strassen_fibril_lf "${FIBRIL_LF_LIB}")
diff --git a/benchmarks/fibril_lf/fork.h b/benchmarks/fibril_lf/fork.h
new file mode 100644
index 00000000..8ab080b0
--- /dev/null
+++ b/benchmarks/fibril_lf/fork.h
@@ -0,0 +1,70 @@
+#ifndef FIBRIL_FORK_H
+#define FIBRIL_FORK_H
+
+#define _fibril_defs(...) \
+  _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_defs_(n, ...) \
+  _fibril_concat(_fibril_defs_, n)(__VA_ARGS__)
+#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__)
+#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__)
+#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__)
+#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__)
+#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__)
+#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__)
+#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__)
+#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__)
+#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__)
+#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__)
+#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__)
+#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__)
+#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__)
+#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__)
+#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__)
+#define _fibril_defs_1(a)      __typeof__(a) a1,
+#define _fibril_defs_0()
+
+#define _fibril_args(...) \
+  _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_args_(n, ...) \
+  _fibril_concat(_fibril_args_, n)(__VA_ARGS__)
+#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__)
+#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__)
+#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__)
+#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__)
+#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__)
+#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__)
+#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__)
+#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__)
+#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__)
+#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__)
+#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__)
+#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__)
+#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__)
+#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__)
+#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__)
+#define _fibril_args_1(a)      a1
+#define _fibril_args_0()
+
+#define _fibril_expand(...) \
+  _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_expand_(n, ...) \
+  _fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
+#define _fibril_expand_16(...) __VA_ARGS__,
+#define _fibril_expand_15(...) __VA_ARGS__,
+#define _fibril_expand_14(...) __VA_ARGS__,
+#define _fibril_expand_13(...) __VA_ARGS__,
+#define _fibril_expand_12(...) __VA_ARGS__,
+#define _fibril_expand_11(...) __VA_ARGS__,
+#define _fibril_expand_10(...) __VA_ARGS__,
+#define _fibril_expand_9( ...) __VA_ARGS__,
+#define _fibril_expand_8( ...) __VA_ARGS__,
+#define _fibril_expand_7( ...) __VA_ARGS__,
+#define _fibril_expand_6( ...) __VA_ARGS__,
+#define _fibril_expand_5( ...) __VA_ARGS__,
+#define _fibril_expand_4( ...) __VA_ARGS__,
+#define _fibril_expand_3( ...) __VA_ARGS__,
+#define _fibril_expand_2( ...) __VA_ARGS__,
+#define _fibril_expand_1( ...) __VA_ARGS__,
+#define _fibril_expand_0()
+
+#endif /* end of include guard: FIBRIL_FORK_H */
diff --git a/benchmarks/heat.cpp b/benchmarks/heat.cpp
new file mode 100644
index 00000000..82da1ea4
--- /dev/null
+++ b/benchmarks/heat.cpp
@@ -0,0 +1,205 @@
+/*
+ * Heat diffusion (Jacobi-type iteration)
+ *
+ * Volker Strumpen, Boston                                 August 1996
+ *
+ * Copyright (c) 1996 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "test.h"
+
+#define f(x,y)     (sin(x)*sin(y))
+#define randa(x,t) (0.0)
+#define randb(x,t) (exp(-2*(t))*sin(x))
+#define randc(y,t) (0.0)
+#define randd(y,t) (exp(-2*(t))*sin(y))
+#define solu(x,y,t) (exp(-2*(t))*sin(x)*sin(y))
+
+int n = 4096;
+
+int nx, ny, nt;
+double xu, xo, yu, yo, tu, to;
+
+double dx, dy, dt;
+double dtdxsq, dtdysq;
+
+double **  odd;
+double ** even;
+
+fibril static void heat(double ** m, int il, int iu)
+{
+  if (iu - il > 1) {
+    int im = (il + iu) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, heat, (m, il, im));
+    heat(m, im, iu);
+
+    fibril_join(&fr);
+    return;
+  }
+
+  int i = il;
+  int j;
+  double * row = m[i];
+
+  if (i == 0) {
+    for (j = 0; j < ny; ++j) {
+      row[j] = randc(yu + j * dy, 0);
+    }
+  } else if (i == nx - 1) {
+    for (j = 0; j < ny; ++j) {
+      row[j] = randd(yu + j * dy, 0);
+    }
+  } else {
+    row[0] = randa(xu + i * dx, 0);
+    for (j = 1; j < ny - 1; ++j) {
+      row[j] = f(xu + i * dx, yu + j * dy);
+    }
+    row[ny - 1] = randb(xu + i * dx, 0);
+  }
+}
+
+fibril void diffuse(double ** out, double ** in, int il, int iu, double t)
+{
+  if (iu - il > 1) {
+    int im = (il + iu) / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, diffuse, (out, in, il, im, t));
+    diffuse(out, in, im, iu, t);
+
+    fibril_join(&fr);
+    return;
+  }
+
+  int i = il;
+  int j;
+  double * row = out[i];
+
+  if (i == 0) {
+    for (j = 0; j < ny; ++j) {
+      row[j] = randc(yu + j * dy, t);
+    }
+  } else if (i == nx - 1) {
+    for (j = 0; j < ny; ++j) {
+      row[j] = randd(yu + j * dy, t);
+    }
+  } else {
+    row[0] = randa(xu + i * dx, t);
+    for (j = 1; j < ny - 1; ++j) {
+      row[j] = in[i][j]
+        + dtdysq * (in[i][j + 1] - 2 * in[i][j] + in[i][j - 1])
+        + dtdxsq * (in[i + 1][j] - 2 * in[i][j] + in[i - 1][j]);
+    }
+    row[ny - 1] = randb(xu + i * dx, t);
+  }
+}
+
+void init()
+{
+  nx = n;
+  ny = 1024;
+  nt = 100;
+  xu = 0.0;
+  xo = 1.570796326794896558;
+  yu = 0.0;
+  yo = 1.570796326794896558;
+  tu = 0.0;
+  to = 0.0000001;
+
+  dx = (xo - xu) / (nx - 1);
+  dy = (yo - yu) / (ny - 1);
+  dt = (to - tu) / nt;
+
+  dtdxsq = dt / (dx * dx);
+  dtdysq = dt / (dy * dy);
+
+  even = (double**) malloc(sizeof(double *) * nx);
+  odd  = (double**) malloc(sizeof(double *) * nx);
+
+  int i;
+  for (i = 0; i < nx; ++i) {
+    even[i] = (double*) malloc(sizeof(double) * ny);
+    odd [i] = (double*) malloc(sizeof(double) * ny);
+  }
+}
+
+void prep()
+{
+  heat(even, 0, nx);
+}
+
+void test()
+{
+  double t = tu;
+  int i;
+
+  for (i = 1; i <= nt; i += 2) {
+    diffuse(odd, even, 0, nx, t += dt);
+    diffuse(even, odd, 0, nx, t += dt);
+  }
+
+  if (nt % 2) {
+    diffuse(odd, even, 0, nx, t += dt);
+  }
+}
+
+int verify()
+{
+  double **mat;
+  double mae = 0.0;
+  double mre = 0.0;
+  double me = 0.0;
+
+  mat = nt % 2 ? odd : even;
+
+  int a, b;
+
+  for (a = 0; a < nx; ++a) {
+    for (b = 0; b < ny; ++b) {
+      double tmp = fabs(mat[a][b] - solu(xu + a * dx, yu + b * dy, to));
+
+      me += tmp;
+      if (tmp > mae) mae = tmp;
+      if (mat[a][b] != 0.0) tmp = tmp / mat[a][b];
+      if (tmp > mre) mre = tmp;
+    }
+  }
+
+  me = me / (nx * ny);
+
+  if (mae > 1e-12) {
+    printf("Local maximal absolute error %10e\n", mae);
+    return 1;
+  } if (mre > 1e-12) {
+    printf("Local maximal relative error %10e\n", mre);
+    return 1;
+  } if (me > 1e-12) {
+    printf("Global Mean absolute error %10e\n", me);
+    return 1;
+  }
+
+  return 0;
+}
+
diff --git a/benchmarks/integrate.cpp b/benchmarks/integrate.cpp
new file mode 100644
index 00000000..3e888c72
--- /dev/null
+++ b/benchmarks/integrate.cpp
@@ -0,0 +1,79 @@
+#include <stdio.h>
+#include "test.h"
+
+int n = 10000;
+
+static double m;
+static const double epsilon = 1.0e-9;
+
+static double f(double x)
+{
+  return (x * x + 1.0) * x;
+}
+
+static
+double integrate_serial(double x1, double y1, double x2, double y2, double area)
+{
+  double half = (x2 - x1) / 2;
+  double x0 = x1 + half;
+  double y0 = f(x0);
+
+  double area_x1x0 = (y1 + y0) / 2 * half;
+  double area_x0x2 = (y0 + y2) / 2 * half;
+  double area_x1x2 = area_x1x0 + area_x0x2;
+
+  if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) {
+    return area_x1x2;
+  }
+
+  area_x1x0 = integrate_serial(x1, y1, x0, y0, area_x1x0);
+  area_x0x2 = integrate_serial(x0, y0, x2, y2, area_x0x2);
+
+  return area_x1x0 + area_x0x2;
+}
+
+static fibril
+double integrate(double x1, double y1, double x2, double y2, double area)
+{
+  double half = (x2 - x1) / 2;
+  double x0 = x1 + half;
+  double y0 = f(x0);
+
+  double area_x1x0 = (y1 + y0) / 2 * half;
+  double area_x0x2 = (y0 + y2) / 2 * half;
+  double area_x1x2 = area_x1x0 + area_x0x2;
+
+  if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) {
+    return area_x1x2;
+  }
+
+  fibril_t fr;
+  fibril_init(&fr);
+
+  fibril_fork(&fr, &area_x1x0, integrate, (x1, y1, x0, y0, area_x1x0));
+  area_x0x2 = integrate(x0, y0, x2, y2, area_x0x2);
+
+  fibril_join(&fr);
+  return area_x1x0 + area_x0x2;
+}
+
+void init() {}
+void prep() {}
+
+void test()
+{
+  m = integrate(0, f(0), n, f(n), 0);
+}
+
+int verify()
+{
+  double expect = integrate_serial(0, f(0), n, f(n), 0);
+
+  if (m - expect < epsilon && expect - m < epsilon) {
+    return 0;
+  }
+
+  printf("integrate(%d)=%lf (expected %lf)\n", n, m, expect);
+  return 1;
+}
+
diff --git a/benchmarks/knapsack.cpp b/benchmarks/knapsack.cpp
new file mode 100644
index 00000000..49bd5852
--- /dev/null
+++ b/benchmarks/knapsack.cpp
@@ -0,0 +1,165 @@
+/*
+ * Cilk program to solve the 0-1 knapsack problem using a branch-and-bound
+ * technique.
+ *
+ * Author: Matteo Frigo
+ */
+/*
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "test.h"
+
+struct item {
+  int value;
+  int weight;
+};
+
+int n = 32;
+static int capacity = 900;
+static int sol;
+
+static struct item items[] = {
+  { 15, 23 },
+  { 22, 12 },
+  { 17, 42 },
+  { 1, 13 },
+  { 32, 21 },
+  { 65, 43 },
+  { 23, 56 },
+  { 4, 7 },
+  { 4, 8 },
+  { 32, 42 },
+  { 51, 32 },
+  { 22, 12 },
+  { 17, 24 },
+  { 12, 13 },
+  { 23, 21 },
+  { 56, 47 },
+  { 23, 65 },
+  { 6, 7 },
+  { 4, 7 },
+  { 32, 42 },
+  { 22, 42 },
+  { 59, 32 },
+  { 23, 12 },
+  { 12, 24 },
+  { 12, 13 },
+  { 23, 21 },
+  { 39, 48 },
+  { 22, 65 },
+  { 6, 7 },
+  { 4, 7 },
+  { 33, 42 },
+  { 18, 53 }
+};
+
+static int best_so_far = INT_MIN;
+
+static int compare(struct item *a, struct item *b)
+{
+  double c = ((double) a->value / a->weight) -
+    ((double) b->value / b->weight);
+
+  if (c > 0)
+    return -1;
+  if (c < 0)
+    return 1;
+  return 0;
+}
+
+/*
+ * return the optimal solution for n items (first is e) and
+ * capacity c. Value so far is v.
+ */
+fibril static int knapsack(struct item *e, int c, int n, int v)
+{
+  int with, without, best;
+  double ub;
+
+  /* base case: full knapsack or no items */
+  if (c < 0)
+    return INT_MIN;
+
+  if (n == 0 || c == 0)
+    return v;		/* feasible solution, with value v */
+
+  ub = (double) v + c * e->value / e->weight;
+
+  if (ub < best_so_far) {
+    /* prune ! */
+    return INT_MIN;
+  }
+
+  fibril_t fr;
+  fibril_init(&fr);
+  /*
+   * compute the best solution without the current item in the knapsack
+   */
+  fibril_fork(&fr, &without, knapsack, (e + 1, c, n - 1, v));
+
+  /* compute the best solution with the current item in the knapsack */
+  with = knapsack(e + 1, c - e->weight, n - 1, v + e->value);
+
+  fibril_join(&fr);
+
+  best = with > without ? with : without;
+
+  /*
+   * notice the race condition here. The program is still
+   * correct, in the sense that the best solution so far
+   * is at least best_so_far. Moreover best_so_far gets updated
+   * when returning, so eventually it should get the right
+   * value. The program is highly non-deterministic.
+   */
+  if (best > best_so_far)
+    best_so_far = best;
+
+  return best;
+}
+
+void init()
+{
+  /* sort the items on decreasing order of value/weight */
+  qsort(items, n, sizeof(struct item),
+      (int (*)(const void *, const void *)) compare);
+}
+
+void prep() {}
+
+void test()
+{
+  sol = knapsack(items, capacity, n, 0);
+}
+
+int verify()
+{
+  int expected = 733;
+
+  if (sol != expected) {
+    printf("sol: %d (expected: %d)\n", sol, expected);
+    return 1;
+  }
+
+  return 0;
+}
+
diff --git a/benchmarks/lu.cpp b/benchmarks/lu.cpp
new file mode 100644
index 00000000..e971e45f
--- /dev/null
+++ b/benchmarks/lu.cpp
@@ -0,0 +1,458 @@
+/****************************************************************************\
+ * LU decomposition
+ * Robert Blumofe
+ *
+ * Copyright (c) 1996, Robert Blumofe.  All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+\****************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "test.h"
+
+/* Define the size of a block. */
+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE 16
+#endif
+
+/* Define the default matrix size. */
+#ifndef DEFAULT_SIZE
+#ifndef BENCHMARK
+#define DEFAULT_SIZE (16 * BLOCK_SIZE)
+#else
+#define DEFAULT_SIZE 4096
+#endif
+#endif
+
+/* A block is a 2D array of doubles. */
+typedef double Block[BLOCK_SIZE][BLOCK_SIZE];
+#define BLOCK(B,I,J) (B[I][J])
+
+/* A matrix is a 1D array of blocks. */
+typedef Block * Matrix;
+#define MATRIX(M,I,J) ((M)[(I)*nBlocks+(J)])
+
+/** Matrix size. */
+int n = DEFAULT_SIZE;
+
+/** The global matrix and a copy of the matrix. */
+static Matrix M, Msave;
+
+/* Matrix size in blocks. */
+static int nBlocks;
+
+/****************************************************************************\
+ * Utility routines.
+ \****************************************************************************/
+
+/*
+ * init_matrix - Fill in matrix M with random values.
+ */
+static void init_matrix(Matrix M, int nb)
+{
+  int I, J, K, i, j, k;
+
+  /* Initialize random number generator. */
+  srand(1);
+
+  /* For each element of each block, fill in random value. */
+  for (I = 0; I < nb; I++)
+    for (J = 0; J < nb; J++)
+      for (i = 0; i < BLOCK_SIZE; i++)
+        for (j = 0; j < BLOCK_SIZE; j++)
+          BLOCK(MATRIX(M, I, J), i, j) = ((double)rand()) / (double)RAND_MAX;
+
+  /* Inflate diagonal entries. */
+  for (K = 0; K < nb; K++)
+    for (k = 0; k < BLOCK_SIZE; k++)
+      BLOCK(MATRIX(M, K, K), k, k) *= 10.0;
+}
+
+/*
+ * print_matrix - Print matrix M.
+ */
+static void print_matrix(Matrix M, int nb)
+{
+  int i, j;
+  (void) print_matrix;
+
+  /* Print out matrix. */
+  for (i = 0; i < nb * BLOCK_SIZE; i++) {
+    for (j = 0; j < nb * BLOCK_SIZE; j++)
+      printf(" %6.4f",
+          BLOCK(MATRIX(M, i / BLOCK_SIZE, j / BLOCK_SIZE),
+            i % BLOCK_SIZE, j % BLOCK_SIZE));
+    printf("\n");
+  }
+}
+
+/*
+ * test_result - Check that matrix LU contains LU decomposition of M.
+ */
+static int test_result(Matrix LU, Matrix M, int nb)
+{
+  int I, J, K, i, j, k;
+  double diff, max_diff;
+  double v;
+  (void) test_result;
+
+  /* Initialize test. */
+  max_diff = 0.0;
+
+  /* Find maximum difference between any element of LU and M. */
+  for (i = 0; i < nb * BLOCK_SIZE; i++)
+    for (j = 0; j < nb * BLOCK_SIZE; j++) {
+      I = i / BLOCK_SIZE;
+      J = j / BLOCK_SIZE;
+      v = 0.0;
+      for (k = 0; k < i && k <= j; k++) {
+        K = k / BLOCK_SIZE;
+        v += BLOCK(MATRIX(LU, I, K), i % BLOCK_SIZE,
+            k % BLOCK_SIZE) *
+          BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE,
+              j % BLOCK_SIZE);
+      }
+      if (k == i && k <= j) {
+        K = k / BLOCK_SIZE;
+        v += BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE,
+            j % BLOCK_SIZE);
+      }
+      diff = fabs(BLOCK(MATRIX(M, I, J), i % BLOCK_SIZE,
+            j % BLOCK_SIZE) - v);
+      if (diff > max_diff)
+        max_diff = diff;
+    }
+
+  /* Check maximum difference against threshold. */
+  return (max_diff > 0.00001);
+}
+
+/****************************************************************************\
+ * Element operations.
+ \****************************************************************************/
+/*
+ * elem_daxmy - Compute y' = y - ax where a is a double and x and y are
+ * vectors of doubles.
+ */
+static void elem_daxmy(double a, double *x, double *y, int n)
+{
+  for (n--; n >= 0; n--) y[n] -= a * x[n];
+}
+
+/****************************************************************************\
+ * Block operations.
+ \****************************************************************************/
+
+/*
+ * block_lu - Factor block B.
+ */
+static void block_lu(Block B)
+{
+  int i, k;
+
+  /* Factor block. */
+  for (k = 0; k < BLOCK_SIZE; k++)
+    for (i = k + 1; i < BLOCK_SIZE; i++) {
+      BLOCK(B, i, k) /= BLOCK(B, k, k);
+      elem_daxmy(BLOCK(B, i, k), &BLOCK(B, k, k + 1),
+          &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1);
+    }
+}
+
+/*
+ * block_lower_solve - Perform forward substitution to solve for B' in
+ * LB' = B.
+ */
+static void block_lower_solve(Block B, Block L)
+{
+  int i, k;
+
+  /* Perform forward substitution. */
+  for (i = 1; i < BLOCK_SIZE; i++)
+    for (k = 0; k < i; k++)
+      elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0),
+          &BLOCK(B, i, 0), BLOCK_SIZE);
+}
+
+/*
+ * block_upper_solve - Perform forward substitution to solve for B' in
+ * B'U = B.
+ */
+static void block_upper_solve(Block B, Block U)
+{
+  int i, k;
+
+  /* Perform forward substitution. */
+  for (i = 0; i < BLOCK_SIZE; i++)
+    for (k = 0; k < BLOCK_SIZE; k++) {
+      BLOCK(B, i, k) /= BLOCK(U, k, k);
+      elem_daxmy(BLOCK(B, i, k), &BLOCK(U, k, k + 1),
+          &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1);
+    }
+}
+
+/*
+ * block_schur - Compute Schur complement B' = B - AC.
+ */
+static void block_schur(Block B, Block A, Block C)
+{
+  int i, k;
+
+  /* Compute Schur complement. */
+  for (i = 0; i < BLOCK_SIZE; i++)
+    for (k = 0; k < BLOCK_SIZE; k++)
+      elem_daxmy(BLOCK(A, i, k), &BLOCK(C, k, 0),
+          &BLOCK(B, i, 0), BLOCK_SIZE);
+}
+
+
+/****************************************************************************\
+ * Divide-and-conquer matrix LU decomposition.
+ \****************************************************************************/
+
+/**
+ * schur - Compute M' = M - VW.
+ */
+fibril static void schur(Matrix M, Matrix V, Matrix W, int nb)
+{
+  Matrix M00, M01, M10, M11;
+  Matrix V00, V01, V10, V11;
+  Matrix W00, W01, W10, W11;
+  int hnb;
+
+  /* Check base case. */
+  if (nb == 1) {
+    block_schur(*M, *V, *W);
+    return;
+  }
+
+  /* Break matrices into 4 pieces. */
+  hnb = nb / 2;
+  M00 = &MATRIX(M, 0, 0);
+  M01 = &MATRIX(M, 0, hnb);
+  M10 = &MATRIX(M, hnb, 0);
+  M11 = &MATRIX(M, hnb, hnb);
+  V00 = &MATRIX(V, 0, 0);
+  V01 = &MATRIX(V, 0, hnb);
+  V10 = &MATRIX(V, hnb, 0);
+  V11 = &MATRIX(V, hnb, hnb);
+  W00 = &MATRIX(W, 0, 0);
+  W01 = &MATRIX(W, 0, hnb);
+  W10 = &MATRIX(W, hnb, 0);
+  W11 = &MATRIX(W, hnb, hnb);
+
+  /* Form Schur complement with recursive calls. */
+  fibril_t fr;
+  fibril_init(&fr);
+
+  fibril_fork(&fr, schur, (M00, V00, W00, hnb));
+  fibril_fork(&fr, schur, (M01, V00, W01, hnb));
+  fibril_fork(&fr, schur, (M10, V10, W00, hnb));
+  schur(M11, V10, W01, hnb);
+  fibril_join(&fr);
+
+  fibril_fork(&fr, schur, (M00, V01, W10, hnb));
+  fibril_fork(&fr, schur, (M01, V01, W11, hnb));
+  fibril_fork(&fr, schur, (M10, V11, W10, hnb));
+  schur(M11, V11, W11, hnb);
+  fibril_join(&fr);
+
+  return;
+}
+
+/*
+ * lower_solve - Compute M' where LM' = M.
+ */
+fibril static void lower_solve(Matrix M, Matrix L, int nb);
+
+static void aux_lower_solve(Matrix Ma, Matrix Mb, Matrix L, int nb)
+{
+  Matrix L00, L01, L10, L11;
+  (void) L01;
+
+  /* Break L matrix into 4 pieces. */
+  L00 = &MATRIX(L, 0, 0);
+  L01 = &MATRIX(L, 0, nb);
+  L10 = &MATRIX(L, nb, 0);
+  L11 = &MATRIX(L, nb, nb);
+
+  /* Solve with recursive calls. */
+  lower_solve(Ma, L00, nb);
+  schur(Mb, L10, Ma, nb);
+  lower_solve(Mb, L11, nb);
+}
+
+fibril static void lower_solve(Matrix M, Matrix L, int nb)
+{
+  Matrix M00, M01, M10, M11;
+  int hnb;
+
+  /* Check base case. */
+  if (nb == 1) {
+    block_lower_solve(*M, *L);
+    return;
+  }
+
+  /* Break matrices into 4 pieces. */
+  hnb = nb / 2;
+  M00 = &MATRIX(M, 0, 0);
+  M01 = &MATRIX(M, 0, hnb);
+  M10 = &MATRIX(M, hnb, 0);
+  M11 = &MATRIX(M, hnb, hnb);
+
+  /* Solve with recursive calls. */
+  fibril_t fr;
+  fibril_init(&fr);
+
+  fibril_fork(&fr, aux_lower_solve, (M00, M10, L, hnb));
+  aux_lower_solve(M01, M11, L, hnb);
+
+  fibril_join(&fr);
+
+  return;
+}
+
+/*
+ * upper_solve - Compute M' where M'U = M.
+ */
+fibril static void upper_solve(Matrix M, Matrix U, int nb);
+
+static void aux_upper_solve(Matrix Ma, Matrix Mb, Matrix U, int nb)
+{
+  Matrix U00, U01, U10, U11;
+  (void) U10;
+
+  /* Break U matrix into 4 pieces. */
+  U00 = &MATRIX(U, 0, 0);
+  U01 = &MATRIX(U, 0, nb);
+  U10 = &MATRIX(U, nb, 0);
+  U11 = &MATRIX(U, nb, nb);
+
+  /* Solve with recursive calls. */
+  upper_solve(Ma, U00, nb);
+  schur(Mb, Ma, U01, nb);
+  upper_solve(Mb, U11, nb);
+
+  return;
+}
+
+fibril static void upper_solve(Matrix M, Matrix U, int nb)
+{
+  Matrix M00, M01, M10, M11;
+  int hnb;
+
+  /* Check base case. */
+  if (nb == 1) {
+    block_upper_solve(*M, *U);
+    return;
+  }
+
+  /* Break matrices into 4 pieces. */
+  hnb = nb / 2;
+  M00 = &MATRIX(M, 0, 0);
+  M01 = &MATRIX(M, 0, hnb);
+  M10 = &MATRIX(M, hnb, 0);
+  M11 = &MATRIX(M, hnb, hnb);
+
+  /* Solve with recursive calls. */
+  fibril_t fr;
+  fibril_init(&fr);
+
+  fibril_fork(&fr, aux_upper_solve, (M00, M01, U, hnb));
+  aux_upper_solve(M10, M11, U, hnb);
+
+  fibril_join(&fr);
+
+  return;
+}
+
+/*
+ * lu - Perform LU decomposition of matrix M.
+ */
+fibril void lu(Matrix M, int nb)
+{
+  Matrix M00, M01, M10, M11;
+  int hnb;
+
+  /* Check base case. */
+  if (nb == 1) {
+    block_lu(*M);
+    return;
+  }
+
+  /* Break matrix into 4 pieces. */
+  hnb = nb / 2;
+  M00 = &MATRIX(M, 0, 0);
+  M01 = &MATRIX(M, 0, hnb);
+  M10 = &MATRIX(M, hnb, 0);
+  M11 = &MATRIX(M, hnb, hnb);
+
+  /* Decompose upper left. */
+  lu(M00, hnb);
+
+  /* Solve for upper right and lower left. */
+  fibril_t fr;
+  fibril_init(&fr);
+
+  fibril_fork(&fr, lower_solve, (M01, M00, hnb));
+  upper_solve(M10, M00, hnb);
+
+  fibril_join(&fr);
+
+  /* Compute Schur complement of lower right. */
+  schur(M11, M10, M01, hnb);
+
+  /* Decompose lower right. */
+  lu(M11, hnb);
+
+  return;
+}
+
+void init()
+{
+  nBlocks = n / BLOCK_SIZE;
+  M = (Matrix) malloc(n * n * sizeof(double));
+  init_matrix(M, nBlocks);
+  (void) Msave;
+#ifndef BENCHMARK
+  Msave = (Matrix) malloc(n * n * sizeof(double));
+  memcpy((void *) Msave, (void *) M, n * n * sizeof(double));
+#endif
+
+}
+
+void prep()
+{
+#ifndef BENCHMARK
+  memcpy((void *) M, (void *) Msave, n * n * sizeof(double));
+#endif
+}
+
+void test()
+{
+  lu(M, nBlocks);
+}
+
+int verify()
+{
+#ifndef BENCHMARK
+  return test_result(M, Msave, nBlocks);
+#else
+  return 0;
+#endif
+}
diff --git a/benchmarks/matmul.cpp b/benchmarks/matmul.cpp
new file mode 100644
index 00000000..74275679
--- /dev/null
+++ b/benchmarks/matmul.cpp
@@ -0,0 +1,142 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "test.h"
+
+int n = 2048;
+
+static float *  a;
+static float *  b;
+static float ** c;
+
+fibril static void compute(float *, int, int, float *, int, int,
+    float **, int, int, int);
+
+static void compute00(float * a, int ai, int aj, float * b, int bi, int bj,
+    float ** c, int ci, int cj, int n)
+{
+  compute(a, ai, aj,     b, bi,     bj, c, ci, cj, n);
+  compute(a, ai, aj + n, b, bi + n, bj, c, ci, cj, n);
+}
+
+static void compute01(float * a, int ai, int aj, float * b, int bi, int bj,
+    float ** c, int ci, int cj, int n)
+{
+  compute(a, ai, aj,     b, bi,     bj + n, c, ci, cj + n, n);
+  compute(a, ai, aj + n, b, bi + n, bj + n, c, ci, cj + n, n);
+}
+
+static void compute10(float * a, int ai, int aj, float * b, int bi, int bj,
+    float ** c, int ci, int cj, int n)
+{
+  compute(a, ai + n, aj,     b, bi,     bj, c, ci + n, cj, n);
+  compute(a, ai + n, aj + n, b, bi + n, bj, c, ci + n, cj, n);
+}
+
+static void compute11(float * a, int ai, int aj, float * b, int bi, int bj,
+    float ** c, int ci, int cj, int n)
+{
+  compute(a, ai + n, aj,     b, bi,     bj + n, c, ci + n, cj + n, n);
+  compute(a, ai + n, aj + n, b, bi + n, bj + n, c, ci + n, cj + n, n);
+}
+
+static void multiply(float * a, int ai, int aj, float * b, int bi, int bj,
+    float ** c, int ci, int cj)
+{
+  int a0 = ai;
+  int a1 = ai + 1;
+
+  float s00 = 0.0F;
+  float s01 = 0.0F;
+  float s10 = 0.0F;
+  float s11 = 0.0F;
+
+  int b0 = bi;
+  int b1 = bi + 1;
+
+  s00 += a[a0 + aj] * b[b0 + bj];
+  s10 += a[a1 + aj] * b[b0 + bj];
+  s01 += a[a0 + aj] * b[b0 + bj + 1];
+  s11 += a[a1 + aj] * b[b0 + bj + 1];
+
+  s00 += a[a0 + aj + 1] * b[b1 + bj];
+  s10 += a[a1 + aj + 1] * b[b1 + bj];
+  s01 += a[a0 + aj + 1] * b[b1 + bj + 1];
+  s11 += a[a1 + aj + 1] * b[b1 + bj + 1];
+
+  c[ci]    [cj]     += s00;
+  c[ci]    [cj + 1] += s01;
+  c[ci + 1][cj]     += s10;
+  c[ci + 1][cj + 1] += s11;
+}
+
+fibril static void compute(float * a, int ai, int aj, float * b, int bi, int bj,
+    float ** c, int ci, int cj, int n)
+{
+  if (n == 2) {
+    multiply(a, ai, aj, b, bi, bj, c, ci, cj);
+  } else {
+    int h = n / 2;
+
+    fibril_t fr;
+    fibril_init(&fr);
+
+    fibril_fork(&fr, compute00, (a, ai, aj, b, bi, bj, c, ci, cj, h));
+    fibril_fork(&fr, compute10, (a, ai, aj, b, bi, bj, c, ci, cj, h));
+    fibril_fork(&fr, compute01, (a, ai, aj, b, bi, bj, c, ci, cj, h));
+    compute11(a, ai, aj, b, bi, bj, c, ci, cj, h);
+
+    fibril_join(&fr);
+  }
+}
+
+void init()
+{
+  a = (float*) malloc(sizeof(float) * n * n);
+  b = (float*) malloc(sizeof(float) * n * n);
+  c = (float**) malloc(sizeof(float *) * n);
+
+  int i, j;
+  (void) j;
+  for (i = 0; i < n; ++i) {
+    c[i] = (float*) malloc(sizeof(float) * n);
+  }
+
+  for (i = 0; i < n * n; ++i) {
+    a[i] = 1.0F;
+  }
+
+  for (i = 0; i < n * n; ++i) {
+    b[i] = 1.0F;
+  }
+}
+
+void prep()
+{
+  int i, j;
+
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; ++j) {
+      c[i][j] = 0;
+    }
+  }
+}
+
+void test()
+{
+  compute(a, 0, 0, b, 0, 0, c, 0, 0, n);
+}
+
+int verify() {
+  int i, j;
+
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; j++) {
+      if (c[i][j] != n) {
+        printf("c[%d][%d]=%f (expected %f)\n", i, j, c[i][j], (float) n);
+        return 1;
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/benchmarks/nqueens.cpp b/benchmarks/nqueens.cpp
new file mode 100644
index 00000000..784c8702
--- /dev/null
+++ b/benchmarks/nqueens.cpp
@@ -0,0 +1,70 @@
+#include <stdio.h>
+#include "test.h"
+
+int n = 14;
+int m;
+
+fibril static int nqueens(const int * a, int n, int d, int i)
+{
+  //int aa[d + 1];
+  int aa[16];
+  int j;
+
+  for (j = 0; j < d; ++j) {
+    aa[j] = a[j];
+
+    int diff = a[j] - i;
+    int dist = d - j;
+
+    if (diff == 0 || dist == diff || dist + diff == 0) return 0;
+  }
+
+  if (d >= 0) aa[d] = i;
+  if (++d == n) return 1;
+
+  //int res[n];
+  int res[16];
+  a = aa;
+
+  fibril_t fr;
+  fibril_init(&fr);
+
+  for (i = 0; i < n; ++i) {
+    fibril_fork(&fr, &res[i], nqueens, (a, n, d, i));
+  }
+
+  fibril_join(&fr);
+
+  int sum = 0;
+
+  for (i = 0; i < n; ++i) {
+    sum += res[i];
+  }
+
+  return sum;
+}
+
+void init() {}
+void prep() {}
+
+void test()
+{
+  m = nqueens(NULL, n, -1, 0);
+}
+
+int verify()
+{
+  static int res[16] = {
+    1, 0, 0, 2, 10, 4, 40, 92, 352, 724, 2680,
+    14200, 73712, 365596, 2279184, 14772512
+  };
+
+  int failed;
+
+  if ((failed = (m != res[n - 1]))) {
+    printf("nqueens(%d)=%d (expected %d)\n", n, m, res[n - 1]);
+  }
+
+  return failed;
+}
+
diff --git a/benchmarks/openmp/CMakeLists.txt b/benchmarks/openmp/CMakeLists.txt
new file mode 100644
index 00000000..4e5b97f0
--- /dev/null
+++ b/benchmarks/openmp/CMakeLists.txt
@@ -0,0 +1,28 @@
+
+add_definitions(-DFIBRIL_OPENMP)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+
+
+add_executable(cholesky_openmp ../cholesky.cpp)
+
+add_executable(fft_openmp ../fft.cpp)
+
+add_executable(fib_openmp ../fib.cpp)
+
+add_executable(heat_openmp ../heat.cpp)
+
+add_executable(integrate_openmp ../integrate.cpp)
+
+add_executable(knapsack_openmp ../knapsack.cpp)
+
+add_executable(lu_openmp ../lu.cpp)
+
+add_executable(matmul_openmp ../matmul.cpp)
+
+add_executable(nqueens_openmp ../nqueens.cpp)
+
+add_executable(quicksort_openmp ../quicksort.cpp)
+
+add_executable(rectmul_openmp ../rectmul.cpp)
+
+add_executable(strassen_openmp ../strassen.cpp)
diff --git a/benchmarks/openmp/openmp.h b/benchmarks/openmp/openmp.h
new file mode 100644
index 00000000..127a779b
--- /dev/null
+++ b/benchmarks/openmp/openmp.h
@@ -0,0 +1,101 @@
+#ifndef OPENMP_H
+#define OPENMP_H
+
+#include <omp.h>
+#include <thread>
+#include <functional>
+
+#define fibril
+#define fibril_t __attribute__((unused)) int
+#define fibril_init(fp)
+
+__attribute__((always_inline))
+inline static void fibril_join(__attribute__((unused)) fibril_t *f) {
+#pragma omp taskwait
+}
+
+#if 0
+__attribute__((always_inline))
+inline static void _omp_fork(std::function<void(void)> f) {
+#pragma omp task untied firstprivate(f)
+	{
+		f();
+	}
+}
+
+#define fibril_fork_nrt(fp, fn, ag) _omp_fork([=]{ fn ag; })
+
+#define fibril_fork_wrt(fp, rtp, fn, ag) do { \
+  __typeof__(rtp) pt = rtp; \
+  _omp_fork([=]{ *pt = fn ag; }); \
+} while (0)
+
+#else
+
+#define _fibril_expand(...) \
+  _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
+#define _fibril_expand_(n, ...) \
+  _fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
+#define _fibril_expand_16(...) __VA_ARGS__
+#define _fibril_expand_15(...) __VA_ARGS__
+#define _fibril_expand_14(...) __VA_ARGS__
+#define _fibril_expand_13(...) __VA_ARGS__
+#define _fibril_expand_12(...) __VA_ARGS__
+#define _fibril_expand_11(...) __VA_ARGS__
+#define _fibril_expand_10(...) __VA_ARGS__
+#define _fibril_expand_9( ...) __VA_ARGS__
+#define _fibril_expand_8( ...) __VA_ARGS__
+#define _fibril_expand_7( ...) __VA_ARGS__
+#define _fibril_expand_6( ...) __VA_ARGS__
+#define _fibril_expand_5( ...) __VA_ARGS__
+#define _fibril_expand_4( ...) __VA_ARGS__
+#define _fibril_expand_3( ...) __VA_ARGS__
+#define _fibril_expand_2( ...) __VA_ARGS__
+#define _fibril_expand_1( ...) __VA_ARGS__
+#define _fibril_expand_0()
+
+template<class F, class ...As>
+__attribute__((always_inline))
+inline static void _omp_fork0(F f, As... as) {
+#pragma omp task untied default(shared)
+	{
+		f(as...);
+	}
+}
+
+template<class F, class R, class ...As>
+__attribute__((always_inline))
+inline static void _omp_fork1(F f, R r, As... as) {
+#pragma omp task untied default(shared)
+	{
+		*r = f(as...);
+	}
+}
+
+#define fibril_fork_nrt(fp, fn, ag) _omp_fork0(fn, _fibril_expand ag)
+#define fibril_fork_wrt(fp, rtp, fn, ag) _omp_fork1(fn, rtp, _fibril_expand ag)
+#endif
+
+
+static int NTHREADS;
+int fibril_rt_nprocs() { return (NTHREADS) ? NTHREADS : std::thread::hardware_concurrency(); }
+
+__attribute__((always_inline))
+inline static void _omp_init(int n, std::function<void(void)> f) {
+	int nprocs = std::thread::hardware_concurrency();
+	if (n > 0 && n < nprocs) {
+		NTHREADS = n;
+	} else {
+		NTHREADS = nprocs;
+	}
+#pragma omp parallel sections num_threads(NTHREADS) default(shared)
+	{
+		f();
+	}
+}
+
+#define fibril_rt_init(n) _omp_init(n, [&]() {
+
+#define fibril_rt_exit() })
+
+#endif /* end of include guard: OPENMP_H */
diff --git a/benchmarks/quicksort.cpp b/benchmarks/quicksort.cpp
new file mode 100644
index 00000000..c4eb013f
--- /dev/null
+++ b/benchmarks/quicksort.cpp
@@ -0,0 +1,84 @@
+#include <math.h>
+#include <stdlib.h>
+#include "test.h"
+
+int n = 8;
+static int * a, * b;
+static size_t size;
+
+fibril void quicksort(int * a, size_t n)
+{
+  if (n < 2) return;
+
+  int pivot = a[n / 2];
+
+  int *left  = a;
+  int *right = a + n - 1;
+
+  while (left <= right) {
+    if (*left < pivot) {
+      left++;
+    } else if (*right > pivot) {
+      right--;
+    } else {
+      int tmp = *left;
+      *left = *right;
+      *right = tmp;
+      left++;
+      right--;
+    }
+  }
+
+  fibril_t fr;
+  fibril_init(&fr);
+
+  fibril_fork(&fr, quicksort, (a, right - a + 1));
+  quicksort(left, a + n - left);
+
+  fibril_join(&fr);
+}
+
+int verify()
+{
+  if (size < 2) return 0;
+
+  int prev = a[0];
+  size_t i;
+  for (i = 1; i < size; ++i) {
+    if (prev > a[i]) return 1;
+    prev = a[i];
+  }
+
+  return 0;
+}
+
+void init()
+{
+  size = 1;
+
+  size_t i;
+  for (i = 0; i < (size_t) n; ++i) {
+    size *= 10;
+  }
+
+  a = (int*) malloc(sizeof(int) * size);
+  b = (int*) malloc(sizeof(int) * size);
+
+  for (i = 0; i < size; ++i) {
+    b[i] = rand();
+  }
+}
+
+void prep()
+{
+  size_t i;
+  for (i = 0; i < size; ++i) {
+    a[i] = b[i];
+  }
+}
+
+void test()
+{
+  quicksort(a, size);
+}
+
diff --git a/benchmarks/rectmul.cpp b/benchmarks/rectmul.cpp
new file mode 100644
index 00000000..4bf5b05b
--- /dev/null
+++ b/benchmarks/rectmul.cpp
@@ -0,0 +1,365 @@
+/*
+ * Program to multiply two rectangualar matrizes A(n,m) * B(m,n), where
+ * (n < m) and (n mod 16 = 0) and (m mod n = 0). (Otherwise fill with 0s
+ * to fit the shape.)
+ *
+ * written by Harald Prokop (prokop@mit.edu) Fall 97.
+ */
+/*
+ * Copyright (c) 2003 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "test.h"
+
+#define BLOCK_EDGE 16
+#define BLOCK_SIZE (BLOCK_EDGE * BLOCK_EDGE)
+
+typedef double block[BLOCK_SIZE];
+
+#ifndef BENCHMARK
+int n = 512;
+#else
+int n = 4096;
+#endif
+
+static block * A, * B, * R;
+static int x, y, z;
+
+/* compute R = R+AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies
+*/
+static void mult_add_block(block * A, block * B, block * R)
+{
+  int i, j;
+
+  for (j = 0; j < 16; j += 2) {	/* 2 columns at a time */
+    double *bp = &((double *) B)[j];
+    for (i = 0; i < 16; i += 2) {		/* 2 rows at a time */
+      double *ap = &((double *) A)[i * 16];
+      double *rp = &((double *) R)[j + i * 16];
+	  //register double s0_0, s0_1;
+	  //register double s1_0, s1_1;
+      double s0_0, s0_1;
+      double s1_0, s1_1;
+      s0_0 = rp[0];
+      s0_1 = rp[1];
+      s1_0 = rp[16];
+      s1_1 = rp[17];
+      s0_0 += ap[0] * bp[0];
+      s0_1 += ap[0] * bp[1];
+      s1_0 += ap[16] * bp[0];
+      s1_1 += ap[16] * bp[1];
+      s0_0 += ap[1] * bp[16];
+      s0_1 += ap[1] * bp[17];
+      s1_0 += ap[17] * bp[16];
+      s1_1 += ap[17] * bp[17];
+      s0_0 += ap[2] * bp[32];
+      s0_1 += ap[2] * bp[33];
+      s1_0 += ap[18] * bp[32];
+      s1_1 += ap[18] * bp[33];
+      s0_0 += ap[3] * bp[48];
+      s0_1 += ap[3] * bp[49];
+      s1_0 += ap[19] * bp[48];
+      s1_1 += ap[19] * bp[49];
+      s0_0 += ap[4] * bp[64];
+      s0_1 += ap[4] * bp[65];
+      s1_0 += ap[20] * bp[64];
+      s1_1 += ap[20] * bp[65];
+      s0_0 += ap[5] * bp[80];
+      s0_1 += ap[5] * bp[81];
+      s1_0 += ap[21] * bp[80];
+      s1_1 += ap[21] * bp[81];
+      s0_0 += ap[6] * bp[96];
+      s0_1 += ap[6] * bp[97];
+      s1_0 += ap[22] * bp[96];
+      s1_1 += ap[22] * bp[97];
+      s0_0 += ap[7] * bp[112];
+      s0_1 += ap[7] * bp[113];
+      s1_0 += ap[23] * bp[112];
+      s1_1 += ap[23] * bp[113];
+      s0_0 += ap[8] * bp[128];
+      s0_1 += ap[8] * bp[129];
+      s1_0 += ap[24] * bp[128];
+      s1_1 += ap[24] * bp[129];
+      s0_0 += ap[9] * bp[144];
+      s0_1 += ap[9] * bp[145];
+      s1_0 += ap[25] * bp[144];
+      s1_1 += ap[25] * bp[145];
+      s0_0 += ap[10] * bp[160];
+      s0_1 += ap[10] * bp[161];
+      s1_0 += ap[26] * bp[160];
+      s1_1 += ap[26] * bp[161];
+      s0_0 += ap[11] * bp[176];
+      s0_1 += ap[11] * bp[177];
+      s1_0 += ap[27] * bp[176];
+      s1_1 += ap[27] * bp[177];
+      s0_0 += ap[12] * bp[192];
+      s0_1 += ap[12] * bp[193];
+      s1_0 += ap[28] * bp[192];
+      s1_1 += ap[28] * bp[193];
+      s0_0 += ap[13] * bp[208];
+      s0_1 += ap[13] * bp[209];
+      s1_0 += ap[29] * bp[208];
+      s1_1 += ap[29] * bp[209];
+      s0_0 += ap[14] * bp[224];
+      s0_1 += ap[14] * bp[225];
+      s1_0 += ap[30] * bp[224];
+      s1_1 += ap[30] * bp[225];
+      s0_0 += ap[15] * bp[240];
+      s0_1 += ap[15] * bp[241];
+      s1_0 += ap[31] * bp[240];
+      s1_1 += ap[31] * bp[241];
+      rp[0] = s0_0;
+      rp[1] = s0_1;
+      rp[16] = s1_0;
+      rp[17] = s1_1;
+    }
+  }
+}
+
+
+/* compute R = AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies
+*/
+static void multiply_block(block * A, block * B, block * R)
+{
+  int i, j;
+
+  for (j = 0; j < 16; j += 2) {	/* 2 columns at a time */
+    double *bp = &((double *) B)[j];
+    for (i = 0; i < 16; i += 2) {		/* 2 rows at a time */
+      double *ap = &((double *) A)[i * 16];
+      double *rp = &((double *) R)[j + i * 16];
+	  //register double s0_0, s0_1;
+	  //register double s1_0, s1_1;
+      double s0_0, s0_1;
+      double s1_0, s1_1;
+      s0_0 = ap[0] * bp[0];
+      s0_1 = ap[0] * bp[1];
+      s1_0 = ap[16] * bp[0];
+      s1_1 = ap[16] * bp[1];
+      s0_0 += ap[1] * bp[16];
+      s0_1 += ap[1] * bp[17];
+      s1_0 += ap[17] * bp[16];
+      s1_1 += ap[17] * bp[17];
+      s0_0 += ap[2] * bp[32];
+      s0_1 += ap[2] * bp[33];
+      s1_0 += ap[18] * bp[32];
+      s1_1 += ap[18] * bp[33];
+      s0_0 += ap[3] * bp[48];
+      s0_1 += ap[3] * bp[49];
+      s1_0 += ap[19] * bp[48];
+      s1_1 += ap[19] * bp[49];
+      s0_0 += ap[4] * bp[64];
+      s0_1 += ap[4] * bp[65];
+      s1_0 += ap[20] * bp[64];
+      s1_1 += ap[20] * bp[65];
+      s0_0 += ap[5] * bp[80];
+      s0_1 += ap[5] * bp[81];
+      s1_0 += ap[21] * bp[80];
+      s1_1 += ap[21] * bp[81];
+      s0_0 += ap[6] * bp[96];
+      s0_1 += ap[6] * bp[97];
+      s1_0 += ap[22] * bp[96];
+      s1_1 += ap[22] * bp[97];
+      s0_0 += ap[7] * bp[112];
+      s0_1 += ap[7] * bp[113];
+      s1_0 += ap[23] * bp[112];
+      s1_1 += ap[23] * bp[113];
+      s0_0 += ap[8] * bp[128];
+      s0_1 += ap[8] * bp[129];
+      s1_0 += ap[24] * bp[128];
+      s1_1 += ap[24] * bp[129];
+      s0_0 += ap[9] * bp[144];
+      s0_1 += ap[9] * bp[145];
+      s1_0 += ap[25] * bp[144];
+      s1_1 += ap[25] * bp[145];
+      s0_0 += ap[10] * bp[160];
+      s0_1 += ap[10] * bp[161];
+      s1_0 += ap[26] * bp[160];
+      s1_1 += ap[26] * bp[161];
+      s0_0 += ap[11] * bp[176];
+      s0_1 += ap[11] * bp[177];
+      s1_0 += ap[27] * bp[176];
+      s1_1 += ap[27] * bp[177];
+      s0_0 += ap[12] * bp[192];
+      s0_1 += ap[12] * bp[193];
+      s1_0 += ap[28] * bp[192];
+      s1_1 += ap[28] * bp[193];
+      s0_0 += ap[13] * bp[208];
+      s0_1 += ap[13] * bp[209];
+      s1_0 += ap[29] * bp[208];
+      s1_1 += ap[29] * bp[209];
+      s0_0 += ap[14] * bp[224];
+      s0_1 += ap[14] * bp[225];
+      s1_0 += ap[30] * bp[224];
+      s1_1 += ap[30] * bp[225];
+      s0_0 += ap[15] * bp[240];
+      s0_1 += ap[15] * bp[241];
+      s1_0 += ap[31] * bp[240];
+      s1_1 += ap[31] * bp[241];
+      rp[0] = s0_0;
+      rp[1] = s0_1;
+      rp[16] = s1_0;
+      rp[17] = s1_1;
+    }
+  }
+}
+
+
+int check_matrix(block * R, long x, long y, long o, double v)
+{
+  int a, b;
+
+  if (x * y == 1) {
+    /**
+     * Checks if each A[i,j] of a martix A of size nb x nb blocks has
+     * value v.
+     */
+    int i;
+    for (i = 0; i < BLOCK_SIZE; i++)
+      if (((double *) R)[i] != v)
+        return 1;
+
+    return 0;
+  }
+
+  if (x>y) {
+    a = check_matrix(R, x / 2, y, o, v);
+    b = check_matrix(R + (x / 2) * o,(x + 1) / 2, y, o, v);
+  } else {
+    a = check_matrix(R, x, y / 2, o, v);
+    b = check_matrix(R + (y / 2), x, (y + 1) / 2, o, v);
+  }
+
+  return a + b;
+}
+
+/* Add matrix T into matrix R, where T and R are bl blocks in size
+ *
+ */
+fibril void add_matrix(block * T, long ot, block * R, long oR, long x, long y)
+{
+  if (x + y == 2) {
+    long i;
+    for (i = 0; i < BLOCK_SIZE; i += 4) {
+      ((double *) R)[i + 0] += ((double *) T)[i + 0];
+      ((double *) R)[i + 1] += ((double *) T)[i + 1];
+      ((double *) R)[i + 2] += ((double *) T)[i + 2];
+      ((double *) R)[i + 3] += ((double *) T)[i + 3];
+    }
+    return;
+  }
+
+  fibril_t fr;
+  fibril_init(&fr);
+
+  if (x > y) {
+    fibril_fork(&fr, add_matrix, (T, ot, R, oR, x/2, y));
+    add_matrix(T+(x/2)*ot, ot, R+(x/2)*oR, oR, (x+1)/2, y);
+  } else {
+    fibril_fork(&fr, add_matrix, (T, ot, R, oR, x, y/2));
+    add_matrix(T+(y/2), ot, R+(y/2), oR, x, (y+1)/2);
+  }
+
+  fibril_join(&fr);
+}
+
+void init_matrix(block * R, long x, long y, long o, double v)
+{
+  if (x + y ==2) {
+    int i;
+    for (i = 0; i < BLOCK_SIZE; i++)
+      ((double *) R)[i] = v;
+    return;
+  }
+
+  if (x > y) {
+    init_matrix(R, x/2, y, o, v);
+    init_matrix(R+(x/2) * o, (x+1)/2, y, o, v);
+  } else {
+    init_matrix(R, x, y/2, o, v);
+    init_matrix(R+(y/2), x, (y+1)/2, o, v);
+  }
+}
+
+fibril static void multiply_matrix(block * A, long oa, block * B, long ob,
+    long x, long y, long z, block * R, long oR, int add)
+{
+  if (x + y + z == 3) {
+    if (add)
+      return mult_add_block(A, B, R);
+    else
+      return multiply_block(A, B, R);
+  }
+
+  fibril_t fr;
+  fibril_init(&fr);
+
+  if (x >= y && x >= z) {
+    fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x/2, y, z, R, oR, add));
+    multiply_matrix(A+(x/2)*oa, oa, B, ob, (x+1)/2, y, z, R+(x/2)*oR, oR, add);
+    fibril_join(&fr);
+  } else if (y > x && y > z) {
+    fibril_fork(&fr, multiply_matrix,
+        (A+(y/2), oa, B+(y/2)*ob, ob, x, (y+1)/2, z, R, oR, add));
+
+    block * tmp = (block*) malloc(x * z * sizeof(block));
+    multiply_matrix(A, oa, B, ob, x, y/2, z, tmp, z, 0);
+    fibril_join(&fr);
+
+    add_matrix(tmp, z, R, oR, x, z);
+    free(tmp);
+  } else {
+    fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x, y, z/2, R, oR, add));
+    multiply_matrix(A, oa, B+(z/2), ob, x, y, (z+1)/2, R+(z/2), oR, add);
+    fibril_join(&fr);
+  }
+}
+
+void init() {
+  x = n / BLOCK_EDGE;
+  y = n / BLOCK_EDGE;
+  z = n / BLOCK_EDGE;
+
+  A = (block*) malloc(x * y * sizeof(block));
+  B = (block*) malloc(y * z * sizeof(block));
+  R = (block*) malloc(x * z * sizeof(block));
+
+  init_matrix(A, x, y, y, 1.0);
+  init_matrix(B, y, z, z, 1.0);
+}
+
+void prep() {
+  init_matrix(R, x, z, z, 0.0);
+}
+
+void test() {
+  multiply_matrix(A, y, B, z, x, y, z, R, z, 0);
+}
+
+int verify() {
+#ifndef BENCHMARK
+  if (check_matrix(R, x, z, z, y * 16)) {
+    printf("WRONG RESULT!\n");
+    return 1;
+  };
+#endif
+
+  return 0;
+}
diff --git a/benchmarks/serial/CMakeLists.txt b/benchmarks/serial/CMakeLists.txt
new file mode 100644
index 00000000..88ca85c3
--- /dev/null
+++ b/benchmarks/serial/CMakeLists.txt
@@ -0,0 +1,27 @@
+
+add_definitions(-DFIBRIL_SERIAL)
+
+
+add_executable(cholesky_serial ../cholesky.cpp)
+
+add_executable(fft_serial ../fft.cpp)
+
+add_executable(fib_serial ../fib.cpp)
+
+add_executable(heat_serial ../heat.cpp)
+
+add_executable(integrate_serial ../integrate.cpp)
+
+add_executable(knapsack_serial ../knapsack.cpp)
+
+add_executable(lu_serial ../lu.cpp)
+
+add_executable(matmul_serial ../matmul.cpp)
+
+add_executable(nqueens_serial ../nqueens.cpp)
+
+add_executable(quicksort_serial ../quicksort.cpp)
+
+add_executable(rectmul_serial ../rectmul.cpp)
+
+add_executable(strassen_serial ../strassen.cpp)
diff --git a/benchmarks/serial/serial.h b/benchmarks/serial/serial.h
new file mode 100644
index 00000000..482577e4
--- /dev/null
+++ b/benchmarks/serial/serial.h
@@ -0,0 +1,18 @@
+#ifndef SERIAL_H
+#define SERIAL_H
+
+
+
+#define fibril
+#define fibril_t __attribute__((unused)) int
+#define fibril_init(fp)
+#define fibril_join(fp)
+
+#define fibril_fork_nrt(fp, fn, ag) (fn ag)
+#define fibril_fork_wrt(fp, rtp, fn, ag) (*rtp = fn ag)
+
+#define fibril_rt_init(n) ((void) n)
+#define fibril_rt_exit()
+#define fibril_rt_nprocs(n) (1)
+
+#endif /* end of include guard: SERIAL_H */
diff --git a/benchmarks/strassen.cpp b/benchmarks/strassen.cpp
new file mode 100644
index 00000000..a49c66ad
--- /dev/null
+++ b/benchmarks/strassen.cpp
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 1996 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use, copy, modify, and distribute the Software without
+ * restriction, provided the Software, including any modified copies made
+ * under this license, is not distributed for a fee, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
+ * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the Massachusetts
+ * Institute of Technology shall not be used in advertising or otherwise
+ * to promote the sale, use or other dealings in this Software without
+ * prior written authorization from the Massachusetts Institute of
+ * Technology.
+ *
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "test.h"
+
+#define SizeAtWhichDivideAndConquerIsMoreEfficient 64
+#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
+#define CacheBlockSizeInBytes 32
+
+/* The real numbers we are using --- either double or float */
+typedef double REAL;
+typedef unsigned long PTR;
+
+/* maximum tolerable relative error (for the checking routine) */
+#define EPSILON (1.0E-6)
+
+/*
+ * Matrices are stored in row-major order; A is a pointer to
+ * the first element of the matrix, and an is the number of elements
+ * between two rows. This macro produces the element A[i,j]
+ * given A, an, i and j
+ */
+#define ELEM(A, an, i, j) (A[(i) * (an) + (j)])
+
+#ifndef BENCHMARK
+int n = 512;
+#else
+int n = 4096;
+#endif
+
+static REAL * A, * B, * C;
+
+/*
+ * Naive sequential algorithm, for comparison purposes
+ */
+void matrixmul(int n, REAL * A, int an, REAL * B, int bn, REAL * C, int cn)
+{
+  int i, j, k;
+  REAL s;
+
+  for (i = 0; i < n; ++i)
+    for (j = 0; j < n; ++j) {
+      s = 0.0;
+      for (k = 0; k < n; ++k)
+        s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
+
+      ELEM(C, cn, i, j) = s;
+    }
+}
+
+/*****************************************************************************
+ **
+ ** FastNaiveMatrixMultiply
+ **
+ ** For small to medium sized matrices A, B, and C of size
+ ** MatrixSize * MatrixSize this function performs the operation
+ ** C = A x B efficiently.
+ **
+ ** Note MatrixSize must be divisible by 8.
+ **
+ ** INPUT:
+ **    C = (*C WRITE) Address of top left element of matrix C.
+ **    A = (*A IS READ ONLY) Address of top left element of matrix A.
+ **    B = (*B IS READ ONLY) Address of top left element of matrix B.
+ **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+ **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+ **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+ **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+ **
+ ** OUTPUT:
+ **    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+ **
+ *****************************************************************************/
+static void FastNaiveMatrixMultiply(
+    REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
+    unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+      REAL FirstARowValue = *ARowStart++;
+
+      REAL Sum0 = FirstARowValue * (*BColumnStart);
+      REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
+      REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
+      REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
+      REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
+      REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
+      REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
+      REAL Sum7 = FirstARowValue * (*(BColumnStart+7));
+
+      unsigned Products;
+      for (Products = 1; Products < MatrixSize; Products++) {
+        REAL ARowValue = *ARowStart++;
+        BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+
+        Sum0 += ARowValue * (*BColumnStart);
+        Sum1 += ARowValue * (*(BColumnStart+1));
+        Sum2 += ARowValue * (*(BColumnStart+2));
+        Sum3 += ARowValue * (*(BColumnStart+3));
+        Sum4 += ARowValue * (*(BColumnStart+4));
+        Sum5 += ARowValue * (*(BColumnStart+5));
+        Sum6 += ARowValue * (*(BColumnStart+6));
+        Sum7 += ARowValue * (*(BColumnStart+7));
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+
+/*****************************************************************************
+ **
+ ** FastAdditiveNaiveMatrixMultiply
+ **
+ ** For small to medium sized matrices A, B, and C of size
+ ** MatrixSize * MatrixSize this function performs the operation
+ ** C += A x B efficiently.
+ **
+ ** Note MatrixSize must be divisible by 8.
+ **
+ ** INPUT:
+ **    C = (*C READ/WRITE) Address of top left element of matrix C.
+ **    A = (*A IS READ ONLY) Address of top left element of matrix A.
+ **    B = (*B IS READ ONLY) Address of top left element of matrix B.
+ **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+ **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+ **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+ **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+ **
+ ** OUTPUT:
+ **    C = (*C READ/WRITE) Matrix C contains C + A x B.
+ **
+ *****************************************************************************/
+static void FastAdditiveNaiveMatrixMultiply(
+    REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
+    unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+
+      REAL Sum0 = *C;
+      REAL Sum1 = *(C+1);
+      REAL Sum2 = *(C+2);
+      REAL Sum3 = *(C+3);
+      REAL Sum4 = *(C+4);
+      REAL Sum5 = *(C+5);
+      REAL Sum6 = *(C+6);
+      REAL Sum7 = *(C+7);
+
+      unsigned Products;
+      for (Products = 0; Products < MatrixSize; Products++) {
+        REAL ARowValue = *ARowStart++;
+
+        Sum0 += ARowValue * (*BColumnStart);
+        Sum1 += ARowValue * (*(BColumnStart+1));
+        Sum2 += ARowValue * (*(BColumnStart+2));
+        Sum3 += ARowValue * (*(BColumnStart+3));
+        Sum4 += ARowValue * (*(BColumnStart+4));
+        Sum5 += ARowValue * (*(BColumnStart+5));
+        Sum6 += ARowValue * (*(BColumnStart+6));
+        Sum7 += ARowValue * (*(BColumnStart+7));
+
+        BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+
+
+/*****************************************************************************
+ **
+ ** MultiplyByDivideAndConquer
+ **
+ ** For medium to medium-large (would you like fries with that) sized
+ ** matrices A, B, and C of size MatrixSize * MatrixSize this function
+ ** efficiently performs the operation
+ **    C  = A x B (if AdditiveMode == 0)
+ **    C += A x B (if AdditiveMode != 0)
+ **
+ ** Note MatrixSize must be divisible by 16.
+ **
+ ** INPUT:
+ **    C = (*C READ/WRITE) Address of top left element of matrix C.
+ **    A = (*A IS READ ONLY) Address of top left element of matrix A.
+ **    B = (*B IS READ ONLY) Address of top left element of matrix B.
+ **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+ **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+ **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+ **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+ **    AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
+ **
+ ** OUTPUT:
+ **    C (+)= A x B. (+ if AdditiveMode != 0)
+ **
+ *****************************************************************************/
+void MultiplyByDivideAndConquer(
+    REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
+    unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB,
+    int AdditiveMode)
+{
+#define A00 A
+#define B00 B
+#define C00 C
+
+  REAL  *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
+  unsigned QuadrantSize = MatrixSize >> 1;
+
+  /* partition the matrix */
+  A01 = A00 + QuadrantSize;
+  A10 = A00 + RowWidthA * QuadrantSize;
+  A11 = A10 + QuadrantSize;
+
+  B01 = B00 + QuadrantSize;
+  B10 = B00 + RowWidthB * QuadrantSize;
+  B11 = B10 + QuadrantSize;
+
+  C01 = C00 + QuadrantSize;
+  C10 = C00 + RowWidthC * QuadrantSize;
+  C11 = C10 + QuadrantSize;
+
+  if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
+    MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
+    MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
+    MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
+    MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
+    MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, 1);
+    MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, 1);
+    MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, 1);
+    MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB, 1);
+  } else {
+    if (AdditiveMode) {
+      FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+      FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+      FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+      FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+    } else {
+      FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+
+      FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+
+      FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+
+      FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+          RowWidthC, RowWidthA, RowWidthB);
+    }
+
+    FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB);
+    FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB);
+    FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB);
+    FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
+        RowWidthC, RowWidthA, RowWidthB);
+  }
+
+  return;
+}
+
+
+/*****************************************************************************
+ **
+ ** OptimizedStrassenMultiply
+ **
+ ** For large matrices A, B, and C of size MatrixSize * MatrixSize this
+ ** function performs the operation C = A x B efficiently.
+ **
+ ** INPUT:
+ **    C = (*C WRITE) Address of top left element of matrix C.
+ **    A = (*A IS READ ONLY) Address of top left element of matrix A.
+ **    B = (*B IS READ ONLY) Address of top left element of matrix B.
+ **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+ **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+ **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+ **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+ ** OUTPUT:
+ **    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+ **
+ *****************************************************************************/
+fibril static void OptimizedStrassenMultiply(
+    REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
+    unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize *
+    QuadrantSize + 32;
+  unsigned Column, Row;
+
+  /************************************************************************
+   ** For each matrix A, B, and C, we'll want pointers to each quandrant
+   ** in the matrix. These quandrants will be addressed as follows:
+   **  --        --
+   **  | A11  A12 |
+   **  |          |
+   **  | A21  A22 |
+   **  --        --
+   ************************************************************************/
+  REAL /**A11, *B11, *C11,*/ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+#define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= SizeAtWhichDivideAndConquerIsMoreEfficient) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize,
+        RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+#define A11 A
+#define B11 B
+#define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = (char*) malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+    Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+
+  /***************************************************************************
+   ** Step through all columns row by row (vertically)
+   ** (jumps in memory by RowWidth => bad locality)
+   ** (but we want the best locality on the innermost loop)
+   ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+
+    /*************************************************************************
+     ** Step through each row horizontally (addressing elements in each column)
+     ** (jumps linearly througn memory => good locality)
+     *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+
+      /***********************************************************
+       ** Within this loop, the following holds for MatrixOffset:
+       ** MatrixOffset = (Row * RowWidth) + Column
+       ** (note: that the unit of the offset is number of reals)
+       ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+#define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+#define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+#define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  fibril_t fr;
+  fibril_init(&fr);
+
+  /* M2 = A11 x B11 */
+  fibril_fork(&fr, OptimizedStrassenMultiply,
+      (M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB));
+
+  /* M5 = S1 * S5 */
+  fibril_fork(&fr, OptimizedStrassenMultiply,
+      (M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize));
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  fibril_fork(&fr, OptimizedStrassenMultiply,
+      (T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize));
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  fibril_fork(&fr, OptimizedStrassenMultiply,
+      (C22, S3, S7, QuadrantSize, RowWidthC, QuadrantSize, QuadrantSize));
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  fibril_fork(&fr, OptimizedStrassenMultiply,
+      (C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB));
+
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  fibril_fork(&fr, OptimizedStrassenMultiply,
+    (C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB));
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  OptimizedStrassenMultiply(C21, A22, S8, QuadrantSize, RowWidthC,
+      RowWidthA, QuadrantSize);
+
+  fibril_join(&fr);
+
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+
+  free(StartHeap);
+}
+
+static void strassen(int n, REAL * A, int an, REAL * B, int bn,
+    REAL * C, int cn) {
+  OptimizedStrassenMultiply(C, A, B, n, cn, bn, an);
+}
+
+/*
+ * Set an n by n matrix A to random values.  The distance between
+ * rows is an
+ */
+void init_matrix(int n, REAL *A, int an)
+{
+  int i, j;
+
+  for (i = 0; i < n; ++i)
+    for (j = 0; j < n; ++j)
+      ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX;
+}
+
+/*
+ * Compare two matrices.  Print an error message if they differ by
+ * more than EPSILON.
+ */
+int compare_matrix(int n, REAL *A, int an, REAL *B, int bn)
+{
+  int i, j;
+  REAL c;
+
+  for (i = 0; i < n; ++i)
+    for (j = 0; j < n; ++j) {
+      /* compute the relative error c */
+      c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
+      if (c < 0.0)
+        c = -c;
+
+      c = c / ELEM(A, an, i, j);
+      if (c > EPSILON) {
+        return 1;
+      }
+    }
+
+  return 0;
+}
+
+void init() {
+  A = (REAL*) malloc(n * n * sizeof(REAL));
+  B = (REAL*) malloc(n * n * sizeof(REAL));
+  C = (REAL*) malloc(n * n * sizeof(REAL));
+
+  init_matrix(n, A, n);
+  init_matrix(n, B, n);
+}
+
+void prep() {
+}
+
+void test() {
+  strassen(n, A, n, B, n, C, n);
+}
+
+int verify() {
+  int fail = 0;
+
+#ifndef BENCHMARK
+  REAL * E = (REAL*) malloc(n * n * sizeof(REAL));
+  matrixmul(n, A, n, B, n, E, n);
+  fail = compare_matrix(n, E, n, C, n);
+  if (fail > 0) printf("WRONG RESULT!\n");
+#endif
+
+  return fail;
+}
diff --git a/benchmarks/tbb/CMakeLists.txt b/benchmarks/tbb/CMakeLists.txt
new file mode 100644
index 00000000..2ad0b5b8
--- /dev/null
+++ b/benchmarks/tbb/CMakeLists.txt
@@ -0,0 +1,43 @@
+
+add_definitions(-DFIBRIL_TBB)
+
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ltbb")
+
+find_library(TBB_LIB tbb)
+
+
+add_executable(cholesky_tbb ../cholesky.cpp)
+target_link_libraries(cholesky_tbb "${TBB_LIB}")
+
+add_executable(fft_tbb ../fft.cpp)
+target_link_libraries(fft_tbb "${TBB_LIB}")
+
+add_executable(fib_tbb ../fib.cpp)
+target_link_libraries(fib_tbb "${TBB_LIB}")
+
+add_executable(heat_tbb ../heat.cpp)
+target_link_libraries(heat_tbb "${TBB_LIB}")
+
+add_executable(integrate_tbb ../integrate.cpp)
+target_link_libraries(integrate_tbb "${TBB_LIB}")
+
+add_executable(knapsack_tbb ../knapsack.cpp)
+target_link_libraries(knapsack_tbb "${TBB_LIB}")
+
+add_executable(lu_tbb ../lu.cpp)
+target_link_libraries(lu_tbb "${TBB_LIB}")
+
+add_executable(matmul_tbb ../matmul.cpp)
+target_link_libraries(matmul_tbb "${TBB_LIB}")
+
+add_executable(nqueens_tbb ../nqueens.cpp)
+target_link_libraries(nqueens_tbb "${TBB_LIB}")
+
+add_executable(quicksort_tbb ../quicksort.cpp)
+target_link_libraries(quicksort_tbb "${TBB_LIB}")
+
+add_executable(rectmul_tbb ../rectmul.cpp)
+target_link_libraries(rectmul_tbb "${TBB_LIB}")
+
+add_executable(strassen_tbb ../strassen.cpp)
+target_link_libraries(strassen_tbb "${TBB_LIB}")
diff --git a/benchmarks/tbb/tbb.h b/benchmarks/tbb/tbb.h
new file mode 100644
index 00000000..98434ced
--- /dev/null
+++ b/benchmarks/tbb/tbb.h
@@ -0,0 +1,36 @@
+#ifndef TBB_H
+#define TBB_H
+
+#include <tbb/task_group.h>
+#include <tbb/task_scheduler_init.h>
+
+#define fibril
+#define fibril_t tbb::task_group
+#define fibril_init(fp)
+#define fibril_join(fp) (fp)->wait()
+
+#define fibril_fork_nrt(fp, fn, ag) (fp)->run([=]{ fn ag; })
+#define fibril_fork_wrt(fp, rtp, fn, ag) do { \
+  __typeof__(rtp) pt = rtp; \
+  (fp)->run([=]{ *pt = fn ag; }); \
+} while (0)
+
+#include <thread>
+
+int NTHREADS;
+int fibril_rt_nprocs() { return (NTHREADS) ? NTHREADS : std::thread::hardware_concurrency(); }
+
+#define fibril_rt_init(n) \
+  do { \
+    int max_nprocs = fibril_rt_nprocs(); \
+    if (n > 0 && n <= max_nprocs) { \
+      NTHREADS = n; \
+    } else { \
+      NTHREADS = max_nprocs; \
+    } \
+  } while(0); \
+tbb::task_scheduler_init _fibril_rt_init(NTHREADS)
+
+#define fibril_rt_exit()
+
+#endif /* end of include guard: TBB_H */
diff --git a/benchmarks/test.h b/benchmarks/test.h
new file mode 100644
index 00000000..813a1fd3
--- /dev/null
+++ b/benchmarks/test.h
@@ -0,0 +1,148 @@
+#ifndef TEST_H
+#define TEST_H
+
+#if HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+extern void init();
+extern void prep();
+extern void test();
+extern int verify();
+
+extern int n;
+
+#include <stdlib.h>
+#include "fibril.h"
+
+#ifdef BENCHMARK
+
+#include <stdio.h>
+#include <float.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+static void sort(float * a, int n)
+{
+	int i, sorted = 0;
+
+	while (!sorted) {
+		sorted = 1;
+
+		for (i = 1; i < n; ++i) {
+			if (a[i] < a[i - 1]) {
+				float t = a[i];
+				a[i] = a[i - 1];
+				a[i - 1] = t;
+				sorted = 0;
+			}
+		}
+	}
+}
+
+size_t static inline time_elapsed(size_t val)
+{
+	struct timeval t;
+	gettimeofday(&t, NULL);
+	return t.tv_sec * 1000000 + t.tv_usec - val;
+}
+
+static void bench(const char * name, int nprocs)
+{
+	const static int iter = 10;
+	float times[iter];
+
+	printf("===========================================\n");
+	printf("  Benchmark: %s\n", strrchr(name, '/') + 1);
+	printf("  Input size: %d\n", n);
+	printf("  Number of iterations: %d\n", iter);
+	printf("  Number of processors: %d\n", nprocs);
+
+	struct rusage ru;
+	getrusage(RUSAGE_SELF, &ru);
+	long rss = ru.ru_maxrss;
+	long flt = ru.ru_minflt;
+
+	/* warm up */
+	prep();
+	test();
+
+	/* benchmark */
+	int i;
+	for (i = 0; i < iter; ++i) {
+		prep();
+		size_t usecs = time_elapsed(0);
+		test();
+		usecs = time_elapsed(usecs);
+		times[i] = usecs / 1000000.0;
+		printf("  #%d execution time: %f s\n", i, times[i]);
+	}
+
+	sort(times, iter);
+
+	float p10 = times[1];
+	float p90 = times[8];
+	float med = times[5];
+
+	getrusage(RUSAGE_SELF, &ru);
+	rss = ru.ru_maxrss - rss;
+	flt = ru.ru_minflt - flt;
+
+	printf("  Execution time summary:\n");
+	printf("    Median: %f s\n", med);
+	printf("    10th %%: %f s\n", p10);
+	printf("    90th %%: %f s\n", p90);
+	printf("  Resources summary: \n");
+	printf("    Max RSS: %ld (KB)\n", ru.ru_maxrss);
+	printf("    Runtime RSS: %ld (KB)\n", rss);
+	printf("    # of page faults: %ld\n", flt);
+}
+
+#endif
+
+#include <stdlib.h>
+
+int main(int argc, const char * argv[])
+{
+	if (argc > 1 && (argc = atoi(argv[1])) > 0) {
+		n = argc;
+	}
+
+	init();
+	int result;
+
+	int nthreads = 0;
+	char *env = getenv("EMPER_BENCH_NPROCS");
+	if (env) nthreads = atoi(env);
+
+	fibril_rt_init(nthreads);
+	int nprocs = fibril_rt_nprocs();
+
+#ifdef BENCHMARK
+	bench(argv[0], nprocs);
+#else
+	(void) nprocs;
+	prep();
+	test();
+#endif
+
+	result = verify();
+	fibril_rt_exit();
+
+#ifdef BENCHMARK
+#ifdef FIBRIL_STATS
+	printf("  Statistics summary:\n");
+	printf("    # of steals: %s\n", getenv("FIBRIL_N_STEALS"));
+	printf("    # of suspensions: %s\n", getenv("FIBRIL_N_SUSPENSIONS"));
+	printf("    # of stacks used: %s\n", getenv("FIBRIL_N_STACKS"));
+	printf("    # of pages used: %s\n", getenv("FIBRIL_N_PAGES"));
+#endif
+	printf("===========================================\n");
+#endif
+
+	//return verify();
+	return result;
+}
+
+#endif /* end of include guard: TEST_H */
diff --git a/emper/CMakeLists.txt b/emper/CMakeLists.txt
index e9a18674..62472c71 100644
--- a/emper/CMakeLists.txt
+++ b/emper/CMakeLists.txt
@@ -11,6 +11,7 @@ add_files(EMPER_SOURCE Debug.cpp)
 add_files(EMPER_SOURCE ContextManager.cpp)
 add_files(EMPER_SOURCE BinaryPrivateSemaphore.cpp)
 add_files(EMPER_SOURCE CountingPrivateSemaphore.cpp)
+add_files(EMPER_SOURCE Fibril.cpp)
 add_files(EMPER_SOURCE Semaphore.cpp)
 
 add_files(EMPER_INCLUDE ".")
diff --git a/emper/Context.hpp b/emper/Context.hpp
index 582f5e05..fecb69ac 100644
--- a/emper/Context.hpp
+++ b/emper/Context.hpp
@@ -3,6 +3,7 @@
 #include <cassert>
 #include <functional>
 #include <cstring>
+#include <sys/mman.h>
 
 #include <valgrind/valgrind.h>
 
@@ -10,6 +11,20 @@
 #include "Debug.hpp"
 
 class Context;
+class Dispatcher;
+class Fiber;
+
+#define PAGE_SIZE		(4 * 1024)
+#ifdef EMPER_BENCH_STACK_SIZE
+#define STACK_SIZE		EMPER_BENCH_STACK_SIZE
+#else
+#define STACK_SIZE		(0x10000)
+#endif
+#ifdef EMPER_FIBRIL_STATS
+#include <atomic>
+extern std::atomic<uint64_t> statsUnmapp;
+#endif
+
 
 extern "C" [[noreturn]] void switch_and_load_context(void** toTos);
 // *Not* marked as 'noreturn' because save_and_switch_context does
@@ -20,10 +35,12 @@ extern "C" [[noreturn]] void switch_context(void** toTos);
 
 class ALIGN_TO_CACHE_LINE Context : Logger<LogSubsystem::C> {
 private:
-	static const unsigned int CONTEXT_SIZE = 0xffff; // 1024 * 1024 * 4;
+	static const unsigned int CONTEXT_SIZE = STACK_SIZE; // 0xffff;
 
 	static thread_local Context* currentContext;
 
+	const Fiber* currentFiber;
+
 	void* const tos;
 
 	//	unsigned valgrindStackId;
@@ -52,6 +69,21 @@ private:
 		context->mainFunction();
 	}
 
+	friend Dispatcher;
+
+	static void setCurrentFiber(const Fiber* fiber) {
+		assert(currentContext);
+
+		currentContext->currentFiber = fiber;
+	}
+
+	static const Fiber* getCurrentFiber() {
+		assert(currentContext);
+
+		if (!currentContext) return nullptr;
+		return currentContext->currentFiber;
+	}
+
 public:
 	// cppcheck-suppress noExplicitConstructor selfInitialization
 	Context(func_t mainFunction)
@@ -106,13 +138,35 @@ public:
 		return tos;
 	}
 
+#ifdef EMPER_MADVISE
+	inline void unmap(void *from) const {
+		const size_t PAGE_SIZE_MASK = 4 * 1024 - 1;
+		const uintptr_t start = ((uintptr_t) context + PAGE_SIZE_MASK) & ~PAGE_SIZE_MASK;
+		const uintptr_t end = (uintptr_t) from & ~PAGE_SIZE_MASK;
+		//if (madvise((void*) start, (end - start), MADV_DONTNEED)) {
+		if (madvise((void*) start, (end - start), MADV_FREE)) {
+			perror("madvise");
+			// die()?
+		}
+#ifdef EMPER_FIBRIL_STATS
+		statsUnmapp++;
+#endif
+	}
+#endif
+
 	/**
 	 * Start this context. 
 	 */
 	[[noreturn]] inline void start() {
 		LOGD("starting");
 		currentContext = this;
-		switch_context(&savedStackpointer);
+		asm(
+				"mov %0, %%rsp\n\t"
+				"jmp *%1\n\t"
+				:: "r" ((void**) tos - 1), "r" (kickoff) : "memory"
+			);
+		__builtin_unreachable();
+		//switch_context(&savedStackpointer);
 	}
 
 	/**
@@ -154,4 +208,5 @@ public:
 		return currentContext;
 	}
 
+	friend class Fibril;
 };
diff --git a/emper/ContextManager.cpp b/emper/ContextManager.cpp
index 88a651c8..8d4b2f7c 100644
--- a/emper/ContextManager.cpp
+++ b/emper/ContextManager.cpp
@@ -5,10 +5,14 @@
 #include "Runtime.hpp"
 #include "Debug.hpp"
 #include "Context.hpp"
+#include "Continuation.hpp"
+#include "Fibril.hpp"
+#include "Dispatcher.hpp"
 
 ContextManager::ContextManager(Runtime& runtime) : MemoryManager(runtime), runtime(runtime) {
+#ifdef EMPER_CM_WITH_MEMORY_MANAGER
 	auto newWorkerHook = [this]() {
-		for (unsigned int i = 0; i < CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE * 2; ++i) {
+		for (unsigned int i = 0; i < CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE; ++i) {
 			Context* context = new Context(this->runtime.dispatcher.getDispatchLoop());
 			putFreeContext(context);
 		}
@@ -16,10 +20,11 @@ ContextManager::ContextManager(Runtime& runtime) : MemoryManager(runtime), runti
 	// Note that it is important that this hook is executed *after*
 	// the one of the MemoryManager superclass.
 	runtime.addNewWorkerHook(newWorkerHook);
+#endif
 }
 
 Context* ContextManager::getFreeContext() {
-#ifdef CM_WITH_MEMORY_MANAGER
+#ifdef EMPER_CM_WITH_MEMORY_MANAGER
 	bool malloced;
 	void* memory = getMemory(&malloced);
 	if (malloced) {
@@ -32,18 +37,40 @@ Context* ContextManager::getFreeContext() {
 }
 
 void ContextManager::putFreeContext(Context* context) {
-#ifdef CM_WITH_MEMORY_MANAGER
+#ifdef EMPER_CM_WITH_MEMORY_MANAGER
 	putMemory(context);
 #else
 	delete context;
 #endif
 }
 
+thread_local static Continuation *cont;
+
 void ContextManager::start() {
+	uintptr_t val;
+	Continuation c;
+	cont = &c;
+
+	val = cont->setJmp();
+	if (Runtime::getRuntime()->isShuttingDown())
+		pthread_exit(nullptr);
+
+	Fibril::tryResumeFiber(val);
+
+	Context* currentContext = Context::getCurrentContext();
+	if (currentContext) {
+		/* use currentContext to execute Fibers */
+		currentContext->start();
+	}
+
 	Context* freeContext = getFreeContext();
 	freeContext->start();
 }
 
+void ContextManager::resume(uintptr_t val) {
+	cont->longJmp(val);
+}
+
 /**
  * Save the current context and start a new one.
  */
@@ -64,5 +91,12 @@ void ContextManager::discardAndResume(Context* context) {
 			LOGD("Freeing context " << contextToFree);
 			putFreeContext(contextToFree);
 	});
+
+	// Since we are going to discard this context, it will never reach
+	// the end of its dispatch loop, and hence we need to ensure that
+	// the fiber is recycled.
+	const Fiber* currentFiber = Dispatcher::getCurrentFiberPtr();
+	Dispatcher::recycle(currentFiber);
+
 	contextToFree->discardAndResume(context);
 }
diff --git a/emper/ContextManager.hpp b/emper/ContextManager.hpp
index e39f5b32..b57a2e2b 100644
--- a/emper/ContextManager.hpp
+++ b/emper/ContextManager.hpp
@@ -8,7 +8,7 @@
 
 class Context;
 
-#define CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE 64
+#define CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE 16
 
 class ContextManager : public Logger<LogSubsystem::CM>, protected MemoryManager<Context, 128, CONTEXT_MANAGER_FIRST_LAYER_QUEUE_SIZE> {
 
@@ -30,4 +30,7 @@ public:
 
 	[[noreturn]] void discardAndResume(Context* context);
 
+	//[[noreturn]] void resume(); /* FIXME noreturn leads to SEGFAULTs in fibril_join, because compiler thinks join never returns! */
+	void resume(uintptr_t);
+
 };
diff --git a/emper/Continuation.hpp b/emper/Continuation.hpp
new file mode 100644
index 00000000..3140cbac
--- /dev/null
+++ b/emper/Continuation.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+
+
+#define membar(call) do { \
+	call; \
+	asm ( "nop" ::: "rbx", "r12", "r13", "r14", "r15", "memory" ); \
+} while (0);
+
+
+class Continuation {
+public:
+	void *bp;
+	void *sp;
+	void *ip;
+
+	inline __attribute__((always_inline))
+	Continuation() : ip(nullptr) {
+		register void *rbp asm("rbp");
+		register void *rsp asm("rsp");
+
+		bp = rbp;
+		sp = rsp;
+	};
+
+	inline __attribute__((always_inline, noreturn))
+	void execute(const void* _sp) {
+		asm (
+				"mov %0, %%rsp\n\t"
+				"mov %1, %%rbp\n\t"
+				"jmp *%2\n\t"
+				:: "r" (_sp), "r" (bp), "r" (ip) : "memory"
+			);
+		__builtin_unreachable();
+	};
+
+	inline __attribute__((always_inline, returns_twice))
+	uintptr_t setJmp() {
+		auto set_rip = [] (Continuation* c) __attribute__((noinline, hot, optimize(3))) {
+			c->ip = __builtin_return_address(0);
+			return 0;
+		};
+
+		uintptr_t res;
+		membar(res = set_rip(this));
+		return res;
+	};
+
+	inline __attribute__((always_inline, noreturn))
+	void longJmp(uintptr_t ret) {
+		asm (
+				"mov %0, %%rsp\n\t"
+				"mov %1, %%rbp\n\t"
+				"jmp *%2\n\t"
+				:: "r" (sp), "r" (bp), "r" (ip), "a" (ret) : "memory"
+			);
+		__builtin_unreachable();
+	};
+
+};
+
diff --git a/emper/Dispatcher.cpp b/emper/Dispatcher.cpp
index dfe4e280..ac0c76e8 100644
--- a/emper/Dispatcher.cpp
+++ b/emper/Dispatcher.cpp
@@ -6,8 +6,6 @@
 #include "Runtime.hpp"
 #include "Debug.hpp"
 
-thread_local const Fiber* Dispatcher::currentFiber;
-
 func_t Dispatcher::getDispatchLoop() {
 	return std::bind(&Dispatcher::dispatchLoop, this);
 }
diff --git a/emper/Dispatcher.hpp b/emper/Dispatcher.hpp
index 16a6b884..c1eb7771 100644
--- a/emper/Dispatcher.hpp
+++ b/emper/Dispatcher.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "Common.hpp"
+#include "Context.hpp"
 #include "Fiber.hpp"
 #include "Debug.hpp"
 
@@ -17,9 +18,9 @@ protected:
 
 	func_t getDispatchLoop();
 
-	inline void dispatch(const Fiber* fiber) {
+	inline void dispatch(Fiber* fiber) {
 		LOGD("executing fiber " << fiber);
-		currentFiber = fiber;
+		Context::setCurrentFiber(fiber);
 		fiber->run();
 	}
 
@@ -35,8 +36,8 @@ protected:
 		return fiber->doAtomicDecrRefCount();
 	}
 
-	inline void recycle(const Fiber* fiber) {
-		delete fiber;
+	static inline void recycle(const Fiber* fiber) {
+		delete fiber; /* TODO don't delete Fibrils */
 	}
 
 	void putRuntimeWorkerToSleep();
@@ -52,12 +53,11 @@ public:
 	}
 
 	static const Fiber* getCurrentFiberPtr() {
-		assert(currentFiber);
-		return currentFiber;
+		return Context::getCurrentFiber();
 	}
 
 	static bool isDispatchedControlFlow() {
-		return currentFiber != nullptr;
+		return getCurrentFiberPtr() != nullptr;
 	}
 
 	friend ContextManager;
diff --git a/emper/Fiber.cpp b/emper/Fiber.cpp
index c6e578ca..45580cf0 100644
--- a/emper/Fiber.cpp
+++ b/emper/Fiber.cpp
@@ -2,7 +2,7 @@
 
 #include <ostream>
 
-void Fiber::run() const {
+void Fiber::run() {
 	LOGD("run() calling "
 		 << function.target<FIBER_FUN_TEMPLATE_ARG>()
 		 << " (" << function.target_type().name()
diff --git a/emper/Fiber.hpp b/emper/Fiber.hpp
index 4a88f185..088c9677 100644
--- a/emper/Fiber.hpp
+++ b/emper/Fiber.hpp
@@ -16,20 +16,27 @@ class Scheduler;
 class Dispatcher;
 class LawsScheduler;
 
-class ALIGN_TO_CACHE_LINE Fiber : public Logger<LogSubsystem::F> {
+class Fiber : public Logger<LogSubsystem::F> {
 public:
 	typedef std::function<FIBER_FUN_TEMPLATE_ARG> fiber_fun_t;
 	typedef std::function<FIBER_FUN0_TEMPLATE_ARG> fiber_fun0_t;
 
 	static const workeraffinity_t NOT_AFFINE = -1;
 
+	enum Type {
+		FiberType,
+		FibrilType
+	};
+
+	Type type = FiberType;
+
 private:
 	const fiber_fun_t function;
 	void* const arg;
 
 	std::atomic<bool> runnable = { true };
 
-	ALIGN_TO_CACHE_LINE std::atomic_uint referenceCounter = { 1 };
+	std::atomic_uint referenceCounter = { 1 };
 
 	workeraffinity_t* const affinity;
 
@@ -68,7 +75,7 @@ protected:
 
 	virtual ~Fiber() = default;
 
-	virtual void run() const;
+	virtual void run();
 
 private:
 	inline void setMpscNext(Fiber* next) {
diff --git a/emper/Fibril.cpp b/emper/Fibril.cpp
new file mode 100644
index 00000000..0e2ee524
--- /dev/null
+++ b/emper/Fibril.cpp
@@ -0,0 +1,5 @@
+#include "Fibril.hpp"
+
+
+
+thread_local Fibril *Fibril::toResume = nullptr;
diff --git a/emper/Fibril.hpp b/emper/Fibril.hpp
new file mode 100644
index 00000000..eddfa742
--- /dev/null
+++ b/emper/Fibril.hpp
@@ -0,0 +1,283 @@
+#pragma once
+
+#ifdef EMPER_LOCKED_FIBRIL
+#ifdef EMPER_FIBRIL_SYNC
+#include "FibrilLock.hpp"
+#else
+#include <mutex>
+#endif
+#endif
+
+#include "Runtime.hpp"
+#include "Fiber.hpp"
+#include "Continuation.hpp"
+#include "ContextManager.hpp"
+#include "Context.hpp"
+#include "atomic"
+
+
+class Fibril : public Fiber {
+#ifdef EMPER_LOCKED_FIBRIL
+
+#ifndef EMPER_LOCKED_WS_QUEUE
+#error "EMPER_LOCKED_FIBRIL only works in combination with EMPER_LOCKED_WS_QUEUE!"
+#endif
+
+public:
+#ifdef EMPER_FIBRIL_SYNC
+	FibrilLock m;
+#else
+	std::mutex m;
+#endif
+private:
+	int activeChildrenCount = 0;
+
+#else /* ! EMPER_LOCKED_FIBRIL */
+
+private:
+	std::atomic<uint32_t> activeChildrenCount = 0;
+	uint32_t reserveStealCount = 0;
+
+#ifdef EMPER_MADVISE
+	std::atomic<bool> resumable = false;
+#endif
+
+#endif /* EMPER_LOCKED_FIBRIL */
+
+public:
+	Continuation cont;
+
+private:
+	Context *stack;
+
+	static thread_local Fibril *toResume;
+
+
+#ifdef EMPER_LOCKED_FIBRIL
+	inline void tryResume(__attribute__((unused)) uint32_t val) {
+		int c;
+
+		m.lock();
+		c = --activeChildrenCount;
+
+		if (c > 0) {
+			if (stack == Context::currentContext) {
+#ifdef EMPER_MADVISE
+				/* unmap unused stack pages */
+				Context::currentContext->unmap(cont.sp);
+#endif
+				Context::currentContext = nullptr;
+			}
+			m.unlock();
+			/* random steal */
+			return;
+		} else {
+			m.unlock();
+			if (stack != Context::currentContext) {
+				Runtime::getRuntime()->getContextManager().putFreeContext(Context::currentContext);
+				// XXX has to check for hook?
+				Context::currentContext = stack;
+			}
+			/* resume, no return */
+			cont.execute(cont.sp);
+		}
+	}
+
+#else /* ! EMPER_LOCKED_FIBRIL */
+
+	inline void tryResume(uint32_t val) {
+		uint32_t c;
+		Context *s;
+
+		s = stack;
+
+		//c = activeChildrenCount.fetch_sub(1, std::memory_order_acq_rel) - 1;
+		c = activeChildrenCount.fetch_sub(val, std::memory_order_relaxed) - val;
+
+		if (c > 0) {
+			if (s == Context::currentContext) {
+#ifdef EMPER_MADVISE
+				/* unmap unused stack pages */
+				Context::currentContext->unmap(cont.sp);
+				/* set resumable to 'true' */
+				if (true == resumable.exchange(true, std::memory_order_acq_rel)) {
+					/* last one joined, but could not resume
+					 * because of us, so we can resume.
+					 * resume, no return */
+					cont.execute(cont.sp);
+				}
+#endif
+				Context::currentContext = nullptr;
+			}
+			/* random steal */
+			return;
+		} else {
+			if (stack != Context::currentContext) {
+#ifdef EMPER_MADVISE
+				/* get resumable, signal we tried to resume */
+				if (false == resumable.exchange(true, std::memory_order_acq_rel)) {
+					/* stack owner is unmapping pages, we don't
+					 * wait, stack owner sees we were here.
+					 * random steal */
+					return;
+				}
+				/* we can proceed resume */
+#endif
+				Runtime::getRuntime()->getContextManager().putFreeContext(Context::currentContext);
+				// XXX has to check for hook?
+				Context::currentContext = stack;
+			}
+			/* resume, no return */
+			cont.execute(cont.sp);
+		}
+	}
+#endif /* ! EMPER_LOCKED_FIBRIL */
+
+public:
+
+	inline __attribute__((always_inline))
+	Fibril() : Fiber(nullptr, nullptr, nullptr), cont() {
+		stack = Context::currentContext; // TODO check if this is correct
+		type = FibrilType;
+	}
+
+	~Fibril() = default;
+
+
+	void run() override {
+#ifdef EMPER_LOCKED_FIBRIL
+		if (!activeChildrenCount) activeChildrenCount = 2;
+		else activeChildrenCount++;
+		m.unlock();
+#else
+		reserveStealCount -= 1;
+#endif
+
+#ifdef EMPER_FIBRIL_STATS
+		statsSteals++;
+#endif
+
+		/* Reserve 128 byte at the bottom. */
+		/* FIXME clean up, make nice looking */
+		cont.execute((void**)Context::currentContext->getTos() - 16);
+		/* This seems to be necessary, because the compiler will pop args off the stack.
+		 * On the new stack, if we don't reserve space for this, it will lead to access
+		 * outside the stack area.
+		 */
+	}
+
+	__attribute__((noreturn))
+	inline void resume() {
+		toResume = this;
+		Runtime::getRuntime()->getContextManager().resume(1);
+		__builtin_unreachable();
+	}
+
+	inline static void tryResumeFiber(uint32_t val) {
+		if (toResume != nullptr) {
+			toResume->tryResume(val);
+			// XXX has to set toResume to nullptr???
+		}
+	}
+
+private:
+	__attribute__((noinline, hot, optimize(3)))
+	void join_func() {
+		cont.ip = __builtin_return_address(0);
+		toResume = this;
+#ifdef EMPER_LOCKED_FIBRIL
+		Runtime::getRuntime()->getContextManager().resume(1);
+#else
+		Runtime::getRuntime()->getContextManager().resume(reserveStealCount);
+#endif
+	}
+
+public:
+	template<class RET, class... PARs, class... ARGs>
+	inline __attribute__((always_inline))
+	void fork(RET *ret, RET(*fun)(PARs...), ARGs ...args) {
+		auto fork_func = [](ARGs ...args, Fibril *fr, RET *ret, RET(*fun)(PARs...)) __attribute__((noinline, hot, optimize(3))) {
+			fr->cont.ip = __builtin_return_address(0);
+			Runtime* runtime = Runtime::getRuntime();
+			runtime->pushBottom(*fr);
+			*ret = fun(args...);
+			if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */
+				fr->resume();
+			}
+		};
+		membar(fork_func(args..., this, ret, fun));
+	}
+
+	template<class... PARs, class... ARGs>
+	inline __attribute__((always_inline))
+	void fork(void(*fun)(PARs...), ARGs ...args) {
+		auto fork_func = [](ARGs ...args, Fibril *fr, void(*fun)(PARs...)) __attribute__((noinline, hot, optimize(3))) {
+			fr->cont.ip = __builtin_return_address(0);
+			Runtime* runtime = Runtime::getRuntime();
+			runtime->pushBottom(*fr);
+			fun(args...);
+			if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */
+				fr->resume();
+			}
+		};
+		membar(fork_func(args..., this, fun));
+	}
+
+//	template<class RET, class T, class... ARGs>
+//	inline __attribute__((always_inline))
+//	void fork(RET *ret, std::function<T> fun, ARGs ...args) {
+//		auto fork_func = [](Fibril *fr, RET *ret, std::function<T> fun, ARGs ...args) __attribute__((noinline, hot, optimize(3))) {
+//			fr->cont.ip = __builtin_return_address(0);
+//			Runtime* runtime = Runtime::getRuntime();
+//			runtime->pushBottom(*fr);
+//			*ret = fun(args...);
+//			if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */
+//				fr->resume();
+//			}
+//		};
+//		membar(fork_func(this, ret, fun, args...));
+//	}
+
+	template<class T, class... ARGs>
+	inline __attribute__((always_inline))
+	void fork(std::function<T> fun, ARGs ...args) {
+		auto fork_func = [](Fibril *fr, std::function<T> fun, ARGs ...args) __attribute__((noinline, hot, optimize(3))) {
+			fr->cont.ip = __builtin_return_address(0);
+			Runtime* runtime = Runtime::getRuntime();
+			runtime->pushBottom(*fr);
+			fun(args...);
+			if (!runtime->popBottom()) { /* TODO laws Scheduler pushes to queues of other threads, handle that */
+				fr->resume();
+			}
+		};
+		membar(fork_func(this, fun, args...));
+	}
+
+#ifdef EMPER_LOCKED_FIBRIL
+	inline __attribute__((always_inline))
+	void join() {
+		if (activeChildrenCount == 0)
+			return;
+
+		membar(join_func());
+	}
+
+#else
+
+	inline __attribute__((always_inline))
+	void join() {
+		if (reserveStealCount == 0) {
+			return;
+		}
+
+		membar(join_func());
+
+		reserveStealCount = 0;
+#ifdef EMPER_MADVISE
+		resumable.store(false, std::memory_order_relaxed);
+#endif /* EMPER_MADVISE */
+	}
+#endif /* EMPER_LOCKED_FIBRIL */
+
+};
+
diff --git a/emper/MemoryManager.hpp b/emper/MemoryManager.hpp
index c8494fa3..c7302a4b 100644
--- a/emper/MemoryManager.hpp
+++ b/emper/MemoryManager.hpp
@@ -11,11 +11,11 @@ class MemoryManager {
 private:
 	const workerid_t workerCount;
 
-	adt::WsClQueue<void*, WS_QUEUE_SIZE>** queues;
+	//adt::WsClQueue<void*, WS_QUEUE_SIZE>** queues;
 
 	static thread_local adt::BoundedBumpArray<void, WORKER_EXCLUSIVE_QUEUE_SIZE> workerExclusiveQueue;
 	
-	static thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> queue;
+	//static thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> queue;
 
 	static void* mallocMemory() {
 		void* memory;
@@ -35,6 +35,7 @@ public:
 		if (memory)
 			return memory;
 
+#if 0
 		bool poped = queue.popTop(&memory);
 
 		if (likely(poped)) return memory;
@@ -50,6 +51,7 @@ public:
 			poped = queues[victim]->popTop(&memory);
 			if (poped) return memory;
 		}
+#endif
 
 		*malloced = true;
 		// If everything fails, allocate the memory.
@@ -62,9 +64,11 @@ public:
 		if (pushed)
 			return;
 
+#if 0
 		pushed = queue.pushBottom(memory);
 		if (pushed)
 			return;
+#endif
 
 		free(memory);
 	}
@@ -73,15 +77,17 @@ public:
 template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE>
 thread_local adt::BoundedBumpArray<void, WORKER_EXCLUSIVE_QUEUE_SIZE> MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::workerExclusiveQueue;
 
-template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE>
-thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::queue;
+//template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE>
+//thread_local adt::WsClQueue<void*, WS_QUEUE_SIZE> MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::queue;
 
 template<typename T, intptr_t WS_QUEUE_SIZE, size_t WORKER_EXCLUSIVE_QUEUE_SIZE>
 MemoryManager<T, WS_QUEUE_SIZE, WORKER_EXCLUSIVE_QUEUE_SIZE>::MemoryManager(Runtime& runtime) : workerCount(runtime.getWorkerCount()) {
+#if 0
 	queues = new adt::WsClQueue<void*, WS_QUEUE_SIZE>*[workerCount];
 
 	auto newWorkerHook = [this]() {
 		queues[Runtime::getWorkerId()] = &queue;
 	};
 	runtime.addNewWorkerHook(newWorkerHook);
+#endif
 }
diff --git a/emper/Runtime.cpp b/emper/Runtime.cpp
index 693f2596..2c812bfd 100644
--- a/emper/Runtime.cpp
+++ b/emper/Runtime.cpp
@@ -20,6 +20,11 @@ thread_local unsigned int Runtime::seed;
 thread_local workerid_t Runtime::workerId;
 RuntimeStrategy& Runtime::DEFAULT_STRATEGY =  WsStrategy::INSTANCE;
 
+#ifdef EMPER_FIBRIL_STATS
+std::atomic<uint64_t> statsSteals = 0;
+std::atomic<uint64_t> statsUnmapp = 0;
+#endif
+
 Runtime::Runtime(workerid_t workerCount, RuntimeStrategy& strategy) : workerCount(workerCount)
 										 , workerLatch(workerCount)
 										 , strategy(strategy)
@@ -74,9 +79,11 @@ Runtime::Runtime(workerid_t workerCount, RuntimeStrategy& strategy) : workerCoun
 
 Runtime::~Runtime() {
 	DBG("Runtime " << this << " is terminating");
+	shutdown = true;
+	notifyAboutNewWork();
 	for (workerid_t i = 0; i < workerCount; ++i) {
 		DBG("Runtime " << this << " is cancelling worker " << unsigned(i));
-		errno = pthread_cancel(threads[i]);
+		errno = pthread_join(threads[i], nullptr);
 		if (errno) {
 			DIE_MSG_ERRNO("pthread_cancel() failed");
 		}
@@ -85,6 +92,12 @@ Runtime::~Runtime() {
 		std::lock_guard<std::mutex> lock(currentRuntimeMutex);
 		currentRuntime = nullptr;
 	}
+#ifdef EMPER_FIBRIL_STATS
+	printf("  Statistics summary:\n");
+	printf("    # of steals: %lu K\n", (statsSteals.load() + 500) / 1000);
+	printf("    # of unmapps: %lu K\n", (statsUnmapp.load() + 500) / 1000);
+	printf("===========================================\n");
+#endif
 	DBG("Runtime " << this << " terminated");
 }
 
@@ -145,16 +158,41 @@ void Runtime::executeAndWait(std::function<void()> f) {
 		ABORT("Ca not use executeAndWait() from within the Runtime");
 	}
 
-	std::mutex fiberFinished;
-	fiberFinished.lock();
+	pthread_mutex_t m;
+	pthread_cond_t c;
+	bool fiberFinished = false;
+
+	if (int err = pthread_mutex_init(&m, NULL); err) {
+		errno = err;
+		DIE_MSG_ERRNO("pthread_mutex_init");
+	}
+	if (int err = pthread_cond_init(&c, NULL); err) {
+		errno = err;
+		DIE_MSG_ERRNO("pthread_cond_init");
+	}
 
 	Fiber* fiber = Fiber::from([&] {
 			f();
 
-			fiberFinished.unlock();
+			pthread_mutex_lock(&m);
+			fiberFinished = true;
+			pthread_cond_signal(&c);
+			pthread_mutex_unlock(&m);
 		});
 
 	schedule(*fiber);
 
-	fiberFinished.lock();
+	pthread_mutex_lock(&m);
+	while (!fiberFinished) {
+		pthread_cond_wait(&c, &m);
+	}
+	pthread_mutex_unlock(&m);
+	if (int err = pthread_mutex_destroy(&m); err) {
+		errno = err;
+		DIE_MSG_ERRNO("pthread_mutex_destroy");
+	}
+	if (int err = pthread_cond_destroy(&c); err) {
+		errno = err;
+		DIE_MSG_ERRNO("pthread_cond_destroy");
+	}
 }
diff --git a/emper/Runtime.hpp b/emper/Runtime.hpp
index 69b6ff3b..0f6df3eb 100644
--- a/emper/Runtime.hpp
+++ b/emper/Runtime.hpp
@@ -13,6 +13,11 @@
 
 class ContextManager;
 
+#ifdef EMPER_FIBRIL_STATS
+extern std::atomic<uint64_t> statsSteals;
+extern std::atomic<uint64_t> statsUnmapp;
+#endif
+
 class Runtime : public Logger<LogSubsystem::RUNTI> {
 private:
 	static std::mutex currentRuntimeMutex;
@@ -44,6 +49,8 @@ private:
 
 	static void printLastRuntimeStats();
 
+	volatile bool shutdown = false;
+
 protected:
 	void addNewWorkerHook(std::function<void(void)> hook) {
 		newWorkerHooks.push_back(hook);
@@ -78,12 +85,24 @@ public:
 
 	~Runtime();
 
+	inline bool isShuttingDown() {
+		return shutdown;
+	}
+
 	inline void schedule(Fiber& fiber) {
 		scheduler.schedule(fiber);
 	}
 
 	Fiber* nextFiber();
 
+	inline void pushBottom(Fiber& fiber) {
+		scheduler.pushBottom(fiber);
+	}
+
+	inline Fiber* popBottom() {
+		return scheduler.popBottom();
+	}
+
 	// https://stackoverflow.com/a/3747462/194894
 	static inline int rand() {
 		seed = 214013 * seed + 2531011;
diff --git a/emper/Scheduler.hpp b/emper/Scheduler.hpp
index 043165b1..e3427ba6 100644
--- a/emper/Scheduler.hpp
+++ b/emper/Scheduler.hpp
@@ -31,4 +31,7 @@ public:
 
 	virtual Fiber* nextFiber() = 0;
 
+	virtual void pushBottom(Fiber& fiber) = 0;
+	virtual Fiber* popBottom() = 0;
+
 };
diff --git a/emper/SynchronizedFiber.hpp b/emper/SynchronizedFiber.hpp
index cbdf6b88..785fdc33 100644
--- a/emper/SynchronizedFiber.hpp
+++ b/emper/SynchronizedFiber.hpp
@@ -31,7 +31,7 @@ private:
 	explicit SynchronizedFiber(fiber_fun0_t function, PrivateSemaphore& semaphore) : SynchronizedFiber(function, nullptr, semaphore) {
 	}
 
-	void run() const override {
+	void run() override {
 		Fiber::run();
 		semaphore.signalAndExit();
 	}
diff --git a/emper/include/emper-common.h b/emper/include/emper-common.h
index f73f8cc4..b86f3b3d 100644
--- a/emper/include/emper-common.h
+++ b/emper/include/emper-common.h
@@ -10,7 +10,7 @@
 
 #endif
 
-typedef uint8_t workerid_t;
+typedef uint16_t workerid_t;
 typedef int16_t workeraffinity_t;
 
 #define UNUSED_ARG __attribute__((unused))
diff --git a/emper/include/emper.hpp b/emper/include/emper.hpp
index 9fb63c84..16494392 100644
--- a/emper/include/emper.hpp
+++ b/emper/include/emper.hpp
@@ -2,6 +2,7 @@
 
 #include <functional>
 #include <cassert>
+#include <cstdio>
 
 #include "Runtime.hpp"
 #include "Fiber.hpp"
@@ -57,3 +58,13 @@ void spawn(Fiber::fiber_fun0_t function, workeraffinity_t* affinity, S& semaphor
 	Fiber* fiber = SynchronizedFiber::from(function, affinity, semaphore);
 	async(fiber);
 }
+
+
+/* VVVVVVVVVVVVVVVV CONTINUATION VVVVVVVVVVVVVVVVVVVV */
+
+#include "Fibril.hpp"
+
+
+//#define fibril __attribute__((noinline, sysv_abi, optimize("no-omit-frame-pointer")))
+#define fibril __attribute__((optimize("no-omit-frame-pointer")))
+
diff --git a/emper/lib/adt/BoundedMpmcQueue.hpp b/emper/lib/adt/BoundedMpmcQueue.hpp
new file mode 100644
index 00000000..e3e195a3
--- /dev/null
+++ b/emper/lib/adt/BoundedMpmcQueue.hpp
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <atomic>
+
+
+
+
+namespace adt {
+
+
+	template<typename T, const uintptr_t CAPACITY>
+	class BoundedMpmcQueue {
+	private:
+
+		struct {
+			std::atomic<uint64_t> next;
+			T value;
+		} alignas(128) buf[CAPACITY];
+
+
+		using QueueHead = 0;
+		using FreeListHead = CAPACITY - 1;
+
+
+		inline uint64_t updateNext(uint64_t next, uint64_t value) {
+			return ((next + CAPACITY) & ~(CAPACITY - 1)) | value;
+		}
+
+
+		inline uint64_t getIdx(uint64_t value) {
+			return value & (CAPACITY - 1);
+		}
+
+
+	public:
+
+		BoundedMpmcQueue() {
+			buf[QueueHead].next.store(0, std::memory_order_relaxed);
+
+			for (uint64_t i = 1; i < FreeListHead; k++) {
+				buf[i].next.store(i + 1, std::memory_order_relaxed);
+			}
+
+			buf[FreeListHead].next.store(1, std::memory_order_release);
+		}
+
+
+		inline bool put(T item) {
+			size_t head, index;
+
+			head = buf[FreeListHead].next.load(std::memory_order_acquire);
+			do {
+				index = getIdx(head);
+				if (index == FreeListHead)
+					return false;
+				uint64_t next = updateNext(head, buf[index].next.load(std::memory_order_acquire));
+			} while (!buf[FreeListHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire));
+
+			buf[index].value = item;
+			head = buf[QueueHead].next.load(std::memory_order_acquire);
+
+			do {
+				buf[index].next.store(getIdx(head), std::memory_order_relaxed);
+				uint64_t next = updateNext(head, index);
+			} while (!buf[QueueHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire));
+		}
+
+
+		inline bool get(T *itemPtr) {
+			uint64_t head, index;
+
+			head = buf[QueueHead].next.load(std::memory_order_acquire);
+			do {
+				index = getIdx(head);
+				if (!index)
+					return false;
+				uint64_t next = updateNext(head, buf[index].next.load(std::memory_order_acquire));
+			} while (!buf[QueueHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire));
+
+			*itemPtr = buf[index].value;
+			head = buf[FreeListHead].next.load(std::memory_order_acquire);
+
+			do {
+				buf[index].next.store(getIdx(head, std::memory_order_relaxed));
+				uint64_t next = updateNext(head, index);
+			} while (!buf[FreeListHead].next.compare_exchange_weak(head, next, std::memory_order_acq_rel, std::memory_order_acquire));
+
+			return true;
+		}
+
+	};
+
+
+}
diff --git a/emper/lib/adt/FibrilDeque.hpp b/emper/lib/adt/FibrilDeque.hpp
new file mode 100644
index 00000000..a3138292
--- /dev/null
+++ b/emper/lib/adt/FibrilDeque.hpp
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "FibrilLock.hpp"
+
+
+
+template<typename T, const uintptr_t CAPACITY>
+class FibrilDeque {
+private:
+	FibrilLock lock;
+	int head = 0;
+	int tail = 0;
+	T buff[CAPACITY];
+
+public:
+	bool pushBottom(const T item) {
+		if (tail == CAPACITY)
+			return false;
+
+		buff[tail++] = item;
+		//atomic_thread_fence(std::memory_order_seq_cst);
+		return true;
+	}
+
+
+	bool popBottom(T* itemPtr) {
+		int t = tail;
+
+		if (t == 0)
+			return false;
+
+		tail = --t;
+		atomic_thread_fence(std::memory_order_seq_cst);
+		*itemPtr = buff[tail];
+
+		if (head > t) {
+			tail = t + 1;
+			lock.lock();
+
+			if (head > t) {
+				head = 0;
+				tail = 0;
+
+				lock.unlock();
+				return false;
+			}
+
+			tail = t;
+			lock.unlock();
+		}
+
+		return true;
+	}
+
+	bool popTop(T* itemPtr) {
+		if (head >= tail)
+			return false;
+
+		lock.lock();
+
+		int h = head++;
+		atomic_thread_fence(std::memory_order_seq_cst);
+
+		if (h >= tail) {
+			head--;
+
+			lock.unlock();
+			return false;
+		}
+
+		*itemPtr = buff[h];
+#ifdef EMPER_LOCKED_FIBRIL
+			if constexpr (std::is_same<Fiber*, T>::value) {
+				if ((*itemPtr)->type == Fiber::Type::FibrilType) {
+					static_cast<Fibril*>(*itemPtr)->m.lock();
+				}
+			}
+#endif
+
+		lock.unlock();
+		return true;
+	}
+
+};
+
diff --git a/emper/lib/adt/FibrilLock.hpp b/emper/lib/adt/FibrilLock.hpp
new file mode 100644
index 00000000..b28b7adf
--- /dev/null
+++ b/emper/lib/adt/FibrilLock.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <atomic>
+
+
+
+class FibrilLock {
+private:
+	std::atomic_flag value = ATOMIC_FLAG_INIT;
+
+public:
+
+	__attribute__((always_inline))
+	inline void lock() {
+		do {
+			asm( "pause" ::: "memory" );
+		} while (value.test_and_set(std::memory_order_acquire));
+	}
+
+	__attribute__((always_inline))
+	inline void unlock() {
+		value.clear(std::memory_order_release);
+	}
+
+};
diff --git a/emper/lib/adt/LockedQueue.hpp b/emper/lib/adt/LockedQueue.hpp
index fa24f875..c967b367 100644
--- a/emper/lib/adt/LockedQueue.hpp
+++ b/emper/lib/adt/LockedQueue.hpp
@@ -5,6 +5,10 @@
 #include <mutex>
 #include <deque>
 
+#include <type_traits>
+
+#include "Fibril.hpp"
+
 namespace adt {
 
 	template<typename I, const uintptr_t SIZE>
@@ -37,6 +41,13 @@ namespace adt {
 			if (deque.empty()) return false;
 
 			*itemPtr = deque.front();
+#ifdef EMPER_LOCKED_FIBRIL
+			if constexpr (std::is_same<Fiber*, I>::value) {
+				if ((*itemPtr)->type == Fiber::Type::FibrilType) {
+					static_cast<Fibril*>(*itemPtr)->m.lock();
+				}
+			}
+#endif
 
 			deque.pop_front();
 
diff --git a/emper/lib/adt/WsClV3Queue.hpp b/emper/lib/adt/WsClV3Queue.hpp
new file mode 100644
index 00000000..bc14299a
--- /dev/null
+++ b/emper/lib/adt/WsClV3Queue.hpp
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <atomic>
+
+
+
+namespace adt {
+
+	template<typename T, const uintptr_t CAPACITY>
+	class WsClV3Queue {
+	protected:
+
+		alignas(64) std::atomic<uint64_t> top;
+		alignas(64) std::atomic<uint64_t> bottom;
+
+		alignas(64) T queue[CAPACITY];
+
+
+	public:
+		WsClV3Queue() : top(1), bottom(1) { }
+
+
+		bool pushBottom(const T item) {
+			uint64_t localTop, localBottom;
+
+			localBottom = bottom.load(std::memory_order_relaxed);
+			localTop = top.load(std::memory_order_acquire);
+			if ((localBottom - localTop) == CAPACITY)
+				return false;
+
+			queue[localBottom % CAPACITY] = item;
+			bottom.store(localBottom + 1, std::memory_order_release);
+
+			return true;
+		}
+
+
+		bool popBottom(T* itemPtr) {
+			bool ret;
+			uint64_t localTop, localBottom;
+
+			localBottom = bottom.fetch_sub(1, std::memory_order_acq_rel) - 1;
+			localTop = top.load(std::memory_order_acquire);
+
+			*itemPtr = queue[localBottom % CAPACITY];
+
+			if (localBottom < localTop) {
+				bottom.store(localTop, std::memory_order_relaxed);
+				return false;
+			} else if (localBottom > localTop)
+				return true;
+
+			ret = top.compare_exchange_strong(localTop, localTop + 1, std::memory_order_release, std::memory_order_relaxed);
+			bottom.store(localBottom + 1, std::memory_order_relaxed);
+
+			return ret;
+		}
+
+
+		bool popTop(T* itemPtr) {
+			uint64_t localTop, localBottom;
+
+			localTop = top.load(std::memory_order_relaxed);
+again:
+			localBottom = bottom.load(std::memory_order_acquire);
+			if (localBottom <= localTop)
+				return false;
+
+			*itemPtr = queue[localTop % CAPACITY];
+
+			if (!top.compare_exchange_weak(localTop, localTop + 1, std::memory_order_release, std::memory_order_acquire))
+				goto again;
+
+			return true;
+		}
+
+	};
+
+}
diff --git a/emper/lib/adt/WsClV4Queue.hpp b/emper/lib/adt/WsClV4Queue.hpp
new file mode 100644
index 00000000..9d58b2d5
--- /dev/null
+++ b/emper/lib/adt/WsClV4Queue.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <atomic>
+
+
+
+namespace adt {
+
+	template<typename T, const uintptr_t CAPACITY>
+	class WsClV4Queue {
+	protected:
+
+		alignas(64) std::atomic<uint64_t> top;
+		alignas(64) std::atomic<uint64_t> bottom;
+		alignas(64) uint64_t top_private;
+
+		alignas(64) T queue[CAPACITY];
+
+
+	public:
+		WsClV4Queue() : top(1), bottom(1), top_private(1) { }
+
+
+		bool pushBottom(const T item) {
+			uint64_t localTop, localBottom;
+
+			localBottom = bottom.load(std::memory_order_relaxed);
+			localTop = top_private;
+			if ((localBottom - localTop) == CAPACITY) {
+				localTop = top.load(std::memory_order_acquire);
+				if ((localBottom - localTop) == CAPACITY)
+					return false;
+				top_private = localTop;
+			}
+
+			queue[localBottom % CAPACITY] = item;
+			bottom.store(localBottom + 1, std::memory_order_release);
+
+			return true;
+		}
+
+
+		bool popBottom(T* itemPtr) {
+			bool ret;
+			uint64_t localTop, localBottom;
+
+			localBottom = bottom.fetch_sub(1, std::memory_order_acq_rel) - 1;
+			localTop = top.load(std::memory_order_acquire);
+
+			*itemPtr = queue[localBottom % CAPACITY];
+
+			if (localBottom < localTop) {
+				bottom.store(localTop, std::memory_order_relaxed);
+				return false;
+			} else if (localBottom > localTop)
+				return true;
+
+			ret = top.compare_exchange_strong(localTop, localTop + 1, std::memory_order_release, std::memory_order_relaxed);
+			bottom.store(localBottom + 1, std::memory_order_relaxed);
+
+			return ret;
+		}
+
+
+		bool popTop(T* itemPtr) {
+			uint64_t localTop, localBottom;
+
+			localTop = top.load(std::memory_order_relaxed);
+again:
+			localBottom = bottom.load(std::memory_order_acquire);
+			if (localBottom <= localTop)
+				return false;
+
+			*itemPtr = queue[localTop % CAPACITY];
+
+			if (!top.compare_exchange_weak(localTop, localTop + 1, std::memory_order_release, std::memory_order_acquire))
+				goto again;
+
+			return true;
+		}
+
+	};
+
+}
diff --git a/emper/strategies/laws/LawsDispatcher.cpp b/emper/strategies/laws/LawsDispatcher.cpp
index 5e354a35..b135e73a 100644
--- a/emper/strategies/laws/LawsDispatcher.cpp
+++ b/emper/strategies/laws/LawsDispatcher.cpp
@@ -2,11 +2,15 @@
 
 #include "Runtime.hpp"
 #include "LawsStrategy.hpp"
+#include "ContextManager.hpp"
 
 void LawsDispatcher::dispatchLoop() {
 	while (true) {
-		Fiber* const fiber = runtime.nextFiber();
+		Fiber* fiber = runtime.nextFiber();
 		if (!fiber) {
+			Runtime *runtime = Runtime::getRuntime();
+			if (runtime->isShuttingDown())
+				runtime->getContextManager().resume(0);
 #ifdef EMPER_WORKER_SLEEP
 			putRuntimeWorkerToSleep();
 #else
diff --git a/emper/strategies/laws/LawsScheduler.cpp b/emper/strategies/laws/LawsScheduler.cpp
index 26d1684d..6a5c832e 100644
--- a/emper/strategies/laws/LawsScheduler.cpp
+++ b/emper/strategies/laws/LawsScheduler.cpp
@@ -26,6 +26,34 @@ LawsScheduler::LawsScheduler(Runtime& runtime, LawsStrategy& lawsStrategy) : Sch
 	addNewWorkerHook(newWorkerHook);
 }
 
+void LawsScheduler::pushBottom(Fiber& fiber) {
+	fiber.runnable = true;
+	bool pushed = queue.pushBottom(&fiber);
+	if (unlikely(!pushed)) {
+		// Work-stealing should not use an overflow queue
+		// (EMPER_OVERFLOW_QUEUE), because of the extra overhead
+		// required to check that queue for work, so we have to abort
+		// here.
+		ABORT("Could not push fiber " << &fiber << " into queue");
+	}
+}
+
+Fiber* LawsScheduler::popBottom() {
+	Fiber* fiber;
+
+	bool poped = queue.popBottom(&fiber);
+	if (unlikely(!poped)) {
+		// Work-stealing should not use an overflow queue
+		// (EMPER_OVERFLOW_QUEUE), because of the extra overhead
+		// required to check that queue for work, so we have to abort
+		// here.
+		//ABORT("Could not pop fiber from queue");
+		fiber = nullptr;
+	}
+
+	return fiber;
+}
+
 void LawsScheduler::schedule(Fiber& fiber) {
 	LOGD("Scheduling fiber " << &fiber);
 
diff --git a/emper/strategies/laws/LawsScheduler.hpp b/emper/strategies/laws/LawsScheduler.hpp
index f3ce4269..b556568d 100644
--- a/emper/strategies/laws/LawsScheduler.hpp
+++ b/emper/strategies/laws/LawsScheduler.hpp
@@ -57,4 +57,7 @@ public:
 
 	Fiber* nextFiber() override;
 
+	void pushBottom(Fiber& fiber) override;
+	Fiber* popBottom() override;
+
 };
diff --git a/emper/strategies/ws/WsDispatcher.cpp b/emper/strategies/ws/WsDispatcher.cpp
index 20338201..d6864c46 100644
--- a/emper/strategies/ws/WsDispatcher.cpp
+++ b/emper/strategies/ws/WsDispatcher.cpp
@@ -2,11 +2,15 @@
 
 #include "Runtime.hpp"
 #include "Debug.hpp"
+#include "ContextManager.hpp"
 
 void WsDispatcher::dispatchLoop() {
 	while (true) {
-		const Fiber* fiber = runtime.nextFiber();
+		Fiber* fiber = runtime.nextFiber();
 		if (!fiber) {
+			Runtime *runtime = Runtime::getRuntime();
+			if (runtime->isShuttingDown())
+				runtime->getContextManager().resume(0);
 #ifdef EMPER_WORKER_SLEEP
 			putRuntimeWorkerToSleep();
 #else
diff --git a/emper/strategies/ws/WsScheduler.cpp b/emper/strategies/ws/WsScheduler.cpp
index 7766bf57..65718d11 100644
--- a/emper/strategies/ws/WsScheduler.cpp
+++ b/emper/strategies/ws/WsScheduler.cpp
@@ -16,6 +16,34 @@ WsScheduler::WsScheduler(Runtime& runtime, WsStrategy& wsStrategy) : Scheduler(r
 	addNewWorkerHook(newWorkerHook);
 }
 
+void WsScheduler::pushBottom(Fiber& fiber) {
+	bool pushed = queue.pushBottom(&fiber);
+	if (unlikely(!pushed)) {
+		// Work-stealing should not use an overflow queue
+		// (EMPER_OVERFLOW_QUEUE), because of the extra overhead
+		// required to check that queue for work, so we have to abort
+		// here.
+		ABORT("Could not push fiber " << &fiber << " into queue");
+	}
+	//schedule(fiber);
+}
+
+Fiber* WsScheduler::popBottom() {
+	Fiber* fiber;
+
+	bool poped = queue.popBottom(&fiber);
+	if (unlikely(!poped)) {
+		// Work-stealing should not use an overflow queue
+		// (EMPER_OVERFLOW_QUEUE), because of the extra overhead
+		// required to check that queue for work, so we have to abort
+		// here.
+		//ABORT("Could not pop fiber from queue");
+		fiber = nullptr;
+	}
+
+	return fiber;
+}
+
 void WsScheduler::schedule(Fiber& fiber) {
 	LOGD("Scheduling fiber " << &fiber);
 
@@ -62,6 +90,7 @@ Fiber* WsScheduler::nextFiber() {
 #ifdef EMPER_STATS
 	wsStrategy.nextFiberStolen.fetch_add(1, std::memory_order_relaxed);
 #endif
+			/* fibril->stack.ptr = victim->stack TODO */
 			return fiber;
 		}
 	}
diff --git a/emper/strategies/ws/WsScheduler.hpp b/emper/strategies/ws/WsScheduler.hpp
index 382caff2..30e32ee9 100644
--- a/emper/strategies/ws/WsScheduler.hpp
+++ b/emper/strategies/ws/WsScheduler.hpp
@@ -5,12 +5,18 @@
 #include "LockedQueue.hpp"
 #include "emper-common.h"
 
+#include "FibrilDeque.hpp"
+
 class WsStrategy;
 
 class WsScheduler: public Scheduler {
 	template <size_t SIZE>
 #ifdef EMPER_LOCKED_WS_QUEUE
+#ifdef EMPER_FIBRIL_SYNC
+	using WsQueue = FibrilDeque<Fiber*, SIZE>;
+#else
 	using WsQueue = adt::LockedQueue<Fiber*, SIZE>;
+#endif
 #else
 	using WsQueue = adt::WsClQueue<Fiber*, SIZE>;
 #endif
@@ -41,4 +47,7 @@ public:
 
 	Fiber* nextFiber() override;
 
+	void pushBottom(Fiber& fiber) override;
+	Fiber* popBottom() override;
+
 };
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
new file mode 100755
index 00000000..f0d9ce6e
--- /dev/null
+++ b/run_benchmarks.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+
+
+
+benchmark_dir=build/benchmarks
+result_dir=results
+
+benchmarks=( "serial" "emper_fiber" "tbb" "fibril" "cilkplus" )
+benchmark_serial="serial"
+
+max_cores=96
+step_size=12
+
+
+
+
+run_target() { # $1 target, $2 cores
+	(
+		EMPER_BENCH_NPROCS=$2 ./$1 > tmp
+		while [[ $? -ne 0 ]] && [[ $(cat tmp | wc -l) -ne 24 ]]; do
+			EMPER_BENCH_NPROCS=$2 ./$1 > tmp
+		done
+		cat tmp
+	) 2> /dev/null
+}
+
+
+run_benchmarks() { # $1 target_dir, $2 output_parent_dir
+	for target_name in $(ls $1); do
+		target="$1/$target_name"
+		if [[ -f $target && -x $target ]]; then
+			output_dir="$2/$target_name"
+
+			[[ -d $output_dir ]] || mkdir -p $output_dir
+
+			run_target $target 1 > $output_dir/001.txt
+
+			if [[ $benchmark_name != $benchmark_serial ]]; then
+				for cores in $(seq $step_size $step_size $max_cores); do
+					run_target $target $cores > $output_dir/$(printf "%03d" $cores).txt
+				done
+			fi
+		fi
+	done
+}
+
+
+
+
+
+if [[ $# -ge 1 ]]; then
+	target_dir=$benchmark_dir/$1
+	output_parent_dir=$result_dir/$1
+	if [[ $# -eq 2 ]]; then
+		output_parent_dir=$result_dir/$2
+	fi
+
+	run_benchmarks $target_dir $output_parent_dir
+
+	exit 0
+fi
+
+
+
+cp Makefile Makefile.$$
+
+cat Makefile.$$ | sed -e "s/-DEMPER_CM_WITH_MEMORY_MANAGER=..*\>/-DEMPER_CM_WITH_MEMORY_MANAGER=OFF/" > tmp.$$
+mv tmp.$$ Makefile
+
+make clean &> /dev/null || exit 1
+make release &> /dev/null || exit 1
+
+for benchmark_name in ${benchmarks[@]}; do
+	target_dir="$benchmark_dir/$benchmark_name"
+	output_parent_dir="$result_dir/$benchmark_name"
+
+	run_benchmarks $target_dir $output_parent_dir
+done
+
+
+
+target_dir=$benchmark_dir/emper_continuation
+output_parent_dir=$result_dir/emper_continuation
+
+cat Makefile.$$ | sed -e "s/-DEMPER_CM_WITH_MEMORY_MANAGER=..*\>/-DEMPER_CM_WITH_MEMORY_MANAGER=ON/" > tmp.$$
+
+for madv in "ON" "OFF" ; do
+	cat tmp.$$ | sed -e "s/-DEMPER_MADVISE=..*\>/-DEMPER_MADVISE=$madv/" > madv.$$
+
+	output_madv=$output_parent_dir
+	if [[ $madv == "ON" ]]; then
+		output_madv=${output_madv}_madv
+	fi
+
+	for lq in "ON" "OFF" ; do
+		cat madv.$$ | sed -e "s/-DEMPER_LOCKED_WS_QUEUE=..*\>/-DEMPER_LOCKED_WS_QUEUE=$lq/" > lq.$$
+
+		output_lq=${output_madv}
+		if [[ $lq == "ON" ]]; then
+			output_lq=${output_lq}_lq
+			for lf in "ON" "OFF" ; do
+				cat lq.$$ | sed -e "s/-DEMPER_LOCKED_FIBRIL=..*\>/-DEMPER_LOCKED_FIBRIL=$lf/" > lf.$$
+
+				output_lf=${output_lq}
+				if [[ $lf == "ON" ]]; then
+					output_lf=${output_lf}_lf
+				fi
+
+				mv lf.$$ Makefile
+				make clean &> /dev/null || exit 1
+				make release &> /dev/null || exit 1
+				run_benchmarks $target_dir $output_lf
+			done
+		else
+			mv lq.$$ Makefile
+			make clean &> /dev/null || exit 1
+			make release &> /dev/null || exit 1
+			run_benchmarks $target_dir $output_lq
+		fi
+	done
+done
+
+mv Makefile.$$ Makefile
+rm -f *.$$
+
+
+
+exit 0
diff --git a/test.sh b/test.sh
new file mode 100755
index 00000000..863a382d
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+
+
+num_tests=100
+
+
+benchmark_dir=build/benchmarks
+
+
+if [[ $# -eq 1 ]]; then
+	target_dir=$benchmark_dir/$1
+else
+	target_dir=$benchmark_dir/emper_continuation
+fi
+
+
+cd $target_dir
+
+
+for i in $(ls); do
+	if [[ -f $i && -x $i ]]; then
+		passed=0
+		for j in $(seq $num_tests); do
+			printf "\r[ %5d / %5d ] %30s   |\t %5d / %5d   PASSED" $j $num_tests $i $passed $num_tests
+			(
+				./$i
+				exit $?
+			)  &> /dev/null
+			if [[ $? -eq 0 ]]; then
+				passed=$((passed + 1))
+			fi
+		done
+		printf "\r                  %30s   |\t %5d / %5d   PASSED\n" $i $passed $num_tests
+	fi
+done
+
+
+exit 0
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7cdd4e82..d3da0b0a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -14,6 +14,14 @@ add_executable(cpp_api_test CppApiTest.cpp)
 target_link_libraries(cpp_api_test emper)
 add_test(CppApiTest cpp_api_test)
 
+add_executable(cpp_continuation_api_test CppContinuationApiTest.cpp)
+target_link_libraries(cpp_continuation_api_test emper)
+add_test(CppContinuationApiTest cpp_continuation_api_test)
+
+add_executable(simple_continuation_fib_test SimpleContinuationFibTest.cpp)
+target_link_libraries(simple_continuation_fib_test Threads::Threads emper)
+add_test(SimpleContinuationFibTeste simple_continuation_fib_test)
+
 add_executable(simple_actor_test SimpleActorTest.cpp)
 target_link_libraries(simple_actor_test emper)
 add_test(SimpleActorTest simple_actor_test)
@@ -21,3 +29,16 @@ add_test(SimpleActorTest simple_actor_test)
 add_executable(simple_laws_test SimpleLawsTest.cpp)
 target_link_libraries(simple_laws_test emper)
 add_test(SimpleLawsTest simple_laws_test)
+
+add_executable(continuation_sync_test ContinuationSyncTest.cpp)
+target_link_libraries(continuation_sync_test Threads::Threads emper)
+add_test(ContinuationSyncTest continuation_sync_test)
+
+add_executable(continuation_variable_parameter_test ContinuationVariableParameterTest.cpp)
+target_link_libraries(continuation_variable_parameter_test Threads::Threads emper)
+add_test(ContinuationVariableParameterTest continuation_variable_parameter_test)
+
+add_executable(simple_continuation_laws_test SimpleContinuationLawsTest.cpp)
+target_link_libraries(simple_continuation_laws_test emper)
+add_test(SimpleContinuationLawsTest simple_continuation_laws_test)
+
diff --git a/tests/ContinuationSyncTest.cpp b/tests/ContinuationSyncTest.cpp
new file mode 100644
index 00000000..7cb57596
--- /dev/null
+++ b/tests/ContinuationSyncTest.cpp
@@ -0,0 +1,68 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <list>
+#include <string>
+
+#include "emper.hpp"
+#include "PrivateSemaphore.hpp"
+#include "BinaryPrivateSemaphore.hpp"
+#include "CountingPrivateSemaphore.hpp"
+
+
+
+struct param {
+	BPS *sem1;
+	BPS *sem2;
+};
+
+
+fibril static void childFiber(void *p) {
+	param *params = static_cast<param*>(p);
+	//params->sem1->wait();
+	for (volatile uint64_t i = 0; i < (1UL<<16); i++) { }
+	params->sem2->signal();
+}
+
+
+fibril static void mainFiber(void) {
+	BPS sem1, sem2;
+	param params;
+	params.sem1 = &sem1;
+	params.sem2 = &sem2;
+	Fibril *fr = new Fibril();
+
+	fr->fork(childFiber, &params);
+
+	//sem1.signal();
+	sem2.wait();
+
+	fr->join();
+
+	delete fr;
+}
+
+
+int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) {
+	Runtime runtime;
+	//Runtime runtime(2);
+	exit(0);
+
+	Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) fibril {
+				Fibril *fr = new Fibril();
+
+				fr->fork(mainFiber);
+
+				fr->join();
+
+				delete fr;
+
+				exit(EXIT_SUCCESS);
+		}, nullptr);
+
+	runtime.schedule(*fibFiber);
+
+	runtime.waitUntilFinished();
+
+    return EXIT_FAILURE;
+}
diff --git a/tests/ContinuationVariableParameterTest.cpp b/tests/ContinuationVariableParameterTest.cpp
new file mode 100644
index 00000000..1e2df933
--- /dev/null
+++ b/tests/ContinuationVariableParameterTest.cpp
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <list>
+#include <string>
+
+#include "emper.hpp"
+
+
+
+int64_t fib_fast(int n) {
+	int64_t val0 = 0;
+	int64_t val1 = 1;
+	int64_t fib = n;
+
+	for (int i = 2; i < n + 1; i++) {
+		fib = val0 + val1;
+		val0 = val1;
+		val1 = fib;
+	}
+
+	return fib;
+}
+
+
+fibril static void fib(int64_t *r, int64_t n) {
+	if (n < 2) {
+		*r = n;
+	} else {
+		int64_t a, b;
+		a = b = -1337;
+
+		char buffer[sizeof(Fibril)];
+		Fibril *fr = new (buffer) Fibril();
+
+		fr->fork(fib, &a, n - 1);
+		//fib(&a, n - 1);
+		//fr->fork(fib, &b, n - 2);
+		fib(&b, n - 2);
+
+		fr->join();
+
+		fr->~Fibril();
+
+		*r = a + b;
+	}
+}
+
+fibril static int64_t fib(int64_t n) {
+	if (n < 2) {
+		return n;
+	} else {
+		int64_t a, b;
+		a = b = -1337;
+
+		char buffer[sizeof(Fibril)];
+		Fibril *fr = new (buffer) Fibril();
+
+		fr->fork(&a, fib, n - 1);
+		//a = fib(n - 1);
+		//fr->fork(&b, fib, n - 2);
+		b = fib(n - 2);
+
+		fr->join();
+
+		fr->~Fibril();
+
+		return a + b;
+	}
+}
+
+
+int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) {
+	Runtime runtime;
+
+	Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) {
+				//const int fibNum = 35;
+				//const int fibNum = 13;
+				const int fibNum = 42;
+				const int expected = fib_fast(fibNum);
+
+				int64_t result;
+
+				result = fib(fibNum);
+				//fib(&result, fibNum);
+
+				if (result != expected) {
+					exit(EXIT_FAILURE);
+				}
+
+				exit(EXIT_SUCCESS);
+		}, nullptr);
+
+	runtime.schedule(*fibFiber);
+
+	runtime.waitUntilFinished();
+
+    return EXIT_FAILURE;
+}
diff --git a/tests/CppApiTest.cpp b/tests/CppApiTest.cpp
index 71077710..03b8be1c 100644
--- a/tests/CppApiTest.cpp
+++ b/tests/CppApiTest.cpp
@@ -11,13 +11,13 @@ static void increaseCounterByOne() {
 }
 
 static void mainFiber(void) {
-	const unsigned int FIBER_COUNT = 100;
+	const unsigned int FIBER_COUNT = 10000;
 
 	CountingPrivateSemaphore cps;
 
 	for (unsigned int i = 0; i < FIBER_COUNT; ++i) {
 		spawn(&increaseCounterByOne, cps);
-	}		
+	}
 
 	cps.wait();
 
diff --git a/tests/CppContinuationApiTest.cpp b/tests/CppContinuationApiTest.cpp
new file mode 100644
index 00000000..971a778d
--- /dev/null
+++ b/tests/CppContinuationApiTest.cpp
@@ -0,0 +1,41 @@
+#include <atomic>
+
+#include "emper.hpp"
+
+
+
+static std::atomic_uint counter;
+
+fibril static void increaseCounterByOne() {
+	counter++;
+}
+
+fibril static void mainFiber(void) {
+	const unsigned int FIBER_COUNT = 10000;
+
+	Fibril *fr = new Fibril();
+
+	for (unsigned int i = 0; i < FIBER_COUNT; ++i) {
+		fr->fork(increaseCounterByOne);
+	}
+
+	fr->join();
+
+	delete fr;
+
+	if (counter != FIBER_COUNT) {
+		exit(EXIT_FAILURE);
+	}
+
+	exit(EXIT_SUCCESS);
+}
+
+int main(UNUSED_ARG int arg, UNUSED_ARG char *argv[]) {
+	Runtime runtime;
+
+	async(&mainFiber);
+
+	runtime.waitUntilFinished();
+
+	return EXIT_FAILURE;
+}
diff --git a/tests/SimpleContinuationFibTest.cpp b/tests/SimpleContinuationFibTest.cpp
new file mode 100644
index 00000000..41c894d9
--- /dev/null
+++ b/tests/SimpleContinuationFibTest.cpp
@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <list>
+#include <string>
+
+#include "emper.hpp"
+
+
+
+int64_t fib_fast(int n) {
+	int64_t val0 = 0;
+	int64_t val1 = 1;
+	int64_t fib = n;
+
+	for (int i = 2; i < n + 1; i++) {
+		fib = val0 + val1;
+		val0 = val1;
+		val1 = fib;
+	}
+
+	return fib;
+}
+
+
+typedef struct {
+	int n;
+	int64_t* result;
+} fibParams;
+
+fibril static void fib(void *voidParams) {
+	fibParams* params = static_cast<fibParams*>(voidParams);
+	int n = params->n;
+	int64_t *result = params->result;
+
+	if (n < 2) {
+		*result = n;
+	} else {
+		int64_t a, b;
+		a = b = -1337;
+
+		//Fibril *fr = new Fibril();
+		char buffer[sizeof(Fibril)];
+		Fibril *fr = new (buffer) Fibril();
+
+		fibParams newParams1;
+		newParams1.n = n - 1;
+		newParams1.result = &a;
+		fibParams newParams2;
+		newParams2.n = n - 2;
+		newParams2.result = &b;
+
+		fr->fork(fib, &newParams1);
+		fr->fork(fib, &newParams2);
+		//fib(&newParams1);
+		//fib(&newParams2);
+
+		fr->join();
+
+		//delete fr;
+		fr->~Fibril();
+
+		*result = a + b;
+
+	}
+}
+
+
+int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) {
+	Runtime runtime;
+	//Runtime runtime(2);
+
+	Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) {
+				const int fibNum = 42;
+				const int expected = fib_fast(fibNum);
+
+				int64_t result;
+				fibParams params = { fibNum, &result };
+
+				fib(&params);
+
+				if (result != expected) {
+					exit(EXIT_FAILURE);
+				}
+
+				exit(EXIT_SUCCESS);
+		}, nullptr);
+
+	runtime.schedule(*fibFiber);
+
+	runtime.waitUntilFinished();
+
+    return EXIT_FAILURE;
+}
diff --git a/tests/SimpleContinuationLawsTest.cpp b/tests/SimpleContinuationLawsTest.cpp
new file mode 100644
index 00000000..78e560a7
--- /dev/null
+++ b/tests/SimpleContinuationLawsTest.cpp
@@ -0,0 +1,106 @@
+#include "emper.hpp"
+
+#include "LawsStrategy.hpp"
+#include "Fiber.hpp"
+
+#include <random>
+
+
+static const unsigned int ROUND_COUNT = 10;
+static const unsigned int FIBER_LOOPS = 10;
+static const unsigned int PAYLOAD_COUNT = 4096;
+
+
+typedef struct ALIGN_TO_CACHE_LINE {
+	// 4096 * 8 byte (64 bit) = 32 KiB = L1 cache size of most systems
+	uint64_t payload[PAYLOAD_COUNT];
+} FiberData;
+
+typedef struct ALIGN_TO_CACHE_LINE {
+	workeraffinity_t affinity;
+} AlignedWorkerAffinity;
+
+
+static void fiberFun(FiberData* fiberData) {
+	std::random_device randomDevice;
+	std::mt19937_64 randomGenerator(randomDevice());
+	std::uniform_int_distribution<unsigned long long> randomDistribution(0, UINT64_MAX);
+
+	for (unsigned int i = 0; i < FIBER_LOOPS; ++i) {
+		for (unsigned int j = 0; j < PAYLOAD_COUNT; ++j) {
+			unsigned long long r = randomDistribution(randomGenerator);
+			fiberData->payload[j] += r;
+		}
+	}
+}
+
+fibril static void alphaFun() {
+	Runtime* runtime = Runtime::getRuntime();
+	const unsigned int FIBER_COUNT = runtime->getWorkerCount() + 3;
+
+	AlignedWorkerAffinity *affinities = new AlignedWorkerAffinity[FIBER_COUNT];
+	FiberData* fiberData = new FiberData[FIBER_COUNT];
+
+	for (unsigned int i = 0; i < FIBER_COUNT; ++i) {
+		FiberData& currentFiberData = fiberData[i];
+		memset(currentFiberData.payload, 0, sizeof(uint64_t) * PAYLOAD_COUNT);
+
+		affinities[i].affinity = Fiber::NOT_AFFINE;
+	}
+
+	char buffer[sizeof(Fibril)];
+	Fibril *fr = new (buffer) Fibril();
+
+	for (unsigned int round = 0; round < ROUND_COUNT; ++round) {
+		for (unsigned int i = 0; i < FIBER_COUNT; ++i) {
+			FiberData* myFiberData = &fiberData[i];
+			fr->fork(fiberFun, myFiberData);
+			//Fiber* fiber = Fiber::from(&fiberFun,
+			//						   myFiberData,
+			//						   &affinities[i].affinity);
+		}
+		fr->join();
+	}
+
+	std::atomic<uint64_t> finalResult(0);
+	for (unsigned int i = 0; i < FIBER_COUNT; ++i) {
+		FiberData* myFiberData = &fiberData[i];
+		fr->fork<void(void)>([myFiberData, &finalResult]() {
+				uint64_t mySum = 0;
+				for (unsigned int i = 0; i < PAYLOAD_COUNT; ++i) {
+					mySum += myFiberData->payload[i];
+				}
+				finalResult += mySum;
+			});
+		//Fiber* fiber = Fiber::from([myFiberData, &finalResult]() {
+		//		uint64_t mySum = 0;
+		//		for (unsigned int i = 0; i < PAYLOAD_COUNT; ++i) {
+		//			mySum += myFiberData->payload[i];
+		//		}
+		//		finalResult += mySum;
+		//	},
+		//	&affinities[i].affinity);
+	}
+	fr->join();
+
+	fr->~Fibril();
+	free(fiberData);
+	free(affinities);
+
+	std::cerr << "Result: " << finalResult << std::endl;
+
+	exit(EXIT_SUCCESS);
+}
+
+int main(UNUSED_ARG int args, UNUSED_ARG char *argv[]) {
+	RuntimeStrategy& lawsStrategy = LawsStrategy::INSTANCE;
+	Runtime runtime(lawsStrategy);
+
+	Fiber* alphaFiber = Fiber::from(&alphaFun);
+
+	runtime.schedule(*alphaFiber);
+
+	runtime.waitUntilFinished();
+
+	return EXIT_FAILURE;
+}
diff --git a/tests/SimpleFibTest.cpp b/tests/SimpleFibTest.cpp
index d31da0d7..35e9918e 100644
--- a/tests/SimpleFibTest.cpp
+++ b/tests/SimpleFibTest.cpp
@@ -11,6 +11,23 @@
 #include "CountingPrivateSemaphore.hpp"
 #include "Debug.hpp"
 
+
+
+int64_t fib_fast(int n) {
+	int64_t val0 = 0;
+	int64_t val1 = 1;
+	int64_t fib = n;
+
+	for (int i = 2; i < n + 1; i++) {
+		fib = val0 + val1;
+		val0 = val1;
+		val1 = fib;
+	}
+
+	return fib;
+}
+
+
 typedef struct {
 	int n;
 	int* result;
@@ -59,7 +76,9 @@ int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) {
 	Runtime runtime;
 
 	Fiber* fibFiber = Fiber::from([] (UNUSED_ARG void* arg) {
-				const int fibNum = 13;
+				//const int fibNum = 13;
+				const int fibNum = 30;
+
 				int result;
 				BPS sem;
 				fibParams params = { fibNum, &result, &sem };
@@ -68,7 +87,7 @@ int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) {
 
 				sem.wait();
 
-				if (result != 233) {
+				if (result != fib_fast(fibNum)) {
 					exit(EXIT_FAILURE);
 				}
 
@@ -78,6 +97,6 @@ int main(UNUSED_ARG int argc, UNUSED_ARG char *argv[]) {
 	runtime.schedule(*fibFiber);
 
 	runtime.waitUntilFinished();
-	
+
     return EXIT_FAILURE;
 }
diff --git a/time.sh b/time.sh
new file mode 100755
index 00000000..8a66a882
--- /dev/null
+++ b/time.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+
+
+benchmark_dir=build/benchmarks
+
+
+if [[ $# -eq 1 ]]; then
+	target_dir=$benchmark_dir/$1
+else
+	target_dir=$benchmark_dir/emper_continuation
+fi
+
+
+cd $target_dir
+
+
+for i in $(ls); do
+	if [[ -f $i && -x $i ]]; then
+		(
+			false || while [[ $? -ne 0 ]]; do
+				/usr/bin/time -f "%e" ./$i 2> tmp
+			done
+			printf "%30s   |\t %.2f\n" $i $(cat tmp)
+		)
+	fi
+done
+
+
+exit 0
-- 
GitLab