Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • abp-queue
  • async_network2
  • burak
  • cactus_stack_devel
  • cactus_stack_devel_one_commit
  • cast-if-future
  • ci-bump-test
  • client-load-change
  • completer-strategies
  • cppcheck
  • emper-fix-invalid-conv
  • flow
  • fused-continuation-and-completion-stealing
  • libc++
  • libstdc++-asserts
  • linux-version-construct-on-first-use
  • master
  • msan
  • new-delete-leaks
  • remote-put-get-free-context-cycle
  • thread_safe_log_config
  • attic/clang-release-tls-optimization-debug-and-fix
  • attic/continuation-stealing-dev
23 results

Target

Select target project
  • flow/emper
  • aj46ezos/emper
  • i4/manycore/emper
3 results
Select Git revision
  • burak
  • cactus_stack_devel
  • emper-fs-eval
  • io-sleep-main
  • io_uring_network
  • kickoff-without-ret
  • master
  • tsan_ci_target
  • worker_exclusive_uring_no_partial_completion
  • worker_exclusive_uring_weak
10 results
Show changes

Commits on Source 105

5 additional commits have been omitted to prevent performance issues.
Showing
with 4820 additions and 2 deletions
...@@ -56,9 +56,12 @@ endmacro() ...@@ -56,9 +56,12 @@ endmacro()
emper_option(WORKER_SLEEP "Enable sleeping worker support") emper_option(WORKER_SLEEP "Enable sleeping worker support")
emper_option(LOCKED_WS_QUEUE "Use a fully locked queue for work-stealing") emper_option(LOCKED_WS_QUEUE "Use a fully locked queue for work-stealing")
emper_option(LOCKED_FIBRIL "Use a fully locked Fibril. Only works with locked work-stealing queues")
emper_option(OVERFLOW_QUEUE "Use a overflow queue in case the primary queue is full") emper_option(OVERFLOW_QUEUE "Use a overflow queue in case the primary queue is full")
emper_option(LOCKED_MPSC_QUEUE "Use the locked variant for the MPSC queue") emper_option(LOCKED_MPSC_QUEUE "Use the locked variant for the MPSC queue")
emper_option(STATS "Collect stats and print them at the end of the execution") emper_option(STATS "Collect stats and print them at the end of the execution")
emper_option(MADVISE "Use madvise(MADV_DONTNEED) to unmap unused stack pages. Bound memory consumption")
emper_option(CM_WITH_MEMORY_MANAGER "Use context manager with a memory manager")
# Macro to add files to a var. Can even be used in subdirectories. # Macro to add files to a var. Can even be used in subdirectories.
# Source: http://stackoverflow.com/a/7049380/194894 # Source: http://stackoverflow.com/a/7049380/194894
...@@ -108,12 +111,14 @@ add_library(c_emper STATIC ${C_EMPER_SOURCE}) ...@@ -108,12 +111,14 @@ add_library(c_emper STATIC ${C_EMPER_SOURCE})
# set_property(TARGET c_emper PROPERTY INTERPROCEDURAL_OPTIMIZATION True) # set_property(TARGET c_emper PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
target_link_libraries(c_emper emper) target_link_libraries(c_emper emper)
add_subdirectory("lib") #add_subdirectory("lib")
add_subdirectory("apps") add_subdirectory("apps")
add_subdirectory("tests") add_subdirectory("tests")
add_subdirectory("benchmarks")
add_subdirectory("eval") add_subdirectory("eval")
file(GLOB ALL_SOURCE_FILES *.cpp) file(GLOB ALL_SOURCE_FILES *.cpp)
......
...@@ -23,7 +23,11 @@ debug release relwithdebug: ...@@ -23,7 +23,11 @@ debug release relwithdebug:
rm -f build rm -f build
ln -rs build-$@ build ln -rs build-$@ build
cd build-$@; \ cd build-$@; \
[[ -f CMakeCache.txt ]] || cmake -DCMAKE_BUILD_TYPE=$@ .. \ [[ -f CMakeCache.txt ]] || cmake -DCMAKE_BUILD_TYPE=$@ \
-DEMPER_CM_WITH_MEMORY_MANAGER=OFF \
-DEMPER_LOCKED_WS_QUEUE=OFF \
-DEMPER_LOCKED_FIBRIL=OFF \
-DEMPER_MADVISE=OFF .. \
&& make $(COMMON_MAKE_ARGS) && make $(COMMON_MAKE_ARGS)
reldebug: relwithdebug reldebug: relwithdebug
......
add_subdirectory(tbb)
add_subdirectory(openmp)
add_subdirectory(serial)
add_subdirectory(emper_continuation)
add_subdirectory(emper_fiber)
add_subdirectory(fibril)
add_subdirectory(fibril_lf)
#add_subdirectory(cilkplus)
/*
* Sparse Cholesky code with little blocks at the leaves of the Quad tree
* Keith Randall -- Aske Plaat
*
* This code should run with any square sparse real symmetric matrix
* from MatrixMarket (http://math.nist.gov/MatrixMarket)
*
* run with `cholesky -f george-liu.mtx' for a given matrix, or
* `cholesky -n 1000 -z 10000' for a 1000x1000 random matrix with 10000
* nonzeros (caution: random matrices produce lots of fill).
*/
/*
* Copyright (c) 2000 Massachusetts Institute of Technology
* Copyright (c) 2000 Matteo Frigo
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "test.h"
/*************************************************************\
* Basic types
\*************************************************************/
typedef double Real;
#define BLOCK_DEPTH 2 /* logarithm base 2 of BLOCK_SIZE */
#define BLOCK_SIZE (1<<BLOCK_DEPTH) /* 4 seems to be the optimum */
typedef Real Block[BLOCK_SIZE][BLOCK_SIZE];
#define BLOCK(B,I,J) (B[I][J])
#define _00 0
#define _01 1
#define _10 2
#define _11 3
#define TR_00 _00
#define TR_01 _10
#define TR_10 _01
#define TR_11 _11
typedef struct InternalNode {
struct InternalNode *child[4];
} InternalNode;
typedef struct {
Block block;
} LeafNode;
typedef InternalNode *Matrix;
static Matrix A, R;
static int depth;
#ifndef BENCHMARK
int n = 2000;
static int nonzeros = 10000;
#else
int n = 4000;
static int nonzeros = 40000;
#endif
/*************************************************************\
* Linear algebra on blocks
\*************************************************************/
/*
* elem_daxmy - Compute y' = y - ax where a is a Real and x and y are
* vectors of Reals.
*/
static void elem_daxmy(Real a, Real * x, Real * y, int n)
{
for (n--; n >= 0; n--)
y[n] -= a * x[n];
}
/*
* block_schur - Compute Schur complement B' = B - AC.
*/
static void block_schur_full(Block B, Block A, Block C)
{
int i, j, k;
for (i = 0; i < BLOCK_SIZE; i++) {
for (j = 0; j < BLOCK_SIZE; j++) {
for (k = 0; k < BLOCK_SIZE; k++) {
BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k);
}
}
}
}
/*
* block_schur - Compute Schur complement B' = B - AC.
*/
static void block_schur_half(Block B, Block A, Block C)
{
int i, j, k;
/*
* printf("schur half\n");
*/
/* Compute Schur complement. */
for (i = 0; i < BLOCK_SIZE; i++) {
for (j = 0; j <= i /* BLOCK_SIZE */ ; j++) {
for (k = 0; k < BLOCK_SIZE; k++) {
BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k);
}
}
}
}
/*
* block_upper_solve - Perform substitution to solve for B' in
* B'U = B.
*/
static void block_backsub(Block B, Block U)
{
int i, j, k;
/* Perform backward substitution. */
for (i = 0; i < BLOCK_SIZE; i++) {
for (j = 0; j < BLOCK_SIZE; j++) {
for (k = 0; k < i; k++) {
BLOCK(B, j, i) -= BLOCK(U, i, k) * BLOCK(B, j, k); /* transpose? */
}
BLOCK(B, j, i) /= BLOCK(U, i, i);
}
}
}
/*
* block_lower_solve - Perform forward substitution to solve for B' in
* LB' = B.
*/
static void xblock_backsub(Block B, Block L)
{
int i, k;
(void) xblock_backsub;
/* Perform forward substitution. */
for (i = 0; i < BLOCK_SIZE; i++)
for (k = 0; k <= i; k++) {
BLOCK(B, i, k) /= BLOCK(L, k, k);
elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0),
&BLOCK(B, i, 0), BLOCK_SIZE - k);
}
}
/*
* block_cholesky - Factor block B.
*/
static void block_cholesky(Block B)
{
int i, j, k;
for (k = 0; k < BLOCK_SIZE; k++) {
Real x;
if (BLOCK(B, k, k) < 0.0) {
printf("sqrt error: %f\n", BLOCK(B, k, k));
printf("matrix is probably not numerically stable\n");
exit(9);
}
x = sqrt(BLOCK(B, k, k));
for (i = k; i < BLOCK_SIZE; i++) {
BLOCK(B, i, k) /= x;
}
for (j = k + 1; j < BLOCK_SIZE; j++) {
for (i = j; i < BLOCK_SIZE; i++) {
BLOCK(B, i, j) -= BLOCK(B, i, k) * BLOCK(B, j, k);
if (j > i && BLOCK(B, i, j) != 0.0) {
printf("Upper not empty\n");
}
}
}
}
}
/*
* block_zero - zero block B.
*/
static void block_zero(Block B)
{
int i, k;
for (i = 0; i < BLOCK_SIZE; i++) {
for (k = 0; k < BLOCK_SIZE; k++) {
BLOCK(B, i, k) = 0.0;
}
}
}
/*************************************************************\
* Allocation and initialization
\*************************************************************/
/*
* Create new leaf nodes (BLOCK_SIZE x BLOCK_SIZE submatrices)
*/
static inline InternalNode *new_block_leaf(void)
{
LeafNode *leaf = (LeafNode*) malloc(sizeof(LeafNode));
if (leaf == NULL) {
printf("out of memory!\n");
exit(1);
}
return (InternalNode *) leaf;
}
/*
* Create internal node in quadtree representation
*/
static inline InternalNode *new_internal(InternalNode * a00, InternalNode * a01,
InternalNode * a10, InternalNode * a11)
{
InternalNode *node = (InternalNode*) malloc(sizeof(InternalNode));
if (node == NULL) {
printf("out of memory!\n");
exit(1);
}
node->child[_00] = a00;
node->child[_01] = a01;
node->child[_10] = a10;
node->child[_11] = a11;
return node;
}
/*
* Duplicate matrix. Resulting matrix may be laid out in memory
* better than source matrix.
*/
fibril static Matrix copy_matrix(int depth, Matrix a)
{
Matrix r;
if (!a)
return a;
if (depth == BLOCK_DEPTH) {
LeafNode *A = (LeafNode *) a;
LeafNode *R;
r = new_block_leaf();
R = (LeafNode *) r;
memcpy(R->block, A->block, sizeof(Block));
} else {
Matrix r00, r01, r10, r11;
depth--;
fibril_t fr;
fibril_init(&fr);
fibril_fork(&fr, &r00, copy_matrix, (depth, a->child[_00]));
fibril_fork(&fr, &r01, copy_matrix, (depth, a->child[_01]));
fibril_fork(&fr, &r10, copy_matrix, (depth, a->child[_10]));
r11 = copy_matrix(depth, a->child[_11]);
fibril_join(&fr);
r = new_internal(r00, r01, r10, r11);
}
return r;
}
/*
* Deallocate matrix.
*/
void free_matrix(int depth, Matrix a)
{
if (a == NULL)
return;
if (depth == BLOCK_DEPTH) {
free(a);
} else {
depth--;
free_matrix(depth, a->child[_00]);
free_matrix(depth, a->child[_01]);
free_matrix(depth, a->child[_10]);
free_matrix(depth, a->child[_11]);
free(a);
}
}
/*************************************************************\
* Simple matrix operations
\*************************************************************/
/*
* Get matrix element at row r, column c.
*/
static Real get_matrix(int depth, Matrix a, int r, int c)
{
if (a == NULL)
return 0.0;
if (depth == BLOCK_DEPTH) {
LeafNode *A = (LeafNode *) a;
return BLOCK(A->block, r, c);
} else {
int mid;
depth--;
mid = 1 << depth;
if (r < mid) {
if (c < mid)
return get_matrix(depth, a->child[_00], r, c);
else
return get_matrix(depth, a->child[_01], r, c - mid);
} else {
if (c < mid)
return get_matrix(depth, a->child[_10], r - mid, c);
else
return get_matrix(depth, a->child[_11], r - mid, c - mid);
}
}
}
/*
* Set matrix element at row r, column c to value.
*/
static Matrix set_matrix(int depth, Matrix a, int r, int c, Real value)
{
if (depth == BLOCK_DEPTH) {
LeafNode *A;
if (a == NULL) {
a = new_block_leaf();
A = (LeafNode *) a;
block_zero(A->block);
} else {
A = (LeafNode *) a;
}
BLOCK(A->block, r, c) = value;
} else {
int mid;
if (a == NULL)
a = new_internal(NULL, NULL, NULL, NULL);
depth--;
mid = 1 << depth;
if (r < mid) {
if (c < mid)
a->child[_00] = set_matrix(depth, a->child[_00],
r, c, value);
else
a->child[_01] = set_matrix(depth, a->child[_01],
r, c - mid, value);
} else {
if (c < mid)
a->child[_10] = set_matrix(depth, a->child[_10],
r - mid, c, value);
else
a->child[_11] = set_matrix(depth, a->child[_11],
r - mid, c - mid, value);
}
}
return a;
}
/*
* Compute sum of squares of elements of matrix
*/
static Real mag(int depth, Matrix a)
{
Real res = 0.0;
if (!a)
return res;
if (depth == BLOCK_DEPTH) {
LeafNode *A = (LeafNode *) a;
int i, j;
for (i = 0; i < BLOCK_SIZE; i++)
for (j = 0; j < BLOCK_SIZE; j++)
res += BLOCK(A->block, i, j) * BLOCK(A->block, i, j);
} else {
depth--;
res += mag(depth, a->child[_00]);
res += mag(depth, a->child[_01]);
res += mag(depth, a->child[_10]);
res += mag(depth, a->child[_11]);
}
return res;
}
/*************************************************************\
* Cholesky algorithm
\*************************************************************/
/*
* Perform R -= A * Transpose(B)
* if lower==1, update only lower-triangular part of R
*/
fibril static
Matrix mul_and_subT(int depth, int lower, Matrix a, Matrix b, Matrix r)
{
if (depth == BLOCK_DEPTH) {
LeafNode *A = (LeafNode *) a;
LeafNode *B = (LeafNode *) b;
LeafNode *R;
if (r == NULL) {
r = new_block_leaf();
R = (LeafNode *) r;
block_zero(R->block);
} else
R = (LeafNode *) r;
if (lower)
block_schur_half(R->block, A->block, B->block);
else
block_schur_full(R->block, A->block, B->block);
} else {
Matrix r00, r01, r10, r11;
depth--;
if (r != NULL) {
r00 = r->child[_00];
r01 = r->child[_01];
r10 = r->child[_10];
r11 = r->child[_11];
} else {
r00 = NULL;
r01 = NULL;
r10 = NULL;
r11 = NULL;
}
fibril_t fr;
fibril_init(&fr);
if (a->child[_00] && b->child[TR_00])
fibril_fork(&fr, &r00, mul_and_subT, (depth, lower,
a->child[_00], b->child[TR_00],
r00));
if (!lower && a->child[_00] && b->child[TR_01])
fibril_fork(&fr, &r01, mul_and_subT, (depth, 0,
a->child[_00], b->child[TR_01],
r01));
if (a->child[_10] && b->child[TR_00])
fibril_fork(&fr, &r10, mul_and_subT, (depth, 0,
a->child[_10], b->child[TR_00],
r10));
if (a->child[_10] && b->child[TR_01])
fibril_fork(&fr, &r11, mul_and_subT, (depth, lower,
a->child[_10], b->child[TR_01],
r11));
fibril_join(&fr);
if (a->child[_01] && b->child[TR_10])
fibril_fork(&fr, &r00, mul_and_subT, (depth, lower,
a->child[_01], b->child[TR_10],
r00));
if (!lower && a->child[_01] && b->child[TR_11])
fibril_fork(&fr, &r01, mul_and_subT, (depth, 0,
a->child[_01], b->child[TR_11],
r01));
if (a->child[_11] && b->child[TR_10])
fibril_fork(&fr, &r10, mul_and_subT, (depth, 0,
a->child[_11], b->child[TR_10],
r10));
if (a->child[_11] && b->child[TR_11])
fibril_fork(&fr, &r11, mul_and_subT, (depth, lower,
a->child[_11], b->child[TR_11],
r11));
fibril_join(&fr);
if (r == NULL) {
if (r00 || r01 || r10 || r11)
r = new_internal(r00, r01, r10, r11);
} else {
r->child[_00] = r00;
r->child[_01] = r01;
r->child[_10] = r10;
r->child[_11] = r11;
}
}
return r;
}
/*
* Perform substitution to solve for B in BL = A
* Returns B in place of A.
*/
fibril static Matrix backsub(int depth, Matrix a, Matrix l)
{
if (depth == BLOCK_DEPTH) {
LeafNode *A = (LeafNode *) a;
LeafNode *L = (LeafNode *) l;
block_backsub(A->block, L->block);
} else {
Matrix a00, a01, a10, a11;
Matrix l00, l10, l11;
depth--;
a00 = a->child[_00];
a01 = a->child[_01];
a10 = a->child[_10];
a11 = a->child[_11];
l00 = l->child[_00];
l10 = l->child[_10];
l11 = l->child[_11];
fibril_t fr;
fibril_init(&fr);
if (a00)
fibril_fork(&fr, &a00, backsub, (depth, a00, l00));
if (a10)
fibril_fork(&fr, &a10, backsub, (depth, a10, l00));
fibril_join(&fr);
if (a00 && l10)
fibril_fork(&fr, &a01, mul_and_subT, (depth, 0, a00, l10, a01));
if (a10 && l10)
fibril_fork(&fr, &a11, mul_and_subT, (depth, 0, a10, l10, a11));
fibril_join(&fr);
if (a01)
fibril_fork(&fr, &a01, backsub, (depth, a01, l11));
if (a11)
fibril_fork(&fr, &a11, backsub, (depth, a11, l11));
fibril_join(&fr);
a->child[_00] = a00;
a->child[_01] = a01;
a->child[_10] = a10;
a->child[_11] = a11;
}
return a;
}
/*
* Compute Cholesky factorization of A.
*/
fibril static Matrix cholesky(int depth, Matrix a)
{
if (depth == BLOCK_DEPTH) {
LeafNode *A = (LeafNode *) a;
block_cholesky(A->block);
} else {
Matrix a00, a10, a11;
depth--;
a00 = a->child[_00];
a10 = a->child[_10];
a11 = a->child[_11];
if (!a10) {
fibril_t fr;
fibril_init(&fr);
fibril_fork(&fr, &a00, cholesky, (depth, a00));
a11 = cholesky(depth, a11);
fibril_join(&fr);
} else {
a00 = cholesky(depth, a00);
a10 = backsub(depth, a10, a00);
a11 = mul_and_subT(depth, 1, a10, a10, a11);
a11 = cholesky(depth, a11);
}
a->child[_00] = a00;
a->child[_10] = a10;
a->child[_11] = a11;
}
return a;
}
static int logarithm(int size)
{
int k = 0;
while ((1 << k) < size)
k++;
return k;
}
void init()
{
/* generate random matrix */
depth = logarithm(n);
/* diagonal elements */
int i;
for (i = 0; i < n; i++)
A = set_matrix(depth, A, i, i, 1.0);
/* off-diagonal elements */
for (i = 0; i < nonzeros - n; i++) {
int r, c;
do {
r = rand() % n;
c = rand() % n;
} while (r <= c || get_matrix(depth, A, r, c) != 0.0);
A = set_matrix(depth, A, r, c, 0.1);
}
/* extend to power of two n with identity matrix */
for (i = n; i < (1 << depth); i++) {
A = set_matrix(depth, A, i, i, 1.0);
}
}
void prep()
{
free_matrix(depth, R);
R = copy_matrix(depth, A);
}
void test()
{
R = cholesky(depth, R);
}
int verify()
{
int fail = 0;
#ifndef BENCHMARK
/* test - make sure R * Transpose(R) == A */
/* compute || A - R * Transpose(R) || */
A = mul_and_subT(depth, 1, R, R, A);
Real error = mag(depth, A);
fail = (error > 0.00001);
#endif
free_matrix(depth, A);
free_matrix(depth, R);
return fail;
}
add_definitions(-DFIBRIL_CILKPLUS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcilkplus")
find_library(CILKRTS_LIB cilkrts /srv/scratch/uh15efil/intel-cilk-runtime/build/lib)
find_library(DL_LIB NAMES dl)
add_executable(cholesky_cilkplus ../cholesky.cpp)
target_link_libraries(cholesky_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(fft_cilkplus ../fft.cpp)
target_link_libraries(fft_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(fib_cilkplus ../fib.cpp)
target_link_libraries(fib_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(heat_cilkplus ../heat.cpp)
target_link_libraries(heat_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(integrate_cilkplus ../integrate.cpp)
target_link_libraries(integrate_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(knapsack_cilkplus ../knapsack.cpp)
target_link_libraries(knapsack_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(lu_cilkplus ../lu.cpp)
target_link_libraries(lu_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(matmul_cilkplus ../matmul.cpp)
target_link_libraries(matmul_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(nqueens_cilkplus ../nqueens.cpp)
target_link_libraries(nqueens_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(quicksort_cilkplus ../quicksort.cpp)
target_link_libraries(quicksort_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(rectmul_cilkplus ../rectmul.cpp)
target_link_libraries(rectmul_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
add_executable(strassen_cilkplus ../strassen.cpp)
target_link_libraries(strassen_cilkplus "${CILKRTS_LIB}" "${DL_LIB}")
#ifndef CILKPLUS_H
#define CILKPLUS_H
#include <thread>
#include <stdio.h>
#include <cilk/cilk.h>
#include <cilk/cilk_api.h>
#define fibril
#define fibril_t __attribute__((unused)) int
#define fibril_init(fp)
#define fibril_join(fp) cilk_sync
#define fibril_fork_nrt(fp, fn, ag) cilk_spawn fn ag
#define fibril_fork_wrt(fp, rt, fn, ag) *rt = cilk_spawn fn ag
#define _nthreads(_n) [](int n) -> int { \
int nprocs = std::thread::hardware_concurrency(); \
if (n > 0 && n < nprocs) \
return n; \
return nprocs; \
}(_n)
#define fibril_rt_init(n) do { \
char nprocs[32]; \
snprintf(nprocs, 32, "%d", _nthreads(n)); \
__cilkrts_set_param("nworkers", nprocs); \
__cilkrts_set_param("stack size", "0x800000"); \
} while (0);
#define fibril_rt_exit() (__cilkrts_end_cilk())
#define fibril_rt_nprocs() (__cilkrts_get_nworkers())
#endif /* end of include guard: CILKPLUS_H */
add_definitions(-DFIBRIL_EMPER_CONTINUATION)
add_executable(cholesky_emper_continuation ../cholesky.cpp)
target_link_libraries(cholesky_emper_continuation Threads::Threads emper)
add_executable(fft_emper_continuation ../fft.cpp)
target_link_libraries(fft_emper_continuation Threads::Threads emper)
add_executable(fib_emper_continuation ../fib.cpp)
target_link_libraries(fib_emper_continuation Threads::Threads emper)
add_executable(heat_emper_continuation ../heat.cpp)
target_link_libraries(heat_emper_continuation Threads::Threads emper)
add_executable(integrate_emper_continuation ../integrate.cpp)
target_link_libraries(integrate_emper_continuation Threads::Threads emper)
add_executable(knapsack_emper_continuation ../knapsack.cpp)
target_link_libraries(knapsack_emper_continuation Threads::Threads emper)
add_executable(lu_emper_continuation ../lu.cpp)
target_link_libraries(lu_emper_continuation Threads::Threads emper)
add_executable(matmul_emper_continuation ../matmul.cpp)
target_link_libraries(matmul_emper_continuation Threads::Threads emper)
add_executable(nqueens_emper_continuation ../nqueens.cpp)
target_link_libraries(nqueens_emper_continuation Threads::Threads emper)
add_executable(quicksort_emper_continuation ../quicksort.cpp)
target_link_libraries(quicksort_emper_continuation Threads::Threads emper)
add_executable(rectmul_emper_continuation ../rectmul.cpp)
target_link_libraries(rectmul_emper_continuation Threads::Threads emper)
add_executable(strassen_emper_continuation ../strassen.cpp)
target_link_libraries(strassen_emper_continuation Threads::Threads emper)
add_test(cholesky cholesky_emper_continuation)
add_test(fft fft_emper_continuation)
add_test(fib fib_emper_continuation)
add_test(heat heat_emper_continuation)
add_test(integrate integrate_emper_continuation)
add_test(knapsack knapsack_emper_continuation)
add_test(lu lu_emper_continuation)
add_test(matmul matmul_emper_continuation)
add_test(nqueens nqueens_emper_continuation)
add_test(quicksort quicksort_emper_continuation)
add_test(rectmul rectmul_emper_continuation)
add_test(strassen strassen_emper_continuation)
#ifndef EMPER_CONTINUATION_H
#define EMPER_CONTINUATION_H
#include <thread>
//#include "fork.h"
#include "emper.hpp"
#if 0
class StackFibril {
private:
//Fibril *f;
//char memory[sizeof(Fibril) + alignof(Fibril)];
char memory[sizeof(Fibril)];
public:
__attribute__((always_inline))
inline StackFibril() {
//char *addr = (char*) ((uintptr_t) (memory + alignof(Fibril) - 1) & ~(alignof(Fibril) - 1));
//f = new (addr) Fibril();
new (memory) Fibril();
}
__attribute__((always_inline))
inline ~StackFibril() {
//f->~Fibril();
((Fibril*) memory)->~Fibril();
}
__attribute__((always_inline))
inline Fibril* operator->() const noexcept {
//return f;
return (Fibril*) memory;
}
__attribute__((always_inline))
inline Fibril& operator*() const {
//return *f;
return *((Fibril*) memory);
}
};
#define fibril_t StackFibril
#define fibril_init(fp)
#define fibril_join(fp) (*fp)->join();
#if 1
#include "fork.h"
#define fibril_fork_nrt(fp, fn, ag) do { \
auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \
(*f)->cont.ip = __builtin_return_address(0); \
Runtime* runtime = Runtime::getRuntime(); \
runtime->pushBottom(**f); \
fn(_fibril_args ag); \
if (!runtime->popBottom()) { \
(*f)->resume(); \
} \
}; \
membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
} while (0);
#define fibril_fork_wrt(fp, rt, fn, ag) do { \
auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rt) p) __attribute__((noinline, hot, optimize(3))) { \
(*f)->cont.ip = __builtin_return_address(0); \
Runtime* runtime = Runtime::getRuntime(); \
runtime->pushBottom(**f); \
*p = fn(_fibril_args ag); \
if (!runtime->popBottom()) { \
(*f)->resume(); \
} \
}; \
membar(_fibril_##fn##_fork(_fibril_expand ag fp, rt)); \
} while (0);
#else
#define _fibril_expand(...) \
_fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
#define _fibril_expand_(n, ...) \
_fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
#define _fibril_expand_16(...) __VA_ARGS__
#define _fibril_expand_15(...) __VA_ARGS__
#define _fibril_expand_14(...) __VA_ARGS__
#define _fibril_expand_13(...) __VA_ARGS__
#define _fibril_expand_12(...) __VA_ARGS__
#define _fibril_expand_11(...) __VA_ARGS__
#define _fibril_expand_10(...) __VA_ARGS__
#define _fibril_expand_9( ...) __VA_ARGS__
#define _fibril_expand_8( ...) __VA_ARGS__
#define _fibril_expand_7( ...) __VA_ARGS__
#define _fibril_expand_6( ...) __VA_ARGS__
#define _fibril_expand_5( ...) __VA_ARGS__
#define _fibril_expand_4( ...) __VA_ARGS__
#define _fibril_expand_3( ...) __VA_ARGS__
#define _fibril_expand_2( ...) __VA_ARGS__
#define _fibril_expand_1( ...) __VA_ARGS__
#define _fibril_expand_0()
#define fibril_fork_nrt(fp, fn, ag) (*fp)->fork(fn, _fibril_expand ag)
#define fibril_fork_wrt(fp, rt, fn, ag) (*fp)->fork(rt, fn, _fibril_expand ag)
#endif
#endif
#define fibril_t Fibril
#define fibril_init(fp)
#define fibril_join(fp) (fp)->join();
#include "fork.h"
#define fibril_fork_nrt(fp, fn, ag) do { \
auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \
(f)->cont.ip = __builtin_return_address(0); \
Runtime* runtime = Runtime::getRuntime(); \
runtime->pushBottom(*f); \
fn(_fibril_args ag); \
if (!runtime->popBottom()) { \
(f)->resume(); \
} \
}; \
membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
} while (0);
#define fibril_fork_wrt(fp, rt, fn, ag) do { \
auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rt) p) __attribute__((noinline, hot, optimize(3))) { \
(f)->cont.ip = __builtin_return_address(0); \
Runtime* runtime = Runtime::getRuntime(); \
runtime->pushBottom(*f); \
*p = fn(_fibril_args ag); \
if (!runtime->popBottom()) { \
(f)->resume(); \
} \
}; \
membar(_fibril_##fn##_fork(_fibril_expand ag fp, rt)); \
} while (0);
#define _nthreads(_n) [](int n) -> int { \
int nprocs = std::thread::hardware_concurrency(); \
if (n > 0 && n < nprocs) \
return n; \
return nprocs; \
}(_n)
#define fibril_rt_init(n) Runtime runtime(_nthreads(n)); runtime.executeAndWait([&] () {
#define fibril_rt_exit() });
#define fibril_rt_nprocs() runtime.getWorkerCount()
#endif /* end of include guard: EMPER_CONTINUATION_H */
#ifndef FIBRIL_FORK_H
#define FIBRIL_FORK_H
#define _fibril_defs(...) \
_fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
#define _fibril_defs_(n, ...) \
_fibril_concat(_fibril_defs_, n)(__VA_ARGS__)
#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__)
#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__)
#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__)
#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__)
#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__)
#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__)
#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__)
#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__)
#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__)
#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__)
#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__)
#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__)
#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__)
#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__)
#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__)
#define _fibril_defs_1(a) __typeof__(a) a1,
#define _fibril_defs_0()
#define _fibril_args(...) \
_fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
#define _fibril_args_(n, ...) \
_fibril_concat(_fibril_args_, n)(__VA_ARGS__)
#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__)
#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__)
#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__)
#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__)
#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__)
#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__)
#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__)
#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__)
#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__)
#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__)
#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__)
#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__)
#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__)
#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__)
#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__)
#define _fibril_args_1(a) a1
#define _fibril_args_0()
#define _fibril_expand(...) \
_fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
#define _fibril_expand_(n, ...) \
_fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
#define _fibril_expand_16(...) __VA_ARGS__,
#define _fibril_expand_15(...) __VA_ARGS__,
#define _fibril_expand_14(...) __VA_ARGS__,
#define _fibril_expand_13(...) __VA_ARGS__,
#define _fibril_expand_12(...) __VA_ARGS__,
#define _fibril_expand_11(...) __VA_ARGS__,
#define _fibril_expand_10(...) __VA_ARGS__,
#define _fibril_expand_9( ...) __VA_ARGS__,
#define _fibril_expand_8( ...) __VA_ARGS__,
#define _fibril_expand_7( ...) __VA_ARGS__,
#define _fibril_expand_6( ...) __VA_ARGS__,
#define _fibril_expand_5( ...) __VA_ARGS__,
#define _fibril_expand_4( ...) __VA_ARGS__,
#define _fibril_expand_3( ...) __VA_ARGS__,
#define _fibril_expand_2( ...) __VA_ARGS__,
#define _fibril_expand_1( ...) __VA_ARGS__,
#define _fibril_expand_0()
#endif /* end of include guard: FIBRIL_FORK_H */
add_definitions(-DFIBRIL_EMPER_FIBER)
add_executable(cholesky_emper_fiber ../cholesky.cpp)
target_link_libraries(cholesky_emper_fiber Threads::Threads emper)
add_executable(fft_emper_fiber ../fft.cpp)
target_link_libraries(fft_emper_fiber Threads::Threads emper)
add_executable(fib_emper_fiber ../fib.cpp)
target_link_libraries(fib_emper_fiber Threads::Threads emper)
add_executable(heat_emper_fiber ../heat.cpp)
target_link_libraries(heat_emper_fiber Threads::Threads emper)
add_executable(integrate_emper_fiber ../integrate.cpp)
target_link_libraries(integrate_emper_fiber Threads::Threads emper)
add_executable(knapsack_emper_fiber ../knapsack.cpp)
target_link_libraries(knapsack_emper_fiber Threads::Threads emper)
add_executable(lu_emper_fiber ../lu.cpp)
target_link_libraries(lu_emper_fiber Threads::Threads emper)
add_executable(matmul_emper_fiber ../matmul.cpp)
target_link_libraries(matmul_emper_fiber Threads::Threads emper)
add_executable(nqueens_emper_fiber ../nqueens.cpp)
target_link_libraries(nqueens_emper_fiber Threads::Threads emper)
add_executable(quicksort_emper_fiber ../quicksort.cpp)
target_link_libraries(quicksort_emper_fiber Threads::Threads emper)
add_executable(rectmul_emper_fiber ../rectmul.cpp)
target_link_libraries(rectmul_emper_fiber Threads::Threads emper)
add_executable(strassen_emper_fiber ../strassen.cpp)
target_link_libraries(strassen_emper_fiber Threads::Threads emper)
#ifndef EMPER_FIBER_H
#define EMPER_FIBER_H
#include <thread>
#include "emper.hpp"
#define fibril_t CPS
#define fibril_init(fp)
#define fibril_join(fp) (*fp).wait();
#define fibril_fork_nrt(fp, fn, ag) do { \
(*fp).incrementCounterByOne(); \
__typeof__(fp) fpp = fp; \
Runtime::getRuntime()->schedule(*Fiber::from([=] () {fn ag; (*fpp).signalAndExit(); })); \
} while (0);
#define fibril_fork_wrt(fp, rt, fn, ag) do { \
(*fp).incrementCounterByOne(); \
__typeof__(fp) fpp = fp; \
__typeof__(rt) rtp = rt; \
Runtime::getRuntime()->schedule(*Fiber::from([=] () { *rtp = fn ag; (*fpp).signalAndExit(); })); \
} while (0);
#define _nthreads(_n) [](int n) -> int { \
int nprocs = std::thread::hardware_concurrency(); \
if (n > 0 && n < nprocs) \
return n; \
return nprocs; \
}(_n)
#define fibril_rt_init(n) Runtime runtime(_nthreads(n)); runtime.executeAndWait([&] () {
#define fibril_rt_exit() });
#define fibril_rt_nprocs() runtime.getWorkerCount()
#endif /* end of include guard: EMPER_FIBER_H */
/*
* Copyright (c) 2000 Massachusetts Institute of Technology
* Copyright (c) 2000 Matteo Frigo
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include "test.h"
#include "fft.h"
#ifdef BENCHMARK
int n = 26;
#else
int n = 12;
#endif
static int size;
static COMPLEX *in, *out, *cp, *W;
static const REAL pi = 3.1415926535897932384626434;
/*
* compute the W coefficients (that is, powers of the root of 1)
* and store them into an array.
*/
fibril static void compute_w_coefficients(int n, int a, int b, COMPLEX * W)
{
//register double twoPiOverN;
//register int k;
//register REAL s, c;
double twoPiOverN;
int k;
REAL s, c;
if (b - a < 128) {
twoPiOverN = 2.0 * pi / n;
for (k = a; k <= b; ++k) {
c = cos(twoPiOverN * k);
c_re(W[k]) = c_re(W[n - k]) = c;
s = sin(twoPiOverN * k);
c_im(W[k]) = -s;
c_im(W[n - k]) = s;
}
} else {
int ab = (a + b) / 2;
fibril_t fr;
fibril_init(&fr);
fibril_fork(&fr, compute_w_coefficients, (n, a, ab, W));
compute_w_coefficients(n, ab + 1, b, W);
fibril_join(&fr);
}
}
/*
* Determine (in a stupid way) if n is divisible by eight, then by four, else
* find the smallest prime factor of n.
*/
static int factor(int n)
{
int r;
if (n < 2)
return 1;
if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048
|| n == 4096)
return 8;
if ((n & 15) == 0)
return 16;
if ((n & 7) == 0)
return 8;
if ((n & 3) == 0)
return 4;
if ((n & 1) == 0)
return 2;
#if 0
/* radix-32 is too big --- wait for processors with more registers
* :-) */
if ((n & 31) == 0 && n > 256)
return 32;
#endif
/* try odd numbers up to n (computing the sqrt may be slower) */
for (r = 3; r < n; r += 2)
if (n % r == 0)
return r;
/* n is prime */
return n;
}
fibril static void unshuffle(int a, int b,
COMPLEX * in, COMPLEX * out, int r, int m)
{
int i, j;
int r4 = r & (~0x3);
const COMPLEX *ip;
COMPLEX *jp;
if (b - a < 16) {
ip = in + a * r;
for (i = a; i < b; ++i) {
jp = out + i;
for (j = 0; j < r4; j += 4) {
jp[0] = ip[0];
jp[m] = ip[1];
jp[2 * m] = ip[2];
jp[3 * m] = ip[3];
jp += 4 * m;
ip += 4;
}
for (; j < r; ++j) {
*jp = *ip;
ip++;
jp += m;
}
}
} else {
int ab = (a + b) / 2;
fibril_t fr;
fibril_init(&fr);
fibril_fork(&fr, unshuffle, (a, ab, in, out, r, m));
unshuffle(ab, b, in, out, r, m);
fibril_join(&fr);
}
}
/*
* Recursive complex FFT on the n complex components of the array in:
* basic Cooley-Tukey algorithm, with some improvements for
* n power of two. The result is placed in the array out. n is arbitrary.
* The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk
* are prime numbers, and r1 * r2 * ... * rk = n.
*
* n: size of the input
* in: pointer to input
* out: pointer to output
* factors: list of factors of n, precomputed
* W: twiddle factors
* nW: size of W, that is, size of the original transform
*
*/
fibril static void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors,
COMPLEX * W, int nW)
{
int r, m;
/* special cases */
if (n == 32) {
fft_base_32(in, out);
return;
}
if (n == 16) {
fft_base_16(in, out);
return;
}
if (n == 8) {
fft_base_8(in, out);
return;
}
if (n == 4) {
fft_base_4(in, out);
return;
}
if (n == 2) {
fft_base_2(in, out);
return;
}
/* the cases n == 3, n == 5, and maybe 7 should be implemented as well */
r = *factors;
m = n / r;
if (r < n) {
/* split the DFT of length n into r DFTs of length n/r, and recurse */
if (r == 32)
fft_unshuffle_32(0, m, in, out, m);
else if (r == 16)
fft_unshuffle_16(0, m, in, out, m);
else if (r == 8)
fft_unshuffle_8(0, m, in, out, m);
else if (r == 4)
fft_unshuffle_4(0, m, in, out, m);
else if (r == 2)
fft_unshuffle_2(0, m, in, out, m);
else
unshuffle(0, m, in, out, r, m);
fibril_t fr;
fibril_init(&fr);
int k;
for(k = 0; k < n; k += m) {
fibril_fork(&fr, fft_aux, (m, out + k, in + k, factors + 1, W, nW));
}
fibril_join(&fr);
}
/* now multiply by the twiddle factors, and perform m FFTs of length r */
if (r == 2)
fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
else if (r == 4)
fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
else if (r == 8)
fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
else if (r == 16)
fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
else if (r == 32)
fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
else
fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
return;
}
/*
* user interface for fft_aux
*/
static void fft(int n, COMPLEX * in, COMPLEX * out)
{
int factors[40]; /* allows FFTs up to at least 3^40 */
int *p = factors;
int l = n;
int r;
compute_w_coefficients(n, 0, n / 2, W);
/**
* find factors of n, first 8, then 4 and then primes in ascending
* order.
*/
do {
r = factor(l);
*p++ = r;
l /= r;
} while (l > 1);
fft_aux(n, in, out, factors, W, n);
return;
}
/****************************************************************
* END OF FFT ALGORITHM
****************************************************************/
/* tests */
static void fft_alt(int n, COMPLEX * in, COMPLEX * out)
{
int i, j;
COMPLEX sum;
COMPLEX w;
(void) fft_alt;
for (j = 0; j < n; ++j) {
c_re(sum) = c_im(sum) = 0.0;
for (i = 0; i < n; ++i) {
c_re(w) = cos((2.0 * pi * (i * j % n)) / n);
c_im(w) = -sin((2.0 * pi * (i * j % n)) / n);
c_re(sum) += c_re(in[i]) * c_re(w) - c_im(in[i]) * c_im(w);
c_im(sum) += c_im(in[i]) * c_re(w) + c_re(in[i]) * c_im(w);
}
out[j] = sum;
}
return;
}
void init()
{
size = (1 << n);
out = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
in = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
W = (COMPLEX*) malloc(sizeof(COMPLEX) * (size + 1));
int i;
for (i = 0; i < size; ++i) {
c_re(in[i]) = drand48();
c_im(in[i]) = drand48();
}
}
void prep()
{
if (cp == NULL)
cp = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
memcpy(cp, in, sizeof(COMPLEX) * size);
}
void test()
{
fft(size, cp, out);
}
#ifdef BENCHMARK
int verify(void) { return 0; }
#else
int verify(void)
{
COMPLEX * expect = (COMPLEX*) malloc(sizeof(COMPLEX) * size);
fft_alt(size, in, expect);
/* compute the relative error */
double error = 0.0;
int i;
for (i = 0; i < size; ++i) {
double a = sqrt(
(c_re(out[i]) - c_re(expect[i])) * (c_re(out[i]) - c_re(expect[i])) +
(c_im(out[i]) - c_im(expect[i])) * (c_im(out[i]) - c_im(expect[i])));
double d = sqrt(
c_re(expect[i]) * c_re(expect[i]) + c_im(expect[i]) * c_im(expect[i]));
if (d < -1.0e-10 || d > 1.0e-10) a /= d;
if (a > error) error = a;
}
if (error > 1e-3) {
printf("size=%d error=%e\n", size, error);
return 1;
} else {
return 0;
}
}
#endif
This diff is collapsed.
#include <stdio.h>
#include "test.h"
int n = 42;
int m;
static int fib_fast(int n)
{
if (n < 2) return n;
int i = 2, x = 0, y = 0, z = 1;
do {
x = y;
y = z;
z = x + y;
} while (i++ < n);
return z;
}
fibril int fib(int n)
{
if (n < 2) return n;
int x, y;
fibril_t fr;
fibril_init(&fr);
fibril_fork(&fr, &x, fib, (n - 1));
y = fib(n - 2);
fibril_join(&fr);
return x + y;
}
int verify()
{
int expect = fib_fast(n);
if (expect != m) {
printf("fib(%d)=%d (expected %d)\n", n, m, expect);
return 1;
}
return 0;
}
void init() {}
void prep() {}
void test() {
m = fib(n);
}
#ifndef FIBRIL_H
#define FIBRIL_H
#define FIBRIL_SUCCESS 0
#define FIBRIL_FAILURE -1
/**
* These are special arguments to fibril_rt_init().
* FIBRIL_NPROCS tells the runtime to fetch the number of processors
* from the environment variable FIBRIL_NPROCS (getenv(FIBRIL_NPROCS)).
* FIBRIL_NPROCS_ONLN tells the runtime to use all available processors
* in the system (sysconf(_SC_NPROCESSORS_ONLN)).
*/
#define FIBRIL_NPROCS 0
#define FIBRIL_NPROCS_ONLN -1
/** Serial version. */
#ifdef FIBRIL_SERIAL
#include "serial/serial.h"
/** Cilkplus version. */
#elif FIBRIL_CILKPLUS
#include "cilkplus/cilkplus.h"
/** TBB version. */
#elif FIBRIL_TBB
#include "tbb/tbb.h"
/** OpenMP version. */
#elif FIBRIL_OPENMP
#include "openmp/openmp.h"
/** Emper continuation version. */
#elif FIBRIL_EMPER_CONTINUATION
#include "emper_continuation/emper_continuation.h"
/** Emper fiber version. */
#elif FIBRIL_EMPER_FIBER
#include "emper_fiber/emper_fiber.h"
/** Fibril version. */
#elif FIBRIL_FIBRIL_LF
#include "fibril_lf/fibrile.h"
#elif FIBRIL_FIBRIL
#include "fibril/fibrile.h"
#endif
/** fibril_fork has two versions: one with return value and one without. */
#define fibril_fork(...) _fibril_fork_(_fibril_nth(__VA_ARGS__), __VA_ARGS__)
#define _fibril_fork_(n, ...) _fibril_concat(_fibril_fork_, n)(__VA_ARGS__)
/** If nargs is 3, use the no-return-value version. */
#define _fibril_fork_3(...) fibril_fork_nrt(__VA_ARGS__)
/** If nargs is 4, use the with-return-value version. */
#define _fibril_fork_4(...) fibril_fork_wrt(__VA_ARGS__)
/** Helper macros to count number of arguments. */
#define _fibril_nth(...) _fibril_nth_(__VA_ARGS__, ## __VA_ARGS__, \
16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, \
8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 0)
#define _fibril_nth_(_1, _1_, _2, _2_, _3, _3_, _4, _4_, _5, _5_, \
_6, _6_, _7, _7_, _8, _8_, _9, _9_, _10, _10_, _11, _11_, _12, _12_, \
_13, _13_, _14, _14_, _15, _15_, _16, _16_, N, ...) N
#define _fibril_concat(left, right) left##right
#endif /* end of include guard: FIBRIL_H */
add_definitions(-DFIBRIL_FIBRIL)
find_library(FIBRIL_LIB fibril /home/nicolas/uni/ma/fibril/build/lib)
add_executable(cholesky_fibril ../cholesky.cpp)
target_link_libraries(cholesky_fibril "${FIBRIL_LIB}")
add_executable(fft_fibril ../fft.cpp)
target_link_libraries(fft_fibril "${FIBRIL_LIB}")
add_executable(fib_fibril ../fib.cpp)
target_link_libraries(fib_fibril "${FIBRIL_LIB}")
add_executable(heat_fibril ../heat.cpp)
target_link_libraries(heat_fibril "${FIBRIL_LIB}")
add_executable(integrate_fibril ../integrate.cpp)
target_link_libraries(integrate_fibril "${FIBRIL_LIB}")
add_executable(knapsack_fibril ../knapsack.cpp)
target_link_libraries(knapsack_fibril "${FIBRIL_LIB}")
add_executable(lu_fibril ../lu.cpp)
target_link_libraries(lu_fibril "${FIBRIL_LIB}")
add_executable(matmul_fibril ../matmul.cpp)
target_link_libraries(matmul_fibril "${FIBRIL_LIB}")
add_executable(nqueens_fibril ../nqueens.cpp)
target_link_libraries(nqueens_fibril "${FIBRIL_LIB}")
add_executable(quicksort_fibril ../quicksort.cpp)
target_link_libraries(quicksort_fibril "${FIBRIL_LIB}")
add_executable(rectmul_fibril ../rectmul.cpp)
target_link_libraries(rectmul_fibril "${FIBRIL_LIB}")
add_executable(strassen_fibril ../strassen.cpp)
target_link_libraries(strassen_fibril "${FIBRIL_LIB}")
#ifndef FIBRILE_H
#define FIBRILE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "fibrili.h"
/** fibril. */
#define fibril __attribute__((optimize("no-omit-frame-pointer")))
/** fibril_t. */
typedef struct _fibril_t fibril_t;
/** fibril_init. */
__attribute__((always_inline)) extern inline
void fibril_init(fibril_t * frptr)
{
register void * rbp asm ("rbp");
register void * rsp asm ("rsp");
frptr->lock = 0;
frptr->unmapped = 0;
frptr->count = -1;
frptr->stack.btm = rbp;
frptr->stack.top = rsp;
}
/** fibril_join. */
__attribute__((always_inline)) extern inline
void fibril_join(fibril_t * frptr)
{
if (frptr->count > -1) {
fibrili_membar(fibrili_join(frptr));
}
}
#include "fork.h"
#ifdef __cplusplus
/** _fibril_fork_nrt. */
#define fibril_fork_nrt(fp, fn, ag) do { \
auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f) __attribute__((noinline, hot, optimize(3))) { \
fibrili_push(f); \
fn(_fibril_args ag); \
if (!fibrili_pop()) fibrili_resume(f); \
}; \
fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
} while (0)
/** _fibril_fork_wrt. */
#define fibril_fork_wrt(fp, rtp, fn, ag) do { \
auto _fibril_##fn##_fork = [](_fibril_defs ag fibril_t * f, __typeof__(rtp) p) __attribute__((noinline, hot, optimize(3))) { \
fibrili_push(f); \
*p = fn(_fibril_args ag); \
if (!fibrili_pop()) fibrili_resume(f); \
}; \
fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \
} while (0)
#else
/** _fibril_fork_nrt. */
#define fibril_fork_nrt(fp, fn, ag) do { \
__attribute__((noinline, hot, optimize(3))) \
void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f) { \
fibrili_push(f); \
fn(_fibril_args ag); \
if (!fibrili_pop()) fibrili_resume(f); \
} \
fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
} while (0)
/** _fibril_fork_wrt. */
#define fibril_fork_wrt(fp, rtp, fn, ag) do { \
__attribute__((noinline, hot, optimize(3))) \
void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f, __typeof__(rtp) p) { \
fibrili_push(f); \
*p = fn(_fibril_args ag); \
if (!fibrili_pop()) fibrili_resume(f); \
} \
fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \
} while (0)
#endif
extern int fibril_rt_init(int nprocs);
extern int fibril_rt_exit();
extern int fibril_rt_nprocs();
#ifdef __cplusplus
}
#endif
#endif /* end of include guard: FIBRILE_H */
#ifndef FIBRILI_H
#define FIBRILI_H
struct _fibril_t {
char lock;
char unmapped;
int count;
struct {
void * btm;
void * top;
void * ptr;
} stack;
void * pc;
};
extern __thread struct _fibrili_deque_t {
char lock;
int head;
int tail;
void * stack;
void * buff[1000];
} fibrili_deq;
#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7
#define fibrili_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define fibrili_lock(l) do { \
__asm__ ( "pause" : : : "memory" ); \
} while (__atomic_test_and_set(&(l), __ATOMIC_ACQUIRE))
#define fibrili_unlock(l) __atomic_clear(&(l), __ATOMIC_RELEASE)
#else
#if defined(__x86_64__) || defined(_M_X64_)
#define fibrili_fence() __sync_synchronize()
#define fibrili_lock(l) do { \
__asm__ ( "pause" ::: "memory" ); \
} while (__sync_lock_test_and_set(&(l), 1))
#define fibrili_unlock(l) __sync_lock_release(&(l))
#endif
#endif
__attribute__((noinline)) extern
void fibrili_join(struct _fibril_t * frptr);
__attribute__((noreturn)) extern
void fibrili_resume(struct _fibril_t * frptr);
#define fibrili_push(frptr) do { \
(frptr)->pc = __builtin_return_address(0); \
fibrili_deq.buff[fibrili_deq.tail++] = (frptr); \
} while (0)
__attribute__((hot)) static
int fibrili_pop(void)
{
int tail = fibrili_deq.tail;
if (tail == 0) return 0;
fibrili_deq.tail = --tail;
fibrili_fence();
if (fibrili_deq.head > tail) {
fibrili_deq.tail = tail + 1;
fibrili_lock(fibrili_deq.lock);
if (fibrili_deq.head > tail) {
fibrili_deq.head = 0;
fibrili_deq.tail = 0;
fibrili_unlock(fibrili_deq.lock);
return 0;
}
fibrili_deq.tail = tail;
fibrili_unlock(fibrili_deq.lock);
}
return 1;
}
#define fibrili_membar(call) do { \
call; \
__asm__ ( "nop" : : : "rbx", "r12", "r13", "r14", "r15", "memory" ); \
} while (0)
#endif /* end of include guard: FIBRILI_H */
#ifndef FIBRIL_FORK_H
#define FIBRIL_FORK_H
#define _fibril_defs(...) \
_fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
#define _fibril_defs_(n, ...) \
_fibril_concat(_fibril_defs_, n)(__VA_ARGS__)
#define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__)
#define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__)
#define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__)
#define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__)
#define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__)
#define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__)
#define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__)
#define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__)
#define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__)
#define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__)
#define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__)
#define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__)
#define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__)
#define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__)
#define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__)
#define _fibril_defs_1(a) __typeof__(a) a1,
#define _fibril_defs_0()
#define _fibril_args(...) \
_fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
#define _fibril_args_(n, ...) \
_fibril_concat(_fibril_args_, n)(__VA_ARGS__)
#define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__)
#define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__)
#define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__)
#define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__)
#define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__)
#define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__)
#define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__)
#define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__)
#define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__)
#define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__)
#define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__)
#define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__)
#define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__)
#define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__)
#define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__)
#define _fibril_args_1(a) a1
#define _fibril_args_0()
#define _fibril_expand(...) \
_fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
#define _fibril_expand_(n, ...) \
_fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
#define _fibril_expand_16(...) __VA_ARGS__,
#define _fibril_expand_15(...) __VA_ARGS__,
#define _fibril_expand_14(...) __VA_ARGS__,
#define _fibril_expand_13(...) __VA_ARGS__,
#define _fibril_expand_12(...) __VA_ARGS__,
#define _fibril_expand_11(...) __VA_ARGS__,
#define _fibril_expand_10(...) __VA_ARGS__,
#define _fibril_expand_9( ...) __VA_ARGS__,
#define _fibril_expand_8( ...) __VA_ARGS__,
#define _fibril_expand_7( ...) __VA_ARGS__,
#define _fibril_expand_6( ...) __VA_ARGS__,
#define _fibril_expand_5( ...) __VA_ARGS__,
#define _fibril_expand_4( ...) __VA_ARGS__,
#define _fibril_expand_3( ...) __VA_ARGS__,
#define _fibril_expand_2( ...) __VA_ARGS__,
#define _fibril_expand_1( ...) __VA_ARGS__,
#define _fibril_expand_0()
#endif /* end of include guard: FIBRIL_FORK_H */
add_definitions(-DFIBRIL_FIBRIL_LF)
find_library(FIBRIL_LF_LIB fibril /home/nicolas/uni/ma/fibril_wf/build/lib)
add_executable(cholesky_fibril_lf ../cholesky.cpp)
target_link_libraries(cholesky_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(fft_fibril_lf ../fft.cpp)
target_link_libraries(fft_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(fib_fibril_lf ../fib.cpp)
target_link_libraries(fib_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(heat_fibril_lf ../heat.cpp)
target_link_libraries(heat_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(integrate_fibril_lf ../integrate.cpp)
target_link_libraries(integrate_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(knapsack_fibril_lf ../knapsack.cpp)
target_link_libraries(knapsack_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(lu_fibril_lf ../lu.cpp)
target_link_libraries(lu_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(matmul_fibril_lf ../matmul.cpp)
target_link_libraries(matmul_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(nqueens_fibril_lf ../nqueens.cpp)
target_link_libraries(nqueens_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(quicksort_fibril_lf ../quicksort.cpp)
target_link_libraries(quicksort_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(rectmul_fibril_lf ../rectmul.cpp)
target_link_libraries(rectmul_fibril_lf "${FIBRIL_LF_LIB}")
add_executable(strassen_fibril_lf ../strassen.cpp)
target_link_libraries(strassen_fibril_lf "${FIBRIL_LF_LIB}")