diff --git a/.clang-tidy b/.clang-tidy
index 15acd04ff8c95e54886330ea3567c1a76eed7bae..1772a66ce7f81b29d4e6d7b2a0823acd49f6f33b 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -9,6 +9,7 @@ Checks: >
   -cert-err58-cpp,
   -clang-diagnostic-empty-translation-unit,
   -readability-braces-around-statements,
+  -readability-function-cognitive-complexity,
   -readability-implicit-bool-conversion,
   -readability-isolate-declaration,
   -readability-magic-numbers,
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d3ea04bd84519bf33a2ab059d9bae93ccc87d8d8..734b3b27aee08b9781ea4833747a39c8247475a9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -72,7 +72,7 @@ clang-tidy:
 .test:
   stage: test
   script:
-    - make test
+    - make test-all
 
 .gcc:
   variables:
diff --git a/Makefile b/Makefile
index 828923d6154e5f235a8f6d169ae164c77b6b8b7c..5dd0dcef4ba6e7e30905f3b7a6042b4bd90ee188 100644
--- a/Makefile
+++ b/Makefile
@@ -41,10 +41,9 @@ clang:
 fast-static-analysis: all check-format check-license
 
 STATIC_ANALYSIS_NINJA_TARGETS += iwyu
-STATIC_ANALYSIS_NINJA_TARGETS += clang-tidy
 
 .PHONY: static-analysis
-static-analysis: fast-static-analysis
+static-analysis: fast-static-analysis tidy
 	$(NINJA) -C build $(STATIC_ANALYSIS_NINJA_TARGETS)
 
 .PHONY: smoke-test-suite
@@ -56,12 +55,12 @@ smoke-test: smoke-test-suite static-analysis
 
 .PHONY: test
 test: all
-	$(NINJA) -C build test
-
-.PHONY: test-mazstab
-test-mazstab: all
 	cd build && meson test --suite MazStab
 
+.PHONY: test-all
+test-all: all
+	$(NINJA) -C build test
+
 .PHONY: check
 check: smoke-test test
 
@@ -77,6 +76,8 @@ check-license:
 format: all
 	$(NINJA) -C build clang-format
 
+# Note that we do not use meson's clang-tidy target, as it would also
+# run on subprojects.
 .PHONY: tidy
 tidy: compile_commands_wo_subprojects/compile_commands.json
 	./tools/run-clang-tidy
diff --git a/README.md b/README.md
index b3a94d55d3cac8abc480f9163ecf8776ac9cf908..43ffa2e291a9e437d6e707b27bd805124a8fafca 100644
--- a/README.md
+++ b/README.md
@@ -9,3 +9,72 @@ mzs_spawn_nrt
 mzs_spawn_wrt
 
 mzs_sync
+
+Benchmark API
+-------------
+
+```
+void mzs_benchmark_init(const mazstab::ProblemSize& problem_size);
+```
+
+Initializes the benchmark *once* prior the benchmark runs.
+
+```
+void mzs_benchmark_prepare();
+#pragma weak mzs_benchmark_prepare
+```
+
+Prepares the benchmarks before *every* benchmark run. Typically used
+to reset the input data to a default value.
+
+```
+void mzs_benchmark_run();
+```
+
+Triggers exactly one benchmark run. This is the method around the
+non-functional data of the benchmark run, like execution time and
+memory consumption, is collected.
+
+```
+auto mzs_benchmark_verify() -> int;
+#pragma weak mzs_benchmark_verify
+```
+
+Verifies the result of the benchmark run. Invoked after every
+`mzs_benchmark_run()`. Typically used to compare the results of the
+current run with a valid result. Must return a non-zero value if
+verification failed. This is a weak function and hence does not need
+to be provided.
+
+Runtime System API
+------------------
+
+```
+void mzs_rts_enter(const std::function<void()> &function)
+```
+
+Some concurrency platforms require their runtime system to be entered prior executing concurrency platform specific code.
+
+```
+auto mzs_rts_init(int nthreads) -> int
+```
+
+```
+auto mzs_rts_exit() -> int;
+#pragma weak mzs_rts_exit
+```
+
+```
+void mzs_rts_dump_config(int fd);
+#pragma weak mzs_rts_dump_config
+```
+
+```
+void mzs_rts_reset_stats();
+#pragma weak mzs_rts_reset_stats
+```
+
+```
+void mzs_rts_dump_stats(int fd);
+#pragma weak mzs_rts_dump_stats
+```
diff --git a/benchmark-driver.cc b/benchmark-driver.cc
index 100c35c7301d358821ddb752206da7e30605437d..d81b65bc5f3f2d3391c3ffde7cc32b67fbe6ab9a 100644
--- a/benchmark-driver.cc
+++ b/benchmark-driver.cc
@@ -3,18 +3,23 @@
 #include <sys/resource.h>
 #include <sys/sysinfo.h>
 
+#include <boost/program_options.hpp>
 #include <chrono>
 #include <cinttypes>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <functional>
+#include <iostream>
 #include <ratio>
 #include <string>
 #include <vector>
 
-#include "common.h"
 #include "common.hpp"
+#include "mazstab.hpp"
+
+namespace mzs = mazstab;
+namespace po = boost::program_options;
 
 void mzs_rts_enter(const std::function<void()> &function);
 
@@ -32,28 +37,33 @@ void mzs_rts_reset_stats();
 void mzs_rts_dump_stats(int fd);
 #pragma weak mzs_rts_dump_stats
 
-void mzs_benchmark_init();
-#pragma weak mzs_benchmark_init
+void mzs_benchmark_init(const mzs::ProblemSize &problem_size);
 
 void mzs_benchmark_prepare();
 #pragma weak mzs_benchmark_prepare
 
 extern void mzs_benchmark_run();
 
-auto mzs_benchmark_verify() -> int;
+auto mzs_benchmark_verify() -> mzs::VerificationResult;
 #pragma weak mzs_benchmark_verify
 
 const int MZS_BENCHMARK_VERIFICATION_FAILED = 2;
 
+// Benchmark configuration
+static mzs::ProblemSize problem_size = mzs::ProblemSize::m;
+static unsigned int warmup_iterations = 1;
+static unsigned int iterations = 1;
+
+static FILE *log_sink;
+static FILE *csv_sink;
+
+// Benchmark results
 struct stats {
 	struct rusage rusage_before;
 	struct rusage rusage_after;
 	std::chrono::high_resolution_clock::duration duration;
 };
 
-static FILE *log_sink;
-static FILE *csv_sink;
-
 static std::vector<stats> warmup_iteration_stats;
 static std::vector<stats> iteration_stats;
 
@@ -76,8 +86,9 @@ auto do_single_benchmark_run(const std::string &iteration_identifier) -> struct
 	safe_getrusage(&rusage_after);
 
 	if (&mzs_benchmark_verify) {
-		int res = mzs_benchmark_verify();
-		if (res) {
+		mzs::VerificationResult res = mzs_benchmark_verify();
+		// TODO: Record verification result in output data.
+		if (res == mzs::VerificationResult::failed) {
 			exit(MZS_BENCHMARK_VERIFICATION_FAILED);
 		}
 	}
@@ -98,8 +109,8 @@ auto do_single_benchmark_run(const std::string &iteration_identifier) -> struct
 	return s;
 }
 
-void perform_benchmark(unsigned int warmup_iterations, unsigned int iterations) {
-	if (&mzs_benchmark_init) mzs_benchmark_init();
+void perform_benchmark() {
+	mzs_benchmark_init(problem_size);
 
 	for (unsigned int iteration = 0; iteration < warmup_iterations; ++iteration) {
 		std::string iteration_identifier("warmup-" + std::to_string(iteration));
@@ -114,15 +125,31 @@ void perform_benchmark(unsigned int warmup_iterations, unsigned int iterations)
 	}
 }
 
-auto main(UNUSED int argc, UNUSED const char *argv[]) -> int {
-	const int nprocs = get_nprocs();
-
-	unsigned int warmup_iterations = 1;
-	unsigned int iterations = 1;
+auto main(int argc, const char *argv[]) -> int {
+	int nthreads = -1;
+	po::options_description desc("Allow options");
+	// clang-format off
+	desc.add_options()
+		("help", "Show help")
+		("problem-size", po::value<mzs::ProblemSize>(&problem_size), "The problem size: xs or m")
+		("nthreads", po::value<int>(&nthreads), "The number of threads")
+		;
+	// clang-format on
+
+	po::variables_map vm;
+	auto parse_result = po::parse_command_line(argc, argv, desc);
+	po::store(parse_result, vm);
+	po::notify(vm);
+
+	if (vm.count("help")) {
+		std::cout << desc << "\n";
+		return EXIT_SUCCESS;
+	}
 
-	int nthreads = nprocs;
-	char *env = getenv("BENCHMARK_NPROCS");
-	if (env) nthreads = std::stoi(env);
+	if (nthreads < 0) {
+		const int nprocs = get_nprocs();
+		nthreads = nprocs;
+	}
 
 	log_sink = stdout;
 	csv_sink = stdout;
@@ -132,7 +159,5 @@ auto main(UNUSED int argc, UNUSED const char *argv[]) -> int {
 		DIE_MSG("Runtime-system (RTS) initialization failed");
 	}
 
-	// TODO:  would require special wrapping in case of OpenMP
-	// _omp_init(n, [&]() { mzs_benchmark_run() });
-	mzs_rts_enter([&] { perform_benchmark(warmup_iterations, iterations); });
+	mzs_rts_enter([&] { perform_benchmark(); });
 }
diff --git a/benchmarks/fib/fib.cc b/benchmarks/fib/fib.cc
index 14d13f67fa324ed590d1411e9d721765513e240a..3ef1fb84284ce4c9538aad97475b94b52af05f6f 100644
--- a/benchmarks/fib/fib.cc
+++ b/benchmarks/fib/fib.cc
@@ -1,9 +1,12 @@
+#include <cassert>
 #include <cstdio>
 
 #include "mazstab-backend.hpp"
+#include "mazstab.hpp"
 
-// TODO: Change this back to 42.
-static int n = 13;
+namespace mzs = mazstab;
+
+static int n = -1;
 int m;
 
 static auto fib_fast(int n) -> int {
@@ -35,15 +38,29 @@ mzs_sf static auto fib(int n) -> int {
 	return x + y;
 }
 
-auto mzs_benchmark_verify() -> int {
+auto mzs_benchmark_verify() -> mzs::VerificationResult {
 	int expect = fib_fast(n);
 
 	if (expect != m) {
 		fprintf(stderr, "fib(%d)=%d (expected %d)\n", n, m, expect);
-		return 1;
+		return mzs::VerificationResult::failed;
 	}
 
-	return 0;
+	return mzs::VerificationResult::successful;
+}
+
+void mzs_benchmark_run() {
+	assert(n > 0);
+	m = fib(n);
 }
 
-void mzs_benchmark_run() { m = fib(n); }
+void mzs_benchmark_init(const mzs::ProblemSize &problem_size) {
+	switch (problem_size) {
+		case mzs::ProblemSize::xs:
+			n = 13;
+			break;
+		case mzs::ProblemSize::m:
+			n = 42;
+			break;
+	}
+}
diff --git a/benchmarks/lu/lu.cc b/benchmarks/lu/lu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d293899c7213f3359819663557de0ef96e9e900
--- /dev/null
+++ b/benchmarks/lu/lu.cc
@@ -0,0 +1,396 @@
+/****************************************************************************\
+ * LU decomposition
+ * Robert Blumofe
+ *
+ * Copyright (c) 1996, Robert Blumofe.  All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+\****************************************************************************/
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include "mazstab-backend.hpp"
+#include "mazstab.hpp"
+
+namespace mzs = mazstab;
+
+/* Define the size of a block. */
+#define LU_BLOCK_SIZE 16
+
+/* A block is a 2D array of doubles. */
+// NOLINTNEXTLINE(modernize-use-using, modernize-avoid-c-arrays)
+typedef double Block[LU_BLOCK_SIZE][LU_BLOCK_SIZE];
+// NOLINTNEXTLINE(bugprone-macro-parentheses)
+#define BLOCK(B, I, J) (B[I][J])
+
+/* A matrix is a 1D array of blocks. */
+// NOLINTNEXTLINE(modernize-use-using)
+typedef Block *Matrix;
+#define MATRIX(M, I, J) ((M)[(I)*nBlocks + (J)])
+
+/** Matrix size. */
+int n = -1;
+
+/** The global matrix and a copy of the matrix. */
+static Matrix M, Msave;
+
+/* Matrix size in blocks. */
+static int nBlocks;
+
+/****************************************************************************\
+ * Utility routines.
+ \****************************************************************************/
+
+/*
+ * init_matrix - Fill in matrix M with random values.
+ */
+static void init_matrix(Matrix M, int nb) {
+	int I, J, K, i, j, k;
+
+	/* Initialize random number generator. */
+	srand(1);
+
+	/* For each element of each block, fill in random value. */
+	for (I = 0; I < nb; I++)
+		for (J = 0; J < nb; J++)
+			for (i = 0; i < LU_BLOCK_SIZE; i++)
+				for (j = 0; j < LU_BLOCK_SIZE; j++)
+					BLOCK(MATRIX(M, I, J), i, j) = ((double)rand()) / (double)RAND_MAX;
+
+	/* Inflate diagonal entries. */
+	for (K = 0; K < nb; K++)
+		for (k = 0; k < LU_BLOCK_SIZE; k++) BLOCK(MATRIX(M, K, K), k, k) *= 10.0;
+}
+
+/*
+ * test_result - Check that matrix LU contains LU decomposition of M.
+ */
+static auto test_result(Matrix LU, Matrix M, int nb) -> bool {
+	int I, J, K, i, j, k;
+	double diff, max_diff;
+	double v;
+
+	/* Initialize test. */
+	max_diff = 0.0;
+
+	/* Find maximum difference between any element of LU and M. */
+	for (i = 0; i < nb * LU_BLOCK_SIZE; i++)
+		for (j = 0; j < nb * LU_BLOCK_SIZE; j++) {
+			I = i / LU_BLOCK_SIZE;
+			J = j / LU_BLOCK_SIZE;
+			v = 0.0;
+			for (k = 0; k < i && k <= j; k++) {
+				K = k / LU_BLOCK_SIZE;
+				v += BLOCK(MATRIX(LU, I, K), i % LU_BLOCK_SIZE, k % LU_BLOCK_SIZE) *
+						 BLOCK(MATRIX(LU, K, J), k % LU_BLOCK_SIZE, j % LU_BLOCK_SIZE);
+			}
+			if (k == i && k <= j) {
+				K = k / LU_BLOCK_SIZE;
+				v += BLOCK(MATRIX(LU, K, J), k % LU_BLOCK_SIZE, j % LU_BLOCK_SIZE);
+			}
+			diff = fabs(BLOCK(MATRIX(M, I, J), i % LU_BLOCK_SIZE, j % LU_BLOCK_SIZE) - v);
+			if (diff > max_diff) max_diff = diff;
+		}
+
+	/* Check maximum difference against threshold. */
+	return (max_diff > 0.00001);
+}
+
+/****************************************************************************\
+ * Element operations.
+ \****************************************************************************/
+/*
+ * elem_daxmy - Compute y' = y - ax where a is a double and x and y are
+ * vectors of doubles.
+ */
+static void elem_daxmy(double a, const double *x, double *y, int n) {
+	for (n--; n >= 0; n--) y[n] -= a * x[n];
+}
+
+/****************************************************************************\
+ * Block operations.
+ \****************************************************************************/
+
+/*
+ * block_lu - Factor block B.
+ */
+static void block_lu(Block B) {
+	int i, k;
+
+	/* Factor block. */
+	for (k = 0; k < LU_BLOCK_SIZE; k++)
+		for (i = k + 1; i < LU_BLOCK_SIZE; i++) {
+			BLOCK(B, i, k) /= BLOCK(B, k, k);
+			elem_daxmy(BLOCK(B, i, k), &BLOCK(B, k, k + 1), &BLOCK(B, i, k + 1), LU_BLOCK_SIZE - k - 1);
+		}
+}
+
+/*
+ * block_lower_solve - Perform forward substitution to solve for B' in
+ * LB' = B.
+ */
+static void block_lower_solve(Block B, Block L) {
+	int i, k;
+
+	/* Perform forward substitution. */
+	for (i = 1; i < LU_BLOCK_SIZE; i++)
+		for (k = 0; k < i; k++)
+			elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0), &BLOCK(B, i, 0), LU_BLOCK_SIZE);
+}
+
+/*
+ * block_upper_solve - Perform forward substitution to solve for B' in
+ * B'U = B.
+ */
+static void block_upper_solve(Block B, Block U) {
+	int i, k;
+
+	/* Perform forward substitution. */
+	for (i = 0; i < LU_BLOCK_SIZE; i++)
+		for (k = 0; k < LU_BLOCK_SIZE; k++) {
+			BLOCK(B, i, k) /= BLOCK(U, k, k);
+			elem_daxmy(BLOCK(B, i, k), &BLOCK(U, k, k + 1), &BLOCK(B, i, k + 1), LU_BLOCK_SIZE - k - 1);
+		}
+}
+
+/*
+ * block_schur - Compute Schur complement B' = B - AC.
+ */
+static void block_schur(Block B, Block A, Block C) {
+	int i, k;
+
+	/* Compute Schur complement. */
+	for (i = 0; i < LU_BLOCK_SIZE; i++)
+		for (k = 0; k < LU_BLOCK_SIZE; k++)
+			elem_daxmy(BLOCK(A, i, k), &BLOCK(C, k, 0), &BLOCK(B, i, 0), LU_BLOCK_SIZE);
+}
+
+/****************************************************************************\
+ * Divide-and-conquer matrix LU decomposition.
+ \****************************************************************************/
+
+/**
+ * schur - Compute M' = M - VW.
+ */
+mzs_sf static void schur(Matrix M, Matrix V, Matrix W, int nb) {
+	Matrix M00, M01, M10, M11;
+	Matrix V00, V01, V10, V11;
+	Matrix W00, W01, W10, W11;
+	int hnb;
+
+	/* Check base case. */
+	if (nb == 1) {
+		block_schur(*M, *V, *W);
+		return;
+	}
+
+	/* Break matrices into 4 pieces. */
+	hnb = nb / 2;
+	M00 = &MATRIX(M, 0, 0);
+	M01 = &MATRIX(M, 0, hnb);
+	M10 = &MATRIX(M, hnb, 0);
+	M11 = &MATRIX(M, hnb, hnb);
+	V00 = &MATRIX(V, 0, 0);
+	V01 = &MATRIX(V, 0, hnb);
+	V10 = &MATRIX(V, hnb, 0);
+	V11 = &MATRIX(V, hnb, hnb);
+	W00 = &MATRIX(W, 0, 0);
+	W01 = &MATRIX(W, 0, hnb);
+	W10 = &MATRIX(W, hnb, 0);
+	W11 = &MATRIX(W, hnb, hnb);
+
+	/* Form Schur complement with recursive calls. */
+	mzs_sf_state sf_state;
+	mzs_init(&sf_state);
+
+	mzs_spawn(&sf_state, schur, (M00, V00, W00, hnb));
+	mzs_spawn(&sf_state, schur, (M01, V00, W01, hnb));
+	mzs_spawn(&sf_state, schur, (M10, V10, W00, hnb));
+	schur(M11, V10, W01, hnb);
+	mzs_sync(&sf_state);
+
+	mzs_spawn(&sf_state, schur, (M00, V01, W10, hnb));
+	mzs_spawn(&sf_state, schur, (M01, V01, W11, hnb));
+	mzs_spawn(&sf_state, schur, (M10, V11, W10, hnb));
+	schur(M11, V11, W11, hnb);
+	mzs_sync(&sf_state);
+}
+
+/*
+ * lower_solve - Compute M' where LM' = M.
+ */
+mzs_sf static void lower_solve(Matrix M, Matrix L, int nb);
+
+static void aux_lower_solve(Matrix Ma, Matrix Mb, Matrix L, int nb) {
+	Matrix L00, L10, L11;
+
+	/* Break L matrix into 4 pieces. */
+	L00 = &MATRIX(L, 0, 0);
+	L10 = &MATRIX(L, nb, 0);
+	L11 = &MATRIX(L, nb, nb);
+
+	/* Solve with recursive calls. */
+	lower_solve(Ma, L00, nb);
+	schur(Mb, L10, Ma, nb);
+	lower_solve(Mb, L11, nb);
+}
+
+mzs_sf static void lower_solve(Matrix M, Matrix L, int nb) {
+	Matrix M00, M01, M10, M11;
+	int hnb;
+
+	/* Check base case. */
+	if (nb == 1) {
+		block_lower_solve(*M, *L);
+		return;
+	}
+
+	/* Break matrices into 4 pieces. */
+	hnb = nb / 2;
+	M00 = &MATRIX(M, 0, 0);
+	M01 = &MATRIX(M, 0, hnb);
+	M10 = &MATRIX(M, hnb, 0);
+	M11 = &MATRIX(M, hnb, hnb);
+
+	/* Solve with recursive calls. */
+	mzs_sf_state sf_state;
+	mzs_init(&sf_state);
+
+	mzs_spawn(&sf_state, aux_lower_solve, (M00, M10, L, hnb));
+	aux_lower_solve(M01, M11, L, hnb);
+
+	mzs_sync(&sf_state);
+}
+
+/*
+ * upper_solve - Compute M' where M'U = M.
+ */
+mzs_sf static void upper_solve(Matrix M, Matrix U, int nb);
+
+static void aux_upper_solve(Matrix Ma, Matrix Mb, Matrix U, int nb) {
+	Matrix U00, U01, U11;
+
+	/* Break U matrix into 4 pieces. */
+	U00 = &MATRIX(U, 0, 0);
+	U01 = &MATRIX(U, 0, nb);
+	U11 = &MATRIX(U, nb, nb);
+
+	/* Solve with recursive calls. */
+	upper_solve(Ma, U00, nb);
+	schur(Mb, Ma, U01, nb);
+	upper_solve(Mb, U11, nb);
+}
+
+mzs_sf static void upper_solve(Matrix M, Matrix U, int nb) {
+	Matrix M00, M01, M10, M11;
+	int hnb;
+
+	/* Check base case. */
+	if (nb == 1) {
+		block_upper_solve(*M, *U);
+		return;
+	}
+
+	/* Break matrices into 4 pieces. */
+	hnb = nb / 2;
+	M00 = &MATRIX(M, 0, 0);
+	M01 = &MATRIX(M, 0, hnb);
+	M10 = &MATRIX(M, hnb, 0);
+	M11 = &MATRIX(M, hnb, hnb);
+
+	/* Solve with recursive calls. */
+	mzs_sf_state sf_state;
+	mzs_init(&sf_state);
+
+	mzs_spawn(&sf_state, aux_upper_solve, (M00, M01, U, hnb));
+	aux_upper_solve(M10, M11, U, hnb);
+
+	mzs_sync(&sf_state);
+}
+
+/*
+ * lu - Perform LU decomposition of matrix M.
+ */
+mzs_sf void lu(Matrix M, int nb) {
+	Matrix M00, M01, M10, M11;
+	int hnb;
+
+	/* Check base case. */
+	if (nb == 1) {
+		block_lu(*M);
+		return;
+	}
+
+	/* Break matrix into 4 pieces. */
+	hnb = nb / 2;
+	M00 = &MATRIX(M, 0, 0);
+	M01 = &MATRIX(M, 0, hnb);
+	M10 = &MATRIX(M, hnb, 0);
+	M11 = &MATRIX(M, hnb, hnb);
+
+	/* Decompose upper left. */
+	lu(M00, hnb);
+
+	/* Solve for upper right and lower left. */
+	mzs_sf_state sf_state;
+	mzs_init(&sf_state);
+
+	mzs_spawn(&sf_state, lower_solve, (M01, M00, hnb));
+	upper_solve(M10, M00, hnb);
+
+	mzs_sync(&sf_state);
+
+	/* Compute Schur complement of lower right. */
+	schur(M11, M10, M01, hnb);
+
+	/* Decompose lower right. */
+	lu(M11, hnb);
+}
+
+void mzs_benchmark_init(const mzs::ProblemSize &problem_size) {
+	switch (problem_size) {
+		case mzs::ProblemSize::xs:
+			n = 256;
+			break;
+		case mzs::ProblemSize::m:
+			n = 4096;
+			break;
+	}
+
+	nBlocks = n / LU_BLOCK_SIZE;
+	// NOLINTNEXTLINE(clang-analyzer-unix.MallocSizeof)
+	M = (Matrix)malloc(n * n * sizeof(double));
+	// NOLINTNEXTLINE(clang-analyzer-unix.MallocSizeof)
+	Msave = (Matrix)malloc(n * n * sizeof(double));
+	init_matrix(Msave, nBlocks);
+}
+
+void mzs_benchmark_prepare() { memcpy((void *)M, (void *)Msave, n * n * sizeof(double)); }
+
+void mzs_benchmark_run() {
+	assert(n > 0);
+	assert(n % LU_BLOCK_SIZE == 0);
+
+	lu(M, nBlocks);
+}
+
+auto mzs_benchmark_verify() -> mzs::VerificationResult {
+	auto failed = test_result(M, Msave, nBlocks);
+	if (failed) return mzs::VerificationResult::failed;
+
+	return mzs::VerificationResult::successful;
+}
diff --git a/benchmarks/lu/meson.build b/benchmarks/lu/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..626ede5e8aee7f1797092a254146c32160654ba3
--- /dev/null
+++ b/benchmarks/lu/meson.build
@@ -0,0 +1,4 @@
+benchmarks += {
+			   'name': 'lu',
+			   'source_files': ['lu.cc'],
+			 }
diff --git a/benchmarks/meson.build b/benchmarks/meson.build
index 37b9e016d21ec7abcc5259ba41bee0ccbc1d7cfe..dbce7403ca4d61343c229444ed53120a91356869 100644
--- a/benchmarks/meson.build
+++ b/benchmarks/meson.build
@@ -1,2 +1,3 @@
 subdir('fib')
+subdir('lu')
 subdir('nqueens')
diff --git a/benchmarks/nqueens/nqueens.cc b/benchmarks/nqueens/nqueens.cc
index 947f910e66c02c90392c55d554d490f6e32e3c68..34f54c413d34e702078aa516124601861464797d 100644
--- a/benchmarks/nqueens/nqueens.cc
+++ b/benchmarks/nqueens/nqueens.cc
@@ -1,8 +1,12 @@
+#include <cassert>
 #include <cstdio>
 
 #include "mazstab-backend.hpp"
+#include "mazstab.hpp"
 
-int n = 2;
+namespace mzs = mazstab;
+
+int n = -1;
 int m;
 
 mzs_sf static auto nqueens(const int* a, int n, int d, int i) -> int {
@@ -44,17 +48,34 @@ mzs_sf static auto nqueens(const int* a, int n, int d, int i) -> int {
 	return sum;
 }
 
-void mzs_benchmark_run() { m = nqueens(nullptr, n, -1, 0); }
+void mzs_benchmark_run() {
+	assert(n > 0);
+	m = nqueens(nullptr, n, -1, 0);
+}
+
+auto mzs_benchmark_verify() -> mzs::VerificationResult {
+	const int KNOWN_GOOD_RESULT_COUNT = 16;
+	if (n >= KNOWN_GOOD_RESULT_COUNT) return mzs::VerificationResult::impossible;
 
-auto mzs_benchmark_verify() -> int {
 	// NOLINTNEXTLINE(modernize-avoid-c-arrays)
-	static int res[16] = {1,	 0,		0,		2,		 10,		4,			40,			 92,
-												352, 724, 2680, 14200, 73712, 365596, 2279184, 14772512};
+	static int res[KNOWN_GOOD_RESULT_COUNT] = {
+			1, 0, 0, 2, 10, 4, 40, 92, 352, 724, 2680, 14200, 73712, 365596, 2279184, 14772512};
 
 	if (m != res[n - 1]) {
 		fprintf(stderr, "nqueens(%d)=%d (expected %d)\n", n, m, res[n - 1]);
-		return 1;
+		return mzs::VerificationResult::failed;
 	}
 
-	return 0;
+	return mzs::VerificationResult::successful;
+}
+
+void mzs_benchmark_init(const mzs::ProblemSize& problem_size) {
+	switch (problem_size) {
+		case mzs::ProblemSize::xs:
+			n = 4;
+			break;
+		case mzs::ProblemSize::m:
+			n = 14;
+			break;
+	}
 }
diff --git a/commonlib/common.cc b/commonlib/common.cc
index c942f6685f0b23108dd56dade1575e42d449dff0..7614c4b058f6b46cf6fd0ebf7cd04ecbb91c8fe3 100644
--- a/commonlib/common.cc
+++ b/commonlib/common.cc
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 // Copyright © 2021 Florian Schmaus
-// SPDX-License-Identifier: GPL-3.0-or-later
-// Copyright © 2021 Florian Schmaus
 #include "common.hpp"
 
 #include <cstdio>
diff --git a/commonlib/mazstab.cc b/commonlib/mazstab.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07d0ce9f5a19fd61c0ac8ec0efbc74528da02bfc
--- /dev/null
+++ b/commonlib/mazstab.cc
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Copyright © 2021 Florian Schmaus
+#include "mazstab.hpp"
+
+#include <string>
+
+namespace mazstab {
+
+auto operator<<(std::ostream& out, const ProblemSize& problem_size) -> std::ostream& {
+	switch (problem_size) {
+		case ProblemSize::xs:
+			out << "xs";
+			break;
+		case ProblemSize::m:
+			out << "m";
+			break;
+	}
+	return out;
+}
+
+auto operator>>(std::istream& in, ProblemSize& problem_size) -> std::istream& {
+	std::string token;
+
+	in >> token;
+	if (token == "xs") {
+		problem_size = ProblemSize::xs;
+	} else if (token == "m") {
+		problem_size = ProblemSize::m;
+	} else {
+		in.setstate(std::ios_base::failbit);
+	}
+
+	return in;
+}
+
+}	 // namespace mazstab
diff --git a/commonlib/mazstab.hpp b/commonlib/mazstab.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7748a726217f1b8b576f00c14ddde827b45a84f0
--- /dev/null
+++ b/commonlib/mazstab.hpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Copyright © 2021 Florian Schmaus
+#pragma once
+
+#include <istream>
+
+namespace mazstab {
+
+enum class VerificationResult {
+	successful,
+	impossible,
+	failed,
+};
+
+enum class ProblemSize {
+	xs,
+	m,
+};
+
+auto operator<<(std::ostream& out, const ProblemSize& problem_size) -> std::ostream&;
+auto operator>>(std::istream& in, ProblemSize& problem_size) -> std::istream&;
+
+}	 // namespace mazstab
diff --git a/commonlib/meson.build b/commonlib/meson.build
index 81a71426ac829047e0aa0e911a1daf0f8898fe5a..f951f9e4652b8fdc865d5acb80d40c28233d8541 100644
--- a/commonlib/meson.build
+++ b/commonlib/meson.build
@@ -1,6 +1,7 @@
 common_lib = static_library(
   'mzs-common-lib',
   'common.cc',
+  'mazstab.cc',
   include_directories: include_directories('.'),
 )
 
diff --git a/iwyu-mappings.imp b/iwyu-mappings.imp
index 176c3ccfe2f82390a8c6786167491f8d2cb482f9..07eecd438cd44cad85933b73ece879a8e1aae237 100644
--- a/iwyu-mappings.imp
+++ b/iwyu-mappings.imp
@@ -3,4 +3,13 @@
 	{ include: ["\"concrete-mazstab-backend.hpp\"", "private", "\"mazstab-backend.hpp\"", "public"] },
 	{ include: ["\"fibril.hpp\"", "private", "\"mazstab-backend.hpp\"", "public"] },
 	{ include: ["<bits/types/struct_rusage.h>", "private", "<sys/resource.h>", "public"] },
+	{ include: ["<boost/detail/basic_pointerbuf.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/lexical_cast/bad_lexical_cast.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/program_options/detail/parsers.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/program_options/detail/value_semantic.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/program_options/errors.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/program_options/options_description.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/program_options/value_semantic.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/program_options/variables_map.hpp>", "private", "<boost/program_options.hpp>", "public"], },
+	{ include: ["<boost/type_index/type_index_facade.hpp>", "private", "<boost/program_options.hpp>", "public"], },
 ]
diff --git a/meson.build b/meson.build
index 153e90f10992c423f382300eb3e443a41928dafe..b21dea03af268f1b98d81de1f49441f0c212ad37 100644
--- a/meson.build
+++ b/meson.build
@@ -32,13 +32,18 @@ subdir('benchmarks')
 backends = []
 subdir('backends')
 
+boost_program_options_dep = dependency('boost', modules: ['program_options'])
+
 benchmark_driver_lib = static_library(
   'benchmark-driver',
   'benchmark-driver.cc',
-  dependencies: common_dep,
+  dependencies: [common_dep, boost_program_options_dep],
 )
 
-benchmark_driver_dep = declare_dependency(link_with: benchmark_driver_lib)
+benchmark_driver_dep = declare_dependency(
+  link_with: benchmark_driver_lib,
+  dependencies: [common_dep],
+)
 
 foreach backend : backends
   backend_name = backend.get('name')
@@ -90,6 +95,7 @@ foreach backend : backends
 	test(
 	  executable_name,
 	  executable,
+	  args: '--problem-size=xs',
 	  suite: test_suites,
 	)
   endforeach