Commit 30470c81 authored by Florian Fischer's avatar Florian Fischer
Browse files

add dj's workloads

parent 4cd6e015
......@@ -28,9 +28,11 @@ BENCH_OBJECTS = $(notdir $(BENCH_CC_SOURCES:.cc=.o)) $(notdir $(BENCH_C_SOURCES:
BENCH_OBJPRE = $(addprefix $(OBJDIR)/,$(BENCH_OBJECTS))
MAKEFILE_LIST = Makefile
BENCH_TARGETS = $(BENCH_OBJPRE:.o=) $(BENCH_OBJPRE:.o=-glibc-notc)
BENCH_TARGETS = $(BENCH_OBJPRE:.o=) $(OBJDIR)/trace_run
all: $(BENCH_TARGETS) $(OBJDIR)/chattymalloc.so $(OBJDIR)/print_status_on_exit.so
NOTC_TARGETS = $(BENCH_TARGETS:=-glibc-notc)
all: $(BENCH_TARGETS) $(NOTC_TARGETS) $(OBJDIR)/chattymalloc.so $(OBJDIR)/print_status_on_exit.so
$(OBJDIR)/print_status_on_exit.so: print_status_on_exit.c $(MAKEFILE_LIST)
$(CC) -shared $(CFLAGS) -o $@ $< -ldl
......@@ -38,6 +40,14 @@ $(OBJDIR)/print_status_on_exit.so: print_status_on_exit.c $(MAKEFILE_LIST)
$(OBJDIR)/chattymalloc.so: chattymalloc.c $(MAKEFILE_LIST)
$(CC) -shared $(CFLAGS) -o $@ $< -ldl
$(OBJDIR)/trace_run: trace_run.c $(MAKEFILE_LIST)
$(CC) -pthread $(CFLAGS) -o $@ $<
$(OBJDIR)/trace_run-glibc-notc: $(OBJDIR)/trace_run $(MAKEFILE_LIST)
cp $< $@
patchelf --set-interpreter $(GLIBC_NOTC)/ld-linux-x86-64.so.2 $@
patchelf --set-rpath $(GLIBC_NOTC) $@
$(OBJDIR)/cache-thrash: $(OBJDIR)/cache-thrash.o
$(CXX) -pthread -o $@ $^
......
......@@ -9,6 +9,7 @@ from falsesharing import falsesharing
from loop import loop
# from bench_conprod import conprod
from mysql import mysql
from dj_trace import dj_trace
parser = argparse.ArgumentParser(description="benchmark memory allocators")
parser.add_argument("-s", "--save", help="save benchmark results to disk", action='store_true')
......@@ -21,7 +22,7 @@ parser.add_argument("-sd", "--summarydir", help="directory where all plots and t
parser.add_argument("-a", "--analyse", help="collect allocation sizes", action='store_true')
benchmarks = [loop, mysql, falsesharing]
benchmarks = [loop, mysql, falsesharing, dj_trace]
def main():
args = parser.parse_args()
......
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
from urllib.request import urlretrieve
import sys
import re
import shutil
from benchmark import Benchmark
comma_sep_number_re = "(?:\d*(?:,\d*)?)*"
rss_re = "(?P<rss>" + comma_sep_number_re + ")"
time_re = "(?P<time>" + comma_sep_number_re + ")"
calls_re = "(?P<calls>" + comma_sep_number_re + ")"
max_rss_re = re.compile("^{} Kb Max RSS".format(rss_re))
ideal_rss_re = re.compile("^{} Kb Max Ideal RSS".format(rss_re))
malloc_re = re.compile("^Avg malloc time:\s*{} in\s*{} calls$".format(time_re, calls_re))
calloc_re = re.compile("^Avg calloc time:\s*{} in\s*{} calls$".format(time_re, calls_re))
realloc_re = re.compile("^Avg realloc time:\s*{} in\s*{} calls$".format(time_re, calls_re))
free_re = re.compile("^Avg free time:\s*{} in\s*{} calls$".format(time_re, calls_re))
class Benchmark_DJ_Trace( Benchmark ):
def __init__(self):
self.name = "dj_trace"
self.descrition = """This benchmark uses the workload simulator written
by DJ Delorie to simulate workloads provided by him
under https://delorie.com/malloc. Those workloads
are generated from traces of real aaplications and are
also used by delorie to measure improvements in the
glibc allocator.""",
self.cmd = "build/trace_run{binary_suffix} dj_workloads/{workload}.wl"
self.args = {
"workload" : [
"389-ds-2",
"dj",
"dj2",
"mt_test_one_alloc",
"oocalc",
"qemu-virtio",
"qemu-win7",
"proprietary-1",
"proprietary-2",
]
}
self.requirements = ["build/trace_run"]
super().__init__()
def prepare(self, verbose=False):
super().prepare(verbose=verbose)
def reporthook(blocknum, blocksize, totalsize):
readsofar = blocknum * blocksize
if totalsize > 0:
percent = readsofar * 1e2 / totalsize
s = "\r%5.1f%% %*d / %d" % (
percent, len(str(totalsize)), readsofar, totalsize)
sys.stderr.write(s)
else: # total size is unknown
sys.stderr.write("\rdownloaded %d" % (readsofar,))
if not os.path.isdir("dj_workloads"):
os.mkdir("dj_workloads")
for wl in self.args["workload"]:
file_name = wl + ".wl"
file_path = os.path.join("dj_workloads", file_name)
if not os.path.isfile(file_path):
if input("want to download " + wl + " [Y/n] ") in ["", "Y", "y"]:
url = "http://www.delorie.com/malloc/" + file_name
urlretrieve(url, file_path, reporthook)
sys.stderr.write("\n")
return True
def process_stdout(self, result, stdout, verbose):
def to_int(s):
return int(s.replace(',', ""))
for i, l in enumerate(stdout.splitlines()):
if i == 3:
result["Max_RSS"] = to_int(max_rss_re.match(l).group("rss"))
elif i == 4:
result["Ideal_RSS"] = to_int(ideal_rss_re.match(l).group("rss"))
elif i == 7:
result["avg_malloc"] = to_int(malloc_re.match(l).group("time"))
elif i == 8:
result["avg_calloc"] = to_int(calloc_re.match(l).group("time"))
elif i == 9:
result["avg_realloc"] = to_int(realloc_re.match(l).group("time"))
elif i == 10:
result["avg_free"] = to_int(free_re.match(l).group("time"))
def summary(self, sd=None):
args = self.results["args"]
targets = self.results["targets"]
sd = sd or ""
# Total times
for perm in self.iterate_args():
for i, target in enumerate(targets):
d = [float(x["task-clock"]) for x in self.results[target][perm]]
y_val = np.mean(d)
plt.bar([i], y_val, label=target, color=targets[target]["color"])
plt.legend(loc="lower right")
plt.ylabel("Time in ms")
plt.title("Runtime of " + perm.workload + ":")
plt.savefig(os.path.join(sd, ".".join([self.name, perm.workload, "runtime", "png"])))
plt.clf()
# Function Times
for perm in self.iterate_args():
for i, target in enumerate(targets):
x_vals = [x-i/8 for x in range(0,4)]
y_vals = [0] * 4
y_vals[0] = np.mean([x["avg_malloc"] for x in self.results[target][perm]])
y_vals[1] = np.mean([x["avg_calloc"] for x in self.results[target][perm]])
y_vals[2] = np.mean([x["avg_realloc"] for x in self.results[target][perm]])
y_vals[3] = np.mean([x["avg_free"] for x in self.results[target][perm]])
plt.bar(x_vals, y_vals, width=0.2, align="center",
label=target, color=targets[target]["color"])
plt.legend(loc="best")
plt.xticks(range(0,4), ["malloc", "calloc", "realloc", "free"])
plt.ylabel("Avg time in ms")
plt.title("Avg API call times " + perm.workload + ":")
plt.savefig(os.path.join(sd, ".".join([self.name, perm.workload, "apitimes", "png"])))
plt.clf()
# Memusage
for perm in self.iterate_args():
for i, target in enumerate(targets):
d = [x["Max_RSS"] for x in self.results[target][perm]]
y_val = np.mean(d)
plt.bar([i], y_val, label=target, color=targets[target]["color"])
# add ideal rss
y_val = self.results[list(targets.keys())[0]][perm][0]["Ideal_RSS"]
plt.bar([len(targets)], y_val, label="Ideal RSS")
plt.legend(loc="best")
plt.ylabel("Max RSS in Kb")
plt.title("Max RSS " + perm.workload + ":")
plt.savefig(os.path.join(sd, ".".join([self.name, perm.workload, "rss", "png"])))
plt.clf()
dj_trace = Benchmark_DJ_Trace()
#define _LARGEFILE64_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <pthread.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/resource.h>
#include <fcntl.h>
#include <unistd.h>
// #include "malloc.h"
#include <malloc.h>
// #include "mtrace.h"
/* Codes for the simulator/workload programs. Copied from mtrace.h. */
#define C_NOP 0
#define C_DONE 1
#define C_MALLOC 2
#define C_CALLOC 3
#define C_REALLOC 4
#define C_FREE 5
#define C_SYNC_W 6
#define C_SYNC_R 7
#define C_ALLOC_PTRS 8
#define C_ALLOC_SYNCS 9
#define C_NTHREADS 10
#define C_START_THREAD 11
#define C_MEMALIGN 12
#define C_VALLOC 13
#define C_PVALLOC 14
#define C_POSIX_MEMALIGN 15
#if UINTPTR_MAX == 0xffffffffffffffff
#define ticks_t int64_t
/* Setting quick_run to 1 allows the simulator to model
only the allocation and deallocation accounting via
atomic_rss. The actual allocations are skipped. This
mode is useful to verify the workload file. */
#define quick_run 0
static __inline__ ticks_t rdtsc_s(void)
{
unsigned a, d;
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
asm volatile("rdtscp" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}
static __inline__ ticks_t rdtsc_e(void)
{
unsigned a, d;
asm volatile("rdtscp" : "=a" (a), "=d" (d));
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}
#else
#define ticks_t int32_t
static __inline__ ticks_t rdtsc_s(void)
{
unsigned a, d;
asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long)a) | (((unsigned long)d) << 16);
}
static __inline__ ticks_t rdtsc_e(void)
{
unsigned a, d;
asm volatile("rdtscp" : "=a" (a), "=d" (d));
asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
return ((unsigned long)a) | (((unsigned long)d) << 16);
}
#endif
static ticks_t diff_timeval (struct timeval e, struct timeval s)
{
ticks_t usec;
if (e.tv_usec < s.tv_usec)
usec = (e.tv_usec + 1000000 - s.tv_usec) + (e.tv_sec-1 - s.tv_sec)*1000000;
else
usec = (e.tv_usec - s.tv_usec) + (e.tv_sec - s.tv_sec)*1000000;
return usec;
}
#if 1
#define Q1
#define Q2
#else
pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER;
#define Q1 pthread_mutex_lock(&genmutex)
#define Q2 pthread_mutex_unlock(&genmutex)
#endif
pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER;
#define NCBUF 10
static char cbuf[NCBUF][30];
static int ci = 0;
char *comma(ticks_t x)
{
char buf[30], *bs, *bd;
int l, i, idx;
pthread_mutex_lock(&cmutex);
ci = (ci + 1) % NCBUF;
idx = ci;
pthread_mutex_unlock(&cmutex);
bs = buf;
bd = cbuf[idx];
sprintf(buf, "%lld", (long long int)x);
l = strlen(buf);
i = l;
while (*bs)
{
*bd++ = *bs++;
i--;
if (i % 3 == 0 && *bs)
*bd++ = ',';
}
*bd = 0;
return cbuf[idx];
}
static volatile void **ptrs;
static volatile size_t *sizes;
static size_t n_ptrs;
static volatile char *syncs;
static pthread_mutex_t *mutexes;
static pthread_cond_t *conds;
static size_t n_syncs;
static pthread_mutex_t stat_mutex = PTHREAD_MUTEX_INITIALIZER;
ticks_t malloc_time = 0, malloc_count = 0;
ticks_t calloc_time = 0, calloc_count = 0;
ticks_t realloc_time = 0, realloc_count = 0;
ticks_t free_time = 0, free_count = 0;
size_t ideal_rss = 0;
size_t max_ideal_rss = 0;
static pthread_mutex_t rss_mutex = PTHREAD_MUTEX_INITIALIZER;
void atomic_rss (ssize_t delta)
{
pthread_mutex_lock (&rss_mutex);
ideal_rss += delta;
if (max_ideal_rss < ideal_rss)
max_ideal_rss = ideal_rss;
pthread_mutex_unlock (&rss_mutex);
}
pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER;
int threads_done = 0;
//#define dprintf printf
#define dprintf(...) (void)1
//#define mprintf printf
//#define MDEBUG 1
#define mprintf(...) (void)1
#define myabort() my_abort_2(thrc, __LINE__)
void
my_abort_2 (pthread_t thrc, int line)
{
fprintf(stderr, "Abort thread %p at line %d\n", (void *)thrc, line);
abort();
}
/*------------------------------------------------------------*/
/* Wrapper around I/O routines */
int io_fd;
#define IOSIZE 65536
#define IOMIN 4096
static pthread_mutex_t io_mutex = PTHREAD_MUTEX_INITIALIZER;
typedef struct {
unsigned char buf[IOSIZE];
size_t incr;
size_t max_incr;
size_t buf_base;
size_t buf_idx;
int saw_eof;
} IOPerThreadType;
IOPerThreadType main_io;
IOPerThreadType *thread_io;
void
io_init (IOPerThreadType *io, size_t file_offset, int incr)
{
if (incr > IOSIZE)
incr = IOSIZE;
if (incr < IOMIN)
incr = IOMIN;
io->buf_base = file_offset;
io->buf_idx = 0;
io->incr = incr;
pthread_mutex_lock (&io_mutex);
lseek64 (io_fd, io->buf_base, SEEK_SET);
// short read OK, the eof is just to prevent runaways from bad data.
if (read (io_fd, io->buf, incr) < 0)
io->saw_eof = 1;
else
io->saw_eof = 0;
pthread_mutex_unlock (&io_mutex);
}
unsigned char
io_read (IOPerThreadType *io)
{
if (io->buf_idx >= io->incr)
io_init (io, io->buf_base + io->buf_idx, io->incr);
if (io->saw_eof)
return 0xff;
return io->buf [io->buf_idx++];
}
unsigned char
io_peek (IOPerThreadType *io)
{
if (io->buf_idx >= io->incr)
io_init (io, io->buf_base + io->buf_idx, io->incr);
if (io->saw_eof)
return 0xff;
return io->buf [io->buf_idx];
}
size_t
io_pos (IOPerThreadType *io)
{
return io->buf_base + io->buf_idx;
}
/*------------------------------------------------------------*/
static void
wmem (volatile void *ptr, int count)
{
char *p = (char *)ptr;
int i;
if (!p)
return;
for (i=0; i<count; i++)
p[i] = 0x11;
}
#define xwmem(a,b)
static size_t get_int (IOPerThreadType *io)
{
size_t rv = 0;
while (1)
{
unsigned char c = io_read (io);
rv |= (c & 0x7f);
if (c & 0x80)
rv <<= 7;
else
return rv;
}
}
static void free_wipe (size_t idx)
{
char *cp = (char *)ptrs[idx];
if (cp == NULL)
return;
size_t sz = sizes[idx];
size_t i;
for (i=0; i<sz; i++)
{
if (i % 8 == 1)
cp[i] = i / 8;
else
cp[i] = 0x22;
}
}
static void *
thread_common (void *my_data_v)
{
pthread_t thrc = pthread_self ();
size_t p1, p2, sz, sz2;
IOPerThreadType *io = (IOPerThreadType *)my_data_v;
ticks_t my_malloc_time = 0, my_malloc_count = 0;
ticks_t my_calloc_time = 0, my_calloc_count = 0;
ticks_t my_realloc_time = 0, my_realloc_count = 0;
ticks_t my_free_time = 0, my_free_count = 0;
ticks_t stime, etime;
int thread_idx = io - thread_io;
#ifdef MDEBUG
volatile void *tmp;
#endif
while (1)
{
unsigned char this_op = io_peek (io);
if (io->saw_eof)
myabort();
dprintf("op %p:%ld is %d\n", (void *)thrc, io_pos (io), io_peek (io));
switch (io_read (io))
{
case C_NOP:
break;
case C_DONE:
dprintf("op %p:%ld DONE\n", (void *)thrc, io_pos (io));
pthread_mutex_lock (&stat_mutex);
malloc_time += my_malloc_time;
calloc_time += my_calloc_time;
realloc_time += my_realloc_time;
free_time += my_free_time;
malloc_count += my_malloc_count;
calloc_count += my_calloc_count;
realloc_count += my_realloc_count;
free_count += my_free_count;
threads_done ++;
pthread_mutex_unlock (&stat_mutex);
pthread_mutex_lock(&stop_mutex);
pthread_mutex_unlock(&stop_mutex);
return NULL;
case C_MEMALIGN:
p2 = get_int (io);
sz2 = get_int (io);
sz = get_int (io);
dprintf("op %p:%ld %ld = MEMALIGN %ld %ld\n", (void *)thrc, io_pos (io), p2, sz2, sz);
/* we can't force memalign to return NULL (fail), so just skip it. */
if (p2 == 0)
break;
if (p2 > n_ptrs)
myabort();
stime = rdtsc_s();
Q1;
if (ptrs[p2])
{
if (!quick_run)
free ((void *)ptrs[p2]);
atomic_rss (-sizes[p2]);
}
if (!quick_run)
ptrs[p2] = memalign (sz2, sz);
else
ptrs[p2] = (void *)p2;
/* Verify the alignment matches what is expected. */
if (((size_t)ptrs[p2] & (sz2 - 1)) != 0)
myabort ();
sizes[p2] = sz;
mprintf("%p = memalign(%lx, %lx)\n", ptrs[p2], sz2, sz);
Q2;
etime = rdtsc_e();
if (ptrs[p2] != NULL)
atomic_rss (sz);
if (etime < stime)
{
printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
}
my_malloc_time += etime - stime;
my_malloc_count ++;
if (!quick_run)
wmem(ptrs[p2], sz);
break;
case C_MALLOC:
p2 = get_int (io);
sz = get_int (io);
dprintf("op %p:%ld %ld = MALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
/* we can't force malloc to return NULL (fail), so just skip it. */
if (p2 == 0)
break;
if (p2 > n_ptrs)
myabort();
stime = rdtsc_s();
Q1;
if (ptrs[p2])
{
if (!quick_run)
free ((void *)ptrs[p2]);
atomic_rss (-sizes[p2]);
}
if (!quick_run)
ptrs[p2] = malloc (sz);
else
ptrs[p2] = (void *)p2;
sizes[p2] = sz;
mprintf("%p = malloc(%lx)\n", ptrs[p2], sz);
Q2;
etime = rdtsc_e();
if (ptrs[p2] != NULL)
atomic_rss (sz);
if (etime < stime)
{
printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
}
my_malloc_time += etime - stime;
my_malloc_count ++;
if (!quick_run)
wmem(ptrs[p2], sz);
break;
case C_CALLOC:
p2 = get_int (io);
sz = get_int (io);
dprintf("op %p:%ld %ld = CALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);