From 30470c81dac7dfe8fc51f4c58fbae9b3dce4a516 Mon Sep 17 00:00:00 2001
From: Florian Fischer <florian.fl.fischer@fau.de>
Date: Thu, 6 Sep 2018 01:34:31 +0200
Subject: [PATCH] add dj's workloads

---
 Makefile    |  14 +-
 bench.py    |   3 +-
 dj_trace.py | 154 +++++++++++
 trace_run.c | 750 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 918 insertions(+), 3 deletions(-)
 create mode 100644 dj_trace.py
 create mode 100644 trace_run.c

diff --git a/Makefile b/Makefile
index 6b2e859..5d838dc 100644
--- a/Makefile
+++ b/Makefile
@@ -28,9 +28,11 @@ BENCH_OBJECTS = $(notdir $(BENCH_CC_SOURCES:.cc=.o)) $(notdir $(BENCH_C_SOURCES:
 BENCH_OBJPRE = $(addprefix $(OBJDIR)/,$(BENCH_OBJECTS))
 MAKEFILE_LIST = Makefile
 
-BENCH_TARGETS = $(BENCH_OBJPRE:.o=) $(BENCH_OBJPRE:.o=-glibc-notc)
+BENCH_TARGETS = $(BENCH_OBJPRE:.o=) $(OBJDIR)/trace_run
 
-all: $(BENCH_TARGETS) $(OBJDIR)/chattymalloc.so $(OBJDIR)/print_status_on_exit.so
+NOTC_TARGETS = $(BENCH_TARGETS:=-glibc-notc)
+
+all: $(BENCH_TARGETS) $(NOTC_TARGETS) $(OBJDIR)/chattymalloc.so $(OBJDIR)/print_status_on_exit.so
 
 $(OBJDIR)/print_status_on_exit.so: print_status_on_exit.c $(MAKEFILE_LIST)
 	$(CC) -shared $(CFLAGS) -o $@ $< -ldl
@@ -38,6 +40,14 @@ $(OBJDIR)/print_status_on_exit.so: print_status_on_exit.c $(MAKEFILE_LIST)
 $(OBJDIR)/chattymalloc.so: chattymalloc.c $(MAKEFILE_LIST)
 	$(CC) -shared $(CFLAGS) -o $@ $< -ldl
 
+$(OBJDIR)/trace_run: trace_run.c $(MAKEFILE_LIST)
+	$(CC) -pthread $(CFLAGS) -o $@ $<
+
+$(OBJDIR)/trace_run-glibc-notc: $(OBJDIR)/trace_run $(MAKEFILE_LIST)
+	cp $< $@
+	patchelf --set-interpreter $(GLIBC_NOTC)/ld-linux-x86-64.so.2 $@
+	patchelf --set-rpath $(GLIBC_NOTC) $@
+
 $(OBJDIR)/cache-thrash: $(OBJDIR)/cache-thrash.o
 	$(CXX) -pthread -o $@ $^
 
diff --git a/bench.py b/bench.py
index e877052..ee1c412 100755
--- a/bench.py
+++ b/bench.py
@@ -9,6 +9,7 @@ from falsesharing import falsesharing
 from loop import loop
 # from bench_conprod import conprod
 from mysql import mysql
+from dj_trace import dj_trace
 
 parser = argparse.ArgumentParser(description="benchmark memory allocators")
 parser.add_argument("-s", "--save", help="save benchmark results to disk", action='store_true')
@@ -21,7 +22,7 @@ parser.add_argument("-sd", "--summarydir", help="directory where all plots and t
 parser.add_argument("-a", "--analyse", help="collect allocation sizes", action='store_true')
 
 
-benchmarks = [loop, mysql, falsesharing]
+benchmarks = [loop, mysql, falsesharing, dj_trace]
 
 def main():
     args = parser.parse_args()
diff --git a/dj_trace.py b/dj_trace.py
new file mode 100644
index 0000000..300e0a2
--- /dev/null
+++ b/dj_trace.py
@@ -0,0 +1,154 @@
+import matplotlib.pyplot as plt
+import multiprocessing
+import numpy as np
+import os
+from urllib.request import urlretrieve
+import sys
+import re
+import shutil
+
+from benchmark import Benchmark
+
+comma_sep_number_re = "(?:\d*(?:,\d*)?)*"
+rss_re = "(?P<rss>" + comma_sep_number_re + ")"
+time_re = "(?P<time>" + comma_sep_number_re + ")"
+calls_re = "(?P<calls>" + comma_sep_number_re + ")"
+
+max_rss_re = re.compile("^{} Kb Max RSS".format(rss_re))
+ideal_rss_re = re.compile("^{} Kb Max Ideal RSS".format(rss_re))
+
+malloc_re = re.compile("^Avg malloc time:\s*{} in\s*{} calls$".format(time_re, calls_re))
+calloc_re = re.compile("^Avg calloc time:\s*{} in\s*{} calls$".format(time_re, calls_re))
+realloc_re = re.compile("^Avg realloc time:\s*{} in\s*{} calls$".format(time_re, calls_re))
+free_re = re.compile("^Avg free time:\s*{} in\s*{} calls$".format(time_re, calls_re))
+
+class Benchmark_DJ_Trace( Benchmark ):
+    def __init__(self):
+        self.name = "dj_trace"
+        self.descrition = """This benchmark uses the workload simulator written
+                             by DJ Delorie to simulate workloads provided by him
+                             under https://delorie.com/malloc. Those workloads
+                             are generated from traces of real aaplications and are
+                             also used by delorie to measure improvements in the
+                             glibc allocator.""",
+
+        self.cmd = "build/trace_run{binary_suffix} dj_workloads/{workload}.wl"
+
+        self.args = {
+                        "workload" : [
+                                        "389-ds-2",
+                                        "dj",
+                                        "dj2",
+                                        "mt_test_one_alloc",
+                                        "oocalc",
+                                        "qemu-virtio",
+                                        "qemu-win7",
+                                        "proprietary-1",
+                                        "proprietary-2",
+                                      ]
+                    }
+
+        self.requirements = ["build/trace_run"]
+        super().__init__()
+
+    def prepare(self, verbose=False):
+        super().prepare(verbose=verbose)
+
+        def reporthook(blocknum, blocksize, totalsize):
+            readsofar = blocknum * blocksize
+            if totalsize > 0:
+                percent = readsofar * 1e2 / totalsize
+                s = "\r%5.1f%% %*d / %d" % (
+                percent, len(str(totalsize)), readsofar, totalsize)
+                sys.stderr.write(s)
+            else: # total size is unknown
+                sys.stderr.write("\rdownloaded %d" % (readsofar,))
+
+        if not os.path.isdir("dj_workloads"):
+            os.mkdir("dj_workloads")
+
+        for wl in self.args["workload"]:
+            file_name = wl + ".wl"
+            file_path = os.path.join("dj_workloads", file_name)
+            if not os.path.isfile(file_path):
+                if input("want to download " + wl + " [Y/n] ") in ["", "Y", "y"]:
+                    url = "http://www.delorie.com/malloc/" + file_name
+                    urlretrieve(url, file_path, reporthook)
+                    sys.stderr.write("\n")
+        return True
+
+    def process_stdout(self, result, stdout, verbose):
+        def to_int(s):
+            return int(s.replace(',', ""))
+
+        for i, l in enumerate(stdout.splitlines()):
+            if i == 3:
+                result["Max_RSS"] = to_int(max_rss_re.match(l).group("rss"))
+            elif i == 4:
+                result["Ideal_RSS"] = to_int(ideal_rss_re.match(l).group("rss"))
+            elif i == 7:
+                result["avg_malloc"] = to_int(malloc_re.match(l).group("time"))
+            elif i == 8:
+                result["avg_calloc"] = to_int(calloc_re.match(l).group("time"))
+            elif i == 9:
+                result["avg_realloc"] = to_int(realloc_re.match(l).group("time"))
+            elif i == 10:
+                result["avg_free"] = to_int(free_re.match(l).group("time"))
+        
+
+    def summary(self, sd=None):
+        args = self.results["args"]
+        targets = self.results["targets"]
+
+        sd = sd or ""
+
+        # Total times
+        for perm in self.iterate_args():
+            for i, target in enumerate(targets):
+                d = [float(x["task-clock"]) for x in self.results[target][perm]]
+                y_val = np.mean(d)
+                plt.bar([i], y_val, label=target, color=targets[target]["color"])
+
+            plt.legend(loc="lower right")
+            plt.ylabel("Time in ms")
+            plt.title("Runtime of " + perm.workload + ":")
+            plt.savefig(os.path.join(sd, ".".join([self.name, perm.workload, "runtime", "png"])))
+            plt.clf()
+
+        # Function Times
+        for perm in self.iterate_args():
+            for i, target in enumerate(targets):
+                x_vals = [x-i/8 for x in range(0,4)]
+                y_vals = [0] * 4
+                y_vals[0] = np.mean([x["avg_malloc"] for x in self.results[target][perm]])
+                y_vals[1] = np.mean([x["avg_calloc"] for x in self.results[target][perm]])
+                y_vals[2] = np.mean([x["avg_realloc"] for x in self.results[target][perm]])
+                y_vals[3] = np.mean([x["avg_free"] for x in self.results[target][perm]])
+                plt.bar(x_vals, y_vals, width=0.2, align="center",
+                        label=target, color=targets[target]["color"])
+
+            plt.legend(loc="best")
+            plt.xticks(range(0,4), ["malloc", "calloc", "realloc", "free"])
+            plt.ylabel("Avg time in ms")
+            plt.title("Avg API call times " + perm.workload + ":")
+            plt.savefig(os.path.join(sd, ".".join([self.name, perm.workload, "apitimes", "png"])))
+            plt.clf()
+
+        # Memusage
+        for perm in self.iterate_args():
+            for i, target in enumerate(targets):
+                d = [x["Max_RSS"] for x in self.results[target][perm]]
+                y_val = np.mean(d)
+                plt.bar([i], y_val, label=target, color=targets[target]["color"])
+
+            # add ideal rss
+            y_val = self.results[list(targets.keys())[0]][perm][0]["Ideal_RSS"]
+            plt.bar([len(targets)], y_val, label="Ideal RSS")
+
+            plt.legend(loc="best")
+            plt.ylabel("Max RSS in Kb")
+            plt.title("Max RSS " + perm.workload + ":")
+            plt.savefig(os.path.join(sd, ".".join([self.name, perm.workload, "rss", "png"])))
+            plt.clf()
+
+dj_trace = Benchmark_DJ_Trace()
diff --git a/trace_run.c b/trace_run.c
new file mode 100644
index 0000000..604d01e
--- /dev/null
+++ b/trace_run.c
@@ -0,0 +1,750 @@
+#define _LARGEFILE64_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+// #include "malloc.h"
+#include <malloc.h>
+
+// #include "mtrace.h"
+/* Codes for the simulator/workload programs. Copied from mtrace.h. */
+#define C_NOP 0
+#define C_DONE 1
+#define C_MALLOC 2
+#define C_CALLOC 3
+#define C_REALLOC 4
+#define C_FREE 5
+#define C_SYNC_W 6
+#define C_SYNC_R 7
+#define C_ALLOC_PTRS 8
+#define C_ALLOC_SYNCS 9
+#define C_NTHREADS 10
+#define C_START_THREAD 11
+#define C_MEMALIGN 12
+#define C_VALLOC 13
+#define C_PVALLOC 14
+#define C_POSIX_MEMALIGN 15
+
+#if UINTPTR_MAX == 0xffffffffffffffff
+
+#define ticks_t int64_t
+/* Setting quick_run to 1 allows the simulator to model
+   only the allocation and deallocation accounting via
+   atomic_rss. The actual allocations are skipped.  This
+   mode is useful to verify the workload file.  */
+#define quick_run 0
+
+static __inline__ ticks_t rdtsc_s(void)
+{
+  unsigned a, d;
+  asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+  asm volatile("rdtscp" : "=a" (a), "=d" (d));
+  return ((unsigned long long)a) | (((unsigned long long)d) << 32);
+}
+
+static __inline__ ticks_t rdtsc_e(void)
+{
+  unsigned a, d;
+  asm volatile("rdtscp" : "=a" (a), "=d" (d));
+  asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+  return ((unsigned long long)a) | (((unsigned long long)d) << 32);
+}
+
+#else
+
+#define ticks_t int32_t
+
+static __inline__ ticks_t rdtsc_s(void)
+{
+  unsigned a, d;
+  asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
+  asm volatile("rdtsc" : "=a" (a), "=d" (d));
+  return ((unsigned long)a) | (((unsigned long)d) << 16);
+}
+
+static __inline__ ticks_t rdtsc_e(void)
+{
+  unsigned a, d;
+  asm volatile("rdtscp" : "=a" (a), "=d" (d));
+  asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
+  return ((unsigned long)a) | (((unsigned long)d) << 16);
+}
+
+#endif
+
+static ticks_t diff_timeval (struct timeval e, struct timeval s)
+{
+  ticks_t usec;
+  if (e.tv_usec < s.tv_usec)
+    usec = (e.tv_usec + 1000000 - s.tv_usec) + (e.tv_sec-1 - s.tv_sec)*1000000;
+  else
+    usec = (e.tv_usec - s.tv_usec) + (e.tv_sec - s.tv_sec)*1000000;
+  return usec;
+}
+
+#if 1
+#define Q1
+#define Q2
+#else
+pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER;
+#define Q1   pthread_mutex_lock(&genmutex)
+#define Q2   pthread_mutex_unlock(&genmutex)
+#endif
+
+pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER;
+#define NCBUF 10
+static char cbuf[NCBUF][30];
+static int ci = 0;
+
+char *comma(ticks_t x)
+{
+  char buf[30], *bs, *bd;
+  int l, i, idx;
+
+  pthread_mutex_lock(&cmutex);
+  ci = (ci + 1) % NCBUF;
+  idx = ci;
+  pthread_mutex_unlock(&cmutex);
+  bs = buf;
+  bd = cbuf[idx];
+
+  sprintf(buf, "%lld", (long long int)x);
+  l = strlen(buf);
+  i = l;
+  while (*bs)
+    {
+      *bd++ = *bs++;
+      i--;
+      if (i % 3 == 0 && *bs)
+	*bd++ = ',';
+    }
+  *bd = 0;
+  return cbuf[idx];
+}
+
+static volatile void **ptrs;
+static volatile size_t *sizes;
+static size_t n_ptrs;
+static volatile char *syncs;
+static pthread_mutex_t *mutexes;
+static pthread_cond_t *conds;
+static size_t n_syncs;
+
+static pthread_mutex_t stat_mutex = PTHREAD_MUTEX_INITIALIZER;
+ticks_t malloc_time = 0, malloc_count = 0;
+ticks_t calloc_time = 0, calloc_count = 0;
+ticks_t realloc_time = 0, realloc_count = 0;
+ticks_t free_time = 0, free_count = 0;
+
+size_t ideal_rss = 0;
+size_t max_ideal_rss = 0;
+static pthread_mutex_t rss_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void atomic_rss (ssize_t delta)
+{
+  pthread_mutex_lock (&rss_mutex);
+  ideal_rss += delta;
+  if (max_ideal_rss < ideal_rss)
+    max_ideal_rss = ideal_rss;
+  pthread_mutex_unlock (&rss_mutex);
+}
+
+pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER;
+int threads_done = 0;
+
+//#define dprintf printf
+#define dprintf(...) (void)1
+
+//#define mprintf printf
+//#define MDEBUG 1
+#define mprintf(...) (void)1
+
+#define myabort() my_abort_2(thrc, __LINE__)
+void
+my_abort_2 (pthread_t thrc, int line)
+{
+  fprintf(stderr, "Abort thread %p at line %d\n", (void *)thrc, line);
+  abort();
+}
+
+/*------------------------------------------------------------*/
+/* Wrapper around I/O routines */
+
+int io_fd;
+
+#define IOSIZE 65536
+#define IOMIN 4096
+
+static pthread_mutex_t io_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+typedef struct {
+  unsigned char buf[IOSIZE];
+  size_t incr;
+  size_t max_incr;
+  size_t buf_base;
+  size_t buf_idx;
+  int saw_eof;
+} IOPerThreadType;
+
+IOPerThreadType main_io;
+IOPerThreadType *thread_io;
+
+void
+io_init (IOPerThreadType *io, size_t file_offset, int incr)
+{
+  if (incr > IOSIZE)
+    incr = IOSIZE;
+  if (incr < IOMIN)
+    incr = IOMIN;
+
+  io->buf_base = file_offset;
+  io->buf_idx = 0;
+  io->incr = incr;
+
+  pthread_mutex_lock (&io_mutex);
+  lseek64 (io_fd, io->buf_base, SEEK_SET);
+  // short read OK, the eof is just to prevent runaways from bad data.
+  if (read (io_fd, io->buf, incr) < 0)
+    io->saw_eof = 1;
+  else
+    io->saw_eof = 0;
+  pthread_mutex_unlock (&io_mutex);
+}
+
+unsigned char
+io_read (IOPerThreadType *io)
+{
+  if (io->buf_idx >= io->incr)
+    io_init (io, io->buf_base + io->buf_idx, io->incr);
+  if (io->saw_eof)
+    return 0xff;
+  return io->buf [io->buf_idx++];
+}
+
+unsigned char
+io_peek (IOPerThreadType *io)
+{
+  if (io->buf_idx >= io->incr)
+    io_init (io, io->buf_base + io->buf_idx, io->incr);
+  if (io->saw_eof)
+    return 0xff;
+  return io->buf [io->buf_idx];
+}
+
+size_t
+io_pos (IOPerThreadType *io)
+{
+  return io->buf_base + io->buf_idx;
+}
+
+/*------------------------------------------------------------*/
+
+static void
+wmem (volatile void *ptr, int count)
+{
+  char *p = (char *)ptr;
+  int i;
+
+  if (!p)
+    return;
+
+  for (i=0; i<count; i++)
+    p[i] = 0x11;
+}
+#define xwmem(a,b)
+
+static size_t get_int (IOPerThreadType *io)
+{
+  size_t rv = 0;
+  while (1)
+  {
+    unsigned char c = io_read (io);
+    rv |= (c & 0x7f);
+    if (c & 0x80)
+      rv <<= 7;
+    else
+      return rv;
+  }
+}
+
+static void free_wipe (size_t idx)
+{
+  char *cp = (char *)ptrs[idx];
+  if (cp == NULL)
+    return;
+  size_t sz = sizes[idx];
+  size_t i;
+  for (i=0; i<sz; i++)
+    {
+      if (i % 8 == 1)
+	cp[i] = i / 8;
+      else
+	cp[i] = 0x22;
+    }
+}
+
+static void *
+thread_common (void *my_data_v)
+{
+  pthread_t thrc = pthread_self ();
+  size_t p1, p2, sz, sz2;
+  IOPerThreadType *io = (IOPerThreadType *)my_data_v;
+  ticks_t my_malloc_time = 0, my_malloc_count = 0;
+  ticks_t my_calloc_time = 0, my_calloc_count = 0;
+  ticks_t my_realloc_time = 0, my_realloc_count = 0;
+  ticks_t my_free_time = 0, my_free_count = 0;
+  ticks_t stime, etime;
+  int thread_idx = io - thread_io;
+#ifdef MDEBUG
+  volatile void *tmp;
+#endif
+
+  while (1)
+    {
+      unsigned char this_op = io_peek (io);
+      if (io->saw_eof)
+	myabort();
+      dprintf("op %p:%ld is %d\n", (void *)thrc, io_pos (io),  io_peek (io));
+      switch (io_read (io))
+	{
+	case C_NOP:
+	  break;
+
+	case C_DONE:
+	  dprintf("op %p:%ld DONE\n", (void *)thrc, io_pos (io));
+	  pthread_mutex_lock (&stat_mutex);
+	  malloc_time += my_malloc_time;
+	  calloc_time += my_calloc_time;
+	  realloc_time += my_realloc_time;
+	  free_time += my_free_time;
+	  malloc_count += my_malloc_count;
+	  calloc_count += my_calloc_count;
+	  realloc_count += my_realloc_count;
+	  free_count += my_free_count;
+	  threads_done ++;
+	  pthread_mutex_unlock (&stat_mutex);
+	  pthread_mutex_lock(&stop_mutex);
+	  pthread_mutex_unlock(&stop_mutex);
+	  return NULL;
+
+	case C_MEMALIGN:
+	  p2 = get_int (io);
+	  sz2 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = MEMALIGN %ld %ld\n", (void *)thrc, io_pos (io), p2, sz2, sz);
+	  /* we can't force memalign to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+	  if (p2 > n_ptrs)
+	    myabort();
+	  stime = rdtsc_s();
+	  Q1;
+	  if (ptrs[p2])
+	    {
+	      if (!quick_run)
+		free ((void *)ptrs[p2]);
+	      atomic_rss (-sizes[p2]);
+	    }
+	  if (!quick_run)
+	    ptrs[p2] = memalign (sz2, sz);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  /* Verify the alignment matches what is expected.  */
+	  if (((size_t)ptrs[p2] & (sz2 - 1)) != 0)
+	    myabort ();
+	  sizes[p2] = sz;
+	  mprintf("%p = memalign(%lx, %lx)\n", ptrs[p2], sz2, sz);
+	  Q2;
+	  etime = rdtsc_e();
+	  if (ptrs[p2] != NULL)
+	    atomic_rss (sz);
+	  if (etime < stime)
+	    {
+	      printf("s: %llx e:%llx  d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
+	    }
+	  my_malloc_time += etime - stime;
+	  my_malloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  break;
+
+	case C_MALLOC:
+	  p2 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = MALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
+	  /* we can't force malloc to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+	  if (p2 > n_ptrs)
+	    myabort();
+	  stime = rdtsc_s();
+	  Q1;
+	  if (ptrs[p2])
+	    {
+	      if (!quick_run)
+		free ((void *)ptrs[p2]);
+	      atomic_rss (-sizes[p2]);
+	    }
+	  if (!quick_run)
+	    ptrs[p2] = malloc (sz);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  sizes[p2] = sz;
+	  mprintf("%p = malloc(%lx)\n", ptrs[p2], sz);
+	  Q2;
+	  etime = rdtsc_e();
+	  if (ptrs[p2] != NULL)
+	    atomic_rss (sz);
+	  if (etime < stime)
+	    {
+	      printf("s: %llx e:%llx  d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
+	    }
+	  my_malloc_time += etime - stime;
+	  my_malloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  break;
+
+	case C_CALLOC:
+	  p2 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = CALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
+	  /* we can't force calloc to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+	  if (p2 > n_ptrs)
+	    myabort();
+	  if (ptrs[p2])
+	    {
+	      if (!quick_run)
+		free ((void *)ptrs[p2]);
+	      atomic_rss (-sizes[p2]);
+	    }
+	  stime = rdtsc_s();
+	  Q1;
+	  if (!quick_run)
+	    ptrs[p2] = calloc (sz, 1);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  sizes[p2] = sz;
+	  mprintf("%p = calloc(%lx)\n", ptrs[p2], sz);
+	  Q2;
+	  if (ptrs[p2])
+	    atomic_rss (sz);
+	  my_calloc_time += rdtsc_e() - stime;
+	  my_calloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  break;
+
+	case C_REALLOC:
+	  p2 = get_int (io);
+	  p1 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = REALLOC %ld %ld\n", (void *)thrc, io_pos (io), p2, p1, sz);
+	  if (p1 > n_ptrs)
+	    myabort();
+	  if (p2 > n_ptrs)
+	    myabort();
+	  /* we can't force realloc to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+
+	  if (ptrs[p1])
+	    atomic_rss (-sizes[p1]);
+	  if (!quick_run)
+	    free_wipe(p1);
+	  stime = rdtsc_s();
+	  Q1;
+#ifdef MDEBUG
+	  tmp = ptrs[p1];
+#endif
+	  if (!quick_run)
+	    ptrs[p2] = realloc ((void *)ptrs[p1], sz);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  sizes[p2] = sz;
+	  mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz);
+	  Q2;
+	  my_realloc_time += rdtsc_e() - stime;
+	  my_realloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  if (p1 != p2)
+	    ptrs[p1] = 0;
+	  if (ptrs[p2])
+	    atomic_rss (sizes[p2]);
+	  break;
+
+	case C_FREE:
+	  p1 = get_int (io);
+	  if (p1 > n_ptrs)
+	    myabort();
+	  dprintf("op %p:%ld FREE %ld\n", (void *)thrc, io_pos (io), p1);
+	  if (!quick_run)
+	    free_wipe (p1);
+	  if (ptrs[p1])
+	    atomic_rss (-sizes[p1]);
+	  stime = rdtsc_s();
+	  Q1;
+	  mprintf("free(%p)\n", ptrs[p1]);
+	  if (!quick_run)
+	    free ((void *)ptrs[p1]);
+	  Q2;
+	  my_free_time += rdtsc_e() - stime;
+	  my_free_count ++;
+	  ptrs[p1] = 0;
+	  break;
+
+	case C_SYNC_W:
+	  p1 = get_int(io);
+	  dprintf("op %p:%ld SYNC_W %ld\n", (void *)thrc, io_pos (io), p1);
+	  if (p1 > n_syncs)
+	    myabort();
+	  pthread_mutex_lock (&mutexes[p1]);
+	  syncs[p1] = 1;
+	  pthread_cond_signal (&conds[p1]);
+	  __sync_synchronize ();
+	  pthread_mutex_unlock (&mutexes[p1]);
+	  break;
+
+	case C_SYNC_R:
+	  p1 = get_int(io);
+	  dprintf("op %p:%ld SYNC_R %ld\n", (void *)thrc, io_pos (io), p1);
+	  if (p1 > n_syncs)
+	    myabort();
+	  pthread_mutex_lock (&mutexes[p1]);
+	  while (syncs[p1] != 1)
+	    {
+	      pthread_cond_wait (&conds[p1], &mutexes[p1]);
+	      __sync_synchronize ();
+	    }
+	  pthread_mutex_unlock (&mutexes[p1]);
+	  break;
+
+	default:
+	  printf("op %d - unsupported, thread %d addr %lu\n",
+		 this_op, thread_idx, (long unsigned int)io_pos (io));
+	  myabort();
+	}
+    }
+}
+
+static void *alloc_mem (size_t amt)
+{
+  void *rv = mmap (NULL, amt, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  mlock (rv, amt);
+  memset (rv, 0, amt);
+  return rv;
+}
+
+static pthread_t *thread_ids;
+
+void *
+my_malloc (const char *msg, int size, IOPerThreadType *io, size_t *psz, size_t count)
+{
+  void *rv;
+  if (psz)
+    count = *psz = get_int (io);
+  dprintf ("my_malloc for %s size %d * %ld\n", msg, size, count);
+  rv = alloc_mem(size * count);
+  if (!rv)
+    {
+      fprintf(stderr, "calloc(%lu,%lu) failed\n", (long unsigned)size, (long unsigned)*psz);
+      exit(1);
+    }
+  mlock (rv, size * count);
+  return rv;
+}
+
+static const char * const scan_names[] = {
+  "UNUSED",
+  "ARENA",
+  "HEAP",
+  "CHUNK_USED",
+  "CHUNK_FREE",
+  "FASTBIN_FREE",
+  "UNSORTED",
+  "TOP",
+  "TCACHE",
+  "USED"
+};
+
+void
+malloc_scan_callback (void *ptr, size_t length, int type)
+{
+  printf("%s: ptr %p length %llx\n", scan_names[type], ptr, (long long)length);
+}
+
+#define MY_ALLOC(T, psz)				\
+  (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, psz, 0)
+#define MY_ALLOCN(T, count)				\
+  (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, NULL, count)
+
+int
+main(int argc, char **argv)
+{
+  ticks_t start=0;
+  ticks_t end;
+  ticks_t usec;
+  struct timeval tv_s, tv_e;
+  int thread_idx = 0;
+  int i;
+  size_t n_threads = 0;
+  size_t idx;
+  struct rusage res_start, res_end;
+  int done;
+  size_t guessed_io_size = 4096;
+  struct stat statb;
+
+  if (argc < 2)
+    {
+      fprintf(stderr, "Usage: %s <trace2dat.outfile>\n", argv[0]);
+      exit(1);
+    }
+  io_fd = open(argv[1], O_RDONLY);
+  if (io_fd < 0)
+    {
+      fprintf(stderr, "Unable to open %s for reading\n", argv[1]);
+      perror("The error was");
+      exit(1);
+    }
+  fstat (io_fd, &statb);
+
+  io_init (&main_io, 0, IOMIN);
+
+  pthread_mutex_lock(&stop_mutex);
+
+  done = 0;
+  while (!done)
+    {
+      switch (io_read (&main_io))
+	{
+	case C_NOP:
+	  break;
+	case C_ALLOC_PTRS:
+	  ptrs = MY_ALLOC (ptrs, &n_ptrs);
+	  sizes = alloc_mem(sizeof(sizes[0]) * n_ptrs);
+	  ptrs[0] = 0;
+	  break;
+	case C_ALLOC_SYNCS:
+	  n_syncs = get_int(&main_io);
+	  syncs = MY_ALLOCN (syncs, n_syncs);
+	  conds = MY_ALLOCN (conds, n_syncs);
+	  mutexes = MY_ALLOCN (mutexes, n_syncs);
+	  for (idx=0; idx<n_syncs; idx++)
+	    {
+	      pthread_mutex_init (&mutexes[idx], NULL);
+	      pthread_cond_init (&conds[idx], NULL);
+	    }
+	  break;
+	case C_NTHREADS:
+	  thread_ids = MY_ALLOC (thread_ids, &n_threads);
+	  thread_io = MY_ALLOCN (thread_io, n_threads);
+	  guessed_io_size = ((statb.st_size / n_threads) < (1024*1024)) ? 65536 : 4096;
+	  /* The next thing in the workscript is thread creation */
+	  getrusage (RUSAGE_SELF, &res_start);
+	  gettimeofday (&tv_s, NULL);
+	  start = rdtsc_s();
+	  break;
+	case C_START_THREAD:
+	  idx = get_int (&main_io);
+	  io_init (& thread_io[thread_idx], idx, guessed_io_size);
+	  pthread_create (&thread_ids[thread_idx], NULL, thread_common, thread_io + thread_idx);
+	  dprintf("Starting thread %lld at offset %lu %lx\n", (long long)thread_ids[thread_idx], (unsigned long)idx, (unsigned long)idx);
+	  thread_idx ++;
+	  break;
+	case C_DONE:
+	  do
+	    {
+	      pthread_mutex_lock (&stat_mutex);
+	      i = threads_done;
+	      pthread_mutex_unlock (&stat_mutex);
+	    } while (i < thread_idx);
+	  done = 1;
+	  break;
+	}
+    }
+  if (!quick_run)
+    {
+      end = rdtsc_e();
+      gettimeofday (&tv_e, NULL);
+      getrusage (RUSAGE_SELF, &res_end);
+
+      printf("%s cycles\n", comma(end - start));
+      usec = diff_timeval (tv_e, tv_s);
+      printf("%s usec wall time\n", comma(usec));
+
+      usec = diff_timeval (res_end.ru_utime, res_start.ru_utime);
+      printf("%s usec across %d thread%s\n",
+	     comma(usec), (int)n_threads, n_threads == 1 ? "" : "s");
+      printf("%s Kb Max RSS (%s -> %s)\n",
+	     comma(res_end.ru_maxrss - res_start.ru_maxrss),
+	     comma(res_start.ru_maxrss), comma(res_end.ru_maxrss));
+    }
+  printf("%s Kb Max Ideal RSS\n", comma (max_ideal_rss / 1024));
+
+  if (malloc_count == 0) malloc_count ++;
+  if (calloc_count == 0) calloc_count ++;
+  if (realloc_count == 0) realloc_count ++;
+  if (free_count == 0) free_count ++;
+
+  if (!quick_run)
+    {
+      printf("\n");
+      printf("sizeof ticks_t is %lu\n", sizeof(ticks_t));
+      printf("Avg malloc time: %6s in %10s calls\n", comma(malloc_time/malloc_count), comma(malloc_count));
+      printf("Avg calloc time: %6s in %10s calls\n", comma(calloc_time/calloc_count), comma(calloc_count));
+      printf("Avg realloc time: %5s in %10s calls\n", comma(realloc_time/realloc_count), comma(realloc_count));
+      printf("Avg free time: %8s in %10s calls\n", comma(free_time/free_count), comma(free_count));
+      printf("Total call time: %s cycles\n", comma(malloc_time+calloc_time+realloc_time+free_time));
+      printf("\n");
+    }
+
+#if 0
+  /* Free any still-held chunks of memory.  */
+  for (idx=0; idx<n_ptrs; idx++)
+    if (ptrs[idx])
+      {
+	free((void *)ptrs[idx]);
+	ptrs[idx] = 0;
+      }
+#endif
+
+#if 0
+  /* This will fail (crash) for system glibc but that's OK.  */
+  __malloc_scan_chunks(malloc_scan_callback);
+
+  malloc_info (0, stdout);
+#endif
+
+#if 0
+  /* ...or report them as used.  */
+  for (idx=0; idx<n_ptrs; idx++)
+    if (ptrs[idx])
+      {
+	char *p = (char *)ptrs[idx] - 2*sizeof(size_t);
+	size_t *sp = (size_t *)p;
+	size_t size = sp[1] & ~7;
+	malloc_scan_callback (sp, size, 9);
+      }
+#endif
+
+  /* Now that we've scanned all the per-thread caches, it's safe to
+     let them exit and clean up.  */
+  pthread_mutex_unlock(&stop_mutex);
+
+  for (i=0; i<thread_idx; i++)
+    pthread_join (thread_ids[i], NULL);
+
+  return 0;
+}
-- 
GitLab