diff --git a/speedymalloc.c b/speedymalloc.c
index eee7250bc5b5afacf0d8a99cbd4e390dc447023c..1656f25aad1a19862ddf07244635cdacbde57359 100644
--- a/speedymalloc.c
+++ b/speedymalloc.c
@@ -164,6 +164,11 @@ static inline size_t alignup(size_t size, size_t alignment) {
 	return (size + mask) & ~mask;
 }
 
+static inline size_t aligndown(size_t size, size_t alignment) {
+	size_t mask = alignment -1;
+	return size & ~mask;
+}
+
 static size_t MEMSIZE = 0;
 
 static int grow_bump_region() {
@@ -175,13 +180,18 @@ static int grow_bump_region() {
 		exit(errno);
 	}
 
+	tls_t* ltls = tls;
+
 	// init tls
-	if (!tls)
-		tls = (tls_t*)mem;
+	if (!tls) {
+		tls = (tls_t*) mem;
+		ltls = tls;
+		ltls->end = ((uintptr_t) ltls) + sizeof(tls_t);
+	} else {
+		ltls->end = (uintptr_t) mem;
+	}
 
-	tls_t* ltls = tls;
-	ltls->ptr = ((uintptr_t)ltls) + sizeof(tls_t);
-	ltls->end = ((uintptr_t)ltls) + MEMSIZE;
+	ltls->ptr = ltls->end + MEMSIZE;
 #ifdef MADVISE_WILLNEED
 	ltls->next_willneed = ltls->ptr;
 #endif
@@ -212,33 +222,31 @@ static void* bump_alloc(size_t size, size_t alignment) {
 	// expensive thread-local storage operations
 	tls_t* ltls = tls;
 
-	// allocate size header
-	ltls->ptr += sizeof(size_t);
+	uintptr_t new_ptr = aligndown(ltls->ptr - size, alignment) - sizeof(size_t);
 
 	// regrow bump region
-	if (unlikely((ltls->ptr + size + alignment) > ltls->end)) {
+	if (unlikely(new_ptr < ltls->end)) {
 		if (grow_bump_region() < 0)
 			return NULL;
 
-		ltls = tls;
-		ltls->ptr = tls->ptr + sizeof(size_t);
+		// recalculate new_ptr
+		new_ptr = aligndown(ltls->ptr - size, alignment) - sizeof(size_t);
+		assert((uintptr_t) new_ptr > ltls->ptr);
 	}
 
-	// align ptr
-	ltls->ptr = alignup(ltls->ptr, alignment);
-
 #ifdef MADVISE_WILLNEED
-	if(unlikely(ltls->ptr >= ltls->next_willneed)) {
-		madvise((void*)ltls->next_willneed, WILLNEED_SIZE, MADV_WILLNEED);
-		ltls->next_willneed += WILLNEED_SIZE;
+	if (unlikely(ltls->ptr <= ltls->next_willneed)) {
+		ltls->next_willneed -= WILLNEED_SIZE;
+		madvise((void*) ltls->next_willneed, WILLNEED_SIZE, MADV_WILLNEED);
 	}
 #endif
 
-	void* ptr = (void*)ltls->ptr;
-	ptr2chunk(ptr)->size = size;
-	ltls->ptr += size;
+	chunk_t* chunk = (chunk_t*) new_ptr;
+	assert((uintptr_t) chunk2ptr(chunk) % alignment == 0);
+	chunk->size = size;
+	ltls->ptr = new_ptr;
 
-	return ptr;
+	return chunk2ptr(chunk);
 }
 
 static void* prepare_chunk(chunk_t* chunk, size_t bin_id) {