diff --git a/asm/ALU.s b/asm/ALU.s
index 080a9db8118cda17cdaa6c523b6177c669ed0dc9..a2556fd9c8b05502eeb0475abc7fdf4fd1081663 100644
--- a/asm/ALU.s
+++ b/asm/ALU.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # 
 
-	.word   116;
+	.word   128;
 	addi	r1 = r0, 255;  # first instruction not executed 0
 	addi	r1 = r0, 2; #1 r1 = 2
 	addi	r2 = r0, 3; #2 r2 = 3
@@ -33,5 +33,6 @@
 	addi    r3 = r0, 1;
 #	rr	r2 = r2, r3;
 	halt; 
-
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/ALUi.s b/asm/ALUi.s
index 009ef9ad0d18a963af8c11d1041b65698235860f..dca125cb1bd24c3b32ddb0cbb39c828852fa5e87 100644
--- a/asm/ALUi.s
+++ b/asm/ALUi.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 #
 
-	.word   44;
+	.word   56;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 15; # r1 = 15
 	subi	r1 = r1, 5; # r1 = 10
@@ -13,3 +13,6 @@
 	andi	r1 = r1, 3; # r1 = 3
 	addi    r2 = r0, 24;# init r2
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/ALUl.s b/asm/ALUl.s
index 55010d4c09ef09aa4d1196daf61e1a98a1da63d6..079386a3dc0b3867eaab12a2b22d7251e880da0a 100644
--- a/asm/ALUl.s
+++ b/asm/ALUl.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # long immediate instructions
 
-	.word   168;
+	.word   180;
 	addi	r1 = r0, 255;  # first instruction not executed 0
 	addi	r1 = r0, 2; # r1 = 2
 	add     r1  = r1, 65536; # r1 = 65538
@@ -29,5 +29,6 @@
 #	rr	r10 = r10, 2; # r10(31) = 1
 	sra	r10 = r10, 5; # fills in 5 upper bits with 1
 	halt; 
-
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/basic.s b/asm/basic.s
index 3fe76c8812f45791f898f5ef821f39b12a3167e3..78966a0608ac4da21815d19729ffda1aff0d135a 100644
--- a/asm/basic.s
+++ b/asm/basic.s
@@ -2,7 +2,7 @@
 # Just a few basic instructions to watch the pipeline going in ModelSim
 #
 
-	.word   32;
+	.word   40;
 	addi	r1 = r0, 255;
 
 	addi	r1 = r0, 15;
@@ -11,3 +11,6 @@
 	addi	r3 = r0, 3;
 	add	r4 = r2, r3;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/blink.s b/asm/blink.s
index abcfa676e68d942151555426d3ec173fc4a1938e..8872d2e72b2730ef8c14f3514c6869620d29c8f1 100644
--- a/asm/blink.s
+++ b/asm/blink.s
@@ -5,7 +5,7 @@
 # Toggle LED with input from UART.
 #
 
-		.word   124;
+		.word   136;
 		addi	r0 = r0, 0;  # first instruction not executed
 
 		addi	r7 = r0, 16;
@@ -43,3 +43,6 @@
                 addi    r0  = r0 , 0;
                 addi    r0  = r0 , 0;
 		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/boot_loader.s b/asm/boot_loader.s
deleted file mode 100644
index 835aea76c474288edea7edf66fb2c3dc2fc644f1..0000000000000000000000000000000000000000
--- a/asm/boot_loader.s
+++ /dev/null
@@ -1,78 +0,0 @@
-#
-# Expected Result: ...
-# this echos wrong characters
-# MS: what does this program? Looks very out of date: UART at wrong address, bne,...
-# SA: should I continue with assembly boot loader? 
-# MS: I think a boot loader shall be done in C if possible
-# and we shall drop unused out-of-date code
-		.word	264;
-		addi    r16  = r16 , 64;
-		addi    r7 = r7 , 511;
-		addi	r1   = r0 , 2;
-		lwm     r10  = [r5 + 0];
-                nop;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r15  = [r5 + 1];
-                lwm     r15  = [r5 + 1];
-		addi    r17  = r17 , 24;
-		sl	r15 = r15 , r17;
-		lwm     r10  = [r5 + 0];
-		addi    r0  = r0 , 1;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r18  = [r5 + 1];
-                lwm     r18  = [r5 + 1];
-		addi	r19 = r19 , 16;
-		sl      r18 = r18 , r19;
-		or	r15 = r15 , r18;
-		lwm     r10  = [r5 + 0];
-		nop;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r20  = [r5 + 1];
-                lwm     r20  = [r5 + 1];
-		addi	r21 = r21 , 8;
-		sl      r20 = r20 , r21;
-		or	r15 = r15 , r20;
-		lwm     r10  = [r5 + 0];
-		nop;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r22  = [r5 + 1];
-                lwm     r22  = [r5 + 1];
-		addi    r0  = r0 , 1;
-		or	r15 = r15 , r22;
-                swm     [r7 + 1] = r15; 
-		addi    r27 = r27 , 1;
-		andi	r0 = r0 , 0;
-		andi	r1 = r1 , 0;
-		andi    r5 = r5 , 0;
-		andi    r10 = r10 , 0;
-		andi    r11 = r11 , 0;
-		andi	r15 = r15 , 0;
-		andi    r17 = r17 , 0;
-		andi    r18 = r18 , 0;
-		andi    r19 = r19 , 0;
-		andi    r20 = r20 , 0;
-		andi    r21 = r21 , 0;
-		andi    r22 = r22 , 0;
-		addi    r7 = r7 , 1;
-		bne	r27 != r16 , 59;
-		addi    r9 = r9 , 1;
-		andi	r9 = r9 , 0;
-		andi    r27 = r27 , 0;
-		andi    r7 = r7 , 0;
-		andi    r16 = r16 , 0;
-		andi    r0 = r0 , 0;
-		andi    r0 = r0 , 0;
-		andi    r0 = r0 , 0;
-                halt;
diff --git a/asm/branch.s b/asm/branch.s
index 68bc38753792417ef5ccac17fa69b53d1afe2c3d..6e2bbeadb4e4621cdc846e014cc02e48f2dbd50c 100644
--- a/asm/branch.s
+++ b/asm/branch.s
@@ -1,7 +1,7 @@
 #
 # Test branch
 #
-	.word   104;
+	.word   116;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 1;
 	addi	r1 = r0, 2;
@@ -31,3 +31,6 @@ end:	addi	r1 = r0, 13;
 	addi	r1 = r0, 15;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/bug_inverse_predicate_branch.s b/asm/bug_inverse_predicate_branch.s
index f953e32668fdda9f969a0921170ca040e64d747d..0477e0bff3adee01dc225f06d27e2ba4f1632e4f 100644
--- a/asm/bug_inverse_predicate_branch.s
+++ b/asm/bug_inverse_predicate_branch.s
@@ -4,7 +4,7 @@
 # Expected Result: '0'
 # Current output: '1'
 
-	.word	80;
+	.word	84;
 x0:		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r5 = r0, 15;
 		sli	r5 = r5, 28;
diff --git a/asm/call.s b/asm/call.s
index 082eb7d8fa1227461dae7c4bc908da455b8b3eca..e72a2e8dff929e93a0565ae1b7f89ad205ad810c 100644
--- a/asm/call.s
+++ b/asm/call.s
@@ -20,7 +20,7 @@
 	addi	r1 = r0, 0;
 	addi	r1 = r0, 0;
 
-	.word 100; # This looks like not working at all....
+	.word 96;
 start:	addi	r1 = r1, 1;
 	addi	r30 = r0, start;
 	call	foo;
@@ -34,7 +34,7 @@ start:	addi	r1 = r1, 1;
 	br	end;
 	addi	r0 = r0, 0;
 	addi	r0 = r0, 0;
-	.word 20; # this shall be the length - which unit, assume bytes?
+	.word 24;
 foo:	addi	r6 = r0, 6;
 	addi	r7 = r0, 7;
 	ret	r30, r31;   # r32 offset to method base in r30
@@ -44,3 +44,6 @@ foo:	addi	r6 = r0, 6;
 end:	addi	r8 = r0, 8;
 	addi 	r9 = r0, 9;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/callr.s b/asm/callr.s
index e829880b28627794c450cc9855db8cdcd2a07b00..14c29ebea490242754f41a43c420eeb9229edfe7 100644
--- a/asm/callr.s
+++ b/asm/callr.s
@@ -26,7 +26,7 @@ start:	addi	r1 = r1, 1;
 	br	end;
 	addi	r0 = r0, 0;
 	addi	r0 = r0, 0;
-	.word 20; # this shall be the length - which unit, assume bytes?
+	.word 24;
 foo:	addi	r6 = r0, 6;
 	addi	r7 = r0, 7;
 	ret	r30, r31;   # r32 offset to method base in r30
@@ -36,3 +36,6 @@ foo:	addi	r6 = r0, 6;
 end:	addi	r8 = r0, 8;
 	addi 	r9 = r0, 9;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/compare.s b/asm/compare.s
index cc4febff1d8aad258e9a50e9500934f54f5840d3..792c86e86f5ceba778f19a2f5c1c1737ab4e352e 100644
--- a/asm/compare.s
+++ b/asm/compare.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # 
 
-	.word   232;
+	.word   244;
 	addi	r1 = r0, 255;  # first instruction not executed 0
 	addi	r1 = r0, 2; #1 r1 = 2
 x0:	addi	r2 = r0, 2; #2 r2 = 2
@@ -64,5 +64,6 @@ x8:	cmplt   p2  = r1, r2;
 	addi	r23 = r0, 1;  	
 
 	halt; 
-
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/dual_even_odd_address.s b/asm/dual_even_odd_address.s
index 38090e65a6eb51e5decc0f6085a5bd5c7901a147..b4654360f4dcdbbbff7c0d02ad698e6978d4e3bf 100644
--- a/asm/dual_even_odd_address.s
+++ b/asm/dual_even_odd_address.s
@@ -3,7 +3,7 @@
 #
 # Expected Result: echo entered characters
 #
-		.word   164;
+		.word   176;
 		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r1 = r0, 1;
 		add	r2   = r0 , 65536; # dual issue from odd
@@ -35,4 +35,7 @@ x1:		addi    r5 = r5, 1;
 		nop;
 		nop;
 		addi    r13 = r13, 1;
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/dual_forwarding.s b/asm/dual_forwarding.s
index ee684353fad6b4f783eca892f6034b439ddc748d..d8f87bba6f1c2f089b57e7d8ad9ef48cbf031044 100644
--- a/asm/dual_forwarding.s
+++ b/asm/dual_forwarding.s
@@ -4,7 +4,7 @@
 #
 # Expected Result: 
 #
-		.word   140;
+		.word   152;
 		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r1 = r0, 1;
 		add	r2   = r0 , 65536; 
@@ -38,4 +38,7 @@
 		add	r15 = r10, r11;
 		add	r16 = r10, r11;
 		add	r17 = r10, r11;
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/echo.s b/asm/echo.s
index 133d818b2a6d6b58bf2667bb91af53b782504481..0dbfae3cb4b1a6646b3c825f363afad1124128c5 100644
--- a/asm/echo.s
+++ b/asm/echo.s
@@ -4,7 +4,7 @@
 # Expected Result: echo entered characters
 # SA: this is the working version of echo.
 
-		.word   104;
+		.word   116;
 # Set up IO address
 x0:		addi	r0 = r0, 0;
 		add 	r5 = r0, 0xf0000800;
@@ -40,3 +40,6 @@ x2:		lwl     r10  = [r5 + 0];
 
 # Never reached
 		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/echo_final.s b/asm/echo_final.s
deleted file mode 100644
index f2745e5e981eb434e323709be395bf081a3cffb1..0000000000000000000000000000000000000000
--- a/asm/echo_final.s
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# This is a simple echo program on the UART
-#
-# Expected Result: echo entered characters
-#
-# MS: is this roughly the same as echo.s? 
-# SA: This is a deprecated version of echo, it is no longer valid.
-
-		.word   104;
-		addi	r0 = r0, 0;  # first instruction not executed
-		addi	r5 = r0, 15;
-		sli	r5 = r5, 28;
-
-		addi	r1   = r0 , 2;
-		lwl     r10  = [r5 + 0];
-		addi	r0 = r0, 0;
-                and     r11  = r10 , r1;
-		cmpneq  p1 = r1, r11;
-	(p1)	bc	4;
-                addi    r0  = r0 , 0;
-                addi    r0  = r0 , 0;		
-		addi	r5 = r5, 1;
-
-                lwl     r15  = [r5 + 0];
-
-		subi	r5 = r5, 1;
-		addi	r3 = r0, 1;
-		lwl     r10  = [r5 + 0];
-		addi	r0 = r0, 0;
-		and     r11 = r3 , r10;
-		cmpneq  p1 = r3, r11;
-	(p1)	bc	12;
-                addi    r0  = r0 , 0;
-                addi    r0  = r0 , 0;
-
-		addi    r5 = r5, 1;
-		swl	[r5 + 0] = r15;
-		bc	0;
-                addi    r0  = r0 , 0;
-                addi    r0  = r0 , 0;
-                halt;
-
diff --git a/asm/echo_scratchpad1.s b/asm/echo_scratchpad1.s
index b8a4bb68cf9c5dc075fa79212d1d7ead29c17f41..365d79ac9e3c52ad5c59d42092ec3594af8015b7 100644
--- a/asm/echo_scratchpad1.s
+++ b/asm/echo_scratchpad1.s
@@ -4,7 +4,7 @@
 # Expected Result: just checking if echo works along with other instructions.
 #
 
-		.word   124;
+		.word   136;
 x0:		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r5 = r0, 15;
 		sli	r5 = r5, 28;
@@ -41,4 +41,6 @@ x2:		lwl     r10  = [r5 + 0];
                 addi    r0  = r0 , 0;
                 addi    r0  = r0 , 0;
                 halt;
-
+		nop;
+		nop;
+		nop;
diff --git a/asm/fetch_double.s b/asm/fetch_double.s
index ab0acf91f7f29fa0c03209afd8ad7f90eb25ac4a..b4572510d9a488c333cae9e59d42e7cce4688112 100644
--- a/asm/fetch_double.s
+++ b/asm/fetch_double.s
@@ -2,7 +2,7 @@
 # Test dual issue feature by using long constants with a double word fetch
 #
 
-	.word   56;
+	.word   68;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 1;
 	add	r2 = r0, 65536;
@@ -12,3 +12,6 @@
 	add	r6 = r0, 200000;
 	add	r7 = r0, 300000; # add for a long immediate is strange
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/forward_issue.s b/asm/forward_issue.s
index 9bd32cf9af3033cc02815820278724465020911b..5f8485c33118034c2f61a1b5a49c40fd1452201c 100644
--- a/asm/forward_issue.s
+++ b/asm/forward_issue.s
@@ -2,7 +2,7 @@
 # This small test bench showed a former forwarding issue. Is fixed quite some time.
 #
 
-		.word   100;
+		.word   112;
 		addi	r0 = r0, 0;  # first instruction not executed
 
 		addi	r1 = r0, 1;
@@ -32,3 +32,6 @@
 		add	r17 = r10, r11;
 
 		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/gm_test.s b/asm/gm_test.s
index a74c9f118e5bd59aa292d0de1ecb4a0610ae4f5f..905fdbeb0ddf5a4771fbf9d69012c1e1c128b796 100644
--- a/asm/gm_test.s
+++ b/asm/gm_test.s
@@ -1,4 +1,4 @@
-	.word   88;
+	.word   100;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r20 = r0, 15;
 	sli	r20 = r20, 28;
@@ -49,3 +49,6 @@
 # 		swm	[r20 + 1] = r5;
 
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/hello.s b/asm/hello.s
index 25bd540dc487bc74beb32d28d39d007265a4fe28..232062e904f2e416764d59db13eb98752656017c 100644
--- a/asm/hello.s
+++ b/asm/hello.s
@@ -5,7 +5,6 @@
 #
 
 	.word   56;
-	addi	r0 = r0, 0;  # first instruction maybe not executed
 
         add     r7  = r0, 0xF0000900;
 	addi	r8 = r0, 1;
diff --git a/asm/inst_tests/ALU.s b/asm/inst_tests/ALU.s
index 9381c00a2a887693a959e8cf504e936859e8abb1..8ed825653b63f8bb40702f6908f706c5d06d03ff 100644
--- a/asm/inst_tests/ALU.s
+++ b/asm/inst_tests/ALU.s
@@ -1,5 +1,5 @@
 # This test case  tests the different instructions of the ALU
-	.word	116;
+	.word	128;
 	addi	r1 = r0, 10;
 	addi	r1 = r0, 10;
 	add 	r2 = r1, r0;
@@ -23,3 +23,6 @@
 	shadd	r8 = r1, 1; # r8 = 11
 	shadd2	r8 = r8, 1; # r8 = 23
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/branch.s b/asm/inst_tests/branch.s
index 35418620063e8ee99f084cc09628bc9d0f22b29e..8b52ad470f5e924130a315b727fbbf9dfc0f9698 100644
--- a/asm/inst_tests/branch.s
+++ b/asm/inst_tests/branch.s
@@ -1,5 +1,5 @@
 # This test case tests the branching
-	.word	48;
+	.word	60;
 	addi 	r1 = r0, 2;
 	addi 	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -9,4 +9,7 @@ x2:	(p1) br x1;
 	add 	r2 = r1, r1;
 x1:	cmpeq 	p2 = r2, r1;
 	(p2) br x2;
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/datacache_load_store.s b/asm/inst_tests/datacache_load_store.s
index 16a01b21215f933d88fbe10d1818a79629834777..0a4bfedfad8650ed088b457e87f190bfeec2d36d 100644
--- a/asm/inst_tests/datacache_load_store.s
+++ b/asm/inst_tests/datacache_load_store.s
@@ -1,6 +1,6 @@
 # Test case for data cache load and store
 # Initialization begin
-	.word	224;
+	.word	236;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -57,4 +57,7 @@
 	sbc		[r0 + 2] = r1;
 	lhuc	r5 = [r0 + 2];
 	lbuc	r6 = [r0 + 2];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/datacache_load_store2.s b/asm/inst_tests/datacache_load_store2.s
index bb0b4ef2dcb336b85029b766049f23eaf5533dff..57337fd2e06630a885f9775951ef4a19e1a90886 100644
--- a/asm/inst_tests/datacache_load_store2.s
+++ b/asm/inst_tests/datacache_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for data cache load and store
-	.word	240;
+	.word	252;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -56,4 +56,7 @@
 	lbuc	r13 = [r2 + 3];
 	lhuc	r14 = [r2 + 0];
 	lhuc	r15 = [r2 + 1];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/delay_slots.s b/asm/inst_tests/delay_slots.s
new file mode 100644
index 0000000000000000000000000000000000000000..b091f9dd55cc736872680389f3235f0d72afbd4a
--- /dev/null
+++ b/asm/inst_tests/delay_slots.s
@@ -0,0 +1,17 @@
+# This test case tests the delay slots of branches and loads
+	addi 	r1 = r0, 8;
+	add 	r2 = r0, r0;
+	br 		xA;
+	br 		xB;
+	br 		xC;
+xC:	add		r2 = r2, r1;
+xB:	add 	r2 = r2, r1;
+xA:	add 	r2 = r2, r1;
+	br 		xD;
+	add 	r2 = r0, r0;
+	add 	r2 = r2, r1;
+	add 	r2 = r2, r1;
+xD: lwc		r4 = [r1+0];
+	add 	r5 = r4, r1;
+	add 	r5 = r4, r1;
+	halt;
diff --git a/asm/inst_tests/globalmem_load_store.s b/asm/inst_tests/globalmem_load_store.s
index f2d9ce7184601439d3ae9fc8be277d11abc25eb1..19157038513027ec68ab263867e93366ac47d8bb 100644
--- a/asm/inst_tests/globalmem_load_store.s
+++ b/asm/inst_tests/globalmem_load_store.s
@@ -1,6 +1,6 @@
 # Test case for global data memory load and store
 # Initialization begin
-	.word	224;
+	.word	236;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -57,4 +57,7 @@
 	sbm		[r0 + 2] = r1;
 	lhum	r5 = [r0 + 2];
 	lbum	r6 = [r0 + 2];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/globalmem_load_store2.s b/asm/inst_tests/globalmem_load_store2.s
index 917adcc66bce291a89b014a0845c9ff73fda2c4a..f6ba600db0532e9b5b785d084582c75f849c5bba 100644
--- a/asm/inst_tests/globalmem_load_store2.s
+++ b/asm/inst_tests/globalmem_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for global data memory load and store
-	.word	240;
+	.word	252;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -56,4 +56,7 @@
 	lbum	r13 = [r2 + 3];
 	lhum	r14 = [r2 + 0];
 	lhum	r15 = [r2 + 1];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/localmem_load_store.s b/asm/inst_tests/localmem_load_store.s
index 0162332f66aeb39e1a4e0873e20b597b3f0bc026..02f9eb64e5e350f78cb021af1cd618a3bf28f62a 100644
--- a/asm/inst_tests/localmem_load_store.s
+++ b/asm/inst_tests/localmem_load_store.s
@@ -1,6 +1,6 @@
 # Test case for local memory load and store
 # Initialization begin
-	.word	224;
+	.word	236;
 	add 	r1 = r0, 0xE0000000;
 	add 	r1 = r0, 0xE0000000;
 	addi	r2 = r0, 4;
@@ -58,3 +58,6 @@
 	lhul	r5 = [r0 + 2];
 	lbul	r6 = [r0 + 2];
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/localmem_load_store2.s b/asm/inst_tests/localmem_load_store2.s
index b0b97b69a2f2a47085f100ccd11f5bc539db7c51..b5f80aeff3098c3de0fa06e1df49644b44adf913 100644
--- a/asm/inst_tests/localmem_load_store2.s
+++ b/asm/inst_tests/localmem_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for local memory load and store
-	.word	240;
+	.word	252;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -56,4 +56,7 @@
 	lbul	r13 = [r2 + 3];
 	lhul	r14 = [r2 + 0];
 	lhul	r15 = [r2 + 1];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/stackcache_load_store.s b/asm/inst_tests/stackcache_load_store.s
index f67aee2f4dccbfb33bf5ca31345174db5e44c96f..0bf78c321092bb2dd27559a9825732ce71393349 100644
--- a/asm/inst_tests/stackcache_load_store.s
+++ b/asm/inst_tests/stackcache_load_store.s
@@ -1,6 +1,6 @@
 # Test case for stack cache load and store
 # Initialization begin
-	.word	236;
+	.word	248;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -61,3 +61,6 @@
 	lhus	r5 = [r0 + 2];
 	lbus	r6 = [r0 + 2];
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/stackcache_load_store2.s b/asm/inst_tests/stackcache_load_store2.s
index c469453fcbb04ea92caec91b5bf0dbb7a04215e0..ae5c5faa90553f346525e80010dcf49dc46a7c0a 100644
--- a/asm/inst_tests/stackcache_load_store2.s
+++ b/asm/inst_tests/stackcache_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for stack cache load and store
-	.word	252;
+	.word	264;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -60,3 +60,6 @@
 	lhus	r14 = [r2 + 0];
 	lhus	r15 = [r2 + 1];
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests_failing/delay_slots.s b/asm/inst_tests_failing/delay_slots.s
deleted file mode 100644
index 09c3798b28625e897c2e09ae4a0f1fe163805093..0000000000000000000000000000000000000000
--- a/asm/inst_tests_failing/delay_slots.s
+++ /dev/null
@@ -1,18 +0,0 @@
-# This test case tests the delay slots of branches and loads
-	addi 	r1 = r0, 5;
-	addi 	r1 = r0, 5;
-	addi	r2 = r0, r0;
-	br 		x1;
-	br 		x2;
-	br 		x3;
-x3:	add		r2 = r2, r1;
-x2:	add 	r2 = r2, r1;
-x1:	add 	r2 = r2, r1;
-	br 		x4;
-	add 	r2 = r0, r0;
-	add 	r2 = r2, r1;
-	add 	r2 = r2, r1;
-x4: lws		r4 = r1, 0;
-	add 	r5 = r4, r1;
-	add 	r5 = r4, r1;
-	halt;
diff --git a/asm/ld_st_test.s b/asm/ld_st_test.s
index 719f7900eae6574459c60bc024aa4ba8b22a220d..081cf58006a9f2e15b871435a71a7c4b4507e365 100644
--- a/asm/ld_st_test.s
+++ b/asm/ld_st_test.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # test if memory works fine
 
-	.word   60;
+	.word   72;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi    r29 = r0, 10;
@@ -18,3 +18,6 @@ x1:	swm	[r1 + 1] = r2;
 	cmpneq  p1 = r0, r29;
 (p1)	br	x1; #
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/ldst.s b/asm/ldst.s
index 2c18aacfb9374251c10efa59edc3305fe2777b53..c7fc4daa532431c7d3aab66eb68fc95dc431032d 100644
--- a/asm/ldst.s
+++ b/asm/ldst.s
@@ -1,7 +1,7 @@
 #
 # Basic load/store tests
 
-	.word   100;
+	.word   112;
 	addi	r0 = r0, 0;
 	addi	r1 = r0, 4;
 	add	r2 = r0, 0xabcd1234;
@@ -25,3 +25,6 @@
 	sbl	[r1 + 3] = r2;
 	lwl	r3 = [r1 + 0];
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_data_cache.s b/asm/load_store_data_cache.s
index 6bf0b67986b6e47abac2a01a6dacf456c3fa8e68..809efa16aebecced289d935fc64f75a9d9e1b810 100644
--- a/asm/load_store_data_cache.s
+++ b/asm/load_store_data_cache.s
@@ -4,7 +4,7 @@
 # MS: is SPM accessed with lxc/sxc? I thought it is via lxl/sxl.
 # SA: this test case is for data cache with lxc/sxc which is mapped to scratchpad at the moment
 
-	.word   188;
+	.word   200;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi	r2 = r0, 5;
@@ -55,3 +55,6 @@ x1:	sl	r31 = r31, r30;
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r21 equals to all upper bits 1 
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_scratchpad.s b/asm/load_store_scratchpad.s
index 63ab402fb3a4a171f455dab7cba3dbecbd357221..bf4f8aa83db9028701687b431838e4fd91d30b17 100644
--- a/asm/load_store_scratchpad.s
+++ b/asm/load_store_scratchpad.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # different ld/st from/to scratchpad memory
 
-	.word   184;
+	.word   196;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi	r2 = r0, 5;
@@ -57,3 +57,6 @@ x1:	shl	[r1 + 4] = r5; #
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r20 equals to all upper bits 1 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_scratchpad_new.s b/asm/load_store_scratchpad_new.s
index 0657665a5b04942bf529ba8566b1d22845b55f6d..f118cc8ad6d319230985a0fc0b1ce9cd4bb53b44 100644
--- a/asm/load_store_scratchpad_new.s
+++ b/asm/load_store_scratchpad_new.s
@@ -1,7 +1,7 @@
 #
 # Basic instructions test
 # different ld/st from/to scratchpad memory
-	.word   116;
+	.word   128;
 	addi    r1 = r0, 256;
 	sbl     [r1 + 4] = r0;
 	sbl     [r1 + 5] = r0;
@@ -32,3 +32,6 @@ x1:	sl	r31 = r31, r30;
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r20 equals to all upper bits 1 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_scratchpad_new2.s b/asm/load_store_scratchpad_new2.s
index 1dba98d391fbf9b065ab10b0379b951470a76b13..d2e1ef8811b9bde53bb6a675a948da2bdb6eb852 100644
--- a/asm/load_store_scratchpad_new2.s
+++ b/asm/load_store_scratchpad_new2.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # different ld/st from/to scratchpad memory
 
-	.word   228;
+	.word   240;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi	r2 = r0, 5;
@@ -65,3 +65,6 @@ x1:	shl	[r1 + 4] = r5; #
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r20 equals to all upper bits 1 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_stackcache.s b/asm/load_store_stackcache.s
index d14c872e4b1250862ee7403771fcb44a9e0e6f9d..ca44e93e1854540a9d0534463b71a443bcd6e66b 100644
--- a/asm/load_store_stackcache.s
+++ b/asm/load_store_stackcache.s
@@ -1,7 +1,9 @@
 #
 # Expected Result: 
 #
-		.word   48;
+		.word   68;
+		addi    r3 = r0, 0x100;
+		mts     s6 = r3;
 		addi	r5 = r0, 5;                
 #		lwm     r1  = [r31 + 0];
                 sres     4;
@@ -16,4 +18,7 @@
 #                lhs     r3  = [r0 + 0]  ||     lbs     r4  = [r0 + 1];
 #               lhus    r4  = [r0 + 0]  ||     lbus    r6  = [r0 + 1];
 #                sfree   1;
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/load_use.s b/asm/load_use.s
index ee4ad2bf05c9a11046ebbf7d135243d23db40b51..10d55fab9ea6e77dfefb2d3f9ffd1444de00e6b9 100644
--- a/asm/load_use.s
+++ b/asm/load_use.s
@@ -5,7 +5,9 @@
 #   first load (in the load use delay slot) will get old value
 #   second load the correct value from memory
 #
-	.word   52;
+	.word   72;
+	addi    r3 = r0, 0x100;
+	mts     s6 = r3;
 	sres	10;
 	addi	r1 = r0, 4;
 	addi	r2 = r0, 2;
@@ -15,10 +17,10 @@
 	sws	[r1+4] = r2;
 	lws	r3 = [r1+4];
 	addi    r0 = r0, 0;	# This is the delay slot
-# The following behaves different in HW and the simulator.
-# We have not yet defined the semantics of using the value
-# in the delay slot.
-#	add	r4 = r0, r3;	# that one is in the delay slot and will add 3
+	add	r4 = r0, r3;	# that one is in the delay slot and will add 3
 	add	r5 = r0, r3;	# that one shall add 2
 	add	r1 = r0, r5;	# that one shall now be 2
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/mfsmts.s b/asm/mfsmts.s
index 4a7236d8dae8ff617cab34ec93adb0c35c8709a6..d8cfe726c127302d7d5e7fa8d27007acdfaca1b9 100644
--- a/asm/mfsmts.s
+++ b/asm/mfsmts.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 #
 
-	.word   44;
+	.word   56;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 15; # r1 = 15
         mts     s6  = r1;
@@ -13,3 +13,6 @@
 	mts	s6 = r7;
 	mfs	r10 = s6;
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/minimal.s b/asm/minimal.s
index 97bb8eadf783ef3ef7ab3492bfe86828ea478f38..8f2c6067cb944e9e0eaee26576c26171122392ff 100644
--- a/asm/minimal.s
+++ b/asm/minimal.s
@@ -2,8 +2,12 @@
 # A short as possible assembler example
 #
 
-	.word   16;
+	.word   36;
 	addi	r1 = r0, 255;  # first instruction maybe not executed
 	addi	r2 = r0, 1;
 	addi	r3 = r0, 2;
 	add	r4 = r2, r3;
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/mulpipe.s b/asm/mulpipe.s
index efc745498ca3d344ddc55fad136a965ae71c1f62..dd37a21b4eda3887df63553329dc11e64abe3165 100644
--- a/asm/mulpipe.s
+++ b/asm/mulpipe.s
@@ -2,7 +2,7 @@
 # Test the multiplication pipeline
 #
 
-	.word   88;
+	.word   96;
 	addi	r1 = r0, 1;
 	addi	r2 = r0, 2;
 	addi	r3 = r0, 3;
@@ -19,3 +19,6 @@
 	mul r1, r7 || mfs r13 = s2;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/pred_issue.s b/asm/pred_issue.s
index 52707cfba455c2cf289b6db80f8ae4bd3ffc0590..c8cf44b65b357f1bc239245891ebd7138edbb735 100644
--- a/asm/pred_issue.s
+++ b/asm/pred_issue.s
@@ -1,7 +1,7 @@
 
 # Try to extract the issue Sahar observed,
 # but this works - al variations of r1/r2 lt, eq, gt tried
-	.word   36;
+	.word   48;
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 1;
 	cmple   p4 = r1, r2;
@@ -10,3 +10,6 @@
 	xor     r16 = r15, r16; # r16 = 1
 (!p4)   nor     r16 = r16, r15;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/predicate.s b/asm/predicate.s
index 0a1749f6f1c5115b4035c5f266b18ff2204fb3ce..85f29412615705b6e0c4a9be08716e0a3db32871 100644
--- a/asm/predicate.s
+++ b/asm/predicate.s
@@ -1,7 +1,7 @@
 #
 # Test predicates and branch
 #
-	.word   588;
+	.word   600;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -50,6 +50,10 @@
 	cmplt   p6 = r9, r3;
 (p6)	add     r15 = r9, r2;
 	halt;
+   	nop;
+	nop;
+	nop;
+
 # ALU instructions
 	addi    r3 = r0, 3;
 	addi    r1 = r0, 5;
diff --git a/asm/predicated_echo.s b/asm/predicated_echo.s
index 4d60da02365302c7d8db8f04e5f1c50f09cec545..206a154e0ccd1be79da3a49ab144f505eb400844 100644
--- a/asm/predicated_echo.s
+++ b/asm/predicated_echo.s
@@ -5,7 +5,7 @@
 # SA: this tests predicated ld/st which was a bug
 #
 
-		.word   116;
+		.word   128;
 x0:		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r5 = r0, 15;
 		sli	r5 = r5, 28;
@@ -41,4 +41,7 @@ x2:		lwl     r10  = [r5 + 0];
                 addi    r0  = r0 , 0;
                 addi    r0  = r0 , 0;
                 halt;
+		nop;
+		nop;
+		nop;
 
diff --git a/asm/predicated_echo_reverse.s b/asm/predicated_echo_reverse.s
index 20fba41141c880dcbdb71c0cc5cf39955830f567..d2a833206c94c2980d31d0026508eed683453b82 100644
--- a/asm/predicated_echo_reverse.s
+++ b/asm/predicated_echo_reverse.s
@@ -5,7 +5,7 @@
 # SA: this test predicated ld/st which was a bug
 #
 
-		.word   224;
+		.word   236;
 		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r22 = r0, 0;
 		addi    r8 = r0, 0;
@@ -67,5 +67,7 @@ x5:		lwl     r10  = [r5 + 0];
 		br      x0;
 		addi	r8 = r0, 0;
 		addi    r22 = r0, 0;		
-                halt;
-
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/predicated_predicate.s b/asm/predicated_predicate.s
index 96d0c92ec2210a07d22c815d090b3cc834b0bd64..62c750e5e20a40a831676d9f58b4732f614eb7cb 100644
--- a/asm/predicated_predicate.s
+++ b/asm/predicated_predicate.s
@@ -1,7 +1,7 @@
 #
 # Test predicates and branch
 #
-	.word   168;
+	.word   180;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -50,3 +50,6 @@
 	cmplt   p6 = r9, r3;
 (p6)	add     r15 = r9, r2;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/predicates.s b/asm/predicates.s
deleted file mode 100644
index bf0db0eb2ecfe3882c3fd3a7506c7b60d1e41463..0000000000000000000000000000000000000000
--- a/asm/predicates.s
+++ /dev/null
@@ -1,16 +0,0 @@
-#
-# Test predicates and branch
-#
-	.word   44;
-x0:	addi	r0 = r0, 0;  # first instruction not executed
-
-	addi	r1 = r0, 2;
-	addi	r2 = r0, 2;
-
-	cmpeq   p1  = r1, r2;
-	(p1) br	x0;
-	addi	r3 = r0, 3; # somewhere here R1 gets set to 4 !!!
-	addi	r4 = r0, 4;
-	addi	r5 = r0, 5;
-	addi	r6 = r0, 6;
-	halt;
diff --git a/asm/predication.s b/asm/predication.s
index 4b8066ba6d488c7a9c84fd95b1f5371dbd67faff..035e4757a2e5e1a6cb12a85ff1e9f624d03232e8 100644
--- a/asm/predication.s
+++ b/asm/predication.s
@@ -1,7 +1,7 @@
 #
 # Test predicates and branch
 #
-	.word   256;
+	.word   268;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -88,4 +88,6 @@ pb4:		subi	r1 = r1, 1;
 # Done
 #############
 	halt;
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/scratchpad.s b/asm/scratchpad.s
index c8386ac40aba9f0b35b87bb684a65f7be53f036f..cf8c3c9cd73180d2a267ff2a75a827059a10e82f 100644
--- a/asm/scratchpad.s
+++ b/asm/scratchpad.s
@@ -2,7 +2,7 @@
 # Simple test of a SPM
 #
 
-	.word   76;
+	.word   88;
 	addi	r1 = r0, 255;  # first instruction maybe not executed
 	addi	r1 = r0, 32;
 	addi	r2 = r0, 5;
@@ -20,3 +20,6 @@
 	lwl	r6  = [r1 + 4];
 	lwl	r7  = [r1 + 8];
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/scratchpad_store.s b/asm/scratchpad_store.s
deleted file mode 100644
index 4ec785333f60d3c578e704181590e4cf16b97a68..0000000000000000000000000000000000000000
--- a/asm/scratchpad_store.s
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Basic instructions test
-# different ld/st from/to scratchpad memory
-# this is a deprecated version there are other tests on scratchpad
-
-	.word   32;
-	addi	r1 = r0, 255;  # first instruction not executed
-	addi	r1 = r0, 255; # r1 = 255
-	addi	r2 = r0, 5;
-	addi    r4 = r0, 4;
-	swl	[r1 + 1] = r2; # memory address 259 (255 + (1 sl 2)) = 5 
-#	lwl	r10  = [r1 + 1]; # register(10) = 5
-	addi	r2 = r0, 10;
-	shl	[r1 + 3] = r2; # memory address 261 = 10
-#	lhl	r11  = [r1 + 3]; # register(11) = 10
-	sbl	[r1 + 3] = r4; # memory address 258 = 4
-#	lbl
-#	lhul
-#	lbul	
-#	halt; 
diff --git a/asm/simple.s b/asm/simple.s
index 91f6978817ae16eef8303d5653e59411c4bc35e2..5275b6a3d49c498f82f1835c8aa6b5fc9488a257 100644
--- a/asm/simple.s
+++ b/asm/simple.s
@@ -2,7 +2,7 @@
 # Very simple code to get stuff running on the Chisel pipeline.
 #
 
-	.word   76;
+	.word   88;
 	addi	r0 = r0, 0;  # first instruction not executed
 
 	addi	r1 = r0, 1;
@@ -29,3 +29,6 @@
 #	sl	r4 = r1, 3;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/spill.s b/asm/spill.s
index fd4f71f42f12a86e1acabe9ac65fc4e18de78cd4..148560b8da23672aa52771ad8af658fe838cbbee 100644
--- a/asm/spill.s
+++ b/asm/spill.s
@@ -1,7 +1,7 @@
 #
 # Expected Result: 
 #
-		.word   168;
+		.word   180;
 		addi	r5 = r0, 0;                
 		addi    r1 = r0, 0;
 		addi	r10 = r0, 64;
@@ -42,4 +42,7 @@ l2:		sws	[r16 + 0] = r14;
 		sens    64;
 		addi    r11 = r0, 1;# check if stall works in case of spill
 		addi    r12 = r0, 2;# check if stall works in case of spill
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/stackcache.s b/asm/stackcache.s
index a9aa467e56e4d04b8c71aa4066238bf68b6ec8a8..d66550e9c0c75c1ab3c37a883d9fc32b6c0596c5 100644
--- a/asm/stackcache.s
+++ b/asm/stackcache.s
@@ -2,7 +2,7 @@
 # Just a few basic instructions to watch the pipeline going in ModelSim
 #
 
-	.word   44;
+	.word   56;
 	addi	r1 = r0, 255;  # first instruction not executed
 	sres     4; # do we reserve to store? so there should be the same number of stores after sres?
         sws     [r0 + 0] = r1;
@@ -13,3 +13,6 @@
 	addi    r15 = r0, 1;#just to have some instruction to check stall
 	addi    r16 = r0, 0;#just to have some instruction to check stall
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/stall.s b/asm/stall.s
deleted file mode 100644
index 93b0225554a4bf45ada2cb575bb05a7330f37696..0000000000000000000000000000000000000000
--- a/asm/stall.s
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Basic instructions test
-#
-
-	.word   24;
-	addi	r1 = r0, 255;  # first instruction not executed
-	waitm;
-	addi	r1 = r1, 5;
-	addi	r1 = r1, 10;
-	halt; 
diff --git a/asm/test.s b/asm/test.s
index 20664b19e8253e6c8db8aff4f3c770bb334be7ed..b57137894ab7a072b320a5cfc5b174354f97bbfe 100644
--- a/asm/test.s
+++ b/asm/test.s
@@ -6,10 +6,13 @@
 
 # TODO: looks like the UART is in memory address 0....
 
-	.word   32;
+	.word   44;
 	addi	r0 = r0, 0;  # first instruction not executed
 	add	r1 = r0, 0xf0000800;
 	addi	r2 = r0, 42; # '*'
 x1:	swl	[r1 + 1] = r2;
 	br	x1;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_asm.s b/asm/test_asm.s
index c16760e79181c717edf47ecf46f94cd066094b89..c606a0bb59bcfcc2bc34eb774508cbaf3baa78b7 100644
--- a/asm/test_asm.s
+++ b/asm/test_asm.s
@@ -2,7 +2,7 @@
 # Minimum program to test the new assembler
 #
 
-	.word   28;
+	.word   48;
 	addi	r1 = r0, 255;  # first instruction not executed
 
 label2:	addi	r1 = r0, 15;
@@ -12,3 +12,6 @@ label2:	addi	r1 = r0, 15;
 	add	r4 = r2, r3;
 	nor 	r6 = r1, r4		||	and 	r7 = r1, r5;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_branch.s b/asm/test_branch.s
deleted file mode 100644
index 994ff2e3fe0828ef9bc3ef907ea3cd897a1a0c85..0000000000000000000000000000000000000000
--- a/asm/test_branch.s
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# This is a simple output of a single character on the UART
-#
-# TODO: maybe this should just switch a LED to see the result.
-#
-
-# TODO: looks like the UART is in memory address 0....
-# update or drop
-
-	.word   68;
-	addi	r0 = r0, 0;  # first instruction not executed
-	addi	r1 = r0, 0;
-	addi	r2 = r0, 42; # '*'
-#x2:	swm	[r1 + 1] = r2;
-	addi	r1   = r0 , 2;
-	addi	r3 = r19 , 1;
-x1:	lwm     r10  = [r5 + 0];
-	nop;
-        and     r11  = r10 , r3;
-	cmpneq  p1 = r11 , r1;
-	nop;
-	nop;
-	nop;
-	(p1) 	bc x1;
-	nop;
-	nop;
-	swm	[r7 + 1] = r2;
-#	br	x2;
-	halt
diff --git a/asm/test_case_plan.s b/asm/test_case_plan.s
index 70bb845f314041e8f321fa3c3ba87111128deb59..584c018e5ab831375c60130d200e5460d285cbc4 100644
--- a/asm/test_case_plan.s
+++ b/asm/test_case_plan.s
@@ -44,7 +44,7 @@
 # Function calls
 # 
 
-		.word   120;
+		.word   132;
 		addi	r0 = r0, 0;
 
 add:	addi	r1 = r0, 5; 
@@ -402,3 +402,6 @@ cmpneq: cmpneq	p2 = r1, r3;
 # br exit; # Should jump to correct exit label, using halt or loop
 
 exit: 	halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/test_mfs.s b/asm/test_mfs.s
index 4d576e03bc2d6c15b641195736ca8631b18b56d1..d0abea57b33318dde8abf0c2b469318f7e134ccc 100644
--- a/asm/test_mfs.s
+++ b/asm/test_mfs.s
@@ -2,7 +2,7 @@
 # Testing hazard with mfs
 #
 
-	.word   72;
+	.word   84;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi    r1 = r0, 4;
 	addi    r2 = r0, 6;
@@ -24,3 +24,6 @@
 	mfs     r7 = s2;     # r7 should be 8
 	addi    r8 = r7, 0;  # r8 should be 8
 	halt;		      
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_mts.s b/asm/test_mts.s
index dcd05f47366f94ab19dfe916bba1ebad2e4946fc..a0fdbc20eddd42f6d32dd35d837c0cabd5f03891 100644
--- a/asm/test_mts.s
+++ b/asm/test_mts.s
@@ -2,7 +2,7 @@
 # Testing hazard with mts
 #
 
-	.word   60;
+	.word   72;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi    r1 = r0, 1;
 	addi    r2 = r0, 5;
@@ -17,3 +17,6 @@
 	(!p1) swc [r2 + 2] = r1; # should not be executed, will cause unaligned access
 	mts     s0 = r1;       # sets p1 = false
 	halt;		       # r3 should be 7, r4 should be 9
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_old.s b/asm/test_old.s
deleted file mode 100644
index 54542dab71f1e2ec1b645d2e57b2f6652b0c4f24..0000000000000000000000000000000000000000
--- a/asm/test_old.s
+++ /dev/null
@@ -1,16 +0,0 @@
-label:
-	p1 br lab1
-	addi r1, 10 -> r13 // hhj
-	p1 addi r1, 10 -> r13 // hhj
-	sub r2, r3 -> r2
-	p5 sub r2, r3 -> r31
-	br label
-lab1:
-	br lab1
-	p1 cmp r1 != r2 -> p3
-
-
-// That's not Patmos, but Leros assembler....
-// Register definitions
-
-
diff --git a/asm/test_sdram.s b/asm/test_sdram.s
index 977ae966521773681861575c34fd026ebd5bdbea..0460a7deb9c9b3b6977e0ab96ced66f6b9e8b12c 100644
--- a/asm/test_sdram.s
+++ b/asm/test_sdram.s
@@ -13,7 +13,7 @@
 # r11==read/write test bound (address limit)
 # r12==memory read error count
 
-	.word	176;
+	.word	188;
 
 	addi	r0 = r0, 0;  # first instruction not executed         	#0
 begin:	addi	r12 = r0, 0; # r12==error count                       	#1
@@ -35,9 +35,9 @@ poll_stdin: lwl     r1 = [r5 + 0];                                     	#7
 
 	lwl     r1 = [r5 + 1];                                     	#14
 
-	addi    r21 = r0, 65; 'A'                                  	#15
+	addi    r21 = r0, 65; # 'A'                                  	#15
 	addi    r22 = r6, 0;                                       	#16
-	addi	r24 = r0, 80; 16 chars from 'A'                       	#17
+	addi	r24 = r0, 80; # 16 chars from 'A'                       	#17
 
 write_word: swl	[r22 + 0] = r21;                                       	#18
 	cmpneq	p1 = r21, r24;                                      	#19
@@ -71,4 +71,7 @@ poll_stdout: lwl     r1 = [r5 + 0];                                     	#27
 		addi r0 = r0, 0;                                          	#41
 		addi r0 = r0, 0;                                          	#42
 
-halt;                                                       	#43
+        halt;                                                       #43
+        nop;                                                       	#44
+        nop;                                                       	#45
+        nop;                                                       	#46
\ No newline at end of file
diff --git a/asm/test_sdram2.s b/asm/test_sdram2.s
index 1c0a02f6c8133055bdd9e386cd43b22c3b846103..801d34a9b479e46ac8a5e8de40e90feb7bbb8455 100644
--- a/asm/test_sdram2.s
+++ b/asm/test_sdram2.s
@@ -14,7 +14,7 @@
 # r11==read/write test bound (address limit)
 # r12==memory read error count
 
-	.word	336;
+	.word	348;
 
 	addi	r0 = r0, 0;  # first instruction not executed         	#0
 begin:	addi	r5 = r0, 15;                                          	#1
@@ -52,7 +52,7 @@ poll_stdin: lwl     r1 = [r5 + 0];                                     	#6
 	addi	r11 = r0, 1;	# r11 == test_limit                      	#14
 	sli	r11 = r11, 20;                                         	#15
 	addi	r12 = r0, 0; # r12==error count                       	#16
-	addi	r1= r0, 87; 'W'                                       	#17
+	addi	r1= r0, 87; # 'W'                                       	#17
 	swl     [r5 + 1] = r1;                                     	#18
 
 #	Set the value in the cache line (we don't care if it is loaded as we just use one word)
@@ -80,7 +80,7 @@ poll_sdram_ready: lwl     r1  = [r6 + 17]; # sdram.status                    	#2
                 addi    r0  = r0 , 0;                       	#34
                 addi    r0  = r0 , 0;                       	#35
 
-#read_init: addi	r1= r0, 82; 'R'                                       	#36
+#read_init: addi	r1= r0, 82; # 'R'                                       	#36
 	swl     [r5 + 1] = r1;                                     	#37
 	addi	r10 = r0, 0;	# addr_cnt <= 0                          	#38
 #	 r10 should have the value from before
@@ -102,7 +102,7 @@ poll_sdram_ready2: lwl     r1  = [r6 + 17]; # sdram.status                    	#
 	addi    r1  = r10 , 1;  # val = addr+1 (use the same mod as during write)	#49
 	cmpeq  p1 = r1, r2; # should be the same                   	#50
 	(p1)	br no_error; #l:+3                                         	#51
-        (!p1)   addi    r1  = r0 , 69;  'E'                 	#52
+        (!p1)   addi    r1  = r0 , 69; # 'E'                 	#52
 	(!p1)	addi    r12 = r12, 1; #error_cnt++                   	#53
 		swl     [r5 + 1] = r1; # we write to UART without pooling for ready here,	#54
 
@@ -148,4 +148,7 @@ poll_stdout2: lwl     r1 = [r5 + 0];                                     	#71
 	br begin;                                                      	#91
 		addi    r0 = r0, 0;                                       	#92
 		addi    r0 = r0, 0;                                       	#93
-halt;                                                       	#94
+	halt;                                                       	#94
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_sdram3.s b/asm/test_sdram3.s
index ac21422958e7fb6a8528185d061257eaf9ebe08b..6f379b13dfd933230646d63d24bdcd8289cffd08 100644
--- a/asm/test_sdram3.s
+++ b/asm/test_sdram3.s
@@ -17,7 +17,7 @@
 # r11==read/write test bound (address limit)
 # r12==memory read error count
 
-	.word	408;
+	.word	420;
 
 	addi	r0 = r0, 0;  # first instruction not executed         	#0
 begin: addi	r5 = r0, 15;                                          	#1
@@ -56,7 +56,7 @@ poll_stdin: lwl     r1 = [r5 + 0];                                     	#6
 	addi	r11 = r0, 1;	# r11 == test_limit                      	#14
 	sli	r11 = r11, 26;                                          	#15
 	addi	r12 = r0, 0; # r12==error count                       	#16
-	addi	r1= r0, 87; 'W'                                       	#17
+	addi	r1= r0, 87; # 'W'                                       	#17
 	swl     [r5 + 1] = r1;                                     	#18
 	subi	r9 = r0, 64;	# r9 == mask (not 63)                    	#19
 
@@ -97,7 +97,7 @@ skip_store: addi	r10 = r10, 4;                                         	#40
 
 
 #read_init:
-	addi	r1= r0, 82; 'R'                                       	#45
+	addi	r1= r0, 82; # 'R'                                       	#45
 	swl     [r5 + 1] = r1;                                     	#46
 	addi	r10 = r0, 0;	# addr_cnt <= 0                          	#47
 
@@ -128,7 +128,7 @@ skip_load: andi	r2  = r10, 63;  # offset                              	#61
 
 	cmpeq	p1 = r1, r2; # should be the same                    	#65
 	(p1)	br no_error;                                         	#66
-        (!p1)   addi    r1  = r0 , 69;  'E'                 	#67
+        (!p1)   addi    r1  = r0 , 69; # 'E'                 	#67
 	(!p1)	addi    r12 = r12, 1; #error_cnt++                   	#68
 	swl     [r5 + 1] = r1; # we write to UART without pooling for ready here,	#69
 
@@ -173,4 +173,7 @@ poll_stdout3: lwl     r1 = [r5 + 0];                                     	#86
 	br begin;                                                      	#97
 		addi    r0 = r0, 0;                                       	#98
 		addi    r0 = r0, 0;                                       	#99
-halt;                                                       	#100
+	halt;                                                       	#100
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/ALU_forwarding.s b/asm/vliw_tests/ALU_forwarding.s
index 5f7d7465364a65b6bb4b88cee3ceb3774f2600e1..59e0c0ce808b69c23a8d36ee539e1c721a9d3876 100644
--- a/asm/vliw_tests/ALU_forwarding.s
+++ b/asm/vliw_tests/ALU_forwarding.s
@@ -1,5 +1,5 @@
 # Test case for forwarding of ALU instructions
-	.word	124;
+	.word	136;
 	addi	r4 = r0, 0;
 	addi	r4 = r0, 0;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
@@ -18,3 +18,6 @@
 	sl		r16 = r10, r1	||	sr 		r17 = r10, r1;
 	sra		r19 = r12, r1	||	srai 	r18 = r12, 5;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/ALU_forwarding2.s b/asm/vliw_tests/ALU_forwarding2.s
index 1565b5976bc99da61d25f742041eb9f48dab40f1..3004ea003b9c734423c5a48f9b9ee790958e7687 100644
--- a/asm/vliw_tests/ALU_forwarding2.s
+++ b/asm/vliw_tests/ALU_forwarding2.s
@@ -1,5 +1,5 @@
 # Test case for forwarding of ALU instructions
-	.word	68;
+	.word	80;
 	addi	r4 = r0, 0;
 	addi	r4 = r0, 0;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
@@ -16,3 +16,6 @@
 	shadd	r14 = r12, r1	||	shadd2	r15 = r12, r1;
 #	shaddi	r14 = r12, r1;	||	shadd2i	r15 = r12, r1;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/add.s b/asm/vliw_tests/add.s
index a75b5322104eae76682e75736d8a01edec669c3e..3b315e275de83bc69f610894357f7ae1c649a7c9 100644
--- a/asm/vliw_tests/add.s
+++ b/asm/vliw_tests/add.s
@@ -1,7 +1,10 @@
 # Test case for add instruction
-	.word	40;
+	.word	52;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
 	add 	r2 = r1, r1		||	add 	r3 = r1, r1;
 	add 	r4 = r1, r1		||	add 	r5 = r1, r1;
 	add 	r6 = r1, r1		||	add 	r7 = r1, r1;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/addi.s b/asm/vliw_tests/addi.s
index 545093f651dfc4e8428c8160a461fb8a420d1dca..3060b8058c893a1781621abde42b0d2d7885270e 100644
--- a/asm/vliw_tests/addi.s
+++ b/asm/vliw_tests/addi.s
@@ -1,7 +1,10 @@
 # Test case for addi instruction
-	.word	40;
+	.word	52;
 	addi	r1 = r0, 2	||	addi	r1 = r0, 5;
 	addi	r2 = r1, 3	||	addi	r3 = r1, 3;
 	addi	r4 = r1, 4	||	addi	r5 = r1, 4;
 	addi	r6 = r1, 5	||	addi	r7 = r1, 5;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/immediate.s b/asm/vliw_tests/immediate.s
index 28859fd231de3a7620602dddb1fff962da2449ce..593f71240eb309cc49907e88aa32a3692fb06bc4 100644
--- a/asm/vliw_tests/immediate.s
+++ b/asm/vliw_tests/immediate.s
@@ -1,5 +1,5 @@
 # Test case for add instruction
-	.word	52;
+	.word	64;
 	addi	r0 = r0, 0;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
 	subi 	r2 = r1, 2		||	addi 	r3 = r1, 347;
@@ -7,3 +7,6 @@
 	nor 	r6 = r1, r4		||	and 	r7 = r1, r5;
 	add 	r7 = r0, 1234567;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/predicate_forwarding.s b/asm/vliw_tests/predicate_forwarding.s
index 81be1fc5e64843e1f383209001ecf65ea6c65fb9..37e450e719b774440110a6d209767337038ce329 100644
--- a/asm/vliw_tests/predicate_forwarding.s
+++ b/asm/vliw_tests/predicate_forwarding.s
@@ -1,5 +1,5 @@
 # Test case for predicate forwarding.
-	.word	84;
+	.word	96;
 	addi	r0 = r0, 0;
 	addi	r1 = r0, 5		||	addi	r2 = r0, 10;
 	por		p1 = p0, p0 	||	pxor	p2 = p0, p0;
@@ -12,3 +12,6 @@
 	pxor	p3 = p5, p6 	||	por	p4 = p5, p6;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/wr_ispm.s b/asm/wr_ispm.s
index 493d3ce49b254f26b8dfde831ab6f9149b03fc50..6a4d3b8d99abfa41db84a6a40759252bbb887b54 100644
--- a/asm/wr_ispm.s
+++ b/asm/wr_ispm.s
@@ -1,7 +1,7 @@
 #
 # Test write of instruction scratchpad (ISPM)
 #
-# ISPM is currently mapped to 0x10000000
+# ISPM is currently mapped to 0x10000
 #
 # Author: Martin Schoeberl (martin@jopdesign.com)
 #
@@ -12,9 +12,9 @@
 #140123
 #160abc
 
-	.word   56;
+	.word   68;
 	addi	r0 = r0, 0;
-	add	r1 = r0, 0x10000000;
+	add	r1 = r0, 0x10000;
 	add	r2 = r0, 0x140123;
 	swl	[r1 + 0] = r2;
 	add	r2 = r0, 0x160abc;
@@ -22,4 +22,7 @@
 	add	r2 = r0, 0xabcd1234;
 	swl	[r1 + 2] = r2;
 
-	halt; 
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/york_loader.s b/asm/york_loader.s
index 8098a8a6b0377c6ef9ca5080b6eb210ea1997d30..9b8f890088a5532f0902c967dda17e53dbeef53a 100644
--- a/asm/york_loader.s
+++ b/asm/york_loader.s
@@ -1,7 +1,7 @@
 # This is a very simple loader. It just jumps to 0x8000, where
 # the actual code is located
     .word 40;
-top: add r30 = r0, 0x0;
+top: addi r30 = r0, 0x0;
     add r16 = r0, 0x40004;
     nop;
     nop;
@@ -9,5 +9,3 @@ top: add r30 = r0, 0x0;
     nop;
     nop;
     nop;
-    nop;
-    nop;
\ No newline at end of file
diff --git a/testsuite/run.sh b/testsuite/run.sh
index ebd8c8f210179c2d4a8f7ac740740eb52cd8011f..a69914d4402c53732ca17576894e8a2e0e4e7437 100755
--- a/testsuite/run.sh
+++ b/testsuite/run.sh
@@ -19,7 +19,7 @@ for td in ${test_dirs}; do
 done
 cd ..
 
-tests="basic simple test ldst load_store_stackcache ALU ALUi ALUl dual_forwarding scratchpad dual_even_odd_address forward_issue load_store_data_cache load_store_scratchpad load_store_scratchpad_new load_store_scratchpad_new2 predication fetch_double  branch predicate predicated_predicate call callr mulpipe"
+tests="basic minimal simple ALU ALUi ALUl compare dual_forwarding fetch_double dual_even_odd_address forward_issue ldst load_use load_store_stackcache spill load_store_data_cache load_store_scratchpad load_store_scratchpad_new load_store_scratchpad_new2 scratchpad predication branch predicate predicated_predicate pred_issue call callr mulpipe mfsmts test test_asm test_case_plan test_mfs test_mts"
 tests+=${test_disc}
 
 not_working_chsl="none"