From 9ab692c8ad54b08fc904fbecc1551a0da620e22a Mon Sep 17 00:00:00 2001
From: Wolfgang Puffitsch <hausen@gmx.at>
Date: Mon, 3 Mar 2014 19:56:22 +0100
Subject: [PATCH] asm: Adapt assembly test cases for new assembler, delete
 deprecated assembly files, add some test cases to test suite.

---
 asm/ALU.s                               |  7 ++-
 asm/ALUi.s                              |  5 +-
 asm/ALUl.s                              |  7 ++-
 asm/basic.s                             |  5 +-
 asm/blink.s                             |  5 +-
 asm/boot_loader.s                       | 78 -------------------------
 asm/branch.s                            |  5 +-
 asm/bug_inverse_predicate_branch.s      |  2 +-
 asm/call.s                              |  7 ++-
 asm/callr.s                             |  5 +-
 asm/compare.s                           |  7 ++-
 asm/dual_even_odd_address.s             |  7 ++-
 asm/dual_forwarding.s                   |  7 ++-
 asm/echo.s                              |  5 +-
 asm/echo_final.s                        | 42 -------------
 asm/echo_scratchpad1.s                  |  6 +-
 asm/fetch_double.s                      |  5 +-
 asm/forward_issue.s                     |  5 +-
 asm/gm_test.s                           |  5 +-
 asm/hello.s                             |  1 -
 asm/inst_tests/ALU.s                    |  5 +-
 asm/inst_tests/branch.s                 |  7 ++-
 asm/inst_tests/datacache_load_store.s   |  7 ++-
 asm/inst_tests/datacache_load_store2.s  |  7 ++-
 asm/inst_tests/delay_slots.s            | 17 ++++++
 asm/inst_tests/globalmem_load_store.s   |  7 ++-
 asm/inst_tests/globalmem_load_store2.s  |  7 ++-
 asm/inst_tests/localmem_load_store.s    |  5 +-
 asm/inst_tests/localmem_load_store2.s   |  7 ++-
 asm/inst_tests/stackcache_load_store.s  |  5 +-
 asm/inst_tests/stackcache_load_store2.s |  5 +-
 asm/inst_tests_failing/delay_slots.s    | 18 ------
 asm/ld_st_test.s                        |  5 +-
 asm/ldst.s                              |  5 +-
 asm/load_store_data_cache.s             |  5 +-
 asm/load_store_scratchpad.s             |  5 +-
 asm/load_store_scratchpad_new.s         |  5 +-
 asm/load_store_scratchpad_new2.s        |  5 +-
 asm/load_store_stackcache.s             |  9 ++-
 asm/load_use.s                          | 12 ++--
 asm/mfsmts.s                            |  5 +-
 asm/minimal.s                           |  6 +-
 asm/mulpipe.s                           |  5 +-
 asm/pred_issue.s                        |  5 +-
 asm/predicate.s                         |  6 +-
 asm/predicated_echo.s                   |  5 +-
 asm/predicated_echo_reverse.s           |  8 ++-
 asm/predicated_predicate.s              |  5 +-
 asm/predicates.s                        | 16 -----
 asm/predication.s                       |  6 +-
 asm/scratchpad.s                        |  5 +-
 asm/scratchpad_store.s                  | 20 -------
 asm/simple.s                            |  5 +-
 asm/spill.s                             |  7 ++-
 asm/stackcache.s                        |  5 +-
 asm/stall.s                             | 10 ----
 asm/test.s                              |  5 +-
 asm/test_asm.s                          |  5 +-
 asm/test_branch.s                       | 29 ---------
 asm/test_case_plan.s                    |  5 +-
 asm/test_mfs.s                          |  5 +-
 asm/test_mts.s                          |  5 +-
 asm/test_old.s                          | 16 -----
 asm/test_sdram.s                        | 11 ++--
 asm/test_sdram2.s                       | 13 +++--
 asm/test_sdram3.s                       | 13 +++--
 asm/vliw_tests/ALU_forwarding.s         |  5 +-
 asm/vliw_tests/ALU_forwarding2.s        |  5 +-
 asm/vliw_tests/add.s                    |  5 +-
 asm/vliw_tests/addi.s                   |  5 +-
 asm/vliw_tests/immediate.s              |  5 +-
 asm/vliw_tests/predicate_forwarding.s   |  5 +-
 asm/wr_ispm.s                           | 11 ++--
 asm/york_loader.s                       |  4 +-
 testsuite/run.sh                        |  2 +-
 75 files changed, 301 insertions(+), 336 deletions(-)
 delete mode 100644 asm/boot_loader.s
 delete mode 100644 asm/echo_final.s
 create mode 100644 asm/inst_tests/delay_slots.s
 delete mode 100644 asm/inst_tests_failing/delay_slots.s
 delete mode 100644 asm/predicates.s
 delete mode 100644 asm/scratchpad_store.s
 delete mode 100644 asm/stall.s
 delete mode 100644 asm/test_branch.s
 delete mode 100644 asm/test_old.s

diff --git a/asm/ALU.s b/asm/ALU.s
index 080a9db8..a2556fd9 100644
--- a/asm/ALU.s
+++ b/asm/ALU.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # 
 
-	.word   116;
+	.word   128;
 	addi	r1 = r0, 255;  # first instruction not executed 0
 	addi	r1 = r0, 2; #1 r1 = 2
 	addi	r2 = r0, 3; #2 r2 = 3
@@ -33,5 +33,6 @@
 	addi    r3 = r0, 1;
 #	rr	r2 = r2, r3;
 	halt; 
-
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/ALUi.s b/asm/ALUi.s
index 009ef9ad..dca125cb 100644
--- a/asm/ALUi.s
+++ b/asm/ALUi.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 #
 
-	.word   44;
+	.word   56;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 15; # r1 = 15
 	subi	r1 = r1, 5; # r1 = 10
@@ -13,3 +13,6 @@
 	andi	r1 = r1, 3; # r1 = 3
 	addi    r2 = r0, 24;# init r2
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/ALUl.s b/asm/ALUl.s
index 55010d4c..079386a3 100644
--- a/asm/ALUl.s
+++ b/asm/ALUl.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # long immediate instructions
 
-	.word   168;
+	.word   180;
 	addi	r1 = r0, 255;  # first instruction not executed 0
 	addi	r1 = r0, 2; # r1 = 2
 	add     r1  = r1, 65536; # r1 = 65538
@@ -29,5 +29,6 @@
 #	rr	r10 = r10, 2; # r10(31) = 1
 	sra	r10 = r10, 5; # fills in 5 upper bits with 1
 	halt; 
-
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/basic.s b/asm/basic.s
index 3fe76c88..78966a06 100644
--- a/asm/basic.s
+++ b/asm/basic.s
@@ -2,7 +2,7 @@
 # Just a few basic instructions to watch the pipeline going in ModelSim
 #
 
-	.word   32;
+	.word   40;
 	addi	r1 = r0, 255;
 
 	addi	r1 = r0, 15;
@@ -11,3 +11,6 @@
 	addi	r3 = r0, 3;
 	add	r4 = r2, r3;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/blink.s b/asm/blink.s
index abcfa676..8872d2e7 100644
--- a/asm/blink.s
+++ b/asm/blink.s
@@ -5,7 +5,7 @@
 # Toggle LED with input from UART.
 #
 
-		.word   124;
+		.word   136;
 		addi	r0 = r0, 0;  # first instruction not executed
 
 		addi	r7 = r0, 16;
@@ -43,3 +43,6 @@
                 addi    r0  = r0 , 0;
                 addi    r0  = r0 , 0;
 		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/boot_loader.s b/asm/boot_loader.s
deleted file mode 100644
index 835aea76..00000000
--- a/asm/boot_loader.s
+++ /dev/null
@@ -1,78 +0,0 @@
-#
-# Expected Result: ...
-# this echos wrong characters
-# MS: what does this program? Looks very out of date: UART at wrong address, bne,...
-# SA: should I continue with assembly boot loader? 
-# MS: I think a boot loader shall be done in C if possible
-# and we shall drop unused out-of-date code
-		.word	264;
-		addi    r16  = r16 , 64;
-		addi    r7 = r7 , 511;
-		addi	r1   = r0 , 2;
-		lwm     r10  = [r5 + 0];
-                nop;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r15  = [r5 + 1];
-                lwm     r15  = [r5 + 1];
-		addi    r17  = r17 , 24;
-		sl	r15 = r15 , r17;
-		lwm     r10  = [r5 + 0];
-		addi    r0  = r0 , 1;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r18  = [r5 + 1];
-                lwm     r18  = [r5 + 1];
-		addi	r19 = r19 , 16;
-		sl      r18 = r18 , r19;
-		or	r15 = r15 , r18;
-		lwm     r10  = [r5 + 0];
-		nop;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r20  = [r5 + 1];
-                lwm     r20  = [r5 + 1];
-		addi	r21 = r21 , 8;
-		sl      r20 = r20 , r21;
-		or	r15 = r15 , r20;
-		lwm     r10  = [r5 + 0];
-		nop;
-                and     r11  = r10 , r1;
-		bne     r1 != r11 , 4;
-		addi	r0  = r0 , 1;
-                addi    r0  = r0 , 1;
-                lwm     r22  = [r5 + 1];
-                lwm     r22  = [r5 + 1];
-		addi    r0  = r0 , 1;
-		or	r15 = r15 , r22;
-                swm     [r7 + 1] = r15; 
-		addi    r27 = r27 , 1;
-		andi	r0 = r0 , 0;
-		andi	r1 = r1 , 0;
-		andi    r5 = r5 , 0;
-		andi    r10 = r10 , 0;
-		andi    r11 = r11 , 0;
-		andi	r15 = r15 , 0;
-		andi    r17 = r17 , 0;
-		andi    r18 = r18 , 0;
-		andi    r19 = r19 , 0;
-		andi    r20 = r20 , 0;
-		andi    r21 = r21 , 0;
-		andi    r22 = r22 , 0;
-		addi    r7 = r7 , 1;
-		bne	r27 != r16 , 59;
-		addi    r9 = r9 , 1;
-		andi	r9 = r9 , 0;
-		andi    r27 = r27 , 0;
-		andi    r7 = r7 , 0;
-		andi    r16 = r16 , 0;
-		andi    r0 = r0 , 0;
-		andi    r0 = r0 , 0;
-		andi    r0 = r0 , 0;
-                halt;
diff --git a/asm/branch.s b/asm/branch.s
index 68bc3875..6e2bbead 100644
--- a/asm/branch.s
+++ b/asm/branch.s
@@ -1,7 +1,7 @@
 #
 # Test branch
 #
-	.word   104;
+	.word   116;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 1;
 	addi	r1 = r0, 2;
@@ -31,3 +31,6 @@ end:	addi	r1 = r0, 13;
 	addi	r1 = r0, 15;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/bug_inverse_predicate_branch.s b/asm/bug_inverse_predicate_branch.s
index f953e326..0477e0bf 100644
--- a/asm/bug_inverse_predicate_branch.s
+++ b/asm/bug_inverse_predicate_branch.s
@@ -4,7 +4,7 @@
 # Expected Result: '0'
 # Current output: '1'
 
-	.word	80;
+	.word	84;
 x0:		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r5 = r0, 15;
 		sli	r5 = r5, 28;
diff --git a/asm/call.s b/asm/call.s
index 082eb7d8..e72a2e8d 100644
--- a/asm/call.s
+++ b/asm/call.s
@@ -20,7 +20,7 @@
 	addi	r1 = r0, 0;
 	addi	r1 = r0, 0;
 
-	.word 100; # This looks like not working at all....
+	.word 96;
 start:	addi	r1 = r1, 1;
 	addi	r30 = r0, start;
 	call	foo;
@@ -34,7 +34,7 @@ start:	addi	r1 = r1, 1;
 	br	end;
 	addi	r0 = r0, 0;
 	addi	r0 = r0, 0;
-	.word 20; # this shall be the length - which unit, assume bytes?
+	.word 24;
 foo:	addi	r6 = r0, 6;
 	addi	r7 = r0, 7;
 	ret	r30, r31;   # r32 offset to method base in r30
@@ -44,3 +44,6 @@ foo:	addi	r6 = r0, 6;
 end:	addi	r8 = r0, 8;
 	addi 	r9 = r0, 9;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/callr.s b/asm/callr.s
index e829880b..14c29ebe 100644
--- a/asm/callr.s
+++ b/asm/callr.s
@@ -26,7 +26,7 @@ start:	addi	r1 = r1, 1;
 	br	end;
 	addi	r0 = r0, 0;
 	addi	r0 = r0, 0;
-	.word 20; # this shall be the length - which unit, assume bytes?
+	.word 24;
 foo:	addi	r6 = r0, 6;
 	addi	r7 = r0, 7;
 	ret	r30, r31;   # r32 offset to method base in r30
@@ -36,3 +36,6 @@ foo:	addi	r6 = r0, 6;
 end:	addi	r8 = r0, 8;
 	addi 	r9 = r0, 9;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/compare.s b/asm/compare.s
index cc4febff..792c86e8 100644
--- a/asm/compare.s
+++ b/asm/compare.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # 
 
-	.word   232;
+	.word   244;
 	addi	r1 = r0, 255;  # first instruction not executed 0
 	addi	r1 = r0, 2; #1 r1 = 2
 x0:	addi	r2 = r0, 2; #2 r2 = 2
@@ -64,5 +64,6 @@ x8:	cmplt   p2  = r1, r2;
 	addi	r23 = r0, 1;  	
 
 	halt; 
-
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/dual_even_odd_address.s b/asm/dual_even_odd_address.s
index 38090e65..b4654360 100644
--- a/asm/dual_even_odd_address.s
+++ b/asm/dual_even_odd_address.s
@@ -3,7 +3,7 @@
 #
 # Expected Result: echo entered characters
 #
-		.word   164;
+		.word   176;
 		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r1 = r0, 1;
 		add	r2   = r0 , 65536; # dual issue from odd
@@ -35,4 +35,7 @@ x1:		addi    r5 = r5, 1;
 		nop;
 		nop;
 		addi    r13 = r13, 1;
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/dual_forwarding.s b/asm/dual_forwarding.s
index ee684353..d8f87bba 100644
--- a/asm/dual_forwarding.s
+++ b/asm/dual_forwarding.s
@@ -4,7 +4,7 @@
 #
 # Expected Result: 
 #
-		.word   140;
+		.word   152;
 		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r1 = r0, 1;
 		add	r2   = r0 , 65536; 
@@ -38,4 +38,7 @@
 		add	r15 = r10, r11;
 		add	r16 = r10, r11;
 		add	r17 = r10, r11;
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/echo.s b/asm/echo.s
index 133d818b..0dbfae3c 100644
--- a/asm/echo.s
+++ b/asm/echo.s
@@ -4,7 +4,7 @@
 # Expected Result: echo entered characters
 # SA: this is the working version of echo.
 
-		.word   104;
+		.word   116;
 # Set up IO address
 x0:		addi	r0 = r0, 0;
 		add 	r5 = r0, 0xf0000800;
@@ -40,3 +40,6 @@ x2:		lwl     r10  = [r5 + 0];
 
 # Never reached
 		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/echo_final.s b/asm/echo_final.s
deleted file mode 100644
index f2745e5e..00000000
--- a/asm/echo_final.s
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# This is a simple echo program on the UART
-#
-# Expected Result: echo entered characters
-#
-# MS: is this roughly the same as echo.s? 
-# SA: This is a deprecated version of echo, it is no longer valid.
-
-		.word   104;
-		addi	r0 = r0, 0;  # first instruction not executed
-		addi	r5 = r0, 15;
-		sli	r5 = r5, 28;
-
-		addi	r1   = r0 , 2;
-		lwl     r10  = [r5 + 0];
-		addi	r0 = r0, 0;
-                and     r11  = r10 , r1;
-		cmpneq  p1 = r1, r11;
-	(p1)	bc	4;
-                addi    r0  = r0 , 0;
-                addi    r0  = r0 , 0;		
-		addi	r5 = r5, 1;
-
-                lwl     r15  = [r5 + 0];
-
-		subi	r5 = r5, 1;
-		addi	r3 = r0, 1;
-		lwl     r10  = [r5 + 0];
-		addi	r0 = r0, 0;
-		and     r11 = r3 , r10;
-		cmpneq  p1 = r3, r11;
-	(p1)	bc	12;
-                addi    r0  = r0 , 0;
-                addi    r0  = r0 , 0;
-
-		addi    r5 = r5, 1;
-		swl	[r5 + 0] = r15;
-		bc	0;
-                addi    r0  = r0 , 0;
-                addi    r0  = r0 , 0;
-                halt;
-
diff --git a/asm/echo_scratchpad1.s b/asm/echo_scratchpad1.s
index b8a4bb68..365d79ac 100644
--- a/asm/echo_scratchpad1.s
+++ b/asm/echo_scratchpad1.s
@@ -4,7 +4,7 @@
 # Expected Result: just checking if echo works along with other instructions.
 #
 
-		.word   124;
+		.word   136;
 x0:		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r5 = r0, 15;
 		sli	r5 = r5, 28;
@@ -41,4 +41,6 @@ x2:		lwl     r10  = [r5 + 0];
                 addi    r0  = r0 , 0;
                 addi    r0  = r0 , 0;
                 halt;
-
+		nop;
+		nop;
+		nop;
diff --git a/asm/fetch_double.s b/asm/fetch_double.s
index ab0acf91..b4572510 100644
--- a/asm/fetch_double.s
+++ b/asm/fetch_double.s
@@ -2,7 +2,7 @@
 # Test dual issue feature by using long constants with a double word fetch
 #
 
-	.word   56;
+	.word   68;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 1;
 	add	r2 = r0, 65536;
@@ -12,3 +12,6 @@
 	add	r6 = r0, 200000;
 	add	r7 = r0, 300000; # add for a long immediate is strange
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/forward_issue.s b/asm/forward_issue.s
index 9bd32cf9..5f8485c3 100644
--- a/asm/forward_issue.s
+++ b/asm/forward_issue.s
@@ -2,7 +2,7 @@
 # This small test bench showed a former forwarding issue. Is fixed quite some time.
 #
 
-		.word   100;
+		.word   112;
 		addi	r0 = r0, 0;  # first instruction not executed
 
 		addi	r1 = r0, 1;
@@ -32,3 +32,6 @@
 		add	r17 = r10, r11;
 
 		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/gm_test.s b/asm/gm_test.s
index a74c9f11..905fdbeb 100644
--- a/asm/gm_test.s
+++ b/asm/gm_test.s
@@ -1,4 +1,4 @@
-	.word   88;
+	.word   100;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r20 = r0, 15;
 	sli	r20 = r20, 28;
@@ -49,3 +49,6 @@
 # 		swm	[r20 + 1] = r5;
 
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/hello.s b/asm/hello.s
index 25bd540d..232062e9 100644
--- a/asm/hello.s
+++ b/asm/hello.s
@@ -5,7 +5,6 @@
 #
 
 	.word   56;
-	addi	r0 = r0, 0;  # first instruction maybe not executed
 
         add     r7  = r0, 0xF0000900;
 	addi	r8 = r0, 1;
diff --git a/asm/inst_tests/ALU.s b/asm/inst_tests/ALU.s
index 9381c00a..8ed82565 100644
--- a/asm/inst_tests/ALU.s
+++ b/asm/inst_tests/ALU.s
@@ -1,5 +1,5 @@
 # This test case  tests the different instructions of the ALU
-	.word	116;
+	.word	128;
 	addi	r1 = r0, 10;
 	addi	r1 = r0, 10;
 	add 	r2 = r1, r0;
@@ -23,3 +23,6 @@
 	shadd	r8 = r1, 1; # r8 = 11
 	shadd2	r8 = r8, 1; # r8 = 23
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/branch.s b/asm/inst_tests/branch.s
index 35418620..8b52ad47 100644
--- a/asm/inst_tests/branch.s
+++ b/asm/inst_tests/branch.s
@@ -1,5 +1,5 @@
 # This test case tests the branching
-	.word	48;
+	.word	60;
 	addi 	r1 = r0, 2;
 	addi 	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -9,4 +9,7 @@ x2:	(p1) br x1;
 	add 	r2 = r1, r1;
 x1:	cmpeq 	p2 = r2, r1;
 	(p2) br x2;
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/datacache_load_store.s b/asm/inst_tests/datacache_load_store.s
index 16a01b21..0a4bfedf 100644
--- a/asm/inst_tests/datacache_load_store.s
+++ b/asm/inst_tests/datacache_load_store.s
@@ -1,6 +1,6 @@
 # Test case for data cache load and store
 # Initialization begin
-	.word	224;
+	.word	236;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -57,4 +57,7 @@
 	sbc		[r0 + 2] = r1;
 	lhuc	r5 = [r0 + 2];
 	lbuc	r6 = [r0 + 2];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/datacache_load_store2.s b/asm/inst_tests/datacache_load_store2.s
index bb0b4ef2..57337fd2 100644
--- a/asm/inst_tests/datacache_load_store2.s
+++ b/asm/inst_tests/datacache_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for data cache load and store
-	.word	240;
+	.word	252;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -56,4 +56,7 @@
 	lbuc	r13 = [r2 + 3];
 	lhuc	r14 = [r2 + 0];
 	lhuc	r15 = [r2 + 1];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/delay_slots.s b/asm/inst_tests/delay_slots.s
new file mode 100644
index 00000000..b091f9dd
--- /dev/null
+++ b/asm/inst_tests/delay_slots.s
@@ -0,0 +1,17 @@
+# This test case tests the delay slots of branches and loads
+	addi 	r1 = r0, 8;
+	add 	r2 = r0, r0;
+	br 		xA;
+	br 		xB;
+	br 		xC;
+xC:	add		r2 = r2, r1;
+xB:	add 	r2 = r2, r1;
+xA:	add 	r2 = r2, r1;
+	br 		xD;
+	add 	r2 = r0, r0;
+	add 	r2 = r2, r1;
+	add 	r2 = r2, r1;
+xD: lwc		r4 = [r1+0];
+	add 	r5 = r4, r1;
+	add 	r5 = r4, r1;
+	halt;
diff --git a/asm/inst_tests/globalmem_load_store.s b/asm/inst_tests/globalmem_load_store.s
index f2d9ce71..19157038 100644
--- a/asm/inst_tests/globalmem_load_store.s
+++ b/asm/inst_tests/globalmem_load_store.s
@@ -1,6 +1,6 @@
 # Test case for global data memory load and store
 # Initialization begin
-	.word	224;
+	.word	236;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -57,4 +57,7 @@
 	sbm		[r0 + 2] = r1;
 	lhum	r5 = [r0 + 2];
 	lbum	r6 = [r0 + 2];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/globalmem_load_store2.s b/asm/inst_tests/globalmem_load_store2.s
index 917adcc6..f6ba600d 100644
--- a/asm/inst_tests/globalmem_load_store2.s
+++ b/asm/inst_tests/globalmem_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for global data memory load and store
-	.word	240;
+	.word	252;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -56,4 +56,7 @@
 	lbum	r13 = [r2 + 3];
 	lhum	r14 = [r2 + 0];
 	lhum	r15 = [r2 + 1];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/localmem_load_store.s b/asm/inst_tests/localmem_load_store.s
index 0162332f..02f9eb64 100644
--- a/asm/inst_tests/localmem_load_store.s
+++ b/asm/inst_tests/localmem_load_store.s
@@ -1,6 +1,6 @@
 # Test case for local memory load and store
 # Initialization begin
-	.word	224;
+	.word	236;
 	add 	r1 = r0, 0xE0000000;
 	add 	r1 = r0, 0xE0000000;
 	addi	r2 = r0, 4;
@@ -58,3 +58,6 @@
 	lhul	r5 = [r0 + 2];
 	lbul	r6 = [r0 + 2];
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/localmem_load_store2.s b/asm/inst_tests/localmem_load_store2.s
index b0b97b69..b5f80aef 100644
--- a/asm/inst_tests/localmem_load_store2.s
+++ b/asm/inst_tests/localmem_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for local memory load and store
-	.word	240;
+	.word	252;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -56,4 +56,7 @@
 	lbul	r13 = [r2 + 3];
 	lhul	r14 = [r2 + 0];
 	lhul	r15 = [r2 + 1];
-	halt;
\ No newline at end of file
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/stackcache_load_store.s b/asm/inst_tests/stackcache_load_store.s
index f67aee2f..0bf78c32 100644
--- a/asm/inst_tests/stackcache_load_store.s
+++ b/asm/inst_tests/stackcache_load_store.s
@@ -1,6 +1,6 @@
 # Test case for stack cache load and store
 # Initialization begin
-	.word	236;
+	.word	248;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -61,3 +61,6 @@
 	lhus	r5 = [r0 + 2];
 	lbus	r6 = [r0 + 2];
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests/stackcache_load_store2.s b/asm/inst_tests/stackcache_load_store2.s
index c469453f..ae5c5faa 100644
--- a/asm/inst_tests/stackcache_load_store2.s
+++ b/asm/inst_tests/stackcache_load_store2.s
@@ -1,5 +1,5 @@
 # Test case for stack cache load and store
-	.word	252;
+	.word	264;
 	add 	r1 = r0, 0xFF0FF000;
 	add 	r1 = r0, 0xFF0FF000;
 	addi	r2 = r0, 4;
@@ -60,3 +60,6 @@
 	lhus	r14 = [r2 + 0];
 	lhus	r15 = [r2 + 1];
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/inst_tests_failing/delay_slots.s b/asm/inst_tests_failing/delay_slots.s
deleted file mode 100644
index 09c3798b..00000000
--- a/asm/inst_tests_failing/delay_slots.s
+++ /dev/null
@@ -1,18 +0,0 @@
-# This test case tests the delay slots of branches and loads
-	addi 	r1 = r0, 5;
-	addi 	r1 = r0, 5;
-	addi	r2 = r0, r0;
-	br 		x1;
-	br 		x2;
-	br 		x3;
-x3:	add		r2 = r2, r1;
-x2:	add 	r2 = r2, r1;
-x1:	add 	r2 = r2, r1;
-	br 		x4;
-	add 	r2 = r0, r0;
-	add 	r2 = r2, r1;
-	add 	r2 = r2, r1;
-x4: lws		r4 = r1, 0;
-	add 	r5 = r4, r1;
-	add 	r5 = r4, r1;
-	halt;
diff --git a/asm/ld_st_test.s b/asm/ld_st_test.s
index 719f7900..081cf580 100644
--- a/asm/ld_st_test.s
+++ b/asm/ld_st_test.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # test if memory works fine
 
-	.word   60;
+	.word   72;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi    r29 = r0, 10;
@@ -18,3 +18,6 @@ x1:	swm	[r1 + 1] = r2;
 	cmpneq  p1 = r0, r29;
 (p1)	br	x1; #
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/ldst.s b/asm/ldst.s
index 2c18aacf..c7fc4daa 100644
--- a/asm/ldst.s
+++ b/asm/ldst.s
@@ -1,7 +1,7 @@
 #
 # Basic load/store tests
 
-	.word   100;
+	.word   112;
 	addi	r0 = r0, 0;
 	addi	r1 = r0, 4;
 	add	r2 = r0, 0xabcd1234;
@@ -25,3 +25,6 @@
 	sbl	[r1 + 3] = r2;
 	lwl	r3 = [r1 + 0];
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_data_cache.s b/asm/load_store_data_cache.s
index 6bf0b679..809efa16 100644
--- a/asm/load_store_data_cache.s
+++ b/asm/load_store_data_cache.s
@@ -4,7 +4,7 @@
 # MS: is SPM accessed with lxc/sxc? I thought it is via lxl/sxl.
 # SA: this test case is for data cache with lxc/sxc which is mapped to scratchpad at the moment
 
-	.word   188;
+	.word   200;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi	r2 = r0, 5;
@@ -55,3 +55,6 @@ x1:	sl	r31 = r31, r30;
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r21 equals to all upper bits 1 
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_scratchpad.s b/asm/load_store_scratchpad.s
index 63ab402f..bf4f8aa8 100644
--- a/asm/load_store_scratchpad.s
+++ b/asm/load_store_scratchpad.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # different ld/st from/to scratchpad memory
 
-	.word   184;
+	.word   196;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi	r2 = r0, 5;
@@ -57,3 +57,6 @@ x1:	shl	[r1 + 4] = r5; #
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r20 equals to all upper bits 1 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_scratchpad_new.s b/asm/load_store_scratchpad_new.s
index 0657665a..f118cc8a 100644
--- a/asm/load_store_scratchpad_new.s
+++ b/asm/load_store_scratchpad_new.s
@@ -1,7 +1,7 @@
 #
 # Basic instructions test
 # different ld/st from/to scratchpad memory
-	.word   116;
+	.word   128;
 	addi    r1 = r0, 256;
 	sbl     [r1 + 4] = r0;
 	sbl     [r1 + 5] = r0;
@@ -32,3 +32,6 @@ x1:	sl	r31 = r31, r30;
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r20 equals to all upper bits 1 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_scratchpad_new2.s b/asm/load_store_scratchpad_new2.s
index 1dba98d3..d2e1ef88 100644
--- a/asm/load_store_scratchpad_new2.s
+++ b/asm/load_store_scratchpad_new2.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 # different ld/st from/to scratchpad memory
 
-	.word   228;
+	.word   240;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 256; # r1 = 256
 	addi	r2 = r0, 5;
@@ -65,3 +65,6 @@ x1:	shl	[r1 + 4] = r5; #
 	cmpneq  p1 = r31, r29;
 (p1)	br	x1; #r20 equals to all upper bits 1 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/load_store_stackcache.s b/asm/load_store_stackcache.s
index d14c872e..ca44e93e 100644
--- a/asm/load_store_stackcache.s
+++ b/asm/load_store_stackcache.s
@@ -1,7 +1,9 @@
 #
 # Expected Result: 
 #
-		.word   48;
+		.word   68;
+		addi    r3 = r0, 0x100;
+		mts     s6 = r3;
 		addi	r5 = r0, 5;                
 #		lwm     r1  = [r31 + 0];
                 sres     4;
@@ -16,4 +18,7 @@
 #                lhs     r3  = [r0 + 0]  ||     lbs     r4  = [r0 + 1];
 #               lhus    r4  = [r0 + 0]  ||     lbus    r6  = [r0 + 1];
 #                sfree   1;
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/load_use.s b/asm/load_use.s
index ee4ad2bf..10d55fab 100644
--- a/asm/load_use.s
+++ b/asm/load_use.s
@@ -5,7 +5,9 @@
 #   first load (in the load use delay slot) will get old value
 #   second load the correct value from memory
 #
-	.word   52;
+	.word   72;
+	addi    r3 = r0, 0x100;
+	mts     s6 = r3;
 	sres	10;
 	addi	r1 = r0, 4;
 	addi	r2 = r0, 2;
@@ -15,10 +17,10 @@
 	sws	[r1+4] = r2;
 	lws	r3 = [r1+4];
 	addi    r0 = r0, 0;	# This is the delay slot
-# The following behaves different in HW and the simulator.
-# We have not yet defined the semantics of using the value
-# in the delay slot.
-#	add	r4 = r0, r3;	# that one is in the delay slot and will add 3
+	add	r4 = r0, r3;	# that one is in the delay slot and will add 3
 	add	r5 = r0, r3;	# that one shall add 2
 	add	r1 = r0, r5;	# that one shall now be 2
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/mfsmts.s b/asm/mfsmts.s
index 4a7236d8..d8cfe726 100644
--- a/asm/mfsmts.s
+++ b/asm/mfsmts.s
@@ -2,7 +2,7 @@
 # Basic instructions test
 #
 
-	.word   44;
+	.word   56;
 	addi	r1 = r0, 255;  # first instruction not executed
 	addi	r1 = r0, 15; # r1 = 15
         mts     s6  = r1;
@@ -13,3 +13,6 @@
 	mts	s6 = r7;
 	mfs	r10 = s6;
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/minimal.s b/asm/minimal.s
index 97bb8ead..8f2c6067 100644
--- a/asm/minimal.s
+++ b/asm/minimal.s
@@ -2,8 +2,12 @@
 # A short as possible assembler example
 #
 
-	.word   16;
+	.word   36;
 	addi	r1 = r0, 255;  # first instruction maybe not executed
 	addi	r2 = r0, 1;
 	addi	r3 = r0, 2;
 	add	r4 = r2, r3;
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/mulpipe.s b/asm/mulpipe.s
index efc74549..dd37a21b 100644
--- a/asm/mulpipe.s
+++ b/asm/mulpipe.s
@@ -2,7 +2,7 @@
 # Test the multiplication pipeline
 #
 
-	.word   88;
+	.word   96;
 	addi	r1 = r0, 1;
 	addi	r2 = r0, 2;
 	addi	r3 = r0, 3;
@@ -19,3 +19,6 @@
 	mul r1, r7 || mfs r13 = s2;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/pred_issue.s b/asm/pred_issue.s
index 52707cfb..c8cf44b6 100644
--- a/asm/pred_issue.s
+++ b/asm/pred_issue.s
@@ -1,7 +1,7 @@
 
 # Try to extract the issue Sahar observed,
 # but this works - al variations of r1/r2 lt, eq, gt tried
-	.word   36;
+	.word   48;
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 1;
 	cmple   p4 = r1, r2;
@@ -10,3 +10,6 @@
 	xor     r16 = r15, r16; # r16 = 1
 (!p4)   nor     r16 = r16, r15;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/predicate.s b/asm/predicate.s
index 0a1749f6..85f29412 100644
--- a/asm/predicate.s
+++ b/asm/predicate.s
@@ -1,7 +1,7 @@
 #
 # Test predicates and branch
 #
-	.word   588;
+	.word   600;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -50,6 +50,10 @@
 	cmplt   p6 = r9, r3;
 (p6)	add     r15 = r9, r2;
 	halt;
+   	nop;
+	nop;
+	nop;
+
 # ALU instructions
 	addi    r3 = r0, 3;
 	addi    r1 = r0, 5;
diff --git a/asm/predicated_echo.s b/asm/predicated_echo.s
index 4d60da02..206a154e 100644
--- a/asm/predicated_echo.s
+++ b/asm/predicated_echo.s
@@ -5,7 +5,7 @@
 # SA: this tests predicated ld/st which was a bug
 #
 
-		.word   116;
+		.word   128;
 x0:		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r5 = r0, 15;
 		sli	r5 = r5, 28;
@@ -41,4 +41,7 @@ x2:		lwl     r10  = [r5 + 0];
                 addi    r0  = r0 , 0;
                 addi    r0  = r0 , 0;
                 halt;
+		nop;
+		nop;
+		nop;
 
diff --git a/asm/predicated_echo_reverse.s b/asm/predicated_echo_reverse.s
index 20fba411..d2a83320 100644
--- a/asm/predicated_echo_reverse.s
+++ b/asm/predicated_echo_reverse.s
@@ -5,7 +5,7 @@
 # SA: this test predicated ld/st which was a bug
 #
 
-		.word   224;
+		.word   236;
 		addi	r0 = r0, 0;  # first instruction not executed
 		addi	r22 = r0, 0;
 		addi    r8 = r0, 0;
@@ -67,5 +67,7 @@ x5:		lwl     r10  = [r5 + 0];
 		br      x0;
 		addi	r8 = r0, 0;
 		addi    r22 = r0, 0;		
-                halt;
-
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/predicated_predicate.s b/asm/predicated_predicate.s
index 96d0c92e..62c750e5 100644
--- a/asm/predicated_predicate.s
+++ b/asm/predicated_predicate.s
@@ -1,7 +1,7 @@
 #
 # Test predicates and branch
 #
-	.word   168;
+	.word   180;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -50,3 +50,6 @@
 	cmplt   p6 = r9, r3;
 (p6)	add     r15 = r9, r2;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/predicates.s b/asm/predicates.s
deleted file mode 100644
index bf0db0eb..00000000
--- a/asm/predicates.s
+++ /dev/null
@@ -1,16 +0,0 @@
-#
-# Test predicates and branch
-#
-	.word   44;
-x0:	addi	r0 = r0, 0;  # first instruction not executed
-
-	addi	r1 = r0, 2;
-	addi	r2 = r0, 2;
-
-	cmpeq   p1  = r1, r2;
-	(p1) br	x0;
-	addi	r3 = r0, 3; # somewhere here R1 gets set to 4 !!!
-	addi	r4 = r0, 4;
-	addi	r5 = r0, 5;
-	addi	r6 = r0, 6;
-	halt;
diff --git a/asm/predication.s b/asm/predication.s
index 4b8066ba..035e4757 100644
--- a/asm/predication.s
+++ b/asm/predication.s
@@ -1,7 +1,7 @@
 #
 # Test predicates and branch
 #
-	.word   256;
+	.word   268;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi	r1 = r0, 2;
 	addi	r2 = r0, 2;
@@ -88,4 +88,6 @@ pb4:		subi	r1 = r1, 1;
 # Done
 #############
 	halt;
-
+	nop;
+	nop;
+	nop;
diff --git a/asm/scratchpad.s b/asm/scratchpad.s
index c8386ac4..cf8c3c9c 100644
--- a/asm/scratchpad.s
+++ b/asm/scratchpad.s
@@ -2,7 +2,7 @@
 # Simple test of a SPM
 #
 
-	.word   76;
+	.word   88;
 	addi	r1 = r0, 255;  # first instruction maybe not executed
 	addi	r1 = r0, 32;
 	addi	r2 = r0, 5;
@@ -20,3 +20,6 @@
 	lwl	r6  = [r1 + 4];
 	lwl	r7  = [r1 + 8];
 	halt; 
+	nop;
+	nop;
+	nop;
diff --git a/asm/scratchpad_store.s b/asm/scratchpad_store.s
deleted file mode 100644
index 4ec78533..00000000
--- a/asm/scratchpad_store.s
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Basic instructions test
-# different ld/st from/to scratchpad memory
-# this is a deprecated version there are other tests on scratchpad
-
-	.word   32;
-	addi	r1 = r0, 255;  # first instruction not executed
-	addi	r1 = r0, 255; # r1 = 255
-	addi	r2 = r0, 5;
-	addi    r4 = r0, 4;
-	swl	[r1 + 1] = r2; # memory address 259 (255 + (1 sl 2)) = 5 
-#	lwl	r10  = [r1 + 1]; # register(10) = 5
-	addi	r2 = r0, 10;
-	shl	[r1 + 3] = r2; # memory address 261 = 10
-#	lhl	r11  = [r1 + 3]; # register(11) = 10
-	sbl	[r1 + 3] = r4; # memory address 258 = 4
-#	lbl
-#	lhul
-#	lbul	
-#	halt; 
diff --git a/asm/simple.s b/asm/simple.s
index 91f69788..5275b6a3 100644
--- a/asm/simple.s
+++ b/asm/simple.s
@@ -2,7 +2,7 @@
 # Very simple code to get stuff running on the Chisel pipeline.
 #
 
-	.word   76;
+	.word   88;
 	addi	r0 = r0, 0;  # first instruction not executed
 
 	addi	r1 = r0, 1;
@@ -29,3 +29,6 @@
 #	sl	r4 = r1, 3;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/spill.s b/asm/spill.s
index fd4f71f4..148560b8 100644
--- a/asm/spill.s
+++ b/asm/spill.s
@@ -1,7 +1,7 @@
 #
 # Expected Result: 
 #
-		.word   168;
+		.word   180;
 		addi	r5 = r0, 0;                
 		addi    r1 = r0, 0;
 		addi	r10 = r0, 64;
@@ -42,4 +42,7 @@ l2:		sws	[r16 + 0] = r14;
 		sens    64;
 		addi    r11 = r0, 1;# check if stall works in case of spill
 		addi    r12 = r0, 2;# check if stall works in case of spill
-                halt;
+		halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/stackcache.s b/asm/stackcache.s
index a9aa467e..d66550e9 100644
--- a/asm/stackcache.s
+++ b/asm/stackcache.s
@@ -2,7 +2,7 @@
 # Just a few basic instructions to watch the pipeline going in ModelSim
 #
 
-	.word   44;
+	.word   56;
 	addi	r1 = r0, 255;  # first instruction not executed
 	sres     4; # do we reserve to store? so there should be the same number of stores after sres?
         sws     [r0 + 0] = r1;
@@ -13,3 +13,6 @@
 	addi    r15 = r0, 1;#just to have some instruction to check stall
 	addi    r16 = r0, 0;#just to have some instruction to check stall
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/stall.s b/asm/stall.s
deleted file mode 100644
index 93b02255..00000000
--- a/asm/stall.s
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Basic instructions test
-#
-
-	.word   24;
-	addi	r1 = r0, 255;  # first instruction not executed
-	waitm;
-	addi	r1 = r1, 5;
-	addi	r1 = r1, 10;
-	halt; 
diff --git a/asm/test.s b/asm/test.s
index 20664b19..b5713789 100644
--- a/asm/test.s
+++ b/asm/test.s
@@ -6,10 +6,13 @@
 
 # TODO: looks like the UART is in memory address 0....
 
-	.word   32;
+	.word   44;
 	addi	r0 = r0, 0;  # first instruction not executed
 	add	r1 = r0, 0xf0000800;
 	addi	r2 = r0, 42; # '*'
 x1:	swl	[r1 + 1] = r2;
 	br	x1;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_asm.s b/asm/test_asm.s
index c16760e7..c606a0bb 100644
--- a/asm/test_asm.s
+++ b/asm/test_asm.s
@@ -2,7 +2,7 @@
 # Minimum program to test the new assembler
 #
 
-	.word   28;
+	.word   48;
 	addi	r1 = r0, 255;  # first instruction not executed
 
 label2:	addi	r1 = r0, 15;
@@ -12,3 +12,6 @@ label2:	addi	r1 = r0, 15;
 	add	r4 = r2, r3;
 	nor 	r6 = r1, r4		||	and 	r7 = r1, r5;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_branch.s b/asm/test_branch.s
deleted file mode 100644
index 994ff2e3..00000000
--- a/asm/test_branch.s
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# This is a simple output of a single character on the UART
-#
-# TODO: maybe this should just switch a LED to see the result.
-#
-
-# TODO: looks like the UART is in memory address 0....
-# update or drop
-
-	.word   68;
-	addi	r0 = r0, 0;  # first instruction not executed
-	addi	r1 = r0, 0;
-	addi	r2 = r0, 42; # '*'
-#x2:	swm	[r1 + 1] = r2;
-	addi	r1   = r0 , 2;
-	addi	r3 = r19 , 1;
-x1:	lwm     r10  = [r5 + 0];
-	nop;
-        and     r11  = r10 , r3;
-	cmpneq  p1 = r11 , r1;
-	nop;
-	nop;
-	nop;
-	(p1) 	bc x1;
-	nop;
-	nop;
-	swm	[r7 + 1] = r2;
-#	br	x2;
-	halt
diff --git a/asm/test_case_plan.s b/asm/test_case_plan.s
index 70bb845f..584c018e 100644
--- a/asm/test_case_plan.s
+++ b/asm/test_case_plan.s
@@ -44,7 +44,7 @@
 # Function calls
 # 
 
-		.word   120;
+		.word   132;
 		addi	r0 = r0, 0;
 
 add:	addi	r1 = r0, 5; 
@@ -402,3 +402,6 @@ cmpneq: cmpneq	p2 = r1, r3;
 # br exit; # Should jump to correct exit label, using halt or loop
 
 exit: 	halt;
+		nop;
+		nop;
+		nop;
diff --git a/asm/test_mfs.s b/asm/test_mfs.s
index 4d576e03..d0abea57 100644
--- a/asm/test_mfs.s
+++ b/asm/test_mfs.s
@@ -2,7 +2,7 @@
 # Testing hazard with mfs
 #
 
-	.word   72;
+	.word   84;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi    r1 = r0, 4;
 	addi    r2 = r0, 6;
@@ -24,3 +24,6 @@
 	mfs     r7 = s2;     # r7 should be 8
 	addi    r8 = r7, 0;  # r8 should be 8
 	halt;		      
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_mts.s b/asm/test_mts.s
index dcd05f47..a0fdbc20 100644
--- a/asm/test_mts.s
+++ b/asm/test_mts.s
@@ -2,7 +2,7 @@
 # Testing hazard with mts
 #
 
-	.word   60;
+	.word   72;
 	addi	r0 = r0, 0;  # first instruction not executed
 	addi    r1 = r0, 1;
 	addi    r2 = r0, 5;
@@ -17,3 +17,6 @@
 	(!p1) swc [r2 + 2] = r1; # should not be executed, will cause unaligned access
 	mts     s0 = r1;       # sets p1 = false
 	halt;		       # r3 should be 7, r4 should be 9
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_old.s b/asm/test_old.s
deleted file mode 100644
index 54542dab..00000000
--- a/asm/test_old.s
+++ /dev/null
@@ -1,16 +0,0 @@
-label:
-	p1 br lab1
-	addi r1, 10 -> r13 // hhj
-	p1 addi r1, 10 -> r13 // hhj
-	sub r2, r3 -> r2
-	p5 sub r2, r3 -> r31
-	br label
-lab1:
-	br lab1
-	p1 cmp r1 != r2 -> p3
-
-
-// That's not Patmos, but Leros assembler....
-// Register definitions
-
-
diff --git a/asm/test_sdram.s b/asm/test_sdram.s
index 977ae966..0460a7de 100644
--- a/asm/test_sdram.s
+++ b/asm/test_sdram.s
@@ -13,7 +13,7 @@
 # r11==read/write test bound (address limit)
 # r12==memory read error count
 
-	.word	176;
+	.word	188;
 
 	addi	r0 = r0, 0;  # first instruction not executed         	#0
 begin:	addi	r12 = r0, 0; # r12==error count                       	#1
@@ -35,9 +35,9 @@ poll_stdin: lwl     r1 = [r5 + 0];                                     	#7
 
 	lwl     r1 = [r5 + 1];                                     	#14
 
-	addi    r21 = r0, 65; 'A'                                  	#15
+	addi    r21 = r0, 65; # 'A'                                  	#15
 	addi    r22 = r6, 0;                                       	#16
-	addi	r24 = r0, 80; 16 chars from 'A'                       	#17
+	addi	r24 = r0, 80; # 16 chars from 'A'                       	#17
 
 write_word: swl	[r22 + 0] = r21;                                       	#18
 	cmpneq	p1 = r21, r24;                                      	#19
@@ -71,4 +71,7 @@ poll_stdout: lwl     r1 = [r5 + 0];                                     	#27
 		addi r0 = r0, 0;                                          	#41
 		addi r0 = r0, 0;                                          	#42
 
-halt;                                                       	#43
+        halt;                                                       #43
+        nop;                                                       	#44
+        nop;                                                       	#45
+        nop;                                                       	#46
\ No newline at end of file
diff --git a/asm/test_sdram2.s b/asm/test_sdram2.s
index 1c0a02f6..801d34a9 100644
--- a/asm/test_sdram2.s
+++ b/asm/test_sdram2.s
@@ -14,7 +14,7 @@
 # r11==read/write test bound (address limit)
 # r12==memory read error count
 
-	.word	336;
+	.word	348;
 
 	addi	r0 = r0, 0;  # first instruction not executed         	#0
 begin:	addi	r5 = r0, 15;                                          	#1
@@ -52,7 +52,7 @@ poll_stdin: lwl     r1 = [r5 + 0];                                     	#6
 	addi	r11 = r0, 1;	# r11 == test_limit                      	#14
 	sli	r11 = r11, 20;                                         	#15
 	addi	r12 = r0, 0; # r12==error count                       	#16
-	addi	r1= r0, 87; 'W'                                       	#17
+	addi	r1= r0, 87; # 'W'                                       	#17
 	swl     [r5 + 1] = r1;                                     	#18
 
 #	Set the value in the cache line (we don't care if it is loaded as we just use one word)
@@ -80,7 +80,7 @@ poll_sdram_ready: lwl     r1  = [r6 + 17]; # sdram.status                    	#2
                 addi    r0  = r0 , 0;                       	#34
                 addi    r0  = r0 , 0;                       	#35
 
-#read_init: addi	r1= r0, 82; 'R'                                       	#36
+#read_init: addi	r1= r0, 82; # 'R'                                       	#36
 	swl     [r5 + 1] = r1;                                     	#37
 	addi	r10 = r0, 0;	# addr_cnt <= 0                          	#38
 #	 r10 should have the value from before
@@ -102,7 +102,7 @@ poll_sdram_ready2: lwl     r1  = [r6 + 17]; # sdram.status                    	#
 	addi    r1  = r10 , 1;  # val = addr+1 (use the same mod as during write)	#49
 	cmpeq  p1 = r1, r2; # should be the same                   	#50
 	(p1)	br no_error; #l:+3                                         	#51
-        (!p1)   addi    r1  = r0 , 69;  'E'                 	#52
+        (!p1)   addi    r1  = r0 , 69; # 'E'                 	#52
 	(!p1)	addi    r12 = r12, 1; #error_cnt++                   	#53
 		swl     [r5 + 1] = r1; # we write to UART without pooling for ready here,	#54
 
@@ -148,4 +148,7 @@ poll_stdout2: lwl     r1 = [r5 + 0];                                     	#71
 	br begin;                                                      	#91
 		addi    r0 = r0, 0;                                       	#92
 		addi    r0 = r0, 0;                                       	#93
-halt;                                                       	#94
+	halt;                                                       	#94
+	nop;
+	nop;
+	nop;
diff --git a/asm/test_sdram3.s b/asm/test_sdram3.s
index ac214229..6f379b13 100644
--- a/asm/test_sdram3.s
+++ b/asm/test_sdram3.s
@@ -17,7 +17,7 @@
 # r11==read/write test bound (address limit)
 # r12==memory read error count
 
-	.word	408;
+	.word	420;
 
 	addi	r0 = r0, 0;  # first instruction not executed         	#0
 begin: addi	r5 = r0, 15;                                          	#1
@@ -56,7 +56,7 @@ poll_stdin: lwl     r1 = [r5 + 0];                                     	#6
 	addi	r11 = r0, 1;	# r11 == test_limit                      	#14
 	sli	r11 = r11, 26;                                          	#15
 	addi	r12 = r0, 0; # r12==error count                       	#16
-	addi	r1= r0, 87; 'W'                                       	#17
+	addi	r1= r0, 87; # 'W'                                       	#17
 	swl     [r5 + 1] = r1;                                     	#18
 	subi	r9 = r0, 64;	# r9 == mask (not 63)                    	#19
 
@@ -97,7 +97,7 @@ skip_store: addi	r10 = r10, 4;                                         	#40
 
 
 #read_init:
-	addi	r1= r0, 82; 'R'                                       	#45
+	addi	r1= r0, 82; # 'R'                                       	#45
 	swl     [r5 + 1] = r1;                                     	#46
 	addi	r10 = r0, 0;	# addr_cnt <= 0                          	#47
 
@@ -128,7 +128,7 @@ skip_load: andi	r2  = r10, 63;  # offset                              	#61
 
 	cmpeq	p1 = r1, r2; # should be the same                    	#65
 	(p1)	br no_error;                                         	#66
-        (!p1)   addi    r1  = r0 , 69;  'E'                 	#67
+        (!p1)   addi    r1  = r0 , 69; # 'E'                 	#67
 	(!p1)	addi    r12 = r12, 1; #error_cnt++                   	#68
 	swl     [r5 + 1] = r1; # we write to UART without pooling for ready here,	#69
 
@@ -173,4 +173,7 @@ poll_stdout3: lwl     r1 = [r5 + 0];                                     	#86
 	br begin;                                                      	#97
 		addi    r0 = r0, 0;                                       	#98
 		addi    r0 = r0, 0;                                       	#99
-halt;                                                       	#100
+	halt;                                                       	#100
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/ALU_forwarding.s b/asm/vliw_tests/ALU_forwarding.s
index 5f7d7465..59e0c0ce 100644
--- a/asm/vliw_tests/ALU_forwarding.s
+++ b/asm/vliw_tests/ALU_forwarding.s
@@ -1,5 +1,5 @@
 # Test case for forwarding of ALU instructions
-	.word	124;
+	.word	136;
 	addi	r4 = r0, 0;
 	addi	r4 = r0, 0;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
@@ -18,3 +18,6 @@
 	sl		r16 = r10, r1	||	sr 		r17 = r10, r1;
 	sra		r19 = r12, r1	||	srai 	r18 = r12, 5;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/ALU_forwarding2.s b/asm/vliw_tests/ALU_forwarding2.s
index 1565b597..3004ea00 100644
--- a/asm/vliw_tests/ALU_forwarding2.s
+++ b/asm/vliw_tests/ALU_forwarding2.s
@@ -1,5 +1,5 @@
 # Test case for forwarding of ALU instructions
-	.word	68;
+	.word	80;
 	addi	r4 = r0, 0;
 	addi	r4 = r0, 0;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
@@ -16,3 +16,6 @@
 	shadd	r14 = r12, r1	||	shadd2	r15 = r12, r1;
 #	shaddi	r14 = r12, r1;	||	shadd2i	r15 = r12, r1;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/add.s b/asm/vliw_tests/add.s
index a75b5322..3b315e27 100644
--- a/asm/vliw_tests/add.s
+++ b/asm/vliw_tests/add.s
@@ -1,7 +1,10 @@
 # Test case for add instruction
-	.word	40;
+	.word	52;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
 	add 	r2 = r1, r1		||	add 	r3 = r1, r1;
 	add 	r4 = r1, r1		||	add 	r5 = r1, r1;
 	add 	r6 = r1, r1		||	add 	r7 = r1, r1;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/addi.s b/asm/vliw_tests/addi.s
index 545093f6..3060b805 100644
--- a/asm/vliw_tests/addi.s
+++ b/asm/vliw_tests/addi.s
@@ -1,7 +1,10 @@
 # Test case for addi instruction
-	.word	40;
+	.word	52;
 	addi	r1 = r0, 2	||	addi	r1 = r0, 5;
 	addi	r2 = r1, 3	||	addi	r3 = r1, 3;
 	addi	r4 = r1, 4	||	addi	r5 = r1, 4;
 	addi	r6 = r1, 5	||	addi	r7 = r1, 5;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/immediate.s b/asm/vliw_tests/immediate.s
index 28859fd2..593f7124 100644
--- a/asm/vliw_tests/immediate.s
+++ b/asm/vliw_tests/immediate.s
@@ -1,5 +1,5 @@
 # Test case for add instruction
-	.word	52;
+	.word	64;
 	addi	r0 = r0, 0;
 	addi	r1 = r0, 2		||	addi	r1 = r0, 5;
 	subi 	r2 = r1, 2		||	addi 	r3 = r1, 347;
@@ -7,3 +7,6 @@
 	nor 	r6 = r1, r4		||	and 	r7 = r1, r5;
 	add 	r7 = r0, 1234567;
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/vliw_tests/predicate_forwarding.s b/asm/vliw_tests/predicate_forwarding.s
index 81be1fc5..37e450e7 100644
--- a/asm/vliw_tests/predicate_forwarding.s
+++ b/asm/vliw_tests/predicate_forwarding.s
@@ -1,5 +1,5 @@
 # Test case for predicate forwarding.
-	.word	84;
+	.word	96;
 	addi	r0 = r0, 0;
 	addi	r1 = r0, 5		||	addi	r2 = r0, 10;
 	por		p1 = p0, p0 	||	pxor	p2 = p0, p0;
@@ -12,3 +12,6 @@
 	pxor	p3 = p5, p6 	||	por	p4 = p5, p6;
 
 	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/wr_ispm.s b/asm/wr_ispm.s
index 493d3ce4..6a4d3b8d 100644
--- a/asm/wr_ispm.s
+++ b/asm/wr_ispm.s
@@ -1,7 +1,7 @@
 #
 # Test write of instruction scratchpad (ISPM)
 #
-# ISPM is currently mapped to 0x10000000
+# ISPM is currently mapped to 0x10000
 #
 # Author: Martin Schoeberl (martin@jopdesign.com)
 #
@@ -12,9 +12,9 @@
 #140123
 #160abc
 
-	.word   56;
+	.word   68;
 	addi	r0 = r0, 0;
-	add	r1 = r0, 0x10000000;
+	add	r1 = r0, 0x10000;
 	add	r2 = r0, 0x140123;
 	swl	[r1 + 0] = r2;
 	add	r2 = r0, 0x160abc;
@@ -22,4 +22,7 @@
 	add	r2 = r0, 0xabcd1234;
 	swl	[r1 + 2] = r2;
 
-	halt; 
+	halt;
+	nop;
+	nop;
+	nop;
diff --git a/asm/york_loader.s b/asm/york_loader.s
index 8098a8a6..9b8f8900 100644
--- a/asm/york_loader.s
+++ b/asm/york_loader.s
@@ -1,7 +1,7 @@
 # This is a very simple loader. It just jumps to 0x8000, where
 # the actual code is located
     .word 40;
-top: add r30 = r0, 0x0;
+top: addi r30 = r0, 0x0;
     add r16 = r0, 0x40004;
     nop;
     nop;
@@ -9,5 +9,3 @@ top: add r30 = r0, 0x0;
     nop;
     nop;
     nop;
-    nop;
-    nop;
\ No newline at end of file
diff --git a/testsuite/run.sh b/testsuite/run.sh
index ebd8c8f2..a69914d4 100755
--- a/testsuite/run.sh
+++ b/testsuite/run.sh
@@ -19,7 +19,7 @@ for td in ${test_dirs}; do
 done
 cd ..
 
-tests="basic simple test ldst load_store_stackcache ALU ALUi ALUl dual_forwarding scratchpad dual_even_odd_address forward_issue load_store_data_cache load_store_scratchpad load_store_scratchpad_new load_store_scratchpad_new2 predication fetch_double  branch predicate predicated_predicate call callr mulpipe"
+tests="basic minimal simple ALU ALUi ALUl compare dual_forwarding fetch_double dual_even_odd_address forward_issue ldst load_use load_store_stackcache spill load_store_data_cache load_store_scratchpad load_store_scratchpad_new load_store_scratchpad_new2 scratchpad predication branch predicate predicated_predicate pred_issue call callr mulpipe mfsmts test test_asm test_case_plan test_mfs test_mts"
 tests+=${test_disc}
 
 not_working_chsl="none"
-- 
GitLab