Skip to content
Snippets Groups Projects
Commit 9d881764 authored by Mathias Agopian's avatar Mathias Agopian
Browse files

fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source...

fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers

when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component.
the reason was that this mode is used for premltiplied alpha blending, however, if used with a non
premultiplied source, the color component would wrap.

unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails.

this should not impact the UI since it's using h/w acceleration most of the time it also doesn't
impact games which should be using h/w GL. This change will slow the emulator down a bit.
parent 7f5b1a2b
No related branches found
No related tags found
No related merge requests found
...@@ -21,53 +21,80 @@ ...@@ -21,53 +21,80 @@
.global scanline_t32cb16blend_arm .global scanline_t32cb16blend_arm
// uses r6, r7, lr
.macro pixel, DREG, SRC, FB, OFFSET /*
* .macro pixel
*
* \DREG is a 32-bit register containing *two* original destination RGB565
* pixels, with the even one in the low-16 bits, and the odd one in the
* high 16 bits.
*
* \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
*
* \FB is a target register that will contain the blended pixel values.
*
* \ODD is either 0 or 1 and indicates if we're blending the lower or
* upper 16-bit pixels in DREG into FB
*
*
* clobbered: r6, r7, lr
*
*/
.macro pixel, DREG, SRC, FB, ODD
// SRC = AARRGGBB // SRC = 0xAABBGGRR
mov r7, \SRC, lsr #24 // sA mov r7, \SRC, lsr #24 // sA
add r7, r7, r7, lsr #7 // sA + (sA >> 7) add r7, r7, r7, lsr #7 // sA + (sA >> 7)
rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7)) rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7))
1: 1:
.if \OFFSET .if \ODD
// red // red
mov lr, \DREG, lsr #(\OFFSET + 6 + 5) mov lr, \DREG, lsr #(16 + 11)
smulbb lr, r7, lr smulbb lr, r7, lr
mov r6, \SRC, lsr #3 mov r6, \SRC, lsr #3
and r6, r6, #0x1F and r6, r6, #0x1F
add lr, r6, lr, lsr #8 add lr, r6, lr, lsr #8
orr \FB, lr, lsl #(\OFFSET + 11) cmp lr, #0x1F
orrhs \FB, \FB, #(0x1F<<(16 + 11))
orrlo \FB, \FB, lr, lsl #(16 + 11)
// green // green
and r6, \DREG, #(0x3F<<(\OFFSET + 5)) and r6, \DREG, #(0x3F<<(16 + 5))
smulbt r6, r7, r6 smulbt r6, r7, r6
mov lr, \SRC, lsr #(8+2) mov lr, \SRC, lsr #(8+2)
and lr, lr, #0x3F and lr, lr, #0x3F
add r6, lr, r6, lsr #(5+8) add r6, lr, r6, lsr #(5+8)
orr \FB, \FB, r6, lsl #(\OFFSET + 5) cmp r6, #0x3F
orrhs \FB, \FB, #(0x3F<<(16 + 5))
orrlo \FB, \FB, r6, lsl #(16 + 5)
// blue // blue
and lr, \DREG, #(0x1F << \OFFSET) and lr, \DREG, #(0x1F << 16)
smulbt lr, r7, lr smulbt lr, r7, lr
mov r6, \SRC, lsr #(8+8+3) mov r6, \SRC, lsr #(8+8+3)
and r6, r6, #0x1F and r6, r6, #0x1F
add lr, r6, lr, lsr #8 add lr, r6, lr, lsr #8
orr \FB, \FB, lr, lsl #\OFFSET cmp lr, #0x1F
orrhs \FB, \FB, #(0x1F << 16)
orrlo \FB, \FB, lr, lsl #16
.else .else
// red // red
mov lr, \DREG, lsr #(6+5) mov lr, \DREG, lsr #11
and lr, lr, #0x1F and lr, lr, #0x1F
smulbb lr, r7, lr smulbb lr, r7, lr
mov r6, \SRC, lsr #3 mov r6, \SRC, lsr #3
and r6, r6, #0x1F and r6, r6, #0x1F
add lr, r6, lr, lsr #8 add lr, r6, lr, lsr #8
mov \FB, lr, lsl #11 cmp lr, #0x1F
movhs \FB, #(0x1F<<11)
movlo \FB, lr, lsl #11
// green // green
and r6, \DREG, #(0x3F<<5) and r6, \DREG, #(0x3F<<5)
...@@ -75,7 +102,9 @@ ...@@ -75,7 +102,9 @@
mov lr, \SRC, lsr #(8+2) mov lr, \SRC, lsr #(8+2)
and lr, lr, #0x3F and lr, lr, #0x3F
add r6, lr, r6, lsr #(5+8) add r6, lr, r6, lsr #(5+8)
orr \FB, \FB, r6, lsl #5 cmp r6, #0x3F
orrhs \FB, \FB, #(0x3F<<5)
orrlo \FB, \FB, r6, lsl #5
// blue // blue
and lr, \DREG, #0x1F and lr, \DREG, #0x1F
...@@ -83,7 +112,9 @@ ...@@ -83,7 +112,9 @@
mov r6, \SRC, lsr #(8+8+3) mov r6, \SRC, lsr #(8+8+3)
and r6, r6, #0x1F and r6, r6, #0x1F
add lr, r6, lr, lsr #8 add lr, r6, lr, lsr #8
orr \FB, \FB, lr cmp lr, #0x1F
orrhs \FB, \FB, #0x1F
orrlo \FB, \FB, lr
.endif .endif
...@@ -128,7 +159,7 @@ aligned: ...@@ -128,7 +159,7 @@ aligned:
subs r2, r2, #2 subs r2, r2, #2
blo 9f blo 9f
// The main loop is unrolled twice and process 4 pixels // The main loop is unrolled twice and processes 4 pixels
8: ldmia r1!, {r4, r5} 8: ldmia r1!, {r4, r5}
// stream the source // stream the source
pld [r1, #32] pld [r1, #32]
...@@ -142,7 +173,7 @@ aligned: ...@@ -142,7 +173,7 @@ aligned:
// stream the destination // stream the destination
pld [r0, #32] pld [r0, #32]
pixel r3, r4, r12, 0 pixel r3, r4, r12, 0
pixel r3, r5, r12, 16 pixel r3, r5, r12, 1
// effectively, we're getting write-combining by virtue of the // effectively, we're getting write-combining by virtue of the
// cpu's write-back cache. // cpu's write-back cache.
str r12, [r0, #-4] str r12, [r0, #-4]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment