From 4c48643ad39e7d7b31ab71c3e7b347145a73df53 Mon Sep 17 00:00:00 2001 From: Jonathan G Rennison Date: Thu, 29 Sep 2022 00:46:31 +0100 Subject: [PATCH] SSE blitters: Correctly set alpha of output in AlphaBlendTwoPixels Match alpha behaviour of ComposeColourRGBA See: https://github.com/OpenTTD/OpenTTD/pull/10060 --- src/blitter/32bpp_anim_sse4.cpp | 9 +++++---- src/blitter/32bpp_sse_func.hpp | 28 ++++++++++++++++++---------- src/blitter/32bpp_sse_type.h | 1 + 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/blitter/32bpp_anim_sse4.cpp b/src/blitter/32bpp_anim_sse4.cpp index 5089dce69f..695fe35e61 100644 --- a/src/blitter/32bpp_anim_sse4.cpp +++ b/src/blitter/32bpp_anim_sse4.cpp @@ -53,6 +53,7 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL const __m128i a_cm = ALPHA_CONTROL_MASK; const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK; const __m128i tr_nom_base = TRANSPARENT_NOM_BASE; + const __m128i a_am = ALPHA_AND_MASK; for (int y = bp->height; y != 0; y--) { Colour *dst = dst_line; @@ -144,7 +145,7 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL /* Blend colours. */ bmno_alpha_blend: - srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm, a_am); bmno_full_opacity: _mm_storel_epi64((__m128i *) dst, srcABCD); bmno_full_transparency: @@ -171,7 +172,7 @@ bmno_full_transparency: } else { srcABCD = _mm_cvtsi32_si128(src->data); } - dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); + dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm, a_am)); } } break; @@ -255,7 +256,7 @@ bmno_full_transparency: /* Blend colours. */ bmcr_alpha_blend: - srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm, a_am); bmcr_full_opacity: _mm_storel_epi64((__m128i *) dst, srcABCD); bmcr_full_transparency: @@ -288,7 +289,7 @@ bmcr_full_transparency: if (src->a < 255) { bmcr_alpha_blend_single: __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm, a_am); } dst->data = _mm_cvtsi128_si32(srcABCD); } diff --git a/src/blitter/32bpp_sse_func.hpp b/src/blitter/32bpp_sse_func.hpp index 4d04297e33..4771725dfc 100644 --- a/src/blitter/32bpp_sse_func.hpp +++ b/src/blitter/32bpp_sse_func.hpp @@ -65,27 +65,33 @@ static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask) { #if (SSE_VERSION == 2) __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1 - return _mm_shufflehi_epi16(alphaAB, 0x3F); // PSHUFHW, put alpha2 in front of each rgb2 + alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); // PSHUFHW, put alpha2 in front of each rgb2 + alphaAB = _mm_or_si128(alphaAB, mask); // POR, set alpha fields to all 1 + return _mm_xor_si128(alphaAB, mask); // PXOR, set alpha fields to 0 #else return _mm_shuffle_epi8(from, mask); #endif } GNU_TARGET(SSE_TARGET) -static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask) +static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask, const __m128i &alpha_mask) { __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); // PUNPCKLBW, expand each uint8 into uint16 __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); - __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++; - alphaAB = _mm_srli_epi16(alphaAB, 15); - alphaAB = _mm_add_epi16(alphaAB, srcAB); + __m128i alphaMaskAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW (alpha > 0) ? 0xFFFF : 0 + __m128i alphaAB = _mm_srli_epi16(alphaMaskAB, 15); + alphaAB = _mm_add_epi16(alphaAB, srcAB); // if (alpha > 0) a++; alphaAB = DistributeAlpha(alphaAB, distribution_mask); srcAB = _mm_sub_epi16(srcAB, dstAB); // PSUBW, (r - Cr) srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr) srcAB = _mm_srli_epi16(srcAB, 8); // PSRLW, a*(r - Cr)/256 srcAB = _mm_add_epi16(srcAB, dstAB); // PADDW, a*(r - Cr)/256 + Cr + + alphaMaskAB = _mm_and_si128(alphaMaskAB, alpha_mask); // PAND, set non alpha fields to 0 + srcAB = _mm_or_si128(srcAB, alphaMaskAB); // POR, set alpha fields to 0xFFFF is src alpha was > 0 + return PackUnsaturated(srcAB, pack_mask); } @@ -227,9 +233,11 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel const MapValue *src_mv = src_mv_line; /* Load these variables into register before loop. */ + const __m128i alpha_and = ALPHA_AND_MASK; + #define ALPHA_BLEND_PARAM_3 alpha_and #if (SSE_VERSION == 2) const __m128i clear_hi = CLEAR_HIGH_BYTE_MASK; - #define ALPHA_BLEND_PARAM_1 clear_hi + #define ALPHA_BLEND_PARAM_1 alpha_and #define ALPHA_BLEND_PARAM_2 clear_hi #define DARKEN_PARAM_1 tr_nom_base #define DARKEN_PARAM_2 tr_nom_base @@ -275,7 +283,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel for (uint x = (uint) effective_width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2)); + _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3)); src += 2; dst += 2; } @@ -283,7 +291,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2)); + dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3)); } break; @@ -328,7 +336,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel } /* Blend colours. */ - _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2)); + _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3)); dst += 2; src += 2; src_mv += 2; @@ -357,7 +365,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel if (src->a < 255) { bmcr_alpha_blend_single: __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3); } dst->data = _mm_cvtsi128_si32(srcABCD); } diff --git a/src/blitter/32bpp_sse_type.h b/src/blitter/32bpp_sse_type.h index 33c76d9b4d..fe91b294be 100644 --- a/src/blitter/32bpp_sse_type.h +++ b/src/blitter/32bpp_sse_type.h @@ -51,6 +51,7 @@ typedef union ALIGN(16) um128i { #define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0) #define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7) #define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256) +#define ALPHA_AND_MASK _mm_setr_epi16( 0, 0, 0, -1, 0, 0, 0, -1) #endif /* WITH_SSE */ #endif /* BLITTER_32BPP_SSE_TYPE_H */