audio: Remove 5.1->X SIMD converters, add SSE mono->stereo. The 5.1 versions didn't use the new algorithm, and making that new algorithm work took so many permutes that it was significantly slower than just using the scalar versions. However, mono-to-stereo is an extremely common conversion, and it's trivial to accelerate it with plain SSE, so that was added!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index 586f8a4..85faa4b 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -93,68 +93,34 @@
* 8 channels (7.1) layout: FL+FR+FC+LFE+BL+BR+SL+SR
*/
-
-#if 0 /* !!! FIXME: these need to be updated to match the new scalar code. */
-#if HAVE_AVX_INTRINSICS
-/* MSVC will always accept AVX intrinsics when compiling for x64 */
-#if defined(__clang__) || defined(__GNUC__)
-__attribute__((target("avx")))
-#endif
-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
+#if HAVE_SSE3_INTRINSICS
+/* Convert from stereo to mono. Average left and right. */
static void SDLCALL
-SDL_Convert51ToStereo_AVX(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
{
+ const __m128 divby2 = _mm_set1_ps(0.5f);
float *dst = (float *) cvt->buf;
const float *src = dst;
- int i = cvt->len_cvt / (sizeof (float) * 6);
- const float two_fifths_f = 1.0f / 2.5f;
- const __m256 two_fifths_v = _mm256_set1_ps(two_fifths_f);
- const __m256 half = _mm256_set1_ps(0.5f);
+ int i = cvt->len_cvt / 8;
- LOG_DEBUG_CONVERT("5.1", "stereo (using AVX)");
+ LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
SDL_assert(format == AUDIO_F32SYS);
- /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
- while (i >= 4) {
- __m256 in0 = _mm256_loadu_ps(src + 0); /* 0FL 0FR 0FC 0LF 0BL 0BR 1FL 1FR */
- __m256 in1 = _mm256_loadu_ps(src + 8); /* 1FC 1LF 1BL 1BR 2FL 2FR 2FC 2LF */
- __m256 in2 = _mm256_loadu_ps(src + 16); /* 2BL 2BR 3FL 3FR 3FC 3LF 3BL 3BR */
-
- /* 0FL 0FR 0FC 0LF 2FL 2FR 2FC 2LF */
- __m256 temp0 = _mm256_blend_ps(in0, in1, 0xF0);
- /* 1FC 1LF 1BL 1BR 3FC 3LF 3BL 3BR */
- __m256 temp1 = _mm256_blend_ps(in1, in2, 0xF0);
-
- /* 0FC 0FC 1FC 1FC 2FC 2FC 3FC 3FC */
- __m256 fc_distributed = _mm256_mul_ps(half, _mm256_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 2, 2)));
-
- /* 0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
- __m256 permuted0 = _mm256_blend_ps(temp0, temp1, 0xCC);
- /* 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
- __m256 permuted1 = _mm256_permute2f128_ps(in0, in2, 0x21);
-
- /* 0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
- /* + 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
- /* = 0L 0R 1L 1R 2L 2R 3L 3R */
- __m256 out = _mm256_add_ps(permuted0, permuted1);
- out = _mm256_add_ps(out, fc_distributed);
- out = _mm256_mul_ps(out, two_fifths_v);
-
- _mm256_storeu_ps(dst, out);
-
- i -= 4; src += 24; dst += 8;
+ /* Do SSE blocks as long as we have 16 bytes available.
+ Just use unaligned load/stores, if the memory at runtime is
+ aligned it'll be just as fast on modern processors */
+ while (i >= 4) { /* 4 * float32 */
+ _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
+ i -= 4; src += 8; dst += 4;
}
-
/* Finish off any leftovers with scalar operations. */
while (i) {
- const float front_center_distributed = src[2] * 0.5f;
- dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f; /* left */
- dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f; /* right */
- i--; src += 6; dst+=2;
+ *dst = (src[0] + src[1]) * 0.5f;
+ dst++; i--; src += 2;
}
- cvt->len_cvt /= 3;
+ cvt->len_cvt /= 2;
if (cvt->filters[++cvt->filter_index]) {
cvt->filters[cvt->filter_index] (cvt, format);
}
@@ -162,155 +128,38 @@ SDL_Convert51ToStereo_AVX(SDL_AudioCVT * cvt, SDL_AudioFormat format)
#endif
#if HAVE_SSE_INTRINSICS
-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
+/* Convert from mono to stereo. Duplicate to stereo left and right. */
static void SDLCALL
-SDL_Convert51ToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
{
- float *dst = (float *) cvt->buf;
- const float *src = dst;
- int i = cvt->len_cvt / (sizeof (float) * 6);
- const float two_fifths_f = 1.0f / 2.5f;
- const __m128 two_fifths_v = _mm_set1_ps(two_fifths_f);
- const __m128 half = _mm_set1_ps(0.5f);
+ float *dst = ((float *) (cvt->buf + (cvt->len_cvt * 2))) - 8;
+ const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
+ int i = cvt->len_cvt / sizeof (float);
- LOG_DEBUG_CONVERT("5.1", "stereo (using SSE)");
- SDL_assert(format == AUDIO_F32SYS);
-
- /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
- /* Just use unaligned load/stores, if the memory at runtime is */
- /* aligned it'll be just as fast on modern processors */
- while (i >= 2) {
- /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
- /* registers. Using shuffles they can be rearranged so that */
- /* the conversion math can be vectorized. */
- __m128 in0 = _mm_loadu_ps(src); /* 0FL 0FR 0FC 0LF */
- __m128 in1 = _mm_loadu_ps(src + 4); /* 0BL 0BR 1FL 1FR */
- __m128 in2 = _mm_loadu_ps(src + 8); /* 1FC 1LF 1BL 1BR */
-
- /* 0FC 0FC 1FC 1FC */
- __m128 fc_distributed = _mm_mul_ps(half, _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(0, 0, 2, 2)));
-
- /* 0FL 0FR 1BL 1BR */
- __m128 blended = _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(3, 2, 1, 0));
-
- /* 0FL 0FR 1BL 1BR */
- /* + 0BL 0BR 1FL 1FR */
- /* = 0L 0R 1L 1R */
- __m128 out = _mm_add_ps(blended, in1);
- out = _mm_add_ps(out, fc_distributed);
- out = _mm_mul_ps(out, two_fifths_v);
-
- _mm_storeu_ps(dst, out);
-
- i -= 2; src += 12; dst += 4;
- }
-
-
- /* Finish off any leftovers with scalar operations. */
- while (i) {
- const float front_center_distributed = src[2] * 0.5f;
- dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f; /* left */
- dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f; /* right */
- i--; src += 6; dst+=2;
- }
-
- cvt->len_cvt /= 3;
- if (cvt->filters[++cvt->filter_index]) {
- cvt->filters[cvt->filter_index] (cvt, format);
- }
-}
-#endif
-
-#if HAVE_NEON_INTRINSICS
-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
-static void SDLCALL
-SDL_Convert51ToStereo_NEON(SDL_AudioCVT * cvt, SDL_AudioFormat format)
-{
- float *dst = (float *) cvt->buf;
- const float *src = dst;
- int i = cvt->len_cvt / (sizeof (float) * 6);
- const float two_fifths_f = 1.0f / 2.5f;
- const float32x4_t two_fifths_v = vdupq_n_f32(two_fifths_f);
- const float32x4_t half = vdupq_n_f32(0.5f);
-
- LOG_DEBUG_CONVERT("5.1", "stereo (using NEON)");
- SDL_assert(format == AUDIO_F32SYS);
-
- /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
-
- /* Just use unaligned load/stores, it's the same NEON instructions and
- hopefully even unaligned NEON is faster than the scalar fallback. */
- while (i >= 2) {
- /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
- /* registers. Using shuffles they can be rearranged so that */
- /* the conversion math can be vectorized. */
- const float32x4_t in0 = vld1q_f32(src); /* 0FL 0FR 0FC 0LF */
- const float32x4_t in1 = vld1q_f32(src + 4); /* 0BL 0BR 1FL 1FR */
- const float32x4_t in2 = vld1q_f32(src + 8); /* 1FC 1LF 1BL 1BR */
-
- /* 0FC 0FC 1FC 1FC */
- const float32x4_t fc_distributed = vmulq_f32(half, vcombine_f32(vdup_lane_f32(vget_high_f32(in0), 0), vdup_lane_f32(vget_low_f32(in2), 0)));
-
- /* 0FL 0FR 1BL 1BR */
- const float32x4_t blended = vcombine_f32(vget_low_f32(in0), vget_high_f32(in2));
-
- /* 0FL 0FR 1BL 1BR */
- /* + 0BL 0BR 1FL 1FR */
- /* = 0L 0R 1L 1R */
- float32x4_t out = vaddq_f32(blended, in1);
- out = vaddq_f32(out, fc_distributed);
- out = vmulq_f32(out, two_fifths_v);
-
- vst1q_f32(dst, out);
-
- i -= 2; src += 12; dst += 4;
- }
-
- /* Finish off any leftovers with scalar operations. */
- while (i) {
- const float front_center_distributed = src[2] * 0.5f;
- dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f; /* left */
- dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f; /* right */
- i--; src += 6; dst+=2;
- }
-
- cvt->len_cvt /= 3;
- if (cvt->filters[++cvt->filter_index]) {
- cvt->filters[cvt->filter_index] (cvt, format);
- }
-}
-#endif
-#endif
-
-
-#if HAVE_SSE3_INTRINSICS
-/* Convert from stereo to mono. Average left and right. */
-static void SDLCALL
-SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
-{
- const __m128 divby2 = _mm_set1_ps(0.5f);
- float *dst = (float *) cvt->buf;
- const float *src = dst;
- int i = cvt->len_cvt / 8;
-
- LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
+ LOG_DEBUG_CONVERT("mono", "stereo (using SSE)");
SDL_assert(format == AUDIO_F32SYS);
/* Do SSE blocks as long as we have 16 bytes available.
Just use unaligned load/stores, if the memory at runtime is
aligned it'll be just as fast on modern processors */
+ /* convert backwards, since output is growing in-place. */
while (i >= 4) { /* 4 * float32 */
- _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
- i -= 4; src += 8; dst += 4;
+ const __m128 input = _mm_loadu_ps(src); /* A B C D */
+ _mm_storeu_ps(dst, _mm_unpacklo_ps(input, input)); /* A A B B */
+ _mm_storeu_ps(dst+4, _mm_unpackhi_ps(input, input)); /* C C D D */
+ i -= 4; src -= 4; dst -= 8;
}
/* Finish off any leftovers with scalar operations. */
- while (i) {
- *dst = (src[0] + src[1]) * 0.5f;
- dst++; i--; src += 2;
+ src += 3; dst += 6; /* adjust for smaller buffers. */
+ while (i) { /* convert backwards, since output is growing in-place. */
+ const float srcFC = src[0];
+ dst[1] /* FR */ = srcFC;
+ dst[0] /* FL */ = srcFC;
+ i--; src--; dst -= 2;
}
- cvt->len_cvt /= 2;
+ cvt->len_cvt *= 2;
if (cvt->filters[++cvt->filter_index]) {
cvt->filters[cvt->filter_index] (cvt, format);
}
@@ -833,24 +682,16 @@ SDL_BuildAudioCVT(SDL_AudioCVT * cvt,
return SDL_SetError("Invalid channel combination");
} else if (channel_converter != NULL) {
/* swap in some SIMD versions for a few of these. */
- if (channel_converter == SDL_Convert51ToStereo) {
+ if (channel_converter == SDL_ConvertStereoToMono) {
SDL_AudioFilter filter = NULL;
-#if 0 /* !!! FIXME: these have not been updated for the new formulas */
- #if HAVE_AVX_INTRINSICS
- if (!filter && SDL_HasAVX()) { filter = SDL_Convert51ToStereo_AVX; }
- #endif
- #if HAVE_SSE_INTRINSICS
- if (!filter && SDL_HasSSE()) { filter = SDL_Convert51ToStereo_SSE; }
- #endif
- #if HAVE_NEON_INTRINSICS
- if (!filter && SDL_HasNEON()) { filter = SDL_Convert51ToStereo_NEON; }
+ #if HAVE_SSE3_INTRINSICS
+ if (!filter && SDL_HasSSE3()) { filter = SDL_ConvertStereoToMono_SSE3; }
#endif
-#endif
if (filter) { channel_converter = filter; }
- } else if (channel_converter == SDL_ConvertStereoToMono) {
+ } else if (channel_converter == SDL_ConvertMonoToStereo) {
SDL_AudioFilter filter = NULL;
- #if HAVE_SSE3_INTRINSICS
- if (!filter && SDL_HasSSE3()) { filter = SDL_ConvertStereoToMono_SSE3; }
+ #if HAVE_SSE_INTRINSICS
+ if (!filter && SDL_HasSSE()) { filter = SDL_ConvertMonoToStereo_SSE; }
#endif
if (filter) { channel_converter = filter; }
}