Skip to content

Commit cf88bdb

Browse files
committed
Fix dynamic dispatching for x86
1 parent 5388e65 commit cf88bdb

File tree

1 file changed

+88
-34
lines changed

1 file changed

+88
-34
lines changed

src/ffts_real.c

+88-34
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
44
55
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
66
Copyright (c) 2012, The University of Waikato
7-
Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
7+
Copyright (c) 2015 - 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
88
99
All rights reserved.
1010
@@ -417,8 +417,9 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
417417
out[N + 1] = 0.0f;
418418
}
419419

420+
#ifdef __ARM_NEON__
420421
static void
421-
ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
422+
ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output)
422423
{
423424
float *const FFTS_RESTRICT in =
424425
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
@@ -429,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
429430
const float *const FFTS_RESTRICT B =
430431
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
431432
const int N = (const int) p->N;
432-
int i;
433-
434-
#ifdef __ARM_NEON__
435433
float *p_buf0 = in;
436434
float *p_buf1 = in + N - 2;
437435
float *p_out = buf;
438-
#endif
436+
int i;
439437

440438
/* we know this */
441439
FFTS_ASSUME(N/2 > 0);
442440

443-
#ifdef __ARM_NEON__
444441
for (i = 0; i < N/2; i += 2) {
445442
__asm__ __volatile__ (
446443
"vld1.32 {q8}, [%[pa]]!\n\t"
@@ -482,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
482479
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
483480
);
484481
}
485-
#elif HAVE_SSE3
482+
483+
p->plans[0]->transform(p->plans[0], buf, output);
484+
}
485+
#endif
486+
487+
#if HAVE_SSE3
488+
static void
489+
ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output)
490+
{
491+
float *const FFTS_RESTRICT in =
492+
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
493+
float *const FFTS_RESTRICT buf =
494+
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
495+
const float *const FFTS_RESTRICT A =
496+
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
497+
const float *const FFTS_RESTRICT B =
498+
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
499+
const int N = (const int) p->N;
500+
int i;
501+
502+
/* we know this */
503+
FFTS_ASSUME(N/2 > 0);
504+
486505
if (FFTS_UNLIKELY(N <= 8)) {
487506
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
488507
__m128 t1 = _mm_load_ps(in);
@@ -567,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
567586
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
568587
}
569588
}
570-
#elif HAVE_SSE
589+
590+
p->plans[0]->transform(p->plans[0], buf, output);
591+
}
592+
#endif
593+
594+
#if HAVE_SSE
595+
static void
596+
ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output)
597+
{
598+
float *const FFTS_RESTRICT in =
599+
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
600+
float *const FFTS_RESTRICT buf =
601+
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
602+
const float *const FFTS_RESTRICT A =
603+
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
604+
const float *const FFTS_RESTRICT B =
605+
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
606+
const int N = (const int) p->N;
607+
int i;
608+
609+
/* we know this */
610+
FFTS_ASSUME(N/2 > 0);
611+
571612
if (FFTS_UNLIKELY(N <= 8)) {
572613
__m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
573614
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
@@ -660,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
660701
_mm_xor_ps(t4, c0))));
661702
}
662703
}
663-
#else
704+
705+
p->plans[0]->transform(p->plans[0], buf, output);
706+
}
707+
#endif
708+
709+
static void
710+
ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
711+
{
712+
float *const FFTS_RESTRICT in =
713+
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
714+
float *const FFTS_RESTRICT buf =
715+
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
716+
const float *const FFTS_RESTRICT A =
717+
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
718+
const float *const FFTS_RESTRICT B =
719+
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
720+
const int N = (const int) p->N;
721+
int i;
722+
723+
/* we know this */
724+
FFTS_ASSUME(N/2 > 0);
725+
664726
for (i = 0; i < N/2; i++) {
665727
buf[2*i + 0] =
666728
in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] +
@@ -669,45 +731,42 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
669731
in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] -
670732
in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
671733
}
672-
#endif
673734

674735
p->plans[0]->transform(p->plans[0], buf, output);
675736
}
676737

677738
FFTS_API ffts_plan_t*
678739
ffts_init_1d_real(size_t N, int sign)
679740
{
741+
#ifndef __ARM_NEON__
742+
int cpu_flags = ffts_cpu_detect(NULL);
743+
#endif
680744
ffts_plan_t *p;
745+
int invert = 0;
681746

682747
p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
683748
if (!p) {
684749
return NULL;
685750
}
686751

687-
if (sign < 0) {
688752
#ifdef __ARM_NEON__
689-
p->transform = &ffts_execute_1d_real_neon;
753+
p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv;
690754
#else
691-
int cpu_flags = ffts_cpu_detect(NULL);
692-
693755
#ifdef HAVE_SSE3
694-
if (cpu_flags & FFTS_CPU_X86_SSE3) {
695-
p->transform = &ffts_execute_1d_real_sse3;
696-
} else
756+
if (cpu_flags & FFTS_CPU_X86_SSE3) {
757+
p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3;
758+
invert = 1;
759+
} else
697760
#endif
698-
699761
#ifdef HAVE_SSE
700-
if (cpu_flags & FFTS_CPU_X86_SSE) {
701-
p->transform = &ffts_execute_1d_real_sse;
702-
} else
762+
if (cpu_flags & FFTS_CPU_X86_SSE) {
763+
p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse;
764+
} else
703765
#endif
704-
{
705-
p->transform = &ffts_execute_1d_real;
706-
}
707-
#endif
708-
} else {
709-
p->transform = &ffts_execute_1d_real_inv;
766+
{
767+
p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv;
710768
}
769+
#endif
711770

712771
p->destroy = &ffts_free_1d_real;
713772
p->N = N;
@@ -734,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign)
734793
goto cleanup;
735794
}
736795

737-
#ifdef HAVE_SSE3
738-
ffts_generate_table_1d_real_32f(p, sign, 1);
739-
#else
740-
ffts_generate_table_1d_real_32f(p, sign, 0);
741-
#endif
742-
796+
ffts_generate_table_1d_real_32f(p, sign, invert);
743797
return p;
744798

745799
cleanup:

0 commit comments

Comments
 (0)