@@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
4
4
5
5
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
6
6
Copyright (c) 2012, The University of Waikato
7
- Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
7
+ Copyright (c) 2015 - 2018 , Jukka Ojanen <jukka.ojanen@kolumbus.fi>
8
8
9
9
All rights reserved.
10
10
@@ -417,8 +417,9 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
417
417
out [N + 1 ] = 0.0f ;
418
418
}
419
419
420
+ #ifdef __ARM_NEON__
420
421
static void
421
- ffts_execute_1d_real_inv (ffts_plan_t * p , const void * input , void * output )
422
+ ffts_execute_1d_real_inv_neon (ffts_plan_t * p , const void * input , void * output )
422
423
{
423
424
float * const FFTS_RESTRICT in =
424
425
(float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_16 (input );
@@ -429,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
429
430
const float * const FFTS_RESTRICT B =
430
431
(const float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> B );
431
432
const int N = (const int ) p -> N ;
432
- int i ;
433
-
434
- #ifdef __ARM_NEON__
435
433
float * p_buf0 = in ;
436
434
float * p_buf1 = in + N - 2 ;
437
435
float * p_out = buf ;
438
- #endif
436
+ int i ;
439
437
440
438
/* we know this */
441
439
FFTS_ASSUME (N /2 > 0 );
442
440
443
- #ifdef __ARM_NEON__
444
441
for (i = 0 ; i < N /2 ; i += 2 ) {
445
442
__asm__ __volatile__ (
446
443
"vld1.32 {q8}, [%[pa]]!\n\t"
@@ -482,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
482
479
: "memory" , "q8" , "q9" , "q10" , "q11" , "q12" , "q13" , "q14" , "q15"
483
480
);
484
481
}
485
- #elif HAVE_SSE3
482
+
483
+ p -> plans [0 ]-> transform (p -> plans [0 ], buf , output );
484
+ }
485
+ #endif
486
+
487
+ #if HAVE_SSE3
488
+ static void
489
+ ffts_execute_1d_real_inv_sse3 (ffts_plan_t * p , const void * input , void * output )
490
+ {
491
+ float * const FFTS_RESTRICT in =
492
+ (float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_16 (input );
493
+ float * const FFTS_RESTRICT buf =
494
+ (float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> buf );
495
+ const float * const FFTS_RESTRICT A =
496
+ (const float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> A );
497
+ const float * const FFTS_RESTRICT B =
498
+ (const float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> B );
499
+ const int N = (const int ) p -> N ;
500
+ int i ;
501
+
502
+ /* we know this */
503
+ FFTS_ASSUME (N /2 > 0 );
504
+
486
505
if (FFTS_UNLIKELY (N <= 8 )) {
487
506
__m128 t0 = _mm_loadl_pi (_mm_setzero_ps (), (const __m64 * ) & in [N ]);
488
507
__m128 t1 = _mm_load_ps (in );
@@ -567,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
567
586
_mm_mul_ps (_mm_shuffle_ps (t2 , t0 , _MM_SHUFFLE (2 ,2 ,0 ,0 )), t4 ))));
568
587
}
569
588
}
570
- #elif HAVE_SSE
589
+
590
+ p -> plans [0 ]-> transform (p -> plans [0 ], buf , output );
591
+ }
592
+ #endif
593
+
594
+ #if HAVE_SSE
595
+ static void
596
+ ffts_execute_1d_real_inv_sse (ffts_plan_t * p , const void * input , void * output )
597
+ {
598
+ float * const FFTS_RESTRICT in =
599
+ (float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_16 (input );
600
+ float * const FFTS_RESTRICT buf =
601
+ (float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> buf );
602
+ const float * const FFTS_RESTRICT A =
603
+ (const float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> A );
604
+ const float * const FFTS_RESTRICT B =
605
+ (const float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> B );
606
+ const int N = (const int ) p -> N ;
607
+ int i ;
608
+
609
+ /* we know this */
610
+ FFTS_ASSUME (N /2 > 0 );
611
+
571
612
if (FFTS_UNLIKELY (N <= 8 )) {
572
613
__m128 c0 = _mm_load_ps ((const float * ) sign_mask_odd );
573
614
__m128 t0 = _mm_loadl_pi (_mm_setzero_ps (), (const __m64 * ) & in [N ]);
@@ -660,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
660
701
_mm_xor_ps (t4 , c0 ))));
661
702
}
662
703
}
663
- #else
704
+
705
+ p -> plans [0 ]-> transform (p -> plans [0 ], buf , output );
706
+ }
707
+ #endif
708
+
709
+ static void
710
+ ffts_execute_1d_real_inv (ffts_plan_t * p , const void * input , void * output )
711
+ {
712
+ float * const FFTS_RESTRICT in =
713
+ (float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_16 (input );
714
+ float * const FFTS_RESTRICT buf =
715
+ (float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> buf );
716
+ const float * const FFTS_RESTRICT A =
717
+ (const float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> A );
718
+ const float * const FFTS_RESTRICT B =
719
+ (const float * const FFTS_RESTRICT ) FFTS_ASSUME_ALIGNED_32 (p -> B );
720
+ const int N = (const int ) p -> N ;
721
+ int i ;
722
+
723
+ /* we know this */
724
+ FFTS_ASSUME (N /2 > 0 );
725
+
664
726
for (i = 0 ; i < N /2 ; i ++ ) {
665
727
buf [2 * i + 0 ] =
666
728
in [ 2 * i + 0 ] * A [2 * i + 0 ] + in [ 2 * i + 1 ] * A [2 * i + 1 ] +
@@ -669,45 +731,42 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
669
731
in [ 2 * i + 1 ] * A [2 * i + 0 ] - in [ 2 * i + 0 ] * A [2 * i + 1 ] -
670
732
in [N - 2 * i + 0 ] * B [2 * i + 1 ] - in [N - 2 * i + 1 ] * B [2 * i + 0 ];
671
733
}
672
- #endif
673
734
674
735
p -> plans [0 ]-> transform (p -> plans [0 ], buf , output );
675
736
}
676
737
677
738
FFTS_API ffts_plan_t *
678
739
ffts_init_1d_real (size_t N , int sign )
679
740
{
741
+ #ifndef __ARM_NEON__
742
+ int cpu_flags = ffts_cpu_detect (NULL );
743
+ #endif
680
744
ffts_plan_t * p ;
745
+ int invert = 0 ;
681
746
682
747
p = (ffts_plan_t * ) calloc (1 , sizeof (* p ) + sizeof (* p -> plans ));
683
748
if (!p ) {
684
749
return NULL ;
685
750
}
686
751
687
- if (sign < 0 ) {
688
752
#ifdef __ARM_NEON__
689
- p -> transform = & ffts_execute_1d_real_neon ;
753
+ p -> transform = ( sign < 0 ) ? & ffts_execute_1d_real_neon : & ffts_execute_1d_real_inv ;
690
754
#else
691
- int cpu_flags = ffts_cpu_detect (NULL );
692
-
693
755
#ifdef HAVE_SSE3
694
- if (cpu_flags & FFTS_CPU_X86_SSE3 ) {
695
- p -> transform = & ffts_execute_1d_real_sse3 ;
696
- } else
756
+ if (cpu_flags & FFTS_CPU_X86_SSE3 ) {
757
+ p -> transform = (sign < 0 ) ? & ffts_execute_1d_real_sse3 : & ffts_execute_1d_real_inv_sse3 ;
758
+ invert = 1 ;
759
+ } else
697
760
#endif
698
-
699
761
#ifdef HAVE_SSE
700
- if (cpu_flags & FFTS_CPU_X86_SSE ) {
701
- p -> transform = & ffts_execute_1d_real_sse ;
702
- } else
762
+ if (cpu_flags & FFTS_CPU_X86_SSE ) {
763
+ p -> transform = ( sign < 0 ) ? & ffts_execute_1d_real_sse : & ffts_execute_1d_real_inv_sse ;
764
+ } else
703
765
#endif
704
- {
705
- p -> transform = & ffts_execute_1d_real ;
706
- }
707
- #endif
708
- } else {
709
- p -> transform = & ffts_execute_1d_real_inv ;
766
+ {
767
+ p -> transform = (sign < 0 ) ? & ffts_execute_1d_real : & ffts_execute_1d_real_inv ;
710
768
}
769
+ #endif
711
770
712
771
p -> destroy = & ffts_free_1d_real ;
713
772
p -> N = N ;
@@ -734,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign)
734
793
goto cleanup ;
735
794
}
736
795
737
- #ifdef HAVE_SSE3
738
- ffts_generate_table_1d_real_32f (p , sign , 1 );
739
- #else
740
- ffts_generate_table_1d_real_32f (p , sign , 0 );
741
- #endif
742
-
796
+ ffts_generate_table_1d_real_32f (p , sign , invert );
743
797
return p ;
744
798
745
799
cleanup :
0 commit comments