1
+ /*
2
+
3
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
4
+
5
+ Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
6
+
7
+ All rights reserved.
8
+
9
+ Redistribution and use in source and binary forms, with or without
10
+ modification, are permitted provided that the following conditions are met:
11
+ * Redistributions of source code must retain the above copyright
12
+ notice, this list of conditions and the following disclaimer.
13
+ * Redistributions in binary form must reproduce the above copyright
14
+ notice, this list of conditions and the following disclaimer in the
15
+ documentation and/or other materials provided with the distribution.
16
+ * Neither the name of the organization nor the
17
+ names of its contributors may be used to endorse or promote products
18
+ derived from this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
24
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ */
32
+
33
+ #include "ffts_cpu.h"
34
+
35
+ #if defined(FFTS_BUILDING_CPU_TEST )
36
+ #include <stdio.h>
37
+ #endif
38
+
39
+ #if defined(_WIN32 )
40
+ #include <intrin.h>
41
+ #include <windows.h>
42
+ #endif
43
+
44
+ /* TODO: add detection/declaration of these to CMake phase */
45
+ #if !defined(FFTS_CPU_X64 )
46
+ #if defined(_M_AMD64 ) || defined(__amd64 ) || defined(__amd64__ ) || defined(_M_X64 ) || defined(__x86_64 ) || defined(__x86_64__ )
47
+ /* 64 bit x86 detected */
48
+ #define FFTS_CPU_X64
49
+ #endif
50
+ #endif
51
+
52
+ #if !defined(FFTS_CPU_X64 ) && !defined(FFTS_CPU_X86 )
53
+ #if defined(i386 ) || defined(__i386 ) || defined(__i386__ ) || defined(_M_IX86 ) || defined(__X86__ ) || defined(_X86_ )
54
+ /* 32 bit x86 detected */
55
+ #define FFTS_CPU_X86
56
+ #endif
57
+ #endif
58
+
59
+ /* check if build is 32 bit or 64 bit x86 */
60
+ #if defined(FFTS_CPU_X64 ) || defined(FFTS_CPU_X86 )
61
+
62
+ /* Build and tested on
63
+ CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313
64
+ Mac OSX 10.9 - Apple Clang 6.0
65
+ Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4
66
+ Windows XP SP3 - Visual Studio 2005 SP1 x86/x64
67
+ Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64
68
+ Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64
69
+ Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1)
70
+ Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3)
71
+ Windows 10 Pro - Visual Studio 2017 x86/x64
72
+ */
73
+
74
+ /* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */
75
+ #if (defined(_MSC_FULL_VER ) && _MSC_FULL_VER >= 160040219 )
76
+ #define FFTS_HAVE_XGETBV
77
+ #endif
78
+
79
+ #ifndef BIT
80
+ #define BIT (n ) (1u << n)
81
+ #endif
82
+
83
+ /* bit masks */
84
+ #define FFTS_CPU_X86_SSE_BITS (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25))
85
+ #define FFTS_CPU_X86_SSE2_BITS (BIT(26))
86
+ #define FFTS_CPU_X86_SSE3_BITS (BIT(0))
87
+ #define FFTS_CPU_X86_SSSE3_BITS (BIT(9))
88
+ #define FFTS_CPU_X86_SSE4_1_BITS (BIT(19))
89
+ #define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23))
90
+ #define FFTS_CPU_X86_AVX_BITS (BIT(26) | BIT(27) | BIT(28))
91
+ #define FFTS_CPU_X86_XCR0_BITS (
92
+ #define FFTS_CPU_X86_AVX2_BITS (BIT(5))
93
+ #define FFTS_CPU_X86_AVX512_BITS (BIT(16))
94
+
95
+ /* Visual Studio 2008 or older */
96
+ #if defined(FFTS_CPU_X64 ) && defined(_MSC_VER ) && _MSC_VER <= 1500
97
+ #pragma optimize("", off)
98
+ static void __fastcall ffts_cpuidex (int subleaf , int regs [4 ], int leaf )
99
+ {
100
+ /* x64 uses a four register fast-call calling convention by default and
101
+ arguments are passed in registers RCX, RDX, R8, and R9. By disabling
102
+ optimization and passing subleaf as first argument we get __cpuidex
103
+ */
104
+ (void ) subleaf ;
105
+ __cpuid (regs , leaf );
106
+ }
107
+ #pragma optimize("", on)
108
+ #endif
109
+
110
+ static FFTS_INLINE void ffts_cpuid (int regs [4 ], int leaf , int subleaf )
111
+ {
112
+ #if defined(_MSC_VER )
113
+ #if defined(FFTS_CPU_X64 )
114
+ /* Visual Studio 2010 or newer */
115
+ #if _MSC_VER > 1500
116
+ __cpuidex (regs , leaf , subleaf );
117
+ #else
118
+ ffts_cpuidex (subleaf , regs , leaf );
119
+ #endif
120
+ #else
121
+ __asm {
122
+ mov eax , leaf
123
+ mov ecx , subleaf
124
+ mov esi , regs
125
+ cpuid
126
+ mov [esi + 0x0 ], eax
127
+ mov [esi + 0x4 ], ebx
128
+ mov [esi + 0x8 ], ecx
129
+ mov [esi + 0xc ], edx
130
+ }
131
+ #endif
132
+ #elif defined(__GNUC__ ) && __GNUC__
133
+ #if defined(FFTS_CPU_X64 )
134
+ __asm__ __volatile__(
135
+ "cpuid\n\t"
136
+ : "=a" (regs [0 ]), "=b" (regs [1 ]), "=c" (regs [2 ]), "=d" (regs [3 ])
137
+ : "a" (leaf ), "c" (subleaf ));
138
+ #elif defined(__PIC__)
139
+ __asm__ __volatile__(
140
+ "xchgl %%ebx, %1\n\t"
141
+ "cpuid \n\t"
142
+ "xchgl %%ebx, %1\n\t"
143
+ : "=a" (regs [0 ]), "=r" (regs [1 ]), "=c" (regs [2 ]), "=d" (regs [3 ])
144
+ : "a" (leaf ), "c" (subleaf ));
145
+ #else
146
+ __asm__ __volatile__(
147
+ "cpuid\n\t"
148
+ : "=a" (regs [0 ]), "=b" (regs [1 ]), "=c" (regs [2 ]), "=d" (regs [3 ])
149
+ : "a" (leaf ), "c" (subleaf ));
150
+ #endif
151
+ #else
152
+ /* unknown compiler for x86 */
153
+ regs [0 ] = regs [1 ] = regs [2 ] = regs [3 ] = 0 ;
154
+ #endif
155
+ }
156
+
157
+ /* at least Visual Studio 2010 generates invalidate optimized _xgetbv */
158
+ #if defined(FFTS_HAVE_XGETBV )
159
+ #pragma optimize("", off)
160
+ #endif
161
+ static FFTS_INLINE unsigned int ffts_get_xcr0 (void )
162
+ {
163
+ #if defined(FFTS_HAVE_XGETBV )
164
+ return (unsigned int ) _xgetbv (0 );
165
+ #elif defined(_MSC_VER )
166
+ #if defined(FFTS_CPU_X64 )
167
+ /* emulate xgetbv(0) on Windows 7 SP1 or newer */
168
+ typedef DWORD64 (WINAPI * PGETENABLEDXSTATEFEATURES )(VOID );
169
+ PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures =
170
+ (PGETENABLEDXSTATEFEATURES ) GetProcAddress (
171
+ GetModuleHandle (TEXT ("kernel32.dll" )), "GetEnabledXStateFeatures" );
172
+ return pfnGetEnabledXStateFeatures ? (unsigned int ) pfnGetEnabledXStateFeatures () : 0 ;
173
+ #else
174
+ /* note that we have to touch edx register to tell compiler it's used by emited xgetbv */
175
+ unsigned __int32 hi , lo ;
176
+ __asm {
177
+ xor ecx , ecx
178
+ _emit 0x0f
179
+ _emit 0x01
180
+ _emit 0xd0
181
+ mov lo , eax
182
+ mov hi , edx
183
+ }
184
+ return (unsigned int ) lo ;
185
+ #endif
186
+ #elif defined(__GNUC__) && __GNUC__
187
+ unsigned int lo ;
188
+ __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n"
189
+ : "=a" (lo )
190
+ : "c" (0 )
191
+ : "edx" );
192
+ return lo ;
193
+ #else
194
+ /* unknown x86 compiler */
195
+ return 0 ;
196
+ #endif
197
+ }
198
+ #if defined(FFTS_HAVE_XGETBV )
199
+ #pragma optimize("", on)
200
+ #endif
201
+
202
+ int
203
+ ffts_cpu_detect (int * extra_flags )
204
+ {
205
+ static int cpu_flags = -1 ;
206
+ static int cpu_extra_flags = -1 ;
207
+ int max_basic_func ;
208
+ int regs [4 ];
209
+ unsigned int xcr0 ;
210
+
211
+ if (cpu_flags >= 0 ) {
212
+ goto exit ;
213
+ }
214
+
215
+ /* initialize */
216
+ cpu_flags = cpu_extra_flags = 0 ;
217
+
218
+ #if defined(FFTS_BUILDING_CPU_TEST )
219
+ printf ("cpuid check: " );
220
+ #endif
221
+ #if defined(FFTS_CPU_X64 )
222
+ /* cpuid is always supported on x64 */
223
+ #if defined(FFTS_BUILDING_CPU_TEST )
224
+ printf ("skipped\n" );
225
+ #endif
226
+ #else
227
+ #if defined(_MSC_VER )
228
+ _asm {
229
+ pushfd
230
+ pop eax
231
+ mov ebx ,eax
232
+ xor eax ,200000 h
233
+ push eax
234
+ popfd
235
+ pushfd
236
+ pop eax
237
+ push ebx
238
+ popfd
239
+ mov regs [0 * TYPE regs ],eax
240
+ mov regs [1 * TYPE regs ],ebx
241
+ }
242
+ #else
243
+ __asm__ (
244
+ "pushfl\n\t"
245
+ "pop %0\n\t"
246
+ "movl %0,%1\n\t"
247
+ "xorl $0x200000,%0\n\t"
248
+ "pushl %0\n\t"
249
+ "popfl\n\t"
250
+ "pushfl\n\t"
251
+ "popl %0\n\t"
252
+ "pushl %1\n\t"
253
+ "popfl\n\t"
254
+ : "=r" (regs [0 ]), "=r" (regs [1 ])
255
+ );
256
+ #endif
257
+ /* check CPUID bit (bit 21) in EFLAGS register can be toggled */
258
+ if (((regs [0 ] ^ regs [1 ]) & 0x200000 ) == 0 ) {
259
+ #if defined(FFTS_BUILDING_CPU_TEST )
260
+ printf ("not supported\n" );
261
+ #endif
262
+ goto exit ;
263
+ }
264
+ #if defined(FFTS_BUILDING_CPU_TEST )
265
+ printf ("supported\n" );
266
+ #endif
267
+ #endif
268
+
269
+ /* get the number of basic functions */
270
+ ffts_cpuid (regs , 0 , 0 );
271
+ max_basic_func = regs [0 ];
272
+ #if defined(FFTS_BUILDING_CPU_TEST )
273
+ printf ("cpuid eax=0, ecx=0: %d\n" , max_basic_func );
274
+ #endif
275
+ if (max_basic_func == 0 )
276
+ goto exit ;
277
+
278
+ /* get feature flags */
279
+ ffts_cpuid (regs , 1 , 0 );
280
+
281
+ #if defined(FFTS_BUILDING_CPU_TEST )
282
+ printf ("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n" , regs [0 ], regs [1 ], regs [2 ], regs [3 ]);
283
+ #endif
284
+
285
+ #if defined(FFTS_CPU_X64 )
286
+ /* minimum for any x64 */
287
+ cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2 ;
288
+ #else
289
+ /* test if SSE is supported */
290
+ if ((regs [3 ] & FFTS_CPU_X86_SSE_BITS ) != FFTS_CPU_X86_SSE_BITS )
291
+ goto exit ;
292
+ cpu_flags = FFTS_CPU_X86_SSE ;
293
+
294
+ /* test if SSE2 is supported */
295
+ if (!(regs [3 ] & FFTS_CPU_X86_SSE2_BITS ))
296
+ goto exit ;
297
+ cpu_flags |= FFTS_CPU_X86_SSE2 ;
298
+ #endif
299
+
300
+ /* test if SSE3 is supported */
301
+ if (!(regs [2 ] & FFTS_CPU_X86_SSE3_BITS ))
302
+ goto exit ;
303
+ cpu_flags |= FFTS_CPU_X86_SSE3 ;
304
+
305
+ /* test if SSSE3 is supported */
306
+ if (!(regs [2 ] & FFTS_CPU_X86_SSSE3_BITS ))
307
+ goto exit ;
308
+ cpu_flags |= FFTS_CPU_X86_SSSE3 ;
309
+
310
+ /* test if SSE4.1 is supported */
311
+ if (!(regs [2 ] & FFTS_CPU_X86_SSE4_1_BITS ))
312
+ goto exit ;
313
+ cpu_flags |= FFTS_CPU_X86_SSE4_1 ;
314
+
315
+ /* test if SSE4.2 is supported */
316
+ if ((regs [2 ] & FFTS_CPU_X86_SSE4_2_BITS ) != FFTS_CPU_X86_SSE4_2_BITS )
317
+ goto exit ;
318
+ cpu_flags |= FFTS_CPU_X86_SSE4_2 ;
319
+
320
+ /* test if AVX is supported */
321
+ if ((regs [2 ] & FFTS_CPU_X86_AVX_BITS ) != FFTS_CPU_X86_AVX_BITS )
322
+ goto exit ;
323
+
324
+ /* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */
325
+ xcr0 = ffts_get_xcr0 ();
326
+ #if defined(FFTS_BUILDING_CPU_TEST )
327
+ printf ("xcr0: %u\n" , xcr0 );
328
+ #endif
329
+ if ((xcr0 & 0x6 ) != 0x6 )
330
+ goto exit ;
331
+
332
+ cpu_flags |= FFTS_CPU_X86_AVX ;
333
+
334
+ /* check that cpuid extended features exist */
335
+ if (max_basic_func < 7 )
336
+ goto exit ;
337
+
338
+ /* get extended features */
339
+ ffts_cpuid (regs , 7 , 0 );
340
+
341
+ #if defined(FFTS_BUILDING_CPU_TEST )
342
+ printf ("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n" , regs [0 ], regs [1 ], regs [2 ], regs [3 ]);
343
+ #endif
344
+
345
+ /* test if AVX2 is supported */
346
+ if ((regs [1 ] & FFTS_CPU_X86_AVX2_BITS ) != FFTS_CPU_X86_AVX2_BITS )
347
+ goto exit ;
348
+ cpu_flags |= FFTS_CPU_X86_AVX2 ;
349
+
350
+ /* test if AVX512 is supported */
351
+ if ((regs [1 ] & FFTS_CPU_X86_AVX512_BITS ) != FFTS_CPU_X86_AVX512_BITS )
352
+ goto exit ;
353
+ cpu_flags |= FFTS_CPU_X86_AVX512 ;
354
+
355
+ exit :
356
+ if (extra_flags ) {
357
+ * extra_flags = cpu_extra_flags ;
358
+ }
359
+ return cpu_flags ;
360
+ }
361
+ #else
362
+ int
363
+ ffts_cpu_detect (int * extra_flags )
364
+ {
365
+ /* not implemented */
366
+ #if defined(FFTS_BUILDING_CPU_TEST )
367
+ printf ("CPU detection not implemented!!\n" );
368
+ #endif
369
+ return 0 ;
370
+ }
371
+ #endif
0 commit comments