Skip to content

Commit 373c0bd

Browse files
committed
Merge commit 'cf88bdb4788163d4267ea8dcc2eedc0ee989b47d'
2 parents bbe0b33 + cf88bdb commit 373c0bd

File tree

6 files changed

+745
-39
lines changed

6 files changed

+745
-39
lines changed

ffts/CMakeLists.txt

+10
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,8 @@ set(FFTS_SOURCES
414414
src/ffts.c
415415
src/ffts_chirp_z.c
416416
src/ffts_chirp_z.h
417+
src/ffts_cpu.c
418+
src/ffts_cpu.h
417419
src/ffts_internal.h
418420
src/ffts_nd.c
419421
src/ffts_nd.h
@@ -530,6 +532,14 @@ if(ENABLE_STATIC OR ENABLE_SHARED)
530532
ffts
531533
${FFTS_EXTRA_LIBRARIES}
532534
)
535+
536+
add_executable(ffts_cpu_test
537+
src/ffts_cpu.c
538+
src/ffts_cpu.h
539+
tests/cpu_test.c
540+
)
541+
542+
set_target_properties(ffts_cpu_test PROPERTIES COMPILE_DEFINITIONS FFTS_BUILDING_CPU_TEST)
533543
endif(ENABLE_STATIC OR ENABLE_SHARED)
534544

535545
# generate packageconfig file

ffts/src/ffts_cpu.c

+371
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
/*
2+
3+
This file is part of FFTS -- The Fastest Fourier Transform in the South
4+
5+
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
6+
7+
All rights reserved.
8+
9+
Redistribution and use in source and binary forms, with or without
10+
modification, are permitted provided that the following conditions are met:
11+
* Redistributions of source code must retain the above copyright
12+
notice, this list of conditions and the following disclaimer.
13+
* Redistributions in binary form must reproduce the above copyright
14+
notice, this list of conditions and the following disclaimer in the
15+
documentation and/or other materials provided with the distribution.
16+
* Neither the name of the organization nor the
17+
names of its contributors may be used to endorse or promote products
18+
derived from this software without specific prior written permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23+
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
24+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30+
31+
*/
32+
33+
#include "ffts_cpu.h"
34+
35+
#if defined(FFTS_BUILDING_CPU_TEST)
36+
#include <stdio.h>
37+
#endif
38+
39+
#if defined(_WIN32)
40+
#include <intrin.h>
41+
#include <windows.h>
42+
#endif
43+
44+
/* TODO: add detection/declaration of these to CMake phase */
45+
#if !defined(FFTS_CPU_X64)
46+
#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__)
47+
/* 64 bit x86 detected */
48+
#define FFTS_CPU_X64
49+
#endif
50+
#endif
51+
52+
#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86)
53+
#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
54+
/* 32 bit x86 detected */
55+
#define FFTS_CPU_X86
56+
#endif
57+
#endif
58+
59+
/* check if build is 32 bit or 64 bit x86 */
60+
#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86)
61+
62+
/* Build and tested on
63+
CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313
64+
Mac OSX 10.9 - Apple Clang 6.0
65+
Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4
66+
Windows XP SP3 - Visual Studio 2005 SP1 x86/x64
67+
Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64
68+
Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64
69+
Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1)
70+
Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3)
71+
Windows 10 Pro - Visual Studio 2017 x86/x64
72+
*/
73+
74+
/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */
75+
#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
76+
#define FFTS_HAVE_XGETBV
77+
#endif
78+
79+
#ifndef BIT
80+
#define BIT(n) (1u << n)
81+
#endif
82+
83+
/* bit masks */
84+
#define FFTS_CPU_X86_SSE_BITS (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25))
85+
#define FFTS_CPU_X86_SSE2_BITS (BIT(26))
86+
#define FFTS_CPU_X86_SSE3_BITS (BIT(0))
87+
#define FFTS_CPU_X86_SSSE3_BITS (BIT(9))
88+
#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19))
89+
#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23))
90+
#define FFTS_CPU_X86_AVX_BITS (BIT(26) | BIT(27) | BIT(28))
91+
#define FFTS_CPU_X86_XCR0_BITS (
92+
#define FFTS_CPU_X86_AVX2_BITS (BIT(5))
93+
#define FFTS_CPU_X86_AVX512_BITS (BIT(16))
94+
95+
/* Visual Studio 2008 or older */
96+
#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500
97+
#pragma optimize("", off)
98+
static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf)
99+
{
100+
/* x64 uses a four register fast-call calling convention by default and
101+
arguments are passed in registers RCX, RDX, R8, and R9. By disabling
102+
optimization and passing subleaf as first argument we get __cpuidex
103+
*/
104+
(void) subleaf;
105+
__cpuid(regs, leaf);
106+
}
107+
#pragma optimize("", on)
108+
#endif
109+
110+
static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf)
111+
{
112+
#if defined(_MSC_VER)
113+
#if defined(FFTS_CPU_X64)
114+
/* Visual Studio 2010 or newer */
115+
#if _MSC_VER > 1500
116+
__cpuidex(regs, leaf, subleaf);
117+
#else
118+
ffts_cpuidex(subleaf, regs, leaf);
119+
#endif
120+
#else
121+
__asm {
122+
mov eax, leaf
123+
mov ecx, subleaf
124+
mov esi, regs
125+
cpuid
126+
mov [esi + 0x0], eax
127+
mov [esi + 0x4], ebx
128+
mov [esi + 0x8], ecx
129+
mov [esi + 0xc], edx
130+
}
131+
#endif
132+
#elif defined(__GNUC__) && __GNUC__
133+
#if defined(FFTS_CPU_X64)
134+
__asm__ __volatile__(
135+
"cpuid\n\t"
136+
: "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
137+
: "a"(leaf), "c"(subleaf));
138+
#elif defined(__PIC__)
139+
__asm__ __volatile__(
140+
"xchgl %%ebx, %1\n\t"
141+
"cpuid \n\t"
142+
"xchgl %%ebx, %1\n\t"
143+
: "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
144+
: "a"(leaf), "c"(subleaf));
145+
#else
146+
__asm__ __volatile__(
147+
"cpuid\n\t"
148+
: "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
149+
: "a"(leaf), "c"(subleaf));
150+
#endif
151+
#else
152+
/* unknown compiler for x86 */
153+
regs[0] = regs[1] = regs[2] = regs[3] = 0;
154+
#endif
155+
}
156+
157+
/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */
158+
#if defined(FFTS_HAVE_XGETBV)
159+
#pragma optimize("", off)
160+
#endif
161+
static FFTS_INLINE unsigned int ffts_get_xcr0(void)
162+
{
163+
#if defined(FFTS_HAVE_XGETBV)
164+
return (unsigned int) _xgetbv(0);
165+
#elif defined(_MSC_VER)
166+
#if defined(FFTS_CPU_X64)
167+
/* emulate xgetbv(0) on Windows 7 SP1 or newer */
168+
typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID);
169+
PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures =
170+
(PGETENABLEDXSTATEFEATURES) GetProcAddress(
171+
GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures");
172+
return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0;
173+
#else
174+
/* note that we have to touch edx register to tell compiler it's used by emited xgetbv */
175+
unsigned __int32 hi, lo;
176+
__asm {
177+
xor ecx, ecx
178+
_emit 0x0f
179+
_emit 0x01
180+
_emit 0xd0
181+
mov lo, eax
182+
mov hi, edx
183+
}
184+
return (unsigned int) lo;
185+
#endif
186+
#elif defined(__GNUC__) && __GNUC__
187+
unsigned int lo;
188+
__asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n"
189+
: "=a"(lo)
190+
: "c"(0)
191+
: "edx");
192+
return lo;
193+
#else
194+
/* unknown x86 compiler */
195+
return 0;
196+
#endif
197+
}
198+
#if defined(FFTS_HAVE_XGETBV)
199+
#pragma optimize("", on)
200+
#endif
201+
202+
int
203+
ffts_cpu_detect(int *extra_flags)
204+
{
205+
static int cpu_flags = -1;
206+
static int cpu_extra_flags = -1;
207+
int max_basic_func;
208+
int regs[4];
209+
unsigned int xcr0;
210+
211+
if (cpu_flags >= 0) {
212+
goto exit;
213+
}
214+
215+
/* initialize */
216+
cpu_flags = cpu_extra_flags = 0;
217+
218+
#if defined(FFTS_BUILDING_CPU_TEST)
219+
printf("cpuid check: ");
220+
#endif
221+
#if defined(FFTS_CPU_X64)
222+
/* cpuid is always supported on x64 */
223+
#if defined(FFTS_BUILDING_CPU_TEST)
224+
printf("skipped\n");
225+
#endif
226+
#else
227+
#if defined(_MSC_VER)
228+
_asm {
229+
pushfd
230+
pop eax
231+
mov ebx,eax
232+
xor eax,200000h
233+
push eax
234+
popfd
235+
pushfd
236+
pop eax
237+
push ebx
238+
popfd
239+
mov regs[0 * TYPE regs],eax
240+
mov regs[1 * TYPE regs],ebx
241+
}
242+
#else
243+
__asm__ (
244+
"pushfl\n\t"
245+
"pop %0\n\t"
246+
"movl %0,%1\n\t"
247+
"xorl $0x200000,%0\n\t"
248+
"pushl %0\n\t"
249+
"popfl\n\t"
250+
"pushfl\n\t"
251+
"popl %0\n\t"
252+
"pushl %1\n\t"
253+
"popfl\n\t"
254+
: "=r" (regs[0]), "=r" (regs[1])
255+
);
256+
#endif
257+
/* check CPUID bit (bit 21) in EFLAGS register can be toggled */
258+
if (((regs[0] ^ regs[1]) & 0x200000) == 0) {
259+
#if defined(FFTS_BUILDING_CPU_TEST)
260+
printf("not supported\n");
261+
#endif
262+
goto exit;
263+
}
264+
#if defined(FFTS_BUILDING_CPU_TEST)
265+
printf("supported\n");
266+
#endif
267+
#endif
268+
269+
/* get the number of basic functions */
270+
ffts_cpuid(regs, 0, 0);
271+
max_basic_func = regs[0];
272+
#if defined(FFTS_BUILDING_CPU_TEST)
273+
printf("cpuid eax=0, ecx=0: %d\n", max_basic_func);
274+
#endif
275+
if (max_basic_func == 0)
276+
goto exit;
277+
278+
/* get feature flags */
279+
ffts_cpuid(regs, 1, 0);
280+
281+
#if defined(FFTS_BUILDING_CPU_TEST)
282+
printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
283+
#endif
284+
285+
#if defined(FFTS_CPU_X64)
286+
/* minimum for any x64 */
287+
cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2;
288+
#else
289+
/* test if SSE is supported */
290+
if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS)
291+
goto exit;
292+
cpu_flags = FFTS_CPU_X86_SSE;
293+
294+
/* test if SSE2 is supported */
295+
if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS))
296+
goto exit;
297+
cpu_flags |= FFTS_CPU_X86_SSE2;
298+
#endif
299+
300+
/* test if SSE3 is supported */
301+
if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS))
302+
goto exit;
303+
cpu_flags |= FFTS_CPU_X86_SSE3;
304+
305+
/* test if SSSE3 is supported */
306+
if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS))
307+
goto exit;
308+
cpu_flags |= FFTS_CPU_X86_SSSE3;
309+
310+
/* test if SSE4.1 is supported */
311+
if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS))
312+
goto exit;
313+
cpu_flags |= FFTS_CPU_X86_SSE4_1;
314+
315+
/* test if SSE4.2 is supported */
316+
if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS)
317+
goto exit;
318+
cpu_flags |= FFTS_CPU_X86_SSE4_2;
319+
320+
/* test if AVX is supported */
321+
if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS)
322+
goto exit;
323+
324+
/* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */
325+
xcr0 = ffts_get_xcr0();
326+
#if defined(FFTS_BUILDING_CPU_TEST)
327+
printf("xcr0: %u\n", xcr0);
328+
#endif
329+
if ((xcr0 & 0x6) != 0x6)
330+
goto exit;
331+
332+
cpu_flags |= FFTS_CPU_X86_AVX;
333+
334+
/* check that cpuid extended features exist */
335+
if (max_basic_func < 7)
336+
goto exit;
337+
338+
/* get extended features */
339+
ffts_cpuid(regs, 7, 0);
340+
341+
#if defined(FFTS_BUILDING_CPU_TEST)
342+
printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
343+
#endif
344+
345+
/* test if AVX2 is supported */
346+
if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS)
347+
goto exit;
348+
cpu_flags |= FFTS_CPU_X86_AVX2;
349+
350+
/* test if AVX512 is supported */
351+
if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS)
352+
goto exit;
353+
cpu_flags |= FFTS_CPU_X86_AVX512;
354+
355+
exit:
356+
if (extra_flags) {
357+
*extra_flags = cpu_extra_flags;
358+
}
359+
return cpu_flags;
360+
}
361+
#else
362+
int
363+
ffts_cpu_detect(int *extra_flags)
364+
{
365+
/* not implemented */
366+
#if defined(FFTS_BUILDING_CPU_TEST)
367+
printf("CPU detection not implemented!!\n");
368+
#endif
369+
return 0;
370+
}
371+
#endif

0 commit comments

Comments
 (0)