diff --git "a/C:\\Users\\pdaho\\AppData\\Local\\Temp\\TortoiseGit\\rb_mjit_header-0b6d2fd.000.h" "b/C:\\moje\\diff\\rb_mjit_header.h" index 8d2b456..40243e3 100644 --- "a/C:\\Users\\pdaho\\AppData\\Local\\Temp\\TortoiseGit\\rb_mjit_header-0b6d2fd.000.h" +++ "b/C:\\moje\\diff\\rb_mjit_header.h" @@ -4,9 +4,9 @@ #define __STDC_UTF_32__ 1 #define __STDC_HOSTED__ 1 #define __GNUC__ 8 -#define __GNUC_MINOR__ 1 +#define __GNUC_MINOR__ 2 #define __GNUC_PATCHLEVEL__ 0 -#define __VERSION__ "8.1.0" +#define __VERSION__ "8.2.0" #define __ATOMIC_RELAXED 0 #define __ATOMIC_SEQ_CST 5 #define __ATOMIC_ACQUIRE 2 @@ -15,6 +15,7 @@ #define __ATOMIC_CONSUME 1 #define __pic__ 1 #define __PIC__ 1 +#define __OPTIMIZE__ 1 #define __FINITE_MATH_ONLY__ 0 #define __SIZEOF_INT__ 4 #define __SIZEOF_LONG__ 4 @@ -69,7 +70,7 @@ #define __UINTPTR_TYPE__ long long unsigned int #define __has_include(STR) __has_include__(STR) #define __has_include_next(STR) __has_include_next__(STR) -#define __GXX_ABI_VERSION 1012 +#define __GXX_ABI_VERSION 1013 #define __SCHAR_MAX__ 0x7f #define __SHRT_MAX__ 0x7fff #define __INT_MAX__ 0x7fffffff @@ -284,12 +285,10 @@ #define __REGISTER_PREFIX__ #define __USER_LABEL_PREFIX__ #define __GNUC_STDC_INLINE__ 1 -#define __NO_INLINE__ 1 #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1 #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1 #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1 #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1 -#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1 #define __GCC_ATOMIC_BOOL_LOCK_FREE 2 #define __GCC_ATOMIC_CHAR_LOCK_FREE 2 #define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2 @@ -301,7 +300,6 @@ #define __GCC_ATOMIC_LLONG_LOCK_FREE 2 #define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 #define __GCC_ATOMIC_POINTER_LOCK_FREE 2 -#define __GCC_HAVE_DWARF2_CFI_ASM 1 #define __PRAGMA_REDEFINE_EXTNAME 1 #define __SIZEOF_INT128__ 16 #define __SIZEOF_WCHAR_T__ 2 @@ -316,21 +314,12 @@ #define __ATOMIC_HLE_ACQUIRE 65536 #define __ATOMIC_HLE_RELEASE 131072 #define __GCC_ASM_FLAG_OUTPUTS__ 1 -#define __amdfam10 1 -#define __amdfam10__ 1 -#define __tune_amdfam10__ 1 +#define __k8 1 +#define __k8__ 1 #define __code_model_medium__ 1 #define __MMX__ 1 -#define __3dNOW__ 1 -#define __3dNOW_A__ 1 #define __SSE__ 1 #define __SSE2__ 1 -#define __SSE3__ 1 -#define __SSE4A__ 1 -#define __ABM__ 1 -#define __LZCNT__ 1 -#define __POPCNT__ 1 -#define __PRFCHW__ 1 #define __FXSR__ 1 #define __SSE_MATH__ 1 #define __SSE2_MATH__ 1 @@ -364,12 +353,14 @@ #define _WIN64 1 #define __declspec(x) __attribute__((x)) #define __DECIMAL_BID_FORMAT__ 1 -#undef _REENTRANT +#define _REENTRANT 1 #define MJIT_HEADER 1 #define _FORTIFY_SOURCE 2 #define RUBY_DEVEL 1 #define RUBY_EXPORT 1 #define CANONICALIZATION_FOR_MATHN 1 +#define _FORTIFY_SOURCE 2 +#define __USE_MINGW_ANSI_STDIO 1 #define FD_SETSIZE 2048 #define _WIN32_WINNT 0x0501 #define __MINGW_USE_VC2005_COMPAT 1 @@ -430,6 +421,8 @@ #define HAVE_SYS_FILE_H 1 #define HAVE_SYS_UTIME_H 1 #define HAVE_TIME_H 1 +#define HAVE_GMP_H 1 +#define HAVE_LIBGMP 1 #define _FILE_OFFSET_BITS 64 #define HAVE_TYPEOF 1 #define HAVE_LONG_LONG 1 @@ -448,7 +441,7 @@ #define SIZEOF_CLOCK_T 4 #define PACKED_STRUCT(x) x __attribute__((packed)) #define USE_UNALIGNED_MEMBER_ACCESS 1 -#define PRI_LL_PREFIX "I64" +#define PRI_LL_PREFIX "ll" #define HAVE_PID_T 1 #define rb_pid_t pid_t #define SIGNEDNESS_OF_PID_T -1 @@ -534,10 +527,13 @@ #define UNREACHABLE __builtin_unreachable() #define RUBY_FUNC_EXPORTED __attribute__ ((__visibility__("default"))) extern #define RUBY_FUNCTION_NAME_STRING __func__ +#define ENUM_OVER_INT 1 #define HAVE_DECL_SYS_NERR 1 #define HAVE_DECL_GETENV 1 #define SIZEOF_SIZE_T 8 #define SIZEOF_PTRDIFF_T 8 +#define PRI_SIZE_PREFIX "z" +#define PRI_PTRDIFF_PREFIX "t" #define HAVE_STRUCT_STAT_ST_RDEV 1 #define HAVE_ST_RDEV 1 #define SIZEOF_STRUCT_STAT_ST_SIZE SIZEOF_OFF_T @@ -774,8 +770,6 @@ #define _inline __inline #define __CRT_INLINE extern inline __attribute__((__gnu_inline__)) #define __MINGW_INTRIN_INLINE extern __inline__ __attribute__((__always_inline__,__gnu_inline__)) -#undef __CRT__NO_INLINE -#define __CRT__NO_INLINE 1 #define __UNUSED_PARAM(x) x __attribute__ ((__unused__)) #define __restrict_arr __restrict #define __MINGW_ATTRIB_NORETURN __attribute__ ((__noreturn__)) @@ -1104,57 +1098,141 @@ extern int __attribute__((__cdecl__)) __mingw_vasprintf(char ** __restrict__ , const char * __restrict__ , va_list) __attribute__ ((__nothrow__)); #undef __MINGW_PRINTF_FORMAT #undef __MINGW_SCANF_FORMAT -#define __MINGW_PRINTF_FORMAT ms_printf -#define __MINGW_SCANF_FORMAT ms_scanf -#undef __builtin_vsnprintf -#undef __builtin_vsprintf - __attribute__((__format__ (ms_printf, 2, 3))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) fprintf(FILE * __restrict__ _File,const char * __restrict__ _Format,...); - __attribute__((__format__ (ms_printf, 1, 2))) __attribute__ ((__nonnull__ (1))) - int __attribute__((__cdecl__)) printf(const char * __restrict__ _Format,...); - __attribute__((__format__ (ms_printf, 2, 3))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) sprintf(char * __restrict__ _Dest,const char * __restrict__ _Format,...) ; - __attribute__((__format__ (ms_printf, 2, 0))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) vfprintf(FILE * __restrict__ _File,const char * __restrict__ _Format,va_list _ArgList); - __attribute__((__format__ (ms_printf, 1, 0))) __attribute__ ((__nonnull__ (1))) - int __attribute__((__cdecl__)) vprintf(const char * __restrict__ _Format,va_list _ArgList); - __attribute__((__format__ (ms_printf, 2, 0))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) vsprintf(char * __restrict__ _Dest,const char * __restrict__ _Format,va_list _Args) ; - __attribute__((__format__ (ms_scanf, 2, 3))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) fscanf(FILE * __restrict__ _File,const char * __restrict__ _Format,...) ; - __attribute__((__format__ (ms_scanf, 1, 2))) __attribute__ ((__nonnull__ (1))) - int __attribute__((__cdecl__)) scanf(const char * __restrict__ _Format,...) ; - __attribute__((__format__ (ms_scanf, 2, 3))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) sscanf(const char * __restrict__ _Src,const char * __restrict__ _Format,...) ; - int __attribute__((__cdecl__)) vasprintf(char ** __restrict__ ret,const char * __restrict__ format,va_list ap) __attribute__ ((format (ms_printf, 2, 0))); - int __attribute__((__cdecl__)) asprintf(char ** __restrict__ ret,const char * __restrict__ format,...) __attribute__ ((format (ms_printf, 2, 3))); +#define __MINGW_PRINTF_FORMAT gnu_printf +#define __MINGW_SCANF_FORMAT gnu_scanf +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__ ((__format__ (gnu_printf, 2, 3))) __attribute__((nonnull (1,2))) +int asprintf(char **__ret, const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vasprintf( __ret, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__ ((__format__ (gnu_printf, 2, 0))) __attribute__((nonnull (1,2))) +int vasprintf(char **__ret, const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vasprintf( __ret, __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_scanf, 2, 3))) __attribute__ ((__nonnull__ (2))) +int sscanf(const char *__source, const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vsscanf( __source, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_scanf, 1, 2))) __attribute__ ((__nonnull__ (1))) +int scanf(const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfscanf( (__acrt_iob_func(0)), __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_scanf, 2, 3))) __attribute__ ((__nonnull__ (2))) +int fscanf(FILE *__stream, const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfscanf( __stream, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" - __attribute__((__format__ (ms_scanf, 1, 0))) __attribute__ ((__nonnull__ (1))) - int __attribute__((__cdecl__)) __ms_vscanf(const char * __restrict__ Format, va_list argp); - __attribute__((__format__ (ms_scanf, 2, 0))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) __ms_vfscanf (FILE * __restrict__ fp, const char * __restrict__ Format,va_list argp); - __attribute__((__format__ (ms_scanf, 2, 0))) __attribute__ ((__nonnull__ (2))) - int __attribute__((__cdecl__)) __ms_vsscanf (const char * __restrict__ _Str,const char * __restrict__ Format,va_list argp); - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - __attribute__((__format__ (ms_scanf, 2, 0))) __attribute__ ((__nonnull__ (2))) - int vfscanf (FILE *__stream, const char *__format, __builtin_va_list __local_argv) - { - return __ms_vfscanf (__stream, __format, __local_argv); - } - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - __attribute__((__format__ (ms_scanf, 2, 0))) __attribute__ ((__nonnull__ (2))) - int vsscanf (const char * __restrict__ __source, const char * __restrict__ __format, __builtin_va_list __local_argv) - { - return __ms_vsscanf( __source, __format, __local_argv ); - } - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - __attribute__((__format__ (ms_scanf, 1, 0))) __attribute__ ((__nonnull__ (1))) - int vscanf(const char *__format, __builtin_va_list __local_argv) - { - return __ms_vscanf (__format, __local_argv); - } +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_scanf, 2, 0))) __attribute__ ((__nonnull__ (2))) +int vsscanf (const char *__source, const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vsscanf( __source, __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_scanf, 1, 0))) __attribute__ ((__nonnull__ (1))) +int vscanf(const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfscanf( (__acrt_iob_func(0)), __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_scanf, 2, 0))) __attribute__ ((__nonnull__ (2))) +int vfscanf (FILE *__stream, const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfscanf( __stream, __format, __local_argv ); +} #pragma GCC diagnostic pop +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 2, 3))) __attribute__ ((__nonnull__ (2))) +int fprintf (FILE *__stream, const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfprintf( __stream, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 1, 2))) __attribute__ ((__nonnull__ (1))) +int printf (const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfprintf( (__acrt_iob_func(1)), __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 2, 3))) __attribute__ ((__nonnull__ (2))) +int sprintf (char *__stream, const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vsprintf( __stream, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 2, 0))) __attribute__ ((__nonnull__ (2))) +int vfprintf (FILE *__stream, const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfprintf( __stream, __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 1, 0))) __attribute__ ((__nonnull__ (1))) +int vprintf (const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfprintf( (__acrt_iob_func(1)), __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 2, 0))) __attribute__ ((__nonnull__ (2))) +int vsprintf (char *__stream, const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vsprintf( __stream, __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 3, 4))) __attribute__ ((__nonnull__ (3))) +int snprintf (char *__stream, size_t __n, const char *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vsnprintf( __stream, __n, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +__attribute__((__format__ (gnu_printf, 3, 0))) __attribute__ ((__nonnull__ (3))) +int vsnprintf (char *__stream, size_t __n, const char *__format, __builtin_va_list __local_argv) +{ + return __mingw_vsnprintf( __stream, __n, __format, __local_argv ); +} +#define __builtin_vsnprintf __mingw_vsnprintf +#define __builtin_vsprintf __mingw_vsprintf __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _filbuf(FILE *_File); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _flsbuf(int _Ch,FILE *_File); __attribute__ ((__dllimport__)) FILE *__attribute__((__cdecl__)) _fsopen(const char *_Filename,const char *_Mode,int _ShFlag); @@ -1231,36 +1309,6 @@ extern __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _snprintf(char * __restrict__ _Dest,size_t _Count,const char * __restrict__ _Format,...) ; __attribute__((__format__ (ms_printf, 3, 0))) __attribute__ ((__nonnull__ (3))) __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _vsnprintf(char * __restrict__ _Dest,size_t _Count,const char * __restrict__ _Format,va_list _Args) ; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wshadow" - - -#undef snprintf -#undef vsnprintf - __attribute__((__format__ (ms_printf, 3, 0))) __attribute__ ((__nonnull__ (3))) - int __attribute__((__cdecl__)) __ms_vsnprintf(char * __restrict__ d,size_t n,const char * __restrict__ format,va_list arg) - ; - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - __attribute__((__format__ (ms_printf, 3, 0))) __attribute__ ((__nonnull__ (3))) - int vsnprintf (char * __restrict__ __stream, size_t __n, const char * __restrict__ __format, va_list __local_argv) - { - return __ms_vsnprintf (__stream, __n, __format, __local_argv); - } - __attribute__((__format__ (ms_printf, 3, 4))) __attribute__ ((__nonnull__ (3))) - int __attribute__((__cdecl__)) __ms_snprintf(char * __restrict__ s, size_t n, const char * __restrict__ format, ...); -static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) -__attribute__((__format__ (ms_printf, 3, 4))) __attribute__ ((__nonnull__ (3))) -int snprintf (char * __restrict__ __stream, size_t __n, const char * __restrict__ __format, ...) -{ - int __retval; - __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); - __retval = __ms_vsnprintf (__stream, __n, __format, __local_argv); - __builtin_va_end( __local_argv ); - return __retval; -} - - -#pragma GCC diagnostic pop __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _vscprintf(const char * __restrict__ _Format,va_list _ArgList); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _set_printf_count_output(int _Value); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _get_printf_count_output(void); @@ -1293,34 +1341,102 @@ int snprintf (char * __restrict__ __stream, size_t __n, const char * __restrict_ int __attribute__((__cdecl__)) __mingw_swprintf(wchar_t * __restrict__ , const wchar_t * __restrict__ , ...); __attribute__ ((__nonnull__ (2))) int __attribute__((__cdecl__)) __mingw_vswprintf(wchar_t * __restrict__ , const wchar_t * __restrict__ ,va_list); - int __attribute__((__cdecl__)) fwscanf(FILE * __restrict__ _File,const wchar_t * __restrict__ _Format,...) ; - int __attribute__((__cdecl__)) swscanf(const wchar_t * __restrict__ _Src,const wchar_t * __restrict__ _Format,...) ; - int __attribute__((__cdecl__)) wscanf(const wchar_t * __restrict__ _Format,...) ; - int __attribute__((__cdecl__)) __ms_vwscanf (const wchar_t * __restrict__ , va_list); - int __attribute__((__cdecl__)) __ms_vfwscanf (FILE * __restrict__ ,const wchar_t * __restrict__ ,va_list); - int __attribute__((__cdecl__)) __ms_vswscanf (const wchar_t * __restrict__ ,const wchar_t * __restrict__ ,va_list); - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - __attribute__ ((__nonnull__ (2))) - int vfwscanf (FILE *__stream, const wchar_t *__format, __builtin_va_list __local_argv) - { - return __ms_vfwscanf (__stream, __format, __local_argv); - } - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - __attribute__ ((__nonnull__ (2))) - int vswscanf (const wchar_t * __restrict__ __source, const wchar_t * __restrict__ __format, __builtin_va_list __local_argv) - { - return __ms_vswscanf( __source, __format, __local_argv ); - } - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - __attribute__ ((__nonnull__ (1))) - int vwscanf(const wchar_t *__format, __builtin_va_list __local_argv) - { - return __ms_vwscanf (__format, __local_argv); - } - int __attribute__((__cdecl__)) fwprintf(FILE * __restrict__ _File,const wchar_t * __restrict__ _Format,...); - int __attribute__((__cdecl__)) wprintf(const wchar_t * __restrict__ _Format,...); - int __attribute__((__cdecl__)) vfwprintf(FILE * __restrict__ _File,const wchar_t * __restrict__ _Format,va_list _ArgList); - int __attribute__((__cdecl__)) vwprintf(const wchar_t * __restrict__ _Format,va_list _ArgList); +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (2))) +int swscanf(const wchar_t *__source, const wchar_t *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vswscanf( __source, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (1))) +int wscanf(const wchar_t *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfwscanf( (__acrt_iob_func(0)), __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (2))) +int fwscanf(FILE *__stream, const wchar_t *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfwscanf( __stream, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (2))) +int vswscanf (const wchar_t * __restrict__ __source, const wchar_t * __restrict__ __format, __builtin_va_list __local_argv) +{ + return __mingw_vswscanf( __source, __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (1))) +int vwscanf(const wchar_t *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfwscanf( (__acrt_iob_func(0)), __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (2))) +int vfwscanf (FILE *__stream, const wchar_t *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfwscanf( __stream, __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (2))) +int fwprintf (FILE *__stream, const wchar_t *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfwprintf( __stream, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (1))) +int wprintf (const wchar_t *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vfwprintf( (__acrt_iob_func(1)), __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (2))) +int vfwprintf (FILE *__stream, const wchar_t *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfwprintf( __stream, __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (1))) +int vwprintf (const wchar_t *__format, __builtin_va_list __local_argv) +{ + return __mingw_vfwprintf( (__acrt_iob_func(1)), __format, __local_argv ); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (3))) +int snwprintf (wchar_t *__stream, size_t __n, const wchar_t *__format, ...) +{ + int __retval; + __builtin_va_list __local_argv; __builtin_va_start( __local_argv, __format ); + __retval = __mingw_vsnwprintf( __stream, __n, __format, __local_argv ); + __builtin_va_end( __local_argv ); + return __retval; +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + __attribute__ ((__nonnull__ (3))) +int vsnwprintf (wchar_t *__stream, size_t __n, const wchar_t *__format, __builtin_va_list __local_argv) +{ + return __mingw_vsnwprintf( __stream, __n, __format, __local_argv ); +} #define WEOF (wint_t)(0xFFFF) __attribute__ ((__dllimport__)) FILE *__attribute__((__cdecl__)) _wfsopen(const wchar_t *_Filename,const wchar_t *_Mode,int _ShFlag); wint_t __attribute__((__cdecl__)) fgetwc(FILE *_File); @@ -1341,29 +1457,7 @@ int snprintf (char * __restrict__ __stream, size_t __n, const char * __restrict_ __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _vswprintf_c(wchar_t * __restrict__ _DstBuf,size_t _SizeInWords,const wchar_t * __restrict__ _Format,va_list _ArgList); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _snwprintf(wchar_t * __restrict__ _Dest,size_t _Count,const wchar_t * __restrict__ _Format,...) ; __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _vsnwprintf(wchar_t * __restrict__ _Dest,size_t _Count,const wchar_t * __restrict__ _Format,va_list _Args) ; - - -#undef snwprintf -#undef vsnwprintf - int __attribute__((__cdecl__)) __ms_snwprintf (wchar_t * __restrict__ s, size_t n, const wchar_t * __restrict__ format, ...); - int __attribute__((__cdecl__)) __ms_vsnwprintf (wchar_t * __restrict__ , size_t, const wchar_t * __restrict__ , va_list); - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - int snwprintf (wchar_t * __restrict__ s, size_t n, const wchar_t * __restrict__ format, ...) - { - int r; - va_list argp; - __builtin_va_start (argp, format); - r = _vsnwprintf (s, n, format, argp); - __builtin_va_end (argp); - return r; - } - static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) - int __attribute__((__cdecl__)) vsnwprintf (wchar_t * __restrict__ s, size_t n, const wchar_t * __restrict__ format, va_list arg) - { - return _vsnwprintf(s,n,format,arg); - } - - + __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _vscwprintf(const wchar_t * __restrict__ _Format,va_list _ArgList); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _swprintf(wchar_t * __restrict__ _Dest,const wchar_t * __restrict__ _Format,...); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _vswprintf(wchar_t * __restrict__ _Dest,const wchar_t * __restrict__ _Format,va_list _Args); #define _INC_SWPRINTF_INL @@ -1389,7 +1483,6 @@ int swprintf (wchar_t *__stream, size_t __count, const wchar_t *__format, ...) return __retval; } __attribute__ ((__dllimport__)) wchar_t *__attribute__((__cdecl__)) _wtempnam(const wchar_t *_Directory,const wchar_t *_FilePrefix); - __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _vscwprintf(const wchar_t * __restrict__ _Format,va_list _ArgList); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _snwscanf(const wchar_t * __restrict__ _Src,size_t _MaxCount,const wchar_t * __restrict__ _Format,...); __attribute__ ((__dllimport__)) FILE *__attribute__((__cdecl__)) _wfdopen(int _FileHandle ,const wchar_t *_Mode); __attribute__ ((__dllimport__)) FILE *__attribute__((__cdecl__)) _wfopen(const wchar_t * __restrict__ _Filename,const wchar_t *__restrict__ _Mode) ; @@ -1708,8 +1801,8 @@ typedef unsigned long long _sigset_t; int __attribute__((__cdecl__)) strnicmp(const char *_Str1,const char *_Str,size_t _MaxCount) ; int __attribute__((__cdecl__)) strncasecmp (const char *, const char *, size_t); int __attribute__((__cdecl__)) strcasecmp (const char *, const char *); -#define strncasecmp _strnicmp -#define strcasecmp _stricmp + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) strncasecmp (const char *__sz1, const char *__sz2, size_t __sizeMaxCompare) { return _strnicmp (__sz1, __sz2, __sizeMaxCompare); } + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) strcasecmp (const char *__sz1, const char *__sz2) { return _stricmp (__sz1, __sz2); } char *__attribute__((__cdecl__)) strnset(char *_Str,int _Val,size_t _MaxCount) ; char *__attribute__((__cdecl__)) strrev(char *_Str) ; char *__attribute__((__cdecl__)) strset(char *_Str,int _Val) ; @@ -1937,9 +2030,41 @@ __attribute__ ((__dllimport__)) char* __attribute__((__cdecl__)) _getcwd (char*, __attribute__ ((__dllimport__)) intptr_t __attribute__((__cdecl__)) _findfirst32i64(const char *_Filename,struct _finddata32i64_t *_FindData); __attribute__ ((__dllimport__)) intptr_t __attribute__((__cdecl__)) _findfirst64(const char *_Filename,struct __finddata64_t *_FindData); intptr_t __attribute__((__cdecl__)) _findfirst64i32(const char *_Filename,struct _finddata64i32_t *_FindData); + extern inline __attribute__((__gnu_inline__)) intptr_t __attribute__((__cdecl__)) _findfirst64i32(const char *_Filename,struct _finddata64i32_t *_FindData) + { + struct __finddata64_t fd; + intptr_t ret = _findfirst64(_Filename,&fd); + if (ret == -1) { + memset(_FindData,0,sizeof(struct _finddata64i32_t)); + return -1; + } + _FindData->attrib=fd.attrib; + _FindData->time_create=fd.time_create; + _FindData->time_access=fd.time_access; + _FindData->time_write=fd.time_write; + _FindData->size=(_fsize_t) fd.size; + strncpy(_FindData->name,fd.name,260); + return ret; + } __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _findnext32i64(intptr_t _FindHandle,struct _finddata32i64_t *_FindData); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _findnext64(intptr_t _FindHandle,struct __finddata64_t *_FindData); int __attribute__((__cdecl__)) _findnext64i32(intptr_t _FindHandle,struct _finddata64i32_t *_FindData); + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) _findnext64i32(intptr_t _FindHandle,struct _finddata64i32_t *_FindData) + { + struct __finddata64_t fd; + int ret = _findnext64(_FindHandle,&fd); + if (ret == -1) { + memset(_FindData,0,sizeof(struct _finddata64i32_t)); + return -1; + } + _FindData->attrib=fd.attrib; + _FindData->time_create=fd.time_create; + _FindData->time_access=fd.time_access; + _FindData->time_write=fd.time_write; + _FindData->size=(_fsize_t) fd.size; + strncpy(_FindData->name,fd.name,260); + return ret; + } __extension__ long long __attribute__((__cdecl__)) _lseeki64(int _FileHandle,long long _Offset,int _Origin); __extension__ long long __attribute__((__cdecl__)) _telli64(int _FileHandle); int __attribute__((__cdecl__)) chdir (const char *) ; @@ -2087,9 +2212,51 @@ __attribute__ ((__dllimport__)) char* __attribute__((__cdecl__)) _getcwd (char*, __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _fstat64(int _FileDes,struct _stat64 *_Stat); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _fstat32i64(int _FileDes,struct _stat32i64 *_Stat); int __attribute__((__cdecl__)) _fstat64i32(int _FileDes,struct _stat64i32 *_Stat); + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) _fstat64i32(int _FileDes,struct _stat64i32 *_Stat) + { + struct _stat64 st; + int ret=_fstat64(_FileDes,&st); + if (ret == -1) { + memset(_Stat,0,sizeof(struct _stat64i32)); + return -1; + } + _Stat->st_dev=st.st_dev; + _Stat->st_ino=st.st_ino; + _Stat->st_mode=st.st_mode; + _Stat->st_nlink=st.st_nlink; + _Stat->st_uid=st.st_uid; + _Stat->st_gid=st.st_gid; + _Stat->st_rdev=st.st_rdev; + _Stat->st_size=(_off_t) st.st_size; + _Stat->st_atime=st.st_atime; + _Stat->st_mtime=st.st_mtime; + _Stat->st_ctime=st.st_ctime; + return ret; + } __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _stat64(const char *_Name,struct _stat64 *_Stat); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _stat32i64(const char *_Name,struct _stat32i64 *_Stat); int __attribute__((__cdecl__)) _stat64i32(const char *_Name,struct _stat64i32 *_Stat); + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) _stat64i32(const char *_Name,struct _stat64i32 *_Stat) + { + struct _stat64 st; + int ret=_stat64(_Name,&st); + if (ret == -1) { + memset(_Stat,0,sizeof(struct _stat64i32)); + return -1; + } + _Stat->st_dev=st.st_dev; + _Stat->st_ino=st.st_ino; + _Stat->st_mode=st.st_mode; + _Stat->st_nlink=st.st_nlink; + _Stat->st_uid=st.st_uid; + _Stat->st_gid=st.st_gid; + _Stat->st_rdev=st.st_rdev; + _Stat->st_size=(_off_t) st.st_size; + _Stat->st_atime=st.st_atime; + _Stat->st_mtime=st.st_mtime; + _Stat->st_ctime=st.st_ctime; + return ret; + } #define _WSTAT_DEFINED __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _wstat32(const wchar_t *_Name,struct _stat32 *_Stat); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _wstat32i64(const wchar_t *_Name,struct _stat32i64 *_Stat); @@ -2129,6 +2296,27 @@ __attribute__ ((__dllimport__)) char* __attribute__((__cdecl__)) _getcwd (char*, int __attribute__((__cdecl__)) fstat(int _Desc,struct stat *_Stat); int __attribute__((__cdecl__)) stat(const char *_Filename,struct stat *_Stat); int __attribute__((__cdecl__)) wstat(const wchar_t *_Filename,struct stat *_Stat); +extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) + fstat(int _Desc,struct stat *_Stat) { + struct _stat64 st; + int ret=_fstat64(_Desc,&st); + if (ret == -1) { + memset(_Stat,0,sizeof(struct stat)); + return -1; + } + _Stat->st_dev=st.st_dev; + _Stat->st_ino=st.st_ino; + _Stat->st_mode=st.st_mode; + _Stat->st_nlink=st.st_nlink; + _Stat->st_uid=st.st_uid; + _Stat->st_gid=st.st_gid; + _Stat->st_rdev=st.st_rdev; + _Stat->st_size=(_off_t) st.st_size; + _Stat->st_atime=st.st_atime; + _Stat->st_mtime=st.st_mtime; + _Stat->st_ctime=st.st_ctime; + return ret; +} #define stat _stat64 #define fstat _fstat64 #pragma pack(pop) @@ -2220,6 +2408,7 @@ int __attribute__((__cdecl__)) wstat(const wchar_t *_Filename,struct stat *_Stat #define LONG_LONG_MAX __LONG_LONG_MAX__ #undef ULONG_LONG_MAX #define ULONG_LONG_MAX (LONG_LONG_MAX * 2ULL + 1ULL) +#define __USE_MINGW_STRTOX 1 #pragma pack(push,_CRT_PACKING) #define EXIT_SUCCESS 0 #define EXIT_FAILURE 1 @@ -2336,6 +2525,8 @@ __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) ___mb_cur_max_fun void __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) exit(int _Code) __attribute__ ((__noreturn__)); void __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) _exit(int _Code) __attribute__ ((__noreturn__)); void __attribute__((__cdecl__)) _Exit(int) __attribute__ ((__noreturn__)); + extern inline __attribute__((__gnu_inline__)) __attribute__ ((__noreturn__)) void __attribute__((__cdecl__)) _Exit(int status) + { _exit(status); } #undef abort void __attribute__((__cdecl__)) __attribute__ ((__noreturn__)) abort(void); @@ -2389,12 +2580,21 @@ __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) ___mb_cur_max_fun int __attribute__((__cdecl__)) rand(void); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _set_error_mode(int _Mode); void __attribute__((__cdecl__)) srand(unsigned int _Seed); - double __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) strtod(const char * __restrict__ _Str,char ** __restrict__ _EndPtr); - float __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) strtof(const char * __restrict__ nptr, char ** __restrict__ endptr); +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +double __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) strtod(const char * __restrict__ _Str,char ** __restrict__ _EndPtr) +{ + double __attribute__((__cdecl__)) __mingw_strtod (const char * __restrict__, char ** __restrict__); + return __mingw_strtod( _Str, _EndPtr); +} +static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) +float __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) strtof(const char * __restrict__ _Str,char ** __restrict__ _EndPtr) +{ + float __attribute__((__cdecl__)) __mingw_strtof (const char * __restrict__, char ** __restrict__); + return __mingw_strtof( _Str, _EndPtr); +} long double __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) strtold(const char * __restrict__ , char ** __restrict__ ); extern double __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) __strtod (const char * __restrict__ , char ** __restrict__); -#define strtod __strtod float __attribute__((__cdecl__)) __mingw_strtof (const char * __restrict__, char ** __restrict__); double __attribute__((__cdecl__)) __mingw_strtod (const char * __restrict__, char ** __restrict__); long double __attribute__((__cdecl__)) __mingw_strtold(const char * __restrict__, char ** __restrict__); @@ -2436,8 +2636,14 @@ __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) ___mb_cur_max_fun double __attribute__((__cdecl__)) __mingw_wcstod(const wchar_t * __restrict__ _Str,wchar_t ** __restrict__ _EndPtr); float __attribute__((__cdecl__)) __mingw_wcstof(const wchar_t * __restrict__ nptr, wchar_t ** __restrict__ endptr); long double __attribute__((__cdecl__)) __mingw_wcstold(const wchar_t * __restrict__, wchar_t ** __restrict__); - double __attribute__((__cdecl__)) wcstod(const wchar_t * __restrict__ _Str,wchar_t ** __restrict__ _EndPtr); - float __attribute__((__cdecl__)) wcstof(const wchar_t * __restrict__ nptr, wchar_t ** __restrict__ endptr); + static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + double __attribute__((__cdecl__)) wcstod(const wchar_t * __restrict__ _Str,wchar_t ** __restrict__ _EndPtr){ + return __mingw_wcstod(_Str,_EndPtr); + } + static __attribute__ ((__unused__)) __inline__ __attribute__((__cdecl__)) + float __attribute__((__cdecl__)) wcstof(const wchar_t * __restrict__ _Str,wchar_t ** __restrict__ _EndPtr){ + return __mingw_wcstof(_Str,_EndPtr); + } long double __attribute__((__cdecl__)) wcstold(const wchar_t * __restrict__, wchar_t ** __restrict__); __attribute__ ((__dllimport__)) double __attribute__((__cdecl__)) _wcstod_l(const wchar_t * __restrict__ _Str,wchar_t ** __restrict__ _EndPtr,_locale_t _Locale); long __attribute__((__cdecl__)) wcstol(const wchar_t * __restrict__ _Str,wchar_t ** __restrict__ _EndPtr,int _Radix); @@ -2524,6 +2730,7 @@ unsigned long __attribute__((__cdecl__)) _lrotr(unsigned long,int); typedef struct { __extension__ long long quot, rem; } lldiv_t; __extension__ lldiv_t __attribute__((__cdecl__)) lldiv(long long, long long); __extension__ long long __attribute__((__cdecl__)) llabs(long long); + __extension__ extern inline __attribute__((__gnu_inline__)) long long __attribute__((__cdecl__)) llabs(long long _j) { return (_j >= 0 ? _j : -_j); } __extension__ long long __attribute__((__cdecl__)) strtoll(const char * __restrict__, char ** __restrict, int); __extension__ unsigned long long __attribute__((__cdecl__)) strtoull(const char * __restrict__, char ** __restrict__, int); __extension__ long long __attribute__((__cdecl__)) atoll (const char *); @@ -2532,6 +2739,12 @@ unsigned long __attribute__((__cdecl__)) _lrotr(unsigned long,int); __extension__ char *__attribute__((__cdecl__)) ulltoa (unsigned long long , char *, int); __extension__ wchar_t *__attribute__((__cdecl__)) lltow (long long, wchar_t *, int); __extension__ wchar_t *__attribute__((__cdecl__)) ulltow (unsigned long long, wchar_t *, int); + __extension__ extern inline __attribute__((__gnu_inline__)) long long __attribute__((__cdecl__)) atoll (const char * _c) { return _atoi64 (_c); } + __extension__ extern inline __attribute__((__gnu_inline__)) char *__attribute__((__cdecl__)) lltoa (long long _n, char * _c, int _i) { return _i64toa (_n, _c, _i); } + __extension__ extern inline __attribute__((__gnu_inline__)) char *__attribute__((__cdecl__)) ulltoa (unsigned long long _n, char * _c, int _i) { return _ui64toa (_n, _c, _i); } + __extension__ extern inline __attribute__((__gnu_inline__)) long long __attribute__((__cdecl__)) wtoll (const wchar_t * _w) { return _wtoi64 (_w); } + __extension__ extern inline __attribute__((__gnu_inline__)) wchar_t *__attribute__((__cdecl__)) lltow (long long _n, wchar_t * _w, int _i) { return _i64tow (_n, _w, _i); } + __extension__ extern inline __attribute__((__gnu_inline__)) wchar_t *__attribute__((__cdecl__)) ulltow (unsigned long long _n, wchar_t * _w, int _i) { return _ui64tow (_n, _w, _i); } #pragma pack(pop) #define _INC_STDLIB_S __attribute__ ((__dllimport__)) errno_t __attribute__((__cdecl__)) _dupenv_s(char **_PBuffer,size_t *_PBufferSizeInBytes,const char *_VarName); @@ -2667,6 +2880,7 @@ void * __mingw_aligned_realloc (void *_Memory, size_t _Size, size_t _Offset); #undef __need_NULL #define offsetof(TYPE,MEMBER) __builtin_offsetof (TYPE, MEMBER) #define _GCC_MAX_ALIGN_T +#define __CLANG_MAX_ALIGN_T_DEFINED typedef struct { long long __max_align_ll __attribute__((__aligned__(__alignof__(long long)))); long double __max_align_ld __attribute__((__aligned__(__alignof__(long double)))); @@ -2938,6 +3152,8 @@ typedef struct { #define SCNuLEAST8 "hhu" #define SCNuFAST8 "hhu" intmax_t __attribute__((__cdecl__)) imaxabs (intmax_t j); +extern inline __attribute__((__gnu_inline__)) intmax_t __attribute__((__cdecl__)) imaxabs (intmax_t j) + {return (j >= 0 ? j : -j);} imaxdiv_t __attribute__((__cdecl__)) imaxdiv (intmax_t numer, intmax_t denom); intmax_t __attribute__((__cdecl__)) strtoimax (const char* __restrict__ nptr, char** __restrict__ endptr, int base); @@ -2947,6 +3163,116 @@ intmax_t __attribute__((__cdecl__)) wcstoimax (const wchar_t* __restrict__ nptr, wchar_t** __restrict__ endptr, int base); uintmax_t __attribute__((__cdecl__)) wcstoumax (const wchar_t* __restrict__ nptr, wchar_t** __restrict__ endptr, int base); +#undef PRId64 +#undef PRIdLEAST64 +#undef PRIdFAST64 +#undef PRIdMAX +#undef PRIi64 +#undef PRIiLEAST64 +#undef PRIiFAST64 +#undef PRIiMAX +#undef PRIo64 +#undef PRIoLEAST64 +#undef PRIoFAST64 +#undef PRIoMAX +#undef PRIu64 +#undef PRIuLEAST64 +#undef PRIuFAST64 +#undef PRIuMAX +#undef PRIx64 +#undef PRIxLEAST64 +#undef PRIxFAST64 +#undef PRIxMAX +#undef PRIX64 +#undef PRIXLEAST64 +#undef PRIXFAST64 +#undef PRIXMAX +#undef SCNd64 +#undef SCNdLEAST64 +#undef SCNdFAST64 +#undef SCNdMAX +#undef SCNi64 +#undef SCNiLEAST64 +#undef SCNiFAST64 +#undef SCNiMAX +#undef SCNo64 +#undef SCNoLEAST64 +#undef SCNoFAST64 +#undef SCNoMAX +#undef SCNx64 +#undef SCNxLEAST64 +#undef SCNxFAST64 +#undef SCNxMAX +#undef SCNu64 +#undef SCNuLEAST64 +#undef SCNuFAST64 +#undef SCNuMAX +#undef PRIdPTR +#undef PRIiPTR +#undef PRIoPTR +#undef PRIuPTR +#undef PRIxPTR +#undef PRIXPTR +#undef SCNdPTR +#undef SCNiPTR +#undef SCNoPTR +#undef SCNxPTR +#undef SCNuPTR +#define PRId64 "lld" +#define PRIdLEAST64 "lld" +#define PRIdFAST64 "lld" +#define PRIdMAX "lld" +#define PRIi64 "lli" +#define PRIiLEAST64 "lli" +#define PRIiFAST64 "lli" +#define PRIiMAX "lli" +#define PRIo64 "llo" +#define PRIoLEAST64 "llo" +#define PRIoFAST64 "llo" +#define PRIoMAX "llo" +#define PRIu64 "llu" +#define PRIuLEAST64 "llu" +#define PRIuFAST64 "llu" +#define PRIuMAX "llu" +#define PRIx64 "llx" +#define PRIxLEAST64 "llx" +#define PRIxFAST64 "llx" +#define PRIxMAX "llx" +#define PRIX64 "llX" +#define PRIXLEAST64 "llX" +#define PRIXFAST64 "llX" +#define PRIXMAX "llX" +#define SCNd64 "lld" +#define SCNdLEAST64 "lld" +#define SCNdFAST64 "lld" +#define SCNdMAX "lld" +#define SCNi64 "lli" +#define SCNiLEAST64 "lli" +#define SCNiFAST64 "lli" +#define SCNiMAX "lli" +#define SCNo64 "llo" +#define SCNoLEAST64 "llo" +#define SCNoFAST64 "llo" +#define SCNoMAX "llo" +#define SCNx64 "llx" +#define SCNxLEAST64 "llx" +#define SCNxFAST64 "llx" +#define SCNxMAX "llx" +#define SCNu64 "llu" +#define SCNuLEAST64 "llu" +#define SCNuFAST64 "llu" +#define SCNuMAX "llu" +#define PRIdPTR "lld" +#define PRIiPTR "lli" +#define PRIoPTR "llo" +#define PRIuPTR "llu" +#define PRIxPTR "llx" +#define PRIXPTR "llX" +#define SCNdPTR "lld" +#define SCNiPTR "lli" +#define SCNoPTR "llo" +#define SCNxPTR "llx" +#define SCNuPTR "llu" #define _STDALIGN_H #define alignas _Alignas #define alignof _Alignof @@ -2976,6 +3302,8 @@ __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _execute_onexit_t __attribute__ ((__dllimport__)) void __attribute__((__cdecl__)) _endthread(void) __attribute__ ((__noreturn__)); __attribute__ ((__dllimport__)) uintptr_t __attribute__((__cdecl__)) _beginthreadex(void *_Security,unsigned _StackSize,unsigned ( *_StartAddress) (void *),void *_ArgList,unsigned _InitFlag,unsigned *_ThrdAddr); __attribute__ ((__dllimport__)) void __attribute__((__cdecl__)) _endthreadex(unsigned _Retval) __attribute__ ((__noreturn__)); + typedef void ( *_tls_callback_type)(void*,unsigned long,void*); + __attribute__ ((__dllimport__)) void __attribute__((__cdecl__)) _register_thread_local_exe_atexit_callback(_tls_callback_type callback); void __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) _cexit(void); void __attribute__((__cdecl__)) __attribute__ ((__nothrow__)) _c_exit(void); __attribute__ ((__dllimport__)) int __attribute__((__cdecl__)) _getpid(void); @@ -3051,6 +3379,10 @@ int ftruncate(int, off32_t); int ftruncate64(int, off64_t); int truncate(const char *, off32_t); int truncate64(const char *, off64_t); +extern inline __attribute__((__gnu_inline__)) int ftruncate(int __fd, off32_t __length) +{ + return _chsize (__fd, __length); +} #define _FILE_OFFSET_BITS_SET_FTRUNCATE #define ftruncate ftruncate64 #define WIN_PTHREADS_UNISTD_H @@ -3224,6 +3556,20 @@ struct _exception; extern float __attribute__((__cdecl__)) fabsf (float x); extern long double __attribute__((__cdecl__)) fabsl (long double); extern double __attribute__((__cdecl__)) fabs (double _X); + extern inline __attribute__((__gnu_inline__)) float __attribute__((__cdecl__)) fabsf (float x) + { + return __builtin_fabsf (x); + } + extern inline __attribute__((__gnu_inline__)) long double __attribute__((__cdecl__)) fabsl (long double x) + { + long double res = 0.0l; + __asm__ __volatile__ ("fabs;" : "=t" (res) : "0" (x)); + return res; + } + extern inline __attribute__((__gnu_inline__)) double __attribute__((__cdecl__)) fabs (double x) + { + return __builtin_fabs (x); + } double __attribute__((__cdecl__)) ldexp(double _X,int _Y); double __attribute__((__cdecl__)) frexp(double _X,int *_Y); double __attribute__((__cdecl__)) modf(double _X,double *_Y); @@ -3290,6 +3636,51 @@ typedef double double_t; extern int __attribute__((__cdecl__)) __fpclassifyl (long double); extern int __attribute__((__cdecl__)) __fpclassifyf (float); extern int __attribute__((__cdecl__)) __fpclassify (double); + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __fpclassifyl (long double x) { + __mingw_fp_types_t hlp; + unsigned int e; + hlp.ld = &x; + e = hlp.ldt->lh.sign_exponent & 0x7fff; + if (!e) + { + unsigned int h = hlp.ldt->lh.high; + if (!(hlp.ldt->lh.low | h)) + return 0x4000; + else if (!(h & 0x80000000)) + return (0x0400 | 0x4000); + } + else if (e == 0x7fff) + return (((hlp.ldt->lh.high & 0x7fffffff) | hlp.ldt->lh.low) == 0 ? + (0x0100 | 0x0400) : 0x0100); + return 0x0400; + } + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __fpclassify (double x) { + __mingw_fp_types_t hlp; + unsigned int l, h; + hlp.d = &x; + h = hlp.ldt->lh.high; + l = hlp.ldt->lh.low | (h & 0xfffff); + h &= 0x7ff00000; + if ((h | l) == 0) + return 0x4000; + if (!h) + return (0x0400 | 0x4000); + if (h == 0x7ff00000) + return (l ? 0x0100 : (0x0100 | 0x0400)); + return 0x0400; + } + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __fpclassifyf (float x) { + __mingw_fp_types_t hlp; + hlp.f = &x; + hlp.ft->val &= 0x7fffffff; + if (hlp.ft->val == 0) + return 0x4000; + if (hlp.ft->val < 0x800000) + return (0x0400 | 0x4000); + if (hlp.ft->val >= 0x7f800000) + return (hlp.ft->val > 0x7f800000 ? 0x0100 : (0x0100 | 0x0400)); + return 0x0400; + } #define __dfp_expansion(__call,__fin,x) __fin #define fpclassify(x) __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), double), __fpclassify(x), __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), float), __fpclassifyf(x), __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), long double), __fpclassifyl(x), __dfp_expansion(__fpclassify,(__builtin_trap(),0),x)))) #define isfinite(x) ((fpclassify(x) & FP_NAN) == 0) @@ -3297,11 +3688,57 @@ typedef double double_t; extern int __attribute__((__cdecl__)) __isnan (double); extern int __attribute__((__cdecl__)) __isnanf (float); extern int __attribute__((__cdecl__)) __isnanl (long double); + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __isnan (double _x) + { + __mingw_fp_types_t hlp; + int l, h; + hlp.d = &_x; + l = hlp.dt->lh.low; + h = hlp.dt->lh.high & 0x7fffffff; + h |= (unsigned int) (l | -l) >> 31; + h = 0x7ff00000 - h; + return (int) ((unsigned int) h) >> 31; + } + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __isnanf (float _x) + { + __mingw_fp_types_t hlp; + int i; + hlp.f = &_x; + i = hlp.ft->val & 0x7fffffff; + i = 0x7f800000 - i; + return (int) (((unsigned int) i) >> 31); + } + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __isnanl (long double _x) + { + __mingw_fp_types_t ld; + int xx, signexp; + ld.ld = &_x; + signexp = (ld.ldt->lh.sign_exponent & 0x7fff) << 1; + xx = (int) (ld.ldt->lh.low | (ld.ldt->lh.high & 0x7fffffffu)); + signexp |= (unsigned int) (xx | (-xx)) >> 31; + signexp = 0xfffe - signexp; + return (int) ((unsigned int) signexp) >> 16; + } #define isnan(x) __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), double), __isnan(x), __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), float), __isnanf(x), __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), long double), __isnanl(x), __dfp_expansion(__isnan,(__builtin_trap(),x),x)))) #define isnormal(x) (fpclassify(x) == FP_NORMAL) extern int __attribute__((__cdecl__)) __signbit (double); extern int __attribute__((__cdecl__)) __signbitf (float); extern int __attribute__((__cdecl__)) __signbitl (long double); + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __signbit (double x) { + __mingw_fp_types_t hlp; + hlp.d = &x; + return ((hlp.dt->lh.high & 0x80000000) != 0); + } + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __signbitf (float x) { + __mingw_fp_types_t hlp; + hlp.f = &x; + return ((hlp.ft->val & 0x80000000) != 0); + } + extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) __signbitl (long double x) { + __mingw_fp_types_t ld; + ld.ld = &x; + return ((ld.ldt->lh.sign_exponent & 0x8000) != 0); + } #define signbit(x) __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), double), __signbit(x), __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), float), __signbitf(x), __mingw_choose_expr ( __mingw_types_compatible_p (__typeof__ (x), long double), __signbitl(x), __dfp_expansion(__signbit,(__builtin_trap(),x),x)))) extern float __attribute__((__cdecl__)) sinf(float _X); extern long double __attribute__((__cdecl__)) sinl(long double); @@ -3318,10 +3755,13 @@ typedef double double_t; extern float __attribute__((__cdecl__)) atan2f (float, float); extern long double __attribute__((__cdecl__)) atan2l (long double, long double); extern float __attribute__((__cdecl__)) sinhf(float _X); + extern inline __attribute__((__gnu_inline__)) float sinhf(float _X) { return ((float)sinh((double)_X)); } extern long double __attribute__((__cdecl__)) sinhl(long double); extern float __attribute__((__cdecl__)) coshf(float _X); + extern inline __attribute__((__gnu_inline__)) float coshf(float _X) { return ((float)cosh((double)_X)); } extern long double __attribute__((__cdecl__)) coshl(long double); extern float __attribute__((__cdecl__)) tanhf(float _X); + extern inline __attribute__((__gnu_inline__)) float tanhf(float _X) { return ((float)tanh((double)_X)); } extern long double __attribute__((__cdecl__)) tanhl(long double); extern double __attribute__((__cdecl__)) acosh (double); extern float __attribute__((__cdecl__)) acoshf (float); @@ -3333,6 +3773,7 @@ typedef double double_t; extern float __attribute__((__cdecl__)) atanhf (float); extern long double __attribute__((__cdecl__)) atanhl (long double); extern float __attribute__((__cdecl__)) expf(float _X); + extern inline __attribute__((__gnu_inline__)) float expf(float _X) { return ((float)exp((double)_X)); } extern long double __attribute__((__cdecl__)) expl(long double); extern double __attribute__((__cdecl__)) exp2(double); extern float __attribute__((__cdecl__)) exp2f(float); @@ -3341,6 +3782,7 @@ typedef double double_t; extern float __attribute__((__cdecl__)) expm1f(float); extern long double __attribute__((__cdecl__)) expm1l(long double); extern float frexpf(float _X,int *_Y); + extern inline __attribute__((__gnu_inline__)) float frexpf(float _X,int *_Y) { return ((float)frexp((double)_X,_Y)); } extern long double __attribute__((__cdecl__)) frexpl(long double,int *); #define FP_ILOGB0 ((int)0x80000000) #define FP_ILOGBNAN ((int)0x80000000) @@ -3348,6 +3790,7 @@ typedef double double_t; extern int __attribute__((__cdecl__)) ilogbf (float); extern int __attribute__((__cdecl__)) ilogbl (long double); extern float __attribute__((__cdecl__)) ldexpf(float _X,int _Y); + extern inline __attribute__((__gnu_inline__)) float __attribute__((__cdecl__)) ldexpf (float x, int expn) { return (float) ldexp ((double)x, expn); } extern long double __attribute__((__cdecl__)) ldexpl (long double, int); extern float __attribute__((__cdecl__)) logf (float); extern long double __attribute__((__cdecl__)) logl(long double); @@ -3375,8 +3818,10 @@ typedef double double_t; extern long double __attribute__((__cdecl__)) cbrtl (long double); extern double __attribute__((__cdecl__)) hypot (double, double) ; extern float __attribute__((__cdecl__)) hypotf (float x, float y); + extern inline __attribute__((__gnu_inline__)) float __attribute__((__cdecl__)) hypotf (float x, float y) { return (float) hypot ((double)x, (double)y);} extern long double __attribute__((__cdecl__)) hypotl (long double, long double); extern float __attribute__((__cdecl__)) powf(float _X,float _Y); + extern inline __attribute__((__gnu_inline__)) float powf(float _X,float _Y) { return ((float)pow((double)_X,(double)_Y)); } extern long double __attribute__((__cdecl__)) powl (long double, long double); extern float __attribute__((__cdecl__)) sqrtf (float); extern long double sqrtl(long double); @@ -3432,6 +3877,20 @@ __extension__ long long __attribute__((__cdecl__)) llrintl (long double); extern double __attribute__((__cdecl__)) copysign (double, double); extern float __attribute__((__cdecl__)) copysignf (float, float); extern long double __attribute__((__cdecl__)) copysignl (long double, long double); + extern inline __attribute__((__gnu_inline__)) double __attribute__((__cdecl__)) copysign (double x, double y) + { + __mingw_dbl_type_t hx, hy; + hx.x = x; hy.x = y; + hx.lh.high = (hx.lh.high & 0x7fffffff) | (hy.lh.high & 0x80000000); + return hx.x; + } + extern inline __attribute__((__gnu_inline__)) float __attribute__((__cdecl__)) copysignf (float x, float y) + { + __mingw_flt_type_t hx, hy; + hx.x = x; hy.x = y; + hx.val = (hx.val & 0x7fffffff) | (hy.val & 0x80000000); + return hx.x; + } extern double __attribute__((__cdecl__)) nan(const char *tagp); extern float __attribute__((__cdecl__)) nanf(const char *tagp); extern long double __attribute__((__cdecl__)) nanl(const char *tagp); @@ -3628,6 +4087,10 @@ void ruby_xfree(void*); #define NTDDI_WINBLUE 0x06030000 #define NTDDI_WINTHRESHOLD 0x0A000000 #define NTDDI_WIN10 0x0A000000 +#define NTDDI_WIN10_TH2 0x0A000001 +#define NTDDI_WIN10_RS1 0x0A000002 +#define NTDDI_WIN10_RS2 0x0A000003 +#define NTDDI_WIN10_RS3 0x0A000004 #define OSVERSION_MASK 0xFFFF0000U #define SPVERSION_MASK 0x0000FF00 #define SUBVERSION_MASK 0x000000FF @@ -6791,7 +7254,11 @@ enum _mm_hint _MM_HINT_T2 = 1, _MM_HINT_NTA = 0 }; -#define _mm_prefetch(P,I) __builtin_prefetch ((P), ((I & 0x4) >> 2), (I & 0x3)) +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_prefetch (const void *__P, enum _mm_hint __I) +{ + __builtin_prefetch (__P, (__I & 0x4) >> 2, __I & 0x3); +} typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); typedef float __v4sf __attribute__ ((__vector_size__ (16))); @@ -7288,7 +7755,11 @@ _mm_cvtps_pi8(__m128 __A) __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); } -#define _mm_shuffle_ps(A,B,MASK) ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(MASK))) +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) +{ + return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); +} extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_ps (__m128 __A, __m128 __B) { @@ -7484,10 +7955,26 @@ _mm_move_ss (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); } -#define _mm_extract_pi16(A,N) ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N))) -#define _m_pextrw(A,N) _mm_extract_pi16(A, N) -#define _mm_insert_pi16(A,D,N) ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), (int)(D), (int)(N))) -#define _m_pinsrw(A,D,N) _mm_insert_pi16(A, D, N) +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_pi16 (__m64 const __A, int const __N) +{ + return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pextrw (__m64 const __A, int const __N) +{ + return _mm_extract_pi16 (__A, __N); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) +{ + return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pinsrw (__m64 const __A, int const __D, int const __N) +{ + return _mm_insert_pi16 (__A, __D, __N); +} extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_pi16 (__m64 __A, __m64 __B) { @@ -7548,8 +8035,16 @@ _m_pmulhuw (__m64 __A, __m64 __B) { return _mm_mulhi_pu16 (__A, __B); } -#define _mm_shuffle_pi16(A,N) ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N))) -#define _m_pshufw(A,N) _mm_shuffle_pi16 (A, N) +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __A, int const __N) +{ + return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pshufw (__m64 __A, int const __N) +{ + return _mm_shuffle_pi16 (__A, __N); +} extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) { @@ -8290,7 +8785,11 @@ _mm_cvtss_sd (__m128d __A, __m128 __B) { return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); } -#define _mm_shuffle_pd(A,B,N) ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(N))) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) +{ + return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask); +} extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpackhi_pd (__m128d __A, __m128d __B) { @@ -8501,10 +9000,26 @@ _mm_srai_epi32 (__m128i __A, int __B) { return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); } -#define _mm_bsrli_si128(A,N) ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) -#define _mm_bslli_si128(A,N) ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) -#define _mm_srli_si128(A,N) ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) -#define _mm_slli_si128(A,N) ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bsrli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bslli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi16 (__m128i __A, int __B) { @@ -8625,8 +9140,16 @@ _mm_cmpgt_epi32 (__m128i __A, __m128i __B) { return (__m128i) ((__v4si)__A > (__v4si)__B); } -#define _mm_extract_epi16(A,N) ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N))) -#define _mm_insert_epi16(A,D,N) ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A), (int)(D), (int)(N))) +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi16 (__m128i const __A, int const __N) +{ + return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) +{ + return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_epi16 (__m128i __A, __m128i __B) { @@ -8657,9 +9180,21 @@ _mm_mulhi_epu16 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); } -#define _mm_shufflehi_epi16(A,N) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N))) -#define _mm_shufflelo_epi16(A,N) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N))) -#define _mm_shuffle_epi32(A,N) ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflehi_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflelo_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi32 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask); +} extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) { @@ -8766,6 +9301,9 @@ _mm_pause (void) __builtin_ia32_pause (); } #define _PMMINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("sse3") +#define __DISABLE_SSE3__ #define _MM_DENORMALS_ZERO_MASK 0x0040 #define _MM_DENORMALS_ZERO_ON 0x0040 #define _MM_DENORMALS_ZERO_OFF 0x0000 @@ -8819,7 +9357,7 @@ _mm_loaddup_pd (double const *__P) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movedup_pd (__m128d __X) { - return ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(__X), (__v2df)(__m128d)(__X), (int)((((0) << 1) | (0))))); + return _mm_shuffle_pd (__X, __X, (((0) << 1) | (0))); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_lddqu_si128 (__m128i const *__P) @@ -8836,6 +9374,8 @@ _mm_mwait (unsigned int __E, unsigned int __H) { __builtin_ia32_mwait (__E, __H); } +#undef __DISABLE_SSE3__ +#pragma GCC pop_options #define _TMMINTRIN_H_INCLUDED #pragma GCC push_options #pragma GCC target("ssse3") @@ -8960,8 +9500,18 @@ _mm_sign_pi32 (__m64 __X, __m64 __Y) { return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y); } -#define _mm_alignr_epi8(X,Y,N) ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N) * 8)) -#define _mm_alignr_pi8(X,Y,N) ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X), (__v1di)(__m64)(Y), (int)(N) * 8)) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) +{ + return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X, + (__v2di)__Y, __N * 8); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) +{ + return (__m64) __builtin_ia32_palignr ((__v1di)__X, + (__v1di)__Y, __N * 8); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_abs_epi8 (__m128i __X) { @@ -8995,6 +9545,9 @@ _mm_abs_pi32 (__m64 __X) #undef __DISABLE_SSSE3__ #pragma GCC pop_options #define _AMMINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("sse4a") +#define __DISABLE_SSE4A__ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_sd (double * __P, __m128d __Y) { @@ -9010,13 +9563,23 @@ _mm_extract_si64 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); } -#define _mm_extracti_si64(X,I,L) ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X), (unsigned int)(I), (unsigned int)(L))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_insert_si64 (__m128i __X,__m128i __Y) { return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); } -#define _mm_inserti_si64(X,Y,I,L) ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (unsigned int)(I), (unsigned int)(L))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L); +} +#undef __DISABLE_SSE4A__ +#pragma GCC pop_options #define _SMMINTRIN_H_INCLUDED #pragma GCC push_options #pragma GCC target("sse4.1") @@ -9052,10 +9615,30 @@ _mm_testnzc_si128 (__m128i __M, __m128i __V) #define _mm_test_all_zeros(M,V) _mm_testz_si128 ((M), (V)) #define _mm_test_all_ones(V) _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) #define _mm_test_mix_ones_zeros(M,V) _mm_testnzc_si128 ((M), (V)) -#define _mm_round_pd(V,M) ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M))) -#define _mm_round_sd(D,V,M) ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), (__v2df)(__m128d)(V), (int)(M))) -#define _mm_round_ps(V,M) ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M))) -#define _mm_round_ss(D,V,M) ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), (__v4sf)(__m128)(V), (int)(M))) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_pd (__m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_sd(__m128d __D, __m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundsd ((__v2df)__D, + (__v2df)__V, + __M); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ps (__m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ss (__m128 __D, __m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundss ((__v4sf)__D, + (__v4sf)__V, + __M); +} #define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) #define _mm_ceil_sd(D,V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) @@ -9064,7 +9647,13 @@ _mm_testnzc_si128 (__m128i __M, __m128i __V) #define _mm_ceil_ss(D,V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) #define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) #define _mm_floor_ss(D,V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) -#define _mm_blend_epi16(X,Y,M) ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(M))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X, + (__v8hi)__Y, + __M); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) { @@ -9072,7 +9661,13 @@ _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) (__v16qi)__Y, (__v16qi)__M); } -#define _mm_blend_ps(X,Y,M) ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(M))) +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_blendps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) { @@ -9080,7 +9675,13 @@ _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) (__v4sf)__Y, (__v4sf)__M); } -#define _mm_blend_pd(X,Y,M) ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(M))) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_blendpd ((__v2df)__X, + (__v2df)__Y, + __M); +} extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) { @@ -9088,8 +9689,20 @@ _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) (__v2df)__Y, (__v2df)__M); } -#define _mm_dp_ps(X,Y,M) ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(M))) -#define _mm_dp_pd(X,Y,M) ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(M))) +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_dpps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_dppd ((__v2df)__X, + (__v2df)__Y, + __M); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi64 (__m128i __X, __m128i __Y) { @@ -9145,17 +9758,56 @@ _mm_mul_epi32 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); } -#define _mm_insert_ps(D,S,N) ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), (__v4sf)(__m128)(S), (int)(N))) +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_ps (__m128 __D, __m128 __S, const int __N) +{ + return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D, + (__v4sf)__S, + __N); +} #define _MM_MK_INSERTPS_NDX(S,D,M) (((S) << 6) | ((D) << 4) | (M)) -#define _mm_extract_ps(X,N) (__extension__ ({ union { int i; float f; } __tmp; __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); __tmp.i; })) +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_ps (__m128 __X, const int __N) +{ + union { int i; float f; } __tmp; + __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N); + return __tmp.i; +} #define _MM_EXTRACT_FLOAT(D,S,N) { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } #define _MM_PICK_OUT_PS(X,N) _mm_insert_ps (_mm_setzero_ps (), (X), _MM_MK_INSERTPS_NDX ((N), 0, 0x0e)) -#define _mm_insert_epi8(D,S,N) ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), (int)(S), (int)(N))) -#define _mm_insert_epi32(D,S,N) ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), (int)(S), (int)(N))) -#define _mm_insert_epi64(D,S,N) ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), (long long)(S), (int)(N))) -#define _mm_extract_epi8(X,N) ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N))) -#define _mm_extract_epi32(X,N) ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N))) -#define _mm_extract_epi64(X,N) ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D, + __S, __N); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D, + __S, __N); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i __D, long long __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D, + __S, __N); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N); +} +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_minpos_epu16 (__m128i __X) { @@ -9226,7 +9878,12 @@ _mm_packus_epi32 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y); } -#define _mm_mpsadbw_epu8(X,Y,M) ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X, + (__v16qi)__Y, __M); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_load_si128 (__m128i *__X) { @@ -9251,20 +9908,104 @@ _mm_stream_load_si128 (__m128i *__X) #define _SIDD_MOST_SIGNIFICANT 0x40 #define _SIDD_BIT_MASK 0x00 #define _SIDD_UNIT_MASK 0x40 -#define _mm_cmpistrm(X,Y,M) ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistri(X,Y,M) ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpestrm(X,LX,Y,LY,M) ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) -#define _mm_cmpestri(X,LX,Y,LY,M) ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) -#define _mm_cmpistra(X,Y,M) ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistrc(X,Y,M) ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistro(X,Y,M) ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistrs(X,Y,M) ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistrz(X,Y,M) ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpestra(X,LX,Y,LY,M) ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) -#define _mm_cmpestrc(X,LX,Y,LY,M) ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) -#define _mm_cmpestro(X,LX,Y,LY,M) ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) -#define _mm_cmpestrs(X,LX,Y,LY,M) ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) -#define _mm_cmpestrz(X,LX,Y,LY,M) ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistri (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistri128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistra (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistria128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistric128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistro (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistrio128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistris128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistriz128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi64 (__m128i __X, __m128i __Y) { @@ -9275,6 +10016,9 @@ _mm_cmpgt_epi64 (__m128i __X, __m128i __Y) #undef __DISABLE_SSE4_1__ #pragma GCC pop_options #define _POPCNTINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("popcnt") +#define __DISABLE_POPCNT__ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_popcnt_u32 (unsigned int __X) { @@ -9285,6 +10029,8 @@ _mm_popcnt_u64 (unsigned long long __X) { return __builtin_popcountll (__X); } +#undef __DISABLE_POPCNT__ +#pragma GCC pop_options #pragma GCC push_options #pragma GCC target("sse4.1") #define __DISABLE_SSE4_1__ @@ -9345,13 +10091,22 @@ _mm_aesimc_si128 (__m128i __X) { return (__m128i) __builtin_ia32_aesimc128 ((__v2di)__X); } -#define _mm_aeskeygenassist_si128(X,C) ((__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)(__m128i)(X), (int)(C))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aeskeygenassist_si128 (__m128i __X, const int __C) +{ + return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C); +} #undef __DISABLE_AES__ #pragma GCC pop_options #pragma GCC push_options #pragma GCC target("pclmul,sse2") #define __DISABLE_PCLMUL__ -#define _mm_clmulepi64_si128(X,Y,I) ((__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(I))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clmulepi64_si128 (__m128i __X, __m128i __Y, const int __I) +{ + return (__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)__X, + (__v2di)__Y, __I); +} #undef __DISABLE_PCLMUL__ #pragma GCC pop_options #define _IMMINTRIN_H_INCLUDED @@ -9456,8 +10211,20 @@ _mm256_andnot_ps (__m256 __A, __m256 __B) { return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); } -#define _mm256_blend_pd(X,Y,M) ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(M))) -#define _mm256_blend_ps(X,Y,M) ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(M))) +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) +{ + return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, + (__v4df)__Y, + __M); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); +} extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) { @@ -9482,7 +10249,13 @@ _mm256_div_ps (__m256 __A, __m256 __B) { return (__m256) ((__v8sf)__A / (__v8sf)__B); } -#define _mm256_dp_ps(X,Y,M) ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(M))) +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); +} extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_hadd_pd (__m256d __X, __m256d __Y) { @@ -9543,8 +10316,18 @@ _mm256_or_ps (__m256 __A, __m256 __B) { return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); } -#define _mm256_shuffle_pd(A,B,N) ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(N))) -#define _mm256_shuffle_ps(A,B,N) ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(N))) +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) +{ + return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, + __mask); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) +{ + return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, + __mask); +} extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_sub_pd (__m256d __A, __m256d __B) { @@ -9565,12 +10348,38 @@ _mm256_xor_ps (__m256 __A, __m256 __B) { return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); } -#define _mm_cmp_pd(X,Y,P) ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P))) -#define _mm_cmp_ps(X,Y,P) ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P))) -#define _mm256_cmp_pd(X,Y,P) ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P))) -#define _mm256_cmp_ps(X,Y,P) ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P))) -#define _mm_cmp_sd(X,Y,P) ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P))) -#define _mm_cmp_ss(X,Y,P) ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P))) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) +{ + return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, + __P); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) +{ + return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, + __P); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); +} extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_cvtepi32_pd (__m128i __A) { @@ -9623,13 +10432,45 @@ _mm256_cvtss_f32 (__m256 __A) { return __A[0]; } -#define _mm256_extractf128_pd(X,N) ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), (int)(N))) -#define _mm256_extractf128_ps(X,N) ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), (int)(N))) -#define _mm256_extractf128_si256(X,N) ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), (int)(N))) -#define _mm256_extract_epi32(X,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); _mm_extract_epi32 (__Y, (N) % 4); })) -#define _mm256_extract_epi16(X,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); _mm_extract_epi16 (__Y, (N) % 8); })) -#define _mm256_extract_epi8(X,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); _mm_extract_epi8 (__Y, (N) % 16); })) -#define _mm256_extract_epi64(X,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); _mm_extract_epi64 (__Y, (N) % 2); })) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_pd (__m256d __X, const int __N) +{ + return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_ps (__m256 __X, const int __N) +{ + return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_si256 (__m256i __X, const int __N) +{ + return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi32 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + return _mm_extract_epi32 (__Y, __N % 4); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi16 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + return _mm_extract_epi16 (__Y, __N % 8); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi8 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + return _mm_extract_epi8 (__Y, __N % 16); +} +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi64 (__m256i __X, const int __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + return _mm_extract_epi64 (__Y, __N % 2); +} extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_zeroall (void) { @@ -9664,13 +10505,47 @@ _mm256_permutevar_ps (__m256 __A, __m256i __C) return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, (__v8si)__C); } -#define _mm_permute_pd(X,C) ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) -#define _mm256_permute_pd(X,C) ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) -#define _mm_permute_ps(X,C) ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) -#define _mm256_permute_ps(X,C) ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) -#define _mm256_permute2f128_pd(X,Y,C) ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C))) -#define _mm256_permute2f128_ps(X,Y,C) ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C))) -#define _mm256_permute2f128_si256(X,Y,C) ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C))) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_pd (__m128d __X, const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_pd (__m256d __X, const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_ps (__m128 __X, const int __C) +{ + return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_ps (__m256 __X, const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) +{ + return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, + (__v4df)__Y, + __C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) +{ + return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, + (__v8sf)__Y, + __C); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) +{ + return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, + (__v8si)__Y, + __C); +} extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_broadcast_ss (float const *__X) { @@ -9696,13 +10571,55 @@ _mm256_broadcast_ps (__m128 const *__X) { return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); } -#define _mm256_insertf128_pd(X,Y,O) ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(O))) -#define _mm256_insertf128_ps(X,Y,O) ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(O))) -#define _mm256_insertf128_si256(X,Y,O) ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(O))) -#define _mm256_insert_epi32(X,D,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); _mm256_insertf128_si256 ((X), __Y, (N) >> 2); })) -#define _mm256_insert_epi16(X,D,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); _mm256_insertf128_si256 ((X), __Y, (N) >> 3); })) -#define _mm256_insert_epi8(X,D,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); _mm256_insertf128_si256 ((X), __Y, (N) >> 4); })) -#define _mm256_insert_epi64(X,D,N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); _mm256_insertf128_si256 ((X), __Y, (N) >> 1); })) +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) +{ + return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, + (__v2df)__Y, + __O); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) +{ + return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, + (__v4sf)__Y, + __O); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) +{ + return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, + (__v4si)__Y, + __O); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi32 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + __Y = _mm_insert_epi32 (__Y, __D, __N % 4); + return _mm256_insertf128_si256 (__X, __Y, __N >> 2); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi16 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + __Y = _mm_insert_epi16 (__Y, __D, __N % 8); + return _mm256_insertf128_si256 (__X, __Y, __N >> 3); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi8 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + __Y = _mm_insert_epi8 (__Y, __D, __N % 16); + return _mm256_insertf128_si256 (__X, __Y, __N >> 4); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi64 (__m256i __X, long long __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + __Y = _mm_insert_epi64 (__Y, __D, __N % 2); + return _mm256_insertf128_si256 (__X, __Y, __N >> 1); +} extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_load_pd (double const *__P) { @@ -9862,8 +10779,16 @@ _mm256_sqrt_ps (__m256 __A) { return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); } -#define _mm256_round_pd(V,M) ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) -#define _mm256_round_ps(V,M) ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_pd (__m256d __V, const int __M) +{ + return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_ps (__m256 __V, const int __M) +{ + return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); +} #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) @@ -10212,17 +11137,17 @@ _mm256_castsi128_si256 (__m128i __A) extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_set_m128 ( __m128 __H, __m128 __L) { - return ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(_mm256_castps128_ps256 (__L)), (__v4sf)(__m128)(__H), (int)(1))); + return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1); } extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_set_m128d (__m128d __H, __m128d __L) { - return ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(_mm256_castpd128_pd256 (__L)), (__v2df)(__m128d)(__H), (int)(1))); + return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1); } extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_set_m128i (__m128i __H, __m128i __L) { - return ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(_mm256_castsi128_si256 (__L)), (__v4si)(__m128i)(__H), (int)(1))); + return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_setr_m128 (__m128 __L, __m128 __H) @@ -10245,7 +11170,13 @@ _mm256_setr_m128i (__m128i __L, __m128i __H) #pragma GCC push_options #pragma GCC target("avx2") #define __DISABLE_AVX2__ -#define _mm256_mpsadbw_epu8(X,Y,M) ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(M))) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, + (__v32qi)__Y, __M); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_abs_epi8 (__m256i __A) @@ -10336,7 +11267,14 @@ _mm256_adds_epu16 (__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); } -#define _mm256_alignr_epi8(A,B,N) ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (int)(N) * 8)) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, + (__v4di)__B, + __N * 8); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_and_si256 (__m256i __A, __m256i __B) @@ -10369,7 +11307,14 @@ _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) (__v32qi)__Y, (__v32qi)__M); } -#define _mm256_blend_epi16(X,Y,M) ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(M))) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, + (__v16hi)__Y, + __M); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_cmpeq_epi8 (__m256i __A, __m256i __B) @@ -10684,9 +11629,24 @@ _mm256_shuffle_epi8 (__m256i __X, __m256i __Y) return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, (__v32qi)__Y); } -#define _mm256_shuffle_epi32(A,N) ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) -#define _mm256_shufflehi_epi16(A,N) ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) -#define _mm256_shufflelo_epi16(A,N) ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi32 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflehi_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflelo_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_sign_epi8 (__m256i __X, __m256i __Y) @@ -10705,8 +11665,18 @@ _mm256_sign_epi32 (__m256i __X, __m256i __Y) { return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); } -#define _mm256_bslli_epi128(A,N) ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) -#define _mm256_slli_si256(A,N) ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bslli_epi128 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_slli_epi16 (__m256i __A, int __B) @@ -10767,8 +11737,18 @@ _mm256_sra_epi32 (__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); } -#define _mm256_bsrli_epi128(A,N) ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) -#define _mm256_srli_si256(A,N) ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bsrli_epi128 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_srli_epi16 (__m256i __A, int __B) @@ -10937,8 +11917,22 @@ _mm256_broadcastsi128_si256 (__m128i __X) { return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); } -#define _mm_blend_epi32(X,Y,M) ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(M))) -#define _mm256_blend_epi32(X,Y,M) ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(M))) +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, + (__v4si)__Y, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, + (__v8si)__Y, + __M); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_broadcastb_epi8 (__m128i __X) @@ -10993,17 +11987,42 @@ _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) { return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); } -#define _mm256_permute4x64_pd(X,M) ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); +} extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) { return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); } -#define _mm256_permute4x64_epi64(X,M) ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) -#define _mm256_permute2x128_si256(X,Y,M) ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) -#define _mm256_extracti128_si256(X,M) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) -#define _mm256_inserti128_si256(X,Y,M) ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(M))) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_epi64 (__m256i __X, const int __M) +{ + return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti128_si256 (__m256i __X, const int __M) +{ + return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); +} extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_maskload_epi32 (int const *__X, __m256i __M ) @@ -11116,38 +12135,382 @@ _mm_srlv_epi64 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); } -#define _mm_i32gather_pd(BASE,INDEX,SCALE) (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), (double const *)BASE, (__v4si)(__m128i)INDEX, (__v2df)_mm_set1_pd( (double)(long long int) -1), (int)SCALE) -#define _mm_mask_i32gather_pd(SRC,BASE,INDEX,MASK,SCALE) (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, (double const *)BASE, (__v4si)(__m128i)INDEX, (__v2df)(__m128d)MASK, (int)SCALE) -#define _mm256_i32gather_pd(BASE,INDEX,SCALE) (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), (double const *)BASE, (__v4si)(__m128i)INDEX, (__v4df)_mm256_set1_pd( (double)(long long int) -1), (int)SCALE) -#define _mm256_mask_i32gather_pd(SRC,BASE,INDEX,MASK,SCALE) (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, (double const *)BASE, (__v4si)(__m128i)INDEX, (__v4df)(__m256d)MASK, (int)SCALE) -#define _mm_i64gather_pd(BASE,INDEX,SCALE) (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), (double const *)BASE, (__v2di)(__m128i)INDEX, (__v2df)_mm_set1_pd( (double)(long long int) -1), (int)SCALE) -#define _mm_mask_i64gather_pd(SRC,BASE,INDEX,MASK,SCALE) (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, (double const *)BASE, (__v2di)(__m128i)INDEX, (__v2df)(__m128d)MASK, (int)SCALE) -#define _mm256_i64gather_pd(BASE,INDEX,SCALE) (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), (double const *)BASE, (__v4di)(__m256i)INDEX, (__v4df)_mm256_set1_pd( (double)(long long int) -1), (int)SCALE) -#define _mm256_mask_i64gather_pd(SRC,BASE,INDEX,MASK,SCALE) (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, (double const *)BASE, (__v4di)(__m256i)INDEX, (__v4df)(__m256d)MASK, (int)SCALE) -#define _mm_i32gather_ps(BASE,INDEX,SCALE) (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), (float const *)BASE, (__v4si)(__m128i)INDEX, _mm_set1_ps ((float)(int) -1), (int)SCALE) -#define _mm_mask_i32gather_ps(SRC,BASE,INDEX,MASK,SCALE) (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, (float const *)BASE, (__v4si)(__m128i)INDEX, (__v4sf)(__m128d)MASK, (int)SCALE) -#define _mm256_i32gather_ps(BASE,INDEX,SCALE) (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), (float const *)BASE, (__v8si)(__m256i)INDEX, (__v8sf)_mm256_set1_ps ( (float)(int) -1), (int)SCALE) -#define _mm256_mask_i32gather_ps(SRC,BASE,INDEX,MASK,SCALE) (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, (float const *)BASE, (__v8si)(__m256i)INDEX, (__v8sf)(__m256d)MASK, (int)SCALE) -#define _mm_i64gather_ps(BASE,INDEX,SCALE) (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), (float const *)BASE, (__v2di)(__m128i)INDEX, (__v4sf)_mm_set1_ps ( (float)(int) -1), (int)SCALE) -#define _mm_mask_i64gather_ps(SRC,BASE,INDEX,MASK,SCALE) (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, (float const *)BASE, (__v2di)(__m128i)INDEX, (__v4sf)(__m128d)MASK, (int)SCALE) -#define _mm256_i64gather_ps(BASE,INDEX,SCALE) (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), (float const *)BASE, (__v4di)(__m256i)INDEX, (__v4sf)_mm_set1_ps( (float)(int) -1), (int)SCALE) -#define _mm256_mask_i64gather_ps(SRC,BASE,INDEX,MASK,SCALE) (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, (float const *)BASE, (__v4di)(__m256i)INDEX, (__v4sf)(__m128)MASK, (int)SCALE) -#define _mm_i32gather_epi64(BASE,INDEX,SCALE) (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), (long long const *)BASE, (__v4si)(__m128i)INDEX, (__v2di)_mm_set1_epi64x (-1), (int)SCALE) -#define _mm_mask_i32gather_epi64(SRC,BASE,INDEX,MASK,SCALE) (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, (long long const *)BASE, (__v4si)(__m128i)INDEX, (__v2di)(__m128i)MASK, (int)SCALE) -#define _mm256_i32gather_epi64(BASE,INDEX,SCALE) (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), (long long const *)BASE, (__v4si)(__m128i)INDEX, (__v4di)_mm256_set1_epi64x (-1), (int)SCALE) -#define _mm256_mask_i32gather_epi64(SRC,BASE,INDEX,MASK,SCALE) (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, (long long const *)BASE, (__v4si)(__m128i)INDEX, (__v4di)(__m256i)MASK, (int)SCALE) -#define _mm_i64gather_epi64(BASE,INDEX,SCALE) (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), (long long const *)BASE, (__v2di)(__m128i)INDEX, (__v2di)_mm_set1_epi64x (-1), (int)SCALE) -#define _mm_mask_i64gather_epi64(SRC,BASE,INDEX,MASK,SCALE) (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, (long long const *)BASE, (__v2di)(__m128i)INDEX, (__v2di)(__m128i)MASK, (int)SCALE) -#define _mm256_i64gather_epi64(BASE,INDEX,SCALE) (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), (long long const *)BASE, (__v4di)(__m256i)INDEX, (__v4di)_mm256_set1_epi64x (-1), (int)SCALE) -#define _mm256_mask_i64gather_epi64(SRC,BASE,INDEX,MASK,SCALE) (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, (long long const *)BASE, (__v4di)(__m256i)INDEX, (__v4di)(__m256i)MASK, (int)SCALE) -#define _mm_i32gather_epi32(BASE,INDEX,SCALE) (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), (int const *)BASE, (__v4si)(__m128i)INDEX, (__v4si)_mm_set1_epi32 (-1), (int)SCALE) -#define _mm_mask_i32gather_epi32(SRC,BASE,INDEX,MASK,SCALE) (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, (int const *)BASE, (__v4si)(__m128i)INDEX, (__v4si)(__m128i)MASK, (int)SCALE) -#define _mm256_i32gather_epi32(BASE,INDEX,SCALE) (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), (int const *)BASE, (__v8si)(__m256i)INDEX, (__v8si)_mm256_set1_epi32 (-1), (int)SCALE) -#define _mm256_mask_i32gather_epi32(SRC,BASE,INDEX,MASK,SCALE) (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, (int const *)BASE, (__v8si)(__m256i)INDEX, (__v8si)(__m256i)MASK, (int)SCALE) -#define _mm_i64gather_epi32(BASE,INDEX,SCALE) (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), (int const *)BASE, (__v2di)(__m128i)INDEX, (__v4si)_mm_set1_epi32 (-1), (int)SCALE) -#define _mm_mask_i64gather_epi32(SRC,BASE,INDEX,MASK,SCALE) (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, (int const *)BASE, (__v2di)(__m128i)INDEX, (__v4si)(__m128i)MASK, (int)SCALE) -#define _mm256_i64gather_epi32(BASE,INDEX,SCALE) (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), (int const *)BASE, (__v4di)(__m256i)INDEX, (__v4si)_mm_set1_epi32(-1), (int)SCALE) -#define _mm256_mask_i64gather_epi32(SRC,BASE,INDEX,MASK,SCALE) (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, (int const *)BASE, (__v4di)(__m256i)INDEX, (__v4si)(__m128i)MASK, (int)SCALE) +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v2df __zero = _mm_setzero_pd (); + __v2df __mask = _mm_cmpeq_pd (__zero, __zero); + return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (), + __base, + (__v4si)__index, + __mask, + __scale); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) +{ + return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src, + __base, + (__v4si)__index, + (__v2df)__mask, + __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v4df __zero = _mm256_setzero_pd (); + __v4df __mask = _mm256_cmp_pd (__zero, __zero, 0x00); + return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (), + __base, + (__v4si)__index, + __mask, + __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_pd (__m256d __src, double const *__base, + __m128i __index, __m256d __mask, const int __scale) +{ + return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src, + __base, + (__v4si)__index, + (__v4df)__mask, + __scale); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v2df __src = _mm_setzero_pd (); + __v2df __mask = _mm_cmpeq_pd (__src, __src); + return (__m128d) __builtin_ia32_gatherdiv2df (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) +{ + return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src, + __base, + (__v2di)__index, + (__v2df)__mask, + __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale) +{ + __v4df __src = _mm256_setzero_pd (); + __v4df __mask = _mm256_cmp_pd (__src, __src, 0x00); + return (__m256d) __builtin_ia32_gatherdiv4df (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_pd (__m256d __src, double const *__base, + __m256i __index, __m256d __mask, const int __scale) +{ + return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src, + __base, + (__v4di)__index, + (__v4df)__mask, + __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_ps (float const *__base, __m128i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + return (__m128) __builtin_ia32_gathersiv4sf (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src, + __base, + (__v4si)__index, + (__v4sf)__mask, + __scale); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale) +{ + __v8sf __src = _mm256_setzero_ps (); + __v8sf __mask = _mm256_cmp_ps (__src, __src, 0x00); + return (__m256) __builtin_ia32_gathersiv8sf (__src, + __base, + (__v8si)__index, + __mask, + __scale); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_ps (__m256 __src, float const *__base, + __m256i __index, __m256 __mask, const int __scale) +{ + return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src, + __base, + (__v8si)__index, + (__v8sf)__mask, + __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_ps (float const *__base, __m128i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + return (__m128) __builtin_ia32_gatherdiv4sf (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src, + __base, + (__v2di)__index, + (__v4sf)__mask, + __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + return (__m128) __builtin_ia32_gatherdiv4sf256 (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_ps (__m128 __src, float const *__base, + __m256i __index, __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src, + __base, + (__v4di)__index, + (__v4sf)__mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v2di __src = __extension__ (__v2di){ 0, 0 }; + __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; + return (__m128i) __builtin_ia32_gathersiv2di (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src, + __base, + (__v4si)__index, + (__v2di)__mask, + __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + return (__m256i) __builtin_ia32_gathersiv4di (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base, + __m128i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src, + __base, + (__v4si)__index, + (__v4di)__mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v2di __src = __extension__ (__v2di){ 0, 0 }; + __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; + return (__m128i) __builtin_ia32_gatherdiv2di (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src, + __base, + (__v2di)__index, + (__v2di)__mask, + __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi64 (long long int const *__base, + __m256i __index, const int __scale) +{ + __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + return (__m256i) __builtin_ia32_gatherdiv4di (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base, + __m256i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src, + __base, + (__v4di)__index, + (__v4di)__mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + return (__m128i) __builtin_ia32_gathersiv4si (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src, + __base, + (__v4si)__index, + (__v4si)__mask, + __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale) +{ + __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; + __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; + return (__m256i) __builtin_ia32_gathersiv8si (__src, + __base, + (__v8si)__index, + __mask, + __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi32 (__m256i __src, int const *__base, + __m256i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src, + __base, + (__v8si)__index, + (__v8si)__mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + return (__m128i) __builtin_ia32_gatherdiv4si (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src, + __base, + (__v2di)__index, + (__v4si)__mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + return (__m128i) __builtin_ia32_gatherdiv4si256 (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, + __m256i __index, __m128i __mask, + const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src, + __base, + (__v4di)__index, + (__v4si)__mask, + __scale); +} #undef __DISABLE_AVX2__ #pragma GCC pop_options #define _AVX512FINTRIN_H_INCLUDED @@ -11932,9 +13295,33 @@ _mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y) _mm512_setzero_si512 (), __M); } -#define _mm512_slli_epi64(X,C) ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) -#define _mm512_mask_slli_epi64(W,U,X,C) ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_slli_epi64(U,X,C) ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sll_epi64 (__m512i __A, __m128i __B) @@ -11964,9 +13351,33 @@ _mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B) _mm512_setzero_si512 (), (__mmask8) __U); } -#define _mm512_srli_epi64(X,C) ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) -#define _mm512_mask_srli_epi64(W,U,X,C) ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_srli_epi64(U,X,C) ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_srl_epi64 (__m512i __A, __m128i __B) @@ -11996,9 +13407,33 @@ _mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B) _mm512_setzero_si512 (), (__mmask8) __U); } -#define _mm512_srai_epi64(X,C) ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) -#define _mm512_mask_srai_epi64(W,U,X,C) ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_srai_epi64(U,X,C) ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sra_epi64 (__m512i __A, __m128i __B) @@ -12028,9 +13463,33 @@ _mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B) _mm512_setzero_si512 (), (__mmask8) __U); } -#define _mm512_slli_epi32(X,C) ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) -#define _mm512_mask_slli_epi32(W,U,X,C) ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_slli_epi32(U,X,C) ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sll_epi32 (__m512i __A, __m128i __B) @@ -12060,9 +13519,33 @@ _mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B) _mm512_setzero_si512 (), (__mmask16) __U); } -#define _mm512_srli_epi32(X,C) ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) -#define _mm512_mask_srli_epi32(W,U,X,C) ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_srli_epi32(U,X,C) ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_srl_epi32 (__m512i __A, __m128i __B) @@ -12092,9 +13575,33 @@ _mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B) _mm512_setzero_si512 (), (__mmask16) __U); } -#define _mm512_srai_epi32(X,C) ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) -#define _mm512_mask_srai_epi32(W,U,X,C) ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_srai_epi32(U,X,C) ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sra_epi32 (__m512i __A, __m128i __B) @@ -12124,24 +13631,182 @@ _mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B) _mm512_setzero_si512 (), (__mmask16) __U); } -#define _mm_add_round_sd(A,B,C) (__m128d)__builtin_ia32_addsd_round(A, B, C) -#define _mm_mask_add_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C) -#define _mm_maskz_add_round_sd(U,A,B,C) (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) -#define _mm_add_round_ss(A,B,C) (__m128)__builtin_ia32_addss_round(A, B, C) -#define _mm_mask_add_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C) -#define _mm_maskz_add_round_ss(U,A,B,C) (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) -#define _mm_sub_round_sd(A,B,C) (__m128d)__builtin_ia32_subsd_round(A, B, C) -#define _mm_mask_sub_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C) -#define _mm_maskz_sub_round_sd(U,A,B,C) (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) -#define _mm_sub_round_ss(A,B,C) (__m128)__builtin_ia32_subss_round(A, B, C) -#define _mm_mask_sub_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C) -#define _mm_maskz_sub_round_ss(U,A,B,C) (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) -#define _mm512_ternarylogic_epi64(A,B,C,I) ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)-1)) -#define _mm512_mask_ternarylogic_epi64(A,U,B,C,I) ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) -#define _mm512_maskz_ternarylogic_epi64(U,A,B,C,I) ((__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) -#define _mm512_ternarylogic_epi32(A,B,C,I) ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), (__mmask16)-1)) -#define _mm512_mask_ternarylogic_epi32(A,U,B,C,I) ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), (__mmask16)(U))) -#define _mm512_maskz_ternarylogic_epi32(U,A,B,C,I) ((__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), (__mmask16)(U))) +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C, + const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, __imm, + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, __imm, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + __imm, (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C, + const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + __imm, (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + __imm, (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + __imm, (__mmask16) __U); +} extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_rcp14_pd (__m512d __A) @@ -12346,18 +14011,119 @@ _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) (__v4sf) _mm_setzero_ps (), (__mmask8) __U); } -#define _mm512_sqrt_round_pd(A,C) (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C) -#define _mm512_mask_sqrt_round_pd(W,U,A,C) (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C) -#define _mm512_maskz_sqrt_round_pd(U,A,C) (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_sqrt_round_ps(A,C) (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C) -#define _mm512_mask_sqrt_round_ps(W,U,A,C) (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C) -#define _mm512_maskz_sqrt_round_ps(U,A,C) (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm_sqrt_round_sd(A,B,C) (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, (__v2df) _mm_setzero_pd (), -1, C) -#define _mm_mask_sqrt_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C) -#define _mm_maskz_sqrt_round_sd(U,A,B,C) (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, (__v2df) _mm_setzero_pd (), U, C) -#define _mm_sqrt_round_ss(A,B,C) (__m128)__builtin_ia32_sqrtss_mask_round (B, A, (__v4sf) _mm_setzero_ps (), -1, C) -#define _mm_mask_sqrt_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C) -#define _mm_maskz_sqrt_round_ss(U,A,B,C) (__m128)__builtin_ia32_sqrtss_mask_round (B, A, (__v4sf) _mm_setzero_ps (), U, C) +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtepi8_epi32 (__m128i __A) @@ -12618,13930 +14384,13412 @@ _mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X) _mm512_setzero_si512 (), (__mmask8) __U); } -#define _mm512_add_round_pd(A,B,C) (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) -#define _mm512_mask_add_round_pd(W,U,A,B,C) (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C) -#define _mm512_maskz_add_round_pd(U,A,B,C) (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_add_round_ps(A,B,C) (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) -#define _mm512_mask_add_round_ps(W,U,A,B,C) (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C) -#define _mm512_maskz_add_round_ps(U,A,B,C) (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm512_sub_round_pd(A,B,C) (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) -#define _mm512_mask_sub_round_pd(W,U,A,B,C) (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C) -#define _mm512_maskz_sub_round_pd(U,A,B,C) (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_sub_round_ps(A,B,C) (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) -#define _mm512_mask_sub_round_ps(W,U,A,B,C) (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C) -#define _mm512_maskz_sub_round_ps(U,A,B,C) (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm512_mul_round_pd(A,B,C) (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) -#define _mm512_mask_mul_round_pd(W,U,A,B,C) (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C) -#define _mm512_maskz_mul_round_pd(U,A,B,C) (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_mul_round_ps(A,B,C) (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) -#define _mm512_mask_mul_round_ps(W,U,A,B,C) (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C) -#define _mm512_maskz_mul_round_ps(U,A,B,C) (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm512_div_round_pd(A,B,C) (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) -#define _mm512_mask_div_round_pd(W,U,A,B,C) (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C) -#define _mm512_maskz_div_round_pd(U,A,B,C) (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_div_round_ps(A,B,C) (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) -#define _mm512_mask_div_round_ps(W,U,A,B,C) (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C) -#define _mm512_maskz_div_round_ps(U,A,B,C) (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm_mul_round_sd(A,B,C) (__m128d)__builtin_ia32_mulsd_round(A, B, C) -#define _mm_mask_mul_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C) -#define _mm_maskz_mul_round_sd(U,A,B,C) (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) -#define _mm_mul_round_ss(A,B,C) (__m128)__builtin_ia32_mulss_round(A, B, C) -#define _mm_mask_mul_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C) -#define _mm_maskz_mul_round_ss(U,A,B,C) (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) -#define _mm_div_round_sd(A,B,C) (__m128d)__builtin_ia32_divsd_round(A, B, C) -#define _mm_mask_div_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C) -#define _mm_maskz_div_round_sd(U,A,B,C) (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) -#define _mm_div_round_ss(A,B,C) (__m128)__builtin_ia32_divss_round(A, B, C) -#define _mm_mask_div_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C) -#define _mm_maskz_div_round_ss(U,A,B,C) (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) -#define _mm512_max_round_pd(A,B,R) (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) -#define _mm512_mask_max_round_pd(W,U,A,B,R) (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R) -#define _mm512_maskz_max_round_pd(U,A,B,R) (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) -#define _mm512_max_round_ps(A,B,R) (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), -1, R) -#define _mm512_mask_max_round_ps(W,U,A,B,R) (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R) -#define _mm512_maskz_max_round_ps(U,A,B,R) (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) -#define _mm512_min_round_pd(A,B,R) (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) -#define _mm512_mask_min_round_pd(W,U,A,B,R) (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R) -#define _mm512_maskz_min_round_pd(U,A,B,R) (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) -#define _mm512_min_round_ps(A,B,R) (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, R) -#define _mm512_mask_min_round_ps(W,U,A,B,R) (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R) -#define _mm512_maskz_min_round_ps(U,A,B,R) (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) -#define _mm512_scalef_round_pd(A,B,C) (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) -#define _mm512_mask_scalef_round_pd(W,U,A,B,C) (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C) -#define _mm512_maskz_scalef_round_pd(U,A,B,C) (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_scalef_round_ps(A,B,C) (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) -#define _mm512_mask_scalef_round_ps(W,U,A,B,C) (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C) -#define _mm512_maskz_scalef_round_ps(U,A,B,C) (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm_scalef_round_sd(A,B,C) (__m128d)__builtin_ia32_scalefsd_mask_round (A, B, (__v2df)_mm_setzero_pd (), -1, C) -#define _mm_scalef_round_ss(A,B,C) (__m128)__builtin_ia32_scalefss_mask_round (A, B, (__v4sf)_mm_setzero_ps (), -1, C) -#define _mm512_fmadd_round_pd(A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R) -#define _mm512_mask_fmadd_round_pd(A,U,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) -#define _mm512_mask3_fmadd_round_pd(A,B,C,U,R) (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmadd_round_pd(U,A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R) -#define _mm512_fmadd_round_ps(A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R) -#define _mm512_mask_fmadd_round_ps(A,U,B,C,R) (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R) -#define _mm512_mask3_fmadd_round_ps(A,B,C,U,R) (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmadd_round_ps(U,A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R) -#define _mm512_fmsub_round_pd(A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), -1, R) -#define _mm512_mask_fmsub_round_pd(A,U,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), U, R) -#define _mm512_mask3_fmsub_round_pd(A,B,C,U,R) (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmsub_round_pd(U,A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, -(C), U, R) -#define _mm512_fmsub_round_ps(A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), -1, R) -#define _mm512_mask_fmsub_round_ps(A,U,B,C,R) (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), U, R) -#define _mm512_mask3_fmsub_round_ps(A,B,C,U,R) (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmsub_round_ps(U,A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, -(C), U, R) -#define _mm512_fmaddsub_round_pd(A,B,C,R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R) -#define _mm512_mask_fmaddsub_round_pd(A,U,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) -#define _mm512_mask3_fmaddsub_round_pd(A,B,C,U,R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmaddsub_round_pd(U,A,B,C,R) (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R) -#define _mm512_fmaddsub_round_ps(A,B,C,R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R) -#define _mm512_mask_fmaddsub_round_ps(A,U,B,C,R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R) -#define _mm512_mask3_fmaddsub_round_ps(A,B,C,U,R) (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmaddsub_round_ps(U,A,B,C,R) (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R) -#define _mm512_fmsubadd_round_pd(A,B,C,R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R) -#define _mm512_mask_fmsubadd_round_pd(A,U,B,C,R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R) -#define _mm512_mask3_fmsubadd_round_pd(A,B,C,U,R) (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmsubadd_round_pd(U,A,B,C,R) (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R) -#define _mm512_fmsubadd_round_ps(A,B,C,R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R) -#define _mm512_mask_fmsubadd_round_ps(A,U,B,C,R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R) -#define _mm512_mask3_fmsubadd_round_ps(A,B,C,U,R) (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R) -#define _mm512_maskz_fmsubadd_round_ps(U,A,B,C,R) (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R) -#define _mm512_fnmadd_round_pd(A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, C, -1, R) -#define _mm512_mask_fnmadd_round_pd(A,U,B,C,R) (__m512d)__builtin_ia32_vfnmaddpd512_mask(-(A), B, C, U, R) -#define _mm512_mask3_fnmadd_round_pd(A,B,C,U,R) (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(A), B, C, U, R) -#define _mm512_maskz_fnmadd_round_pd(U,A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, C, U, R) -#define _mm512_fnmadd_round_ps(A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, C, -1, R) -#define _mm512_mask_fnmadd_round_ps(A,U,B,C,R) (__m512)__builtin_ia32_vfnmaddps512_mask(-(A), B, C, U, R) -#define _mm512_mask3_fnmadd_round_ps(A,B,C,U,R) (__m512)__builtin_ia32_vfmaddps512_mask3(-(A), B, C, U, R) -#define _mm512_maskz_fnmadd_round_ps(U,A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, C, U, R) -#define _mm512_fnmsub_round_pd(A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, -(C), -1, R) -#define _mm512_mask_fnmsub_round_pd(A,U,B,C,R) (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R) -#define _mm512_mask3_fnmsub_round_pd(A,B,C,U,R) (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R) -#define _mm512_maskz_fnmsub_round_pd(U,A,B,C,R) (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, -(C), U, R) -#define _mm512_fnmsub_round_ps(A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, -(C), -1, R) -#define _mm512_mask_fnmsub_round_ps(A,U,B,C,R) (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R) -#define _mm512_mask3_fnmsub_round_ps(A,B,C,U,R) (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R) -#define _mm512_maskz_fnmsub_round_ps(U,A,B,C,R) (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, -(C), U, R) -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_abs_epi64 (__m512i __A) -{ - return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); -} -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_abs_epi32 (__m512i __A) +_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) +_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { - return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastss_ps (__m128 __A) +_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R) { - return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) +_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { - return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, - (__v16sf) __O, __M); + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) +_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) { - return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, - (__v16sf) - _mm512_setzero_ps (), - __M); + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastsd_pd (__m128d __A) +_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R) { - return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) +_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { - return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, - (__v8df) __O, __M); + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { - return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, - (__v8df) - _mm512_setzero_pd (), - __M); + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastd_epi32 (__m128i __A) +_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) +_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, - (__v16si) __O, __M); + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) +_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_set1_epi32 (int __A) +_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16)(-1)); + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) +_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O, - __M); + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_set1_epi32 (__mmask16 __M, int __A) +_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { - return (__m512i) - __builtin_ia32_pbroadcastd512_gpr_mask (__A, - (__v16si) _mm512_setzero_si512 (), - __M); + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastq_epi64 (__m128i __A) +_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) +_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, - (__v8di) __O, __M); + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_set1_epi64 (long long __A) +_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8)(-1)); + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) +_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __R) { - return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, - __M); + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) +_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __R) { - return (__m512i) - __builtin_ia32_pbroadcastq512_gpr_mask (__A, - (__v8di) _mm512_setzero_si512 (), - __M); + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_f32x4 (__m128 __A) +_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A) +_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) __O, - __M); + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A) +_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) - _mm512_setzero_ps (), - __M); + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_i32x4 (__m128i __A) +_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, + (__v2df) __B, + __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A) +_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) __O, - __M); + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A) +_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_f64x4 (__m256d __A) +_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A) +_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) __O, - __M); + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A) +_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) - _mm512_setzero_pd (), - __M); + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_i64x4 (__m256i __A) +_mm_div_round_sd (__m128d __A, __m128d __B, const int __R) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, + (__v2df) __B, + __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A) +_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) __O, - __M); + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A) +_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } -typedef enum -{ - _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, - _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, - _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, - _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, - _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, - _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, - _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, - _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, - _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, - _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, - _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, - _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, - _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, - _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, - _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, - _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, - _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, - _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, - _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, - _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, - _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, - _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, - _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, - _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, - _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, - _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, - _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, - _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, - _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, - _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, - _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, - _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, - _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, - _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, - _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, - _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, - _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, - _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, - _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, - _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, - _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, - _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, - _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, - _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, - _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, - _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, - _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, - _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, - _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, - _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, - _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, - _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, - _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, - _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, - _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, - _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, - _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, - _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, - _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, - _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, - _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, - _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, - _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, - _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, - _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, - _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, - _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, - _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, - _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, - _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, - _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, - _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, - _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, - _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, - _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, - _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, - _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, - _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, - _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, - _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, - _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, - _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, - _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, - _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, - _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, - _MM_PERM_DDDD = 0xFF -} _MM_PERM_ENUM; -#define _mm512_shuffle_epi32(X,C) ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) -#define _mm512_mask_shuffle_epi32(W,U,X,C) ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_shuffle_epi32(U,X,C) ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) -#define _mm512_shuffle_i64x2(X,Y,C) ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) -#define _mm512_mask_shuffle_i64x2(W,U,X,Y,C) ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_shuffle_i64x2(U,X,Y,C) ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) -#define _mm512_shuffle_i32x4(X,Y,C) ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) -#define _mm512_mask_shuffle_i32x4(W,U,X,Y,C) ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_shuffle_i32x4(U,X,Y,C) ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) -#define _mm512_shuffle_f64x2(X,Y,C) ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) -#define _mm512_mask_shuffle_f64x2(W,U,X,Y,C) ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) -#define _mm512_maskz_shuffle_f64x2(U,X,Y,C) ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) -#define _mm512_shuffle_f32x4(X,Y,C) ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) -#define _mm512_mask_shuffle_f32x4(W,U,X,Y,C) ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) -#define _mm512_maskz_shuffle_f32x4(U,X,Y,C) ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) -extern __inline __m512i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_rolv_epi32 (__m512i __A, __m512i __B) +_mm_div_round_ss (__m128 __A, __m128 __B, const int __R) { - return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m128) __builtin_ia32_divss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } -extern __inline __m512i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { - return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { - return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_rorv_epi32 (__m512i __A, __m512i __B) +_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { - return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_rolv_epi64 (__m512i __A, __m512i __B) +_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) { - return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_rorv_epi64 (__m512i __A, __m512i __B) +_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { - return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { - return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } -#define _mm512_cvtt_roundpd_epi32(A,B) ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) -#define _mm512_mask_cvtt_roundpd_epi32(W,U,A,B) ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B)) -#define _mm512_maskz_cvtt_roundpd_epi32(U,A,B) ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) -#define _mm512_cvtt_roundpd_epu32(A,B) ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) -#define _mm512_mask_cvtt_roundpd_epu32(W,U,A,B) ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B)) -#define _mm512_maskz_cvtt_roundpd_epu32(U,A,B) ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) -#define _mm512_cvt_roundpd_epi32(A,B) ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) -#define _mm512_mask_cvt_roundpd_epi32(W,U,A,B) ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B)) -#define _mm512_maskz_cvt_roundpd_epi32(U,A,B) ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) -#define _mm512_cvt_roundpd_epu32(A,B) ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) -#define _mm512_mask_cvt_roundpd_epu32(W,U,A,B) ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B)) -#define _mm512_maskz_cvt_roundpd_epu32(U,A,B) ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) -#define _mm512_cvtt_roundps_epi32(A,B) ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) -#define _mm512_mask_cvtt_roundps_epi32(W,U,A,B) ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B)) -#define _mm512_maskz_cvtt_roundps_epi32(U,A,B) ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) -#define _mm512_cvtt_roundps_epu32(A,B) ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) -#define _mm512_mask_cvtt_roundps_epu32(W,U,A,B) ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B)) -#define _mm512_maskz_cvtt_roundps_epu32(U,A,B) ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) -#define _mm512_cvt_roundps_epi32(A,B) ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) -#define _mm512_mask_cvt_roundps_epi32(W,U,A,B) ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B)) -#define _mm512_maskz_cvt_roundps_epi32(U,A,B) ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) -#define _mm512_cvt_roundps_epu32(A,B) ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) -#define _mm512_mask_cvt_roundps_epu32(W,U,A,B) ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B)) -#define _mm512_maskz_cvt_roundps_epu32(U,A,B) ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu32_sd (__m128d __A, unsigned __B) +_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R) { - return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -#define _mm_cvt_roundu64_sd(A,B,C) (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C) -#define _mm_cvt_roundi64_sd(A,B,C) (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) -#define _mm_cvt_roundsi64_sd(A,B,C) (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) -#define _mm_cvt_roundu32_ss(A,B,C) (__m128)__builtin_ia32_cvtusi2ss32(A, B, C) -#define _mm_cvt_roundi32_ss(A,B,C) (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) -#define _mm_cvt_roundsi32_ss(A,B,C) (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) -#define _mm_cvt_roundu64_ss(A,B,C) (__m128)__builtin_ia32_cvtusi2ss64(A, B, C) -#define _mm_cvt_roundi64_ss(A,B,C) (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) -#define _mm_cvt_roundsi64_ss(A,B,C) (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi32_epi8 (__m512i __A) +_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { - return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask16) -1); + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) { - __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R) { - return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, - (__v16qi) __O, __M); + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) +_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { - return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtsepi32_epi8 (__m512i __A) +_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { - return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask16) -1); + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R) { - __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { - return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, - (__v16qi) __O, __M); + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) +_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { - return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtusepi32_epi8 (__m512i __A) +_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R) { - return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask16) -1); + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); } -extern __inline void +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { - __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { - return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, - (__v16qi) __O, - __M); + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) +_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); } -extern __inline __m256i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi32_epi16 (__m512i __A) +_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { - return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, - (__v16hi) - _mm256_undefined_si256 (), - (__mmask16) -1); + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) +_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) { - __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) { - return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, - (__v16hi) __O, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) +_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { - return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, - (__v16hi) - _mm256_setzero_si256 (), - __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtsepi32_epi16 (__m512i __A) +_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { - return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, - (__v16hi) - _mm256_undefined_si256 (), - (__mmask16) -1); + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { - __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) { - return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, - (__v16hi) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) +_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { - return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, - (__v16hi) - _mm256_setzero_si256 (), - __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtusepi32_epi16 (__m512i __A) +_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { - return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, - (__v16hi) - _mm256_undefined_si256 (), - (__mmask16) -1); + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { - __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) { - return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, - (__v16hi) __O, - __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) +_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { - return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, - (__v16hi) - _mm256_setzero_si256 (), - __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi64_epi32 (__m512i __A) +_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { - return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { - __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) { - return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, - (__v8si) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) +_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { - return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtsepi64_epi32 (__m512i __A) +_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { - return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) +_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { - __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) { - return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, - (__v8si) __O, __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) +_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { - return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtusepi64_epi32 (__m512i __A) +_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { - return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { - __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) { - return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, - (__v8si) __O, __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) +_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { - return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi64_epi16 (__m512i __A) +_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { - return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { - __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) { - return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, - (__v8hi) __O, __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) +_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { - return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtsepi64_epi16 (__m512i __A) +_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { - return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) +_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { - __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) { - return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, - (__v8hi) __O, __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) +_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { - return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtusepi64_epi16 (__m512i __A) +_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { - return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { - __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) { - return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, - (__v8hi) __O, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) +_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { - return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi64_epi8 (__m512i __A) +_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { - return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { - __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) { - return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, - (__v16qi) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) +_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { - return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); -} -extern __inline __m128i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtsepi64_epi8 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { - __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { - return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, - (__v16qi) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) +_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) { - return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtusepi64_epi8 (__m512i __A) +_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { - return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { - __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { - return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, - (__v16qi) __O, - __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) +_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) { - return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); } -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi32_pd (__m256i __A) +_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) +_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) +_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepu32_pd (__m256i __A) +_mm512_abs_epi64 (__m512i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) +_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) +_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -#define _mm512_cvt_roundepi32_ps(A,B) (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) -#define _mm512_mask_cvt_roundepi32_ps(W,U,A,B) (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B) -#define _mm512_maskz_cvt_roundepi32_ps(U,A,B) (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) -#define _mm512_cvt_roundepu32_ps(A,B) (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) -#define _mm512_mask_cvt_roundepu32_ps(W,U,A,B) (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B) -#define _mm512_maskz_cvt_roundepu32_ps(U,A,B) (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) -#define _mm512_extractf64x4_pd(X,C) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), (int) (C), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)) -#define _mm512_mask_extractf64x4_pd(W,U,X,C) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), (int) (C), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm512_maskz_extractf64x4_pd(U,X,C) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) -#define _mm512_extractf32x4_ps(X,C) ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), (int) (C), (__v4sf)(__m128)_mm_undefined_ps(), (__mmask8)-1)) -#define _mm512_mask_extractf32x4_ps(W,U,X,C) ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), (int) (C), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm512_maskz_extractf32x4_ps(U,X,C) ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), (int) (C), (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U))) -#define _mm512_extracti64x4_epi64(X,C) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), (int) (C), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)) -#define _mm512_mask_extracti64x4_epi64(W,U,X,C) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), (int) (C), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm512_maskz_extracti64x4_epi64(U,X,C) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm512_extracti32x4_epi32(X,C) ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), (int) (C), (__v4si)(__m128i)_mm_undefined_si128 (), (__mmask8)-1)) -#define _mm512_mask_extracti32x4_epi32(W,U,X,C) ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm512_maskz_extracti32x4_epi32(U,X,C) ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm512_insertf32x4(X,Y,C) ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1))) -#define _mm512_inserti32x4(X,Y,C) ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1))) -#define _mm512_insertf64x4(X,Y,C) ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), (__v4df)(__m256d) (Y), (int) (C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) -#define _mm512_mask_insertf64x4(W,U,X,Y,C) ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), (__v4df)(__m256d) (Y), (int) (C), (__v8df)(__m512d)(W), (__mmask8)(U))) -#define _mm512_maskz_insertf64x4(U,X,Y,C) ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), (__v4df)(__m256d) (Y), (int) (C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) -#define _mm512_inserti64x4(X,Y,C) ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), (__v4di)(__m256i) (Y), (int) (C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) -#define _mm512_mask_inserti64x4(W,U,X,Y,C) ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), (__v4di)(__m256i) (Y), (int) (C), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_inserti64x4(U,X,Y,C) ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), (__v4di)(__m256i) (Y), (int) (C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_loadu_pd (void const *__P) +_mm512_abs_epi32 (__m512i __A) { - return *(__m512d_u *)__P; + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) +_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, - (__v8df) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P) +_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) { - return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_storeu_pd (void *__P, __m512d __A) +_mm512_broadcastss_ps (__m128 __A) { - *(__m512d_u *)__P = __A; + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A) +_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) { - __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A, - (__mmask8) __U); + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) __O, __M); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_loadu_ps (void const *__P) +_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) { - return *(__m512_u *)__P; + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) +_mm512_broadcastsd_pd (__m128d __A) { - return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, - (__v16sf) __W, - (__mmask16) __U); + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P) +_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) { - return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) __O, __M); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_storeu_ps (void *__P, __m512 __A) +_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { - *(__m512_u *)__P = __A; + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A) +_mm512_broadcastd_epi32 (__m128i __A) { - __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A, - (__mmask16) __U); + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, - (__v8di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) __O, __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A) +_mm512_set1_epi32 (int __A) { - __builtin_ia32_storedqudi512_mask ((long long *) __P, (__v8di) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16)(-1)); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_loadu_si512 (void const *__P) +_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) { - return *(__m512i_u *)__P; + return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O, + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +_mm512_maskz_set1_epi32 (__mmask16 __M, int __A) { - return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, - (__v16si) __W, - (__mmask16) __U); + return (__m512i) + __builtin_ia32_pbroadcastd512_gpr_mask (__A, + (__v16si) _mm512_setzero_si512 (), + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P) +_mm512_broadcastq_epi64 (__m128i __A) { - return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_storeu_si512 (void *__P, __m512i __A) +_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) { - *(__m512i_u *)__P = __A; + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) __O, __M); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A) +_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { - __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A, - (__mmask16) __U); + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutevar_pd (__m512d __A, __m512i __C) +_mm512_set1_epi64 (long long __A) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8)(-1)); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) +_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, + __M); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C) +_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512i) + __builtin_ia32_pbroadcastq512_gpr_mask (__A, + (__v8di) _mm512_setzero_si512 (), + __M); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutevar_ps (__m512 __A, __m512i __C) +_mm512_broadcast_f32x4 (__m128 __A) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) +_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) __W, - (__mmask16) __U); + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) __O, + __M); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C) +_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B) +_mm512_broadcast_i32x4 (__m128i __A) { - return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I - , - (__v8di) __A, - (__v8di) __B, - (__mmask8) -1); + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) +_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I - , - (__v8di) __A, - (__v8di) __B, - (__mmask8) __U); + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) __O, + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, - __mmask8 __U, __m512i __B) +_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A, - (__v8di) __I - , - (__v8di) __B, - (__mmask8) __U); + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, - __m512i __I, __m512i __B) +_mm512_broadcast_f64x4 (__m256d __A) { - return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I - , - (__v8di) __A, - (__v8di) __B, - (__mmask8) __U); + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B) +_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A) { - return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I - , - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) __O, + __M); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, - __m512i __I, __m512i __B) +_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A) { - return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I - , - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, - __mmask16 __U, __m512i __B) +_mm512_broadcast_i64x4 (__m256i __A) { - return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A, - (__v16si) __I - , - (__v16si) __B, - (__mmask16) __U); + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A, - __m512i __I, __m512i __B) -{ - return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I - , - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); -} -extern __inline __m512d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B) -{ - return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I - , - (__v8df) __A, - (__v8df) __B, - (__mmask8) -1); -} -extern __inline __m512d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, - __m512d __B) -{ - return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I - , - (__v8df) __A, - (__v8df) __B, - (__mmask8) __U); -} -extern __inline __m512d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) +_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A) { - return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A, - (__v8di) __I - , - (__v8df) __B, - (__mmask8) __U); + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) __O, + __M); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) +_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A) { - return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I - , - (__v8df) __A, - (__v8df) __B, - (__mmask8) __U); + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); } -extern __inline __m512 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B) +typedef enum { - return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I - , - (__v16sf) __A, - (__v16sf) __B, - (__mmask16) -1); -} -extern __inline __m512 + _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, + _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, + _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, + _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, + _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, + _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, + _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, + _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, + _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, + _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, + _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, + _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, + _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, + _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, + _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, + _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, + _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, + _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, + _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, + _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, + _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, + _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, + _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, + _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, + _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, + _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, + _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, + _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, + _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, + _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, + _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, + _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, + _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, + _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, + _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, + _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, + _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, + _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, + _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, + _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, + _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, + _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, + _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, + _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, + _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, + _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, + _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, + _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, + _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, + _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, + _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, + _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, + _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, + _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, + _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, + _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, + _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, + _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, + _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, + _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, + _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, + _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, + _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, + _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, + _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, + _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, + _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, + _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, + _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, + _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, + _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, + _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, + _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, + _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, + _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, + _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, + _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, + _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, + _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, + _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, + _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, + _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, + _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, + _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, + _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, + _MM_PERM_DDDD = 0xFF +} _MM_PERM_ENUM; +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) +_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask) { - return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I - , - (__v16sf) __A, - (__v16sf) __B, - (__mmask16) __U); + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U, - __m512 __B) +_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + _MM_PERM_ENUM __mask) { - return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A, - (__v16si) __I - , - (__v16sf) __B, - (__mmask16) __U); + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I, - __m512 __B) +_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask) { - return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I - , - (__v16sf) __A, - (__v16sf) __B, - (__mmask16) __U); + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -#define _mm512_permute_pd(X,C) ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)(-1))) -#define _mm512_mask_permute_pd(W,U,X,C) ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) -#define _mm512_maskz_permute_pd(U,X,C) ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) -#define _mm512_permute_ps(X,C) ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)(-1))) -#define _mm512_mask_permute_ps(W,U,X,C) ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) -#define _mm512_maskz_permute_ps(U,X,C) ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) -#define _mm512_permutex_pd(X,M) ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) -#define _mm512_mask_permutex_pd(W,U,X,M) ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)(W), (__mmask8)(U))) -#define _mm512_maskz_permutex_pd(U,X,M) ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) -#define _mm512_permutex_epi64(X,I) ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i) (_mm512_undefined_epi32 ()), (__mmask8)(-1))) -#define _mm512_maskz_permutex_epi64(M,X,I) ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i) (_mm512_setzero_si512 ()), (__mmask8)(M))) -#define _mm512_mask_permutex_epi64(W,M,X,I) ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i)(W), (__mmask8)(M))) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, - (__v8di) __X, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) +_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, - (__v8di) __X, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) +_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, - (__v8di) __X, - (__v8di) __W, - __M); + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, - (__v16si) __X, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) +_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, - (__v16si) __X, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) __W, + (__mmask16) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, - __m512i __Y) +_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, - (__v16si) __X, - (__v16si) __W, - __M); + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutexvar_pd (__m512i __X, __m512d __Y) +_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm) { - return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, - (__v8di) __X, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) +_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __imm) { - return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, - (__v8di) __X, - (__v8df) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) __W, + (__mmask8) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) +_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B, + const int __imm) { - return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, - (__v8di) __X, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutexvar_ps (__m512i __X, __m512 __Y) +_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm) { - return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, - (__v16si) __X, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) +_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __imm) { - return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, - (__v16si) __X, - (__v16sf) __W, - (__mmask16) __U); + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) __W, + (__mmask16) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) +_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B, + const int __imm) { - return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, - (__v16si) __X, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -#define _mm512_shuffle_pd(X,Y,C) ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) -#define _mm512_mask_shuffle_pd(W,U,X,Y,C) ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) -#define _mm512_maskz_shuffle_pd(U,X,Y,C) ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) -#define _mm512_shuffle_ps(X,Y,C) ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) -#define _mm512_mask_shuffle_ps(W,U,X,Y,C) ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) -#define _mm512_maskz_shuffle_ps(U,X,Y,C) ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) -#define _mm512_fixupimm_round_pd(X,Y,Z,C,R) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(-1), (R))) -#define _mm512_mask_fixupimm_round_pd(X,U,Y,Z,C,R) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), (R))) -#define _mm512_maskz_fixupimm_round_pd(U,X,Y,Z,C,R) ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), (R))) -#define _mm512_fixupimm_round_ps(X,Y,Z,C,R) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(-1), (R))) -#define _mm512_mask_fixupimm_round_ps(X,U,Y,Z,C,R) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), (R))) -#define _mm512_maskz_fixupimm_round_ps(U,X,Y,Z,C,R) ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), (R))) -#define _mm_fixupimm_round_sd(X,Y,Z,C,R) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(-1), (R))) -#define _mm_mask_fixupimm_round_sd(X,U,Y,Z,C,R) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) -#define _mm_maskz_fixupimm_round_sd(U,X,Y,Z,C,R) ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) -#define _mm_fixupimm_round_ss(X,Y,Z,C,R) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(-1), (R))) -#define _mm_mask_fixupimm_round_ss(X,U,Y,Z,C,R) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) -#define _mm_maskz_fixupimm_round_ss(U,X,Y,Z,C,R) ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movehdup_ps (__m512 __A) +_mm512_rolv_epi32 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) +_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) +_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_moveldup_ps (__m512 __A) +_mm512_rorv_epi32 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) +_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) +_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_or_si512 (__m512i __A, __m512i __B) +_mm512_rolv_epi64 (__m512i __A, __m512i __B) { - return (__m512i) ((__v16su) __A | (__v16su) __B); + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_or_epi32 (__m512i __A, __m512i __B) +_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) ((__v16su) __A | (__v16su) __B); + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_rorv_epi64 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_or_epi64 (__m512i __A, __m512i __B) +_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) ((__v8du) __A | (__v8du) __B); + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R) { - return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_xor_si512 (__m512i __A, __m512i __B) +_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { - return (__m512i) ((__v16su) __A ^ (__v16su) __B); + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_xor_epi32 (__m512i __A, __m512i __B) +_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) { - return (__m512i) ((__v16su) __A ^ (__v16su) __B); + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R) { - return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { - return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_xor_epi64 (__m512i __A, __m512i __B) +_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) { - return (__m512i) ((__v8du) __A ^ (__v8du) __B); + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R) { - return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { - return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); } -#define _mm512_rol_epi32(A,B) ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_undefined_epi32 (), (__mmask16)(-1))) -#define _mm512_mask_rol_epi32(W,U,A,B) ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_rol_epi32(U,A,B) ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_setzero_si512 (), (__mmask16)(U))) -#define _mm512_ror_epi32(A,B) ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_undefined_epi32 (), (__mmask16)(-1))) -#define _mm512_mask_ror_epi32(W,U,A,B) ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_ror_epi32(U,A,B) ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_setzero_si512 (), (__mmask16)(U))) -#define _mm512_rol_epi64(A,B) ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_undefined_epi32 (), (__mmask8)(-1))) -#define _mm512_mask_rol_epi64(W,U,A,B) ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_rol_epi64(U,A,B) ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_setzero_si512 (), (__mmask8)(U))) -#define _mm512_ror_epi64(A,B) ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_undefined_epi32 (), (__mmask8)(-1))) -#define _mm512_mask_ror_epi64(W,U,A,B) ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_ror_epi64(U,A,B) ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_setzero_si512 (), (__mmask8)(U))) -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_and_si512 (__m512i __A, __m512i __B) +_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) { - return (__m512i) ((__v16su) __A & (__v16su) __B); + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_and_epi32 (__m512i __A, __m512i __B) +_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R) { - return (__m512i) ((__v16su) __A & (__v16su) __B); + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { - return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R) { - return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_and_epi64 (__m512i __A, __m512i __B) +_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { - return (__m512i) ((__v8du) __A & (__v8du) __B); + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) { - return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __U); + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R) { - return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_pd (), - __U); + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_andnot_si512 (__m512i __A, __m512i __B) +_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { - return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_andnot_epi32 (__m512i __A, __m512i __B) +_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) { - return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_cvt_roundps_epi32 (__m512 __A, const int __R) { - return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { - return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_andnot_epi64 (__m512i __A, __m512i __B) +_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) { - return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_cvt_roundps_epu32 (__m512 __A, const int __R) { - return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __U); + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { - return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_pd (), - __U); + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); } -extern __inline __mmask16 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_test_epi32_mask (__m512i __A, __m512i __B) +_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) { - return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); } -extern __inline __mmask16 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +_mm_cvtu32_sd (__m128d __A, unsigned __B) { - return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, - (__v16si) __B, __U); + return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_test_epi64_mask (__m512i __A, __m512i __B) +_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R) { - return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, - (__v8di) __B, - (__mmask8) -1); + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R) { - return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U); + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); } -extern __inline __mmask16 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_testn_epi32_mask (__m512i __A, __m512i __B) +_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R) { - return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); } -extern __inline __mmask16 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R) { - return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, - (__v16si) __B, __U); + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_testn_epi64_mask (__m512i __A, __m512i __B) +_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R) { - return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, - (__v8di) __B, - (__mmask8) -1); + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R) { - return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, - (__v8di) __B, __U); + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); } -extern __inline __m512 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_abs_ps (__m512 __A) +_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R) { - return (__m512) _mm512_and_epi32 ((__m512i) __A, - _mm512_set1_epi32 (0x7fffffff)); + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R); } -extern __inline __m512 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_abs_ps (__m512 __W, __mmask16 __U, __m512 __A) +_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R) { - return (__m512) _mm512_mask_and_epi32 ((__m512i) __W, __U, (__m512i) __A, - _mm512_set1_epi32 (0x7fffffff)); + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_abs_pd (__m512 __A) +_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R) { - return (__m512d) _mm512_and_epi64 ((__m512i) __A, - _mm512_set1_epi64 (0x7fffffffffffffffLL)); + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); } -extern __inline __m512d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512 __A) +_mm512_cvtepi32_epi8 (__m512i __A) { - return (__m512d) - _mm512_mask_and_epi64 ((__m512i) __W, __U, (__m512i) __A, - _mm512_set1_epi64 (0x7fffffffffffffffLL)); + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpackhi_epi32 (__m512i __A, __m512i __B) +_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) +_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpackhi_epi64 (__m512i __A, __m512i __B) +_mm512_cvtsepi32_epi8 (__m512i __A) { - return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpacklo_epi32 (__m512i __A, __m512i __B) +_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) +_mm512_cvtusepi32_epi8 (__m512i __A) { - return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) +_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) __O, + __M); } -extern __inline __m512i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) { - return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_cvtepi32_epi16 (__m512i __A) { - return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); } -#define _mm_cvt_roundss_u64(A,B) ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) -#define _mm_cvt_roundss_si64(A,B) ((long long)__builtin_ia32_vcvtss2si64(A, B)) -#define _mm_cvt_roundss_i64(A,B) ((long long)__builtin_ia32_vcvtss2si64(A, B)) -#define _mm_cvtt_roundss_u64(A,B) ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) -#define _mm_cvtt_roundss_i64(A,B) ((long long)__builtin_ia32_vcvttss2si64(A, B)) -#define _mm_cvtt_roundss_si64(A,B) ((long long)__builtin_ia32_vcvttss2si64(A, B)) -#define _mm_cvt_roundss_u32(A,B) ((unsigned)__builtin_ia32_vcvtss2usi32(A, B)) -#define _mm_cvt_roundss_si32(A,B) ((int)__builtin_ia32_vcvtss2si32(A, B)) -#define _mm_cvt_roundss_i32(A,B) ((int)__builtin_ia32_vcvtss2si32(A, B)) -#define _mm_cvtt_roundss_u32(A,B) ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) -#define _mm_cvtt_roundss_si32(A,B) ((int)__builtin_ia32_vcvttss2si32(A, B)) -#define _mm_cvtt_roundss_i32(A,B) ((int)__builtin_ia32_vcvttss2si32(A, B)) -#define _mm_cvt_roundsd_u64(A,B) ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) -#define _mm_cvt_roundsd_si64(A,B) ((long long)__builtin_ia32_vcvtsd2si64(A, B)) -#define _mm_cvt_roundsd_i64(A,B) ((long long)__builtin_ia32_vcvtsd2si64(A, B)) -#define _mm_cvtt_roundsd_u64(A,B) ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) -#define _mm_cvtt_roundsd_si64(A,B) ((long long)__builtin_ia32_vcvttsd2si64(A, B)) -#define _mm_cvtt_roundsd_i64(A,B) ((long long)__builtin_ia32_vcvttsd2si64(A, B)) -#define _mm_cvt_roundsd_u32(A,B) ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B)) -#define _mm_cvt_roundsd_si32(A,B) ((int)__builtin_ia32_vcvtsd2si32(A, B)) -#define _mm_cvt_roundsd_i32(A,B) ((int)__builtin_ia32_vcvtsd2si32(A, B)) -#define _mm_cvtt_roundsd_u32(A,B) ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) -#define _mm_cvtt_roundsd_si32(A,B) ((int)__builtin_ia32_vcvttsd2si32(A, B)) -#define _mm_cvtt_roundsd_i32(A,B) ((int)__builtin_ia32_vcvttsd2si32(A, B)) -extern __inline __m512d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movedup_pd (__m512d __A) +_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) { - return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) { - return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) +_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) { - return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpacklo_pd (__m512d __A, __m512d __B) +_mm512_cvtsepi32_epi16 (__m512i __A) { - return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) { - return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); + __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) { - return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpackhi_pd (__m512d __A, __m512d __B) +_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) { - return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_cvtusepi32_epi16 (__m512i __A) { - return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) { - return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); } -extern __inline __m512 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpackhi_ps (__m512 __A, __m512 __B) +_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) { - return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) __O, + __M); } -extern __inline __m512 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) { - return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); } -extern __inline __m512 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_cvtepi64_epi32 (__m512i __A) { - return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); } -#define _mm512_cvt_roundps_pd(A,B) (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B) -#define _mm512_mask_cvt_roundps_pd(W,U,A,B) (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B) -#define _mm512_maskz_cvt_roundps_pd(U,A,B) (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B) -#define _mm512_cvt_roundph_ps(A,B) (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B) -#define _mm512_mask_cvt_roundph_ps(W,U,A,B) (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B) -#define _mm512_maskz_cvt_roundph_ps(U,A,B) (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B) -#define _mm512_cvt_roundps_ph(A,I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I), (__v16hi)_mm256_undefined_si256 (), -1)) -#define _mm512_cvtps_ph(A,I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I), (__v16hi)_mm256_undefined_si256 (), -1)) -#define _mm512_mask_cvt_roundps_ph(U,W,A,I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I), (__v16hi)(__m256i)(U), (__mmask16) (W))) -#define _mm512_mask_cvtps_ph(U,W,A,I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I), (__v16hi)(__m256i)(U), (__mmask16) (W))) -#define _mm512_maskz_cvt_roundps_ph(W,A,I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I), (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) -#define _mm512_maskz_cvtps_ph(W,A,I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I), (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) -#define _mm512_cvt_roundpd_ps(A,B) (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) -#define _mm512_mask_cvt_roundpd_ps(W,U,A,B) (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B) -#define _mm512_maskz_cvt_roundpd_ps(U,A,B) (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B) -#define _mm_cvt_roundsd_ss(A,B,C) (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) -#define _mm_cvt_roundss_sd(A,B,C) (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_stream_si512 (__m512i * __P, __m512i __A) +_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) { - __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A); + __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_stream_ps (float *__P, __m512 __A) +_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) { - __builtin_ia32_movntps512 (__P, (__v16sf) __A); + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) __O, __M); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_stream_pd (double *__P, __m512d __A) +_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) { - __builtin_ia32_movntpd512 (__P, (__v8df) __A); + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); } -extern __inline __m512i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_stream_load_si512 (void *__P) +_mm512_cvtsepi64_epi32 (__m512i __A) { - return __builtin_ia32_movntdqa512 ((__v8di *)__P); + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); } -typedef enum -{ - _MM_MANT_NORM_1_2, - _MM_MANT_NORM_p5_2, - _MM_MANT_NORM_p5_1, - _MM_MANT_NORM_p75_1p5 -} _MM_MANTISSA_NORM_ENUM; -typedef enum -{ - _MM_MANT_SIGN_src, - _MM_MANT_SIGN_zero, - _MM_MANT_SIGN_nan -} _MM_MANTISSA_SIGN_ENUM; -#define _mm512_getmant_round_pd(X,B,C,R) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1, (R))) -#define _mm512_mask_getmant_round_pd(W,U,X,B,C,R) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)(W), (__mmask8)(U), (R))) -#define _mm512_maskz_getmant_round_pd(U,X,B,C,R) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U), (R))) -#define _mm512_getmant_round_ps(X,B,C,R) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1, (R))) -#define _mm512_mask_getmant_round_ps(W,U,X,B,C,R) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)(W), (__mmask16)(U), (R))) -#define _mm512_maskz_getmant_round_ps(U,X,B,C,R) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U), (R))) -#define _mm_getmant_round_sd(X,Y,C,D,R) ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (R))) -#define _mm_mask_getmant_round_sd(W,U,X,Y,C,D,R) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)(__m128d)(W), (__mmask8)(U), (R))) -#define _mm_maskz_getmant_round_sd(U,X,Y,C,D,R) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U), (R))) -#define _mm_getmant_round_ss(X,Y,C,D,R) ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (R))) -#define _mm_mask_getmant_round_ss(W,U,X,Y,C,D,R) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)(__m128)(W), (__mmask8)(U), (R))) -#define _mm_maskz_getmant_round_ss(U,X,Y,C,D,R) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U), (R))) -#define _mm_getexp_round_ss(A,B,R) ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R)) -#define _mm_mask_getexp_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C) -#define _mm_maskz_getexp_round_ss(U,A,B,C) (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) -#define _mm_getexp_round_sd(A,B,R) ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R)) -#define _mm_mask_getexp_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C) -#define _mm_maskz_getexp_round_sd(U,A,B,C) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) -#define _mm512_getexp_round_ps(A,R) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R)) -#define _mm512_mask_getexp_round_ps(W,U,A,R) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), R)) -#define _mm512_maskz_getexp_round_ps(U,A,R) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R)) -#define _mm512_getexp_round_pd(A,R) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R)) -#define _mm512_mask_getexp_round_pd(W,U,A,R) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), R)) -#define _mm512_maskz_getexp_round_pd(U,A,R) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R)) -#define _mm512_roundscale_round_ps(A,B,R) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), R)) -#define _mm512_mask_roundscale_round_ps(A,B,C,D,R) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), (int)(D), (__v16sf)(__m512)(A), (__mmask16)(B), R)) -#define _mm512_maskz_roundscale_round_ps(A,B,C,R) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps(), (__mmask16)(A), R)) -#define _mm512_roundscale_round_pd(A,B,R) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), R)) -#define _mm512_mask_roundscale_round_pd(A,B,C,D,R) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), R)) -#define _mm512_maskz_roundscale_round_pd(A,B,C,R) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd(), (__mmask8)(A), R)) -#define _mm_roundscale_round_ss(A,B,C,R) ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), R)) -#define _mm_roundscale_round_sd(A,B,C,R) ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), R)) -extern __inline __m512 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_floor_ps (__m512 __A) +_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) { - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - (0x01 | 0x00), - (__v16sf) __A, -1, - 0x04); + __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_floor_pd (__m512d __A) +_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) { - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - (0x01 | 0x00), - (__v8df) __A, -1, - 0x04); + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) __O, __M); } -extern __inline __m512 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_ceil_ps (__m512 __A) +_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) { - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - (0x02 | 0x00), - (__v16sf) __A, -1, - 0x04); + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_ceil_pd (__m512d __A) +_mm512_cvtusepi64_epi32 (__m512i __A) { - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - (0x02 | 0x00), - (__v8df) __A, -1, - 0x04); + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) +_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) { - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - (0x01 | 0x00), - (__v16sf) __W, __U, - 0x04); + __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) { - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - (0x01 | 0x00), - (__v8df) __W, __U, - 0x04); + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) __O, __M); } -extern __inline __m512 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) +_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) { - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - (0x02 | 0x00), - (__v16sf) __W, __U, - 0x04); + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); } -extern __inline __m512d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) +_mm512_cvtepi64_epi16 (__m512i __A) { - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - (0x02 | 0x00), - (__v8df) __W, __U, - 0x04); + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -#define _mm512_alignr_epi32(X,Y,C) ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_epi32 (), (__mmask16)-1)) -#define _mm512_mask_alignr_epi32(W,U,X,Y,C) ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_alignr_epi32(U,X,Y,C) ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_setzero_si512 (), (__mmask16)(U))) -#define _mm512_alignr_epi64(X,Y,C) ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_epi32 (), (__mmask8)-1)) -#define _mm512_mask_alignr_epi64(W,U,X,Y,C) ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_alignr_epi64(U,X,Y,C) ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_setzero_si512 (), (__mmask8)(U))) -extern __inline __mmask16 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B) +_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, - (__v16si) __B, __U); + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, - (__v8di) __B, __U); + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B) +_mm512_cvtsepi64_epi16 (__m512i __A) { - return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, - (__v8di) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __mmask16 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B) +_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, - (__v16si) __B, __U); + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, - (__v8di) __B, __U); + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B) +_mm512_cvtusepi64_epi16 (__m512i __A) { - return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, - (__v8di) __B, + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), (__mmask8) -1); } -extern __inline __mmask16 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y) +_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 5, - (__mmask16) -1); + __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 5, - (__mmask16) __M); + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 5, - (__mmask16) __M); + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y) +_mm512_cvtepi64_epi8 (__m512i __A) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 5, - (__mmask16) -1); + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 5, - (__mmask8) __M); + __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y) +_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 5, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 5, - (__mmask8) __M); + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y) +_mm512_cvtsepi64_epi8 (__m512i __A) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 5, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __mmask16 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 2, - (__mmask16) __M); + __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y) +_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 2, - (__mmask16) -1); + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 2, - (__mmask16) __M); + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __mmask16 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y) +_mm512_cvtusepi64_epi8 (__m512i __A) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 2, - (__mmask16) -1); + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 2, - (__mmask8) __M); + __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y) +_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 2, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) __O, + __M); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 2, - (__mmask8) __M); + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __mmask8 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y) +_mm512_cvtepi32_pd (__m256i __A) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 2, + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_undefined_pd (), (__mmask8) -1); } -extern __inline __mmask16 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 1, - (__mmask16) __M); + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __mmask16 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y) +_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 1, - (__mmask16) -1); + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask16 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_cvtepu32_pd (__m256i __A) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 1, - (__mmask16) __M); + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __mmask16 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y) +_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 1, - (__mmask16) -1); + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 1, - (__mmask8) __M); + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y) +_mm512_cvt_roundepi32_ps (__m512i __A, const int __R) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 1, - (__mmask8) -1); + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline __mmask8 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 1, - (__mmask8) __M); + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __mmask8 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y) +_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 1, - (__mmask8) -1); + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __mmask16 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y) +_mm512_cvt_roundepu32_ps (__m512i __A, const int __R) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 4, - (__mmask16) -1); + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline __mmask16 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) { - return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, - (__v16si) __Y, 4, - (__mmask16) __M); + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __mmask16 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 4, - (__mmask16) __M); + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __mmask16 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y) +_mm512_extractf64x4_pd (__m512d __A, const int __imm) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, - (__v16si) __Y, 4, - (__mmask16) -1); + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_undefined_pd (), + (__mmask8) -1); } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epi64_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A, + const int __imm) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 4, - (__mmask8) __M); + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y) +_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm) { - return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, - (__v8di) __Y, 4, + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf32x4_ps (__m512 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_undefined_ps (), (__mmask8) -1); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A, + const int __imm) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 4, - (__mmask8) __M); + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y) +_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, - (__v8di) __Y, 4, - (__mmask8) -1); + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } -#define _MM_CMPINT_EQ 0x0 -#define _MM_CMPINT_LT 0x1 -#define _MM_CMPINT_LE 0x2 -#define _MM_CMPINT_UNUSED 0x3 -#define _MM_CMPINT_NE 0x4 -#define _MM_CMPINT_NLT 0x5 -#define _MM_CMPINT_GE 0x5 -#define _MM_CMPINT_NLE 0x6 -#define _MM_CMPINT_GT 0x6 -#define _kshiftli_mask16(X,Y) ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y))) -#define _kshiftri_mask16(X,Y) ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y))) -#define _mm512_cmp_epi64_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) -#define _mm512_cmp_epi32_mask(X,Y,P) ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) -#define _mm512_cmp_epu64_mask(X,Y,P) ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) -#define _mm512_cmp_epu32_mask(X,Y,P) ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) -#define _mm512_cmp_round_pd_mask(X,Y,P,R) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1, R)) -#define _mm512_cmp_round_ps_mask(X,Y,P,R) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1, R)) -#define _mm512_mask_cmp_epi64_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)M)) -#define _mm512_mask_cmp_epi32_mask(M,X,Y,P) ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)M)) -#define _mm512_mask_cmp_epu64_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)M)) -#define _mm512_mask_cmp_epu32_mask(M,X,Y,P) ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)M)) -#define _mm512_mask_cmp_round_pd_mask(M,X,Y,P,R) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)M, R)) -#define _mm512_mask_cmp_round_ps_mask(M,X,Y,P,R) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)M, R)) -#define _mm_cmp_round_sd_mask(X,Y,P,R) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1, R)) -#define _mm_mask_cmp_round_sd_mask(M,X,Y,P,R) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (M), R)) -#define _mm_cmp_round_ss_mask(X,Y,P,R) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1, R)) -#define _mm_mask_cmp_round_ss_mask(M,X,Y,P,R) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (M), R)) -#define _mm512_i32gather_ps(INDEX,ADDR,SCALE) (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_undefined_ps(), (void const *)ADDR, (__v16si)(__m512i)INDEX, (__mmask16)0xFFFF, (int)SCALE) -#define _mm512_mask_i32gather_ps(V1OLD,MASK,INDEX,ADDR,SCALE) (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512)V1OLD, (void const *)ADDR, (__v16si)(__m512i)INDEX, (__mmask16)MASK, (int)SCALE) -#define _mm512_i32gather_pd(INDEX,ADDR,SCALE) (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_undefined_pd(), (void const *)ADDR, (__v8si)(__m256i)INDEX, (__mmask8)0xFF, (int)SCALE) -#define _mm512_mask_i32gather_pd(V1OLD,MASK,INDEX,ADDR,SCALE) (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm512_i64gather_ps(INDEX,ADDR,SCALE) (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_undefined_ps(), (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) -#define _mm512_mask_i64gather_ps(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm512_i64gather_pd(INDEX,ADDR,SCALE) (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_undefined_pd(), (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) -#define _mm512_mask_i64gather_pd(V1OLD,MASK,INDEX,ADDR,SCALE) (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm512_i32gather_epi32(INDEX,ADDR,SCALE) (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_epi32 (), (void const *)ADDR, (__v16si)(__m512i)INDEX, (__mmask16)0xFFFF, (int)SCALE) -#define _mm512_mask_i32gather_epi32(V1OLD,MASK,INDEX,ADDR,SCALE) (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i)V1OLD, (void const *)ADDR, (__v16si)(__m512i)INDEX, (__mmask16)MASK, (int)SCALE) -#define _mm512_i32gather_epi64(INDEX,ADDR,SCALE) (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_epi32 (), (void const *)ADDR, (__v8si)(__m256i)INDEX, (__mmask8)0xFF, (int)SCALE) -#define _mm512_mask_i32gather_epi64(V1OLD,MASK,INDEX,ADDR,SCALE) (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm512_i64gather_epi32(INDEX,ADDR,SCALE) (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_undefined_si256(), (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) -#define _mm512_mask_i64gather_epi32(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm512_i64gather_epi64(INDEX,ADDR,SCALE) (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_epi32 (), (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) -#define _mm512_mask_i64gather_epi64(V1OLD,MASK,INDEX,ADDR,SCALE) (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm512_i32scatter_ps(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv16sf ((void *)ADDR, (__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, (__v16sf)(__m512)V1, (int)SCALE) -#define _mm512_mask_i32scatter_ps(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv16sf ((void *)ADDR, (__mmask16)MASK, (__v16si)(__m512i)INDEX, (__v16sf)(__m512)V1, (int)SCALE) -#define _mm512_i32scatter_pd(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv8df ((void *)ADDR, (__mmask8)0xFF, (__v8si)(__m256i)INDEX, (__v8df)(__m512d)V1, (int)SCALE) -#define _mm512_mask_i32scatter_pd(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv8df ((void *)ADDR, (__mmask8)MASK, (__v8si)(__m256i)INDEX, (__v8df)(__m512d)V1, (int)SCALE) -#define _mm512_i64scatter_ps(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv16sf ((void *)ADDR, (__mmask8)0xFF, (__v8di)(__m512i)INDEX, (__v8sf)(__m256)V1, (int)SCALE) -#define _mm512_mask_i64scatter_ps(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv16sf ((void *)ADDR, (__mmask16)MASK, (__v8di)(__m512i)INDEX, (__v8sf)(__m256)V1, (int)SCALE) -#define _mm512_i64scatter_pd(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8df ((void *)ADDR, (__mmask8)0xFF, (__v8di)(__m512i)INDEX, (__v8df)(__m512d)V1, (int)SCALE) -#define _mm512_mask_i64scatter_pd(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8df ((void *)ADDR, (__mmask8)MASK, (__v8di)(__m512i)INDEX, (__v8df)(__m512d)V1, (int)SCALE) -#define _mm512_i32scatter_epi32(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv16si ((void *)ADDR, (__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, (__v16si)(__m512i)V1, (int)SCALE) -#define _mm512_mask_i32scatter_epi32(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv16si ((void *)ADDR, (__mmask16)MASK, (__v16si)(__m512i)INDEX, (__v16si)(__m512i)V1, (int)SCALE) -#define _mm512_i32scatter_epi64(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv8di ((void *)ADDR, (__mmask8)0xFF, (__v8si)(__m256i)INDEX, (__v8di)(__m512i)V1, (int)SCALE) -#define _mm512_mask_i32scatter_epi64(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv8di ((void *)ADDR, (__mmask8)MASK, (__v8si)(__m256i)INDEX, (__v8di)(__m512i)V1, (int)SCALE) -#define _mm512_i64scatter_epi32(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv16si ((void *)ADDR, (__mmask8)0xFF, (__v8di)(__m512i)INDEX, (__v8si)(__m256i)V1, (int)SCALE) -#define _mm512_mask_i64scatter_epi32(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv16si ((void *)ADDR, (__mmask8)MASK, (__v8di)(__m512i)INDEX, (__v8si)(__m256i)V1, (int)SCALE) -#define _mm512_i64scatter_epi64(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8di ((void *)ADDR, (__mmask8)0xFF, (__v8di)(__m512i)INDEX, (__v8di)(__m512i)V1, (int)SCALE) -#define _mm512_mask_i64scatter_epi64(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8di ((void *)ADDR, (__mmask8)MASK, (__v8di)(__m512i)INDEX, (__v8di)(__m512i)V1, (int)SCALE) -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) +_mm512_extracti64x4_epi64 (__m512i __A, const int __imm) { - return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_undefined_si256 (), + (__mmask8) -1); } -extern __inline __m512d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) +_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) { - return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) __W, + (__mmask8) __U); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) +_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm) { - __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, - (__mmask8) __U); + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) +_mm512_extracti32x4_epi32 (__m512i __A, const int __imm) { - return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) +_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) { - return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) __W, + (__mmask8) __U); } -extern __inline void +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) +_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm) { - __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, - (__mmask16) __U); + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm) { - return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A, + (__v4si) __B, + __imm, + (__v16si) __A, -1); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) +_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm) { - return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A, + (__v4sf) __B, + __imm, + (__v16sf) __A, -1); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) +_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm) { - __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A, + __m256i __B, const int __imm) { - return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) __W, + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) +_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B, + const int __imm) { - return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) +_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm) { - __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, - (__mmask16) __U); + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) +_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A, + __m256d __B, const int __imm) { - return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, (__v8df) __W, (__mmask8) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) +_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B, + const int __imm) { - return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P) +_mm512_loadu_pd (void const *__P) { - return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P, - (__v8df) __W, - (__mmask8) __U); + return *(__m512d_u *)__P; } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) { - return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) +_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P) { - return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_pd (void *__P, __m512d __A) +{ + *(__m512d_u *)__P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A, + (__mmask8) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) +_mm512_loadu_ps (void const *__P) { - return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return *(__m512_u *)__P; } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P) +_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) { - return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P, - (__v16sf) __W, - (__mmask16) __U); + return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, + (__v16sf) __W, + (__mmask16) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P) +_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P) { - return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P, - (__v16sf) - _mm512_setzero_ps (), + return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_ps (void *__P, __m512 __A) +{ + *(__m512_u *)__P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A, (__mmask16) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) { - return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, + (__v8di) __W, + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A) +_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P) { - return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A, + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P, - (__v8di) __W, - (__mmask8) __U); + __builtin_ia32_storedqudi512_mask ((long long *) __P, (__v8di) __A, + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +_mm512_loadu_si512 (void const *__P) { - return (__m512i) - __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return *(__m512i_u *)__P; } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) { - return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, + (__v16si) __W, + (__mmask16) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) +_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P) { - return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A, + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, (__v16si) _mm512_setzero_si512 (), (__mmask16) __U); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +_mm512_storeu_si512 (void *__P, __m512i __A) { - return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P, - (__v16si) __W, - (__mmask16) __U); + *(__m512i_u *)__P = __A; } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P) +_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P, - (__v16si) - _mm512_setzero_si512 - (), (__mmask16) __U); + __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A, + (__mmask16) __U); } -#define _kand_mask16 _mm512_kand -#define _kandn_mask16 _mm512_kandn -#define _knot_mask16 _mm512_knot -#define _kor_mask16 _mm512_kor -#define _kxnor_mask16 _mm512_kxnor -#define _kxor_mask16 _mm512_kxor -extern __inline unsigned char +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) +_mm512_permutevar_pd (__m512d __A, __m512i __C) { - *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B); - return (unsigned char) __builtin_ia32_kortestzhi (__A, __B); + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline unsigned char +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B) +_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { - return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A, - (__mmask16) __B); + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) __W, + (__mmask8) __U); } -extern __inline unsigned char +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B) +_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C) { - return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A, - (__mmask16) __B); + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline unsigned int +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtmask16_u32 (__mmask16 __A) +_mm512_permutevar_ps (__m512 __A, __m512i __C) { - return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A); + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline __mmask16 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu32_mask16 (unsigned int __A) +_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { - return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A); + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __mmask16 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_load_mask16 (__mmask16 *__A) +_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C) { - return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A); + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_store_mask16 (__mmask16 *__A, __mmask16 __B) +_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B) { - *(__mmask16 *) __A = __builtin_ia32_kmovw (__B); + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + , + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } -extern __inline __mmask16 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kand (__mmask16 __A, __mmask16 __B) +_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, + __m512i __B) { - return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); } -extern __inline __mmask16 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kandn (__mmask16 __A, __mmask16 __B) +_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, + __mmask8 __U, __m512i __B) { - return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, - (__mmask16) __B); + return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A, + (__v8di) __I + , + (__v8di) __B, + (__mmask8) __U); } -extern __inline __mmask16 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kor (__mmask16 __A, __mmask16 __B) +_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, + __m512i __I, __m512i __B) { - return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); + return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I + , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); } -extern __inline int +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kortestz (__mmask16 __A, __mmask16 __B) +_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B) { - return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A, - (__mmask16) __B); + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + , + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } -extern __inline int +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kortestc (__mmask16 __A, __mmask16 __B) +_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, + __m512i __I, __m512i __B) { - return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A, - (__mmask16) __B); + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); } -extern __inline __mmask16 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kxnor (__mmask16 __A, __mmask16 __B) +_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, + __mmask16 __U, __m512i __B) { - return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); + return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A, + (__v16si) __I + , + (__v16si) __B, + (__mmask16) __U); } -extern __inline __mmask16 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kxor (__mmask16 __A, __mmask16 __B) +_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A, + __m512i __I, __m512i __B) { - return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); + return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I + , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); } -extern __inline __mmask16 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_knot (__mmask16 __A) +_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B) { - return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A); + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + , + (__v8df) __A, + (__v8df) __B, + (__mmask8) -1); } -extern __inline __mmask16 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kunpackb (__mmask16 __A, __mmask16 __B) +_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); } -extern __inline __mmask16 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kunpackb_mask16 (__mmask8 __A, __mmask8 __B) +_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, + __m512d __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A, + (__v8di) __I + , + (__v8df) __B, + (__mmask8) __U); } -#define _mm512_maskz_insertf32x4(A,X,Y,C) ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(), (__mmask8)(A))) -#define _mm512_maskz_inserti32x4(A,X,Y,C) ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (), (__mmask8)(A))) -#define _mm512_mask_insertf32x4(A,B,X,Y,C) ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A), (__mmask8)(B))) -#define _mm512_mask_inserti32x4(A,B,X,Y,C) ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A), (__mmask8)(B))) -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epi64 (__m512i __A, __m512i __B) +_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I, + __m512d __B) { - return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I + , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B) { - return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) -1); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) { - return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epi64 (__m512i __A, __m512i __B) +_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) { - return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A, + (__v16si) __I + , + (__v16sf) __B, + (__mmask16) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) { - return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I + , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +_mm512_permute_pd (__m512d __X, const int __C) { - return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epu64 (__m512i __A, __m512i __B) +_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C) { - return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C) { - return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +_mm512_permute_ps (__m512 __X, const int __C) { - return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epu64 (__m512i __A, __m512i __B) +_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C) { - return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C) { - return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +_mm512_permutex_epi64 (__m512i __X, const int __I) { - return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, - (__v8di) __B, + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, (__v8di) - _mm512_setzero_si512 (), - __M); + _mm512_undefined_epi32 (), + (__mmask8) (-1)); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epi32 (__m512i __A, __m512i __B) +_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M, + __m512i __X, const int __I) { - return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) __W, + (__mmask8) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I) { - return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) _mm512_setzero_si512 (), - __M); + (__mmask8) __M); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +_mm512_permutex_pd (__m512d __X, const int __M) { - return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epi32 (__m512i __A, __m512i __B) +_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M) { - return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M) { - return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) + _mm512_setzero_si512 (), + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epu32 (__m512i __A, __m512i __B) +_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { - return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) __W, + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) + _mm512_setzero_si512 (), + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epu32 (__m512i __A, __m512i __B) +_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) { - return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) __W, + __M); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +_mm512_permutexvar_pd (__m512i __X, __m512d __Y) { - return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpacklo_ps (__m512 __A, __m512 __B) +_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) { - return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) { - return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_permutexvar_ps (__m512i __X, __m512 __Y) { - return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } -#define _mm_max_round_sd(A,B,C) (__m128d)__builtin_ia32_maxsd_round(A, B, C) -#define _mm_mask_max_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C) -#define _mm_maskz_max_round_sd(U,A,B,C) (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) -#define _mm_max_round_ss(A,B,C) (__m128)__builtin_ia32_maxss_round(A, B, C) -#define _mm_mask_max_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C) -#define _mm_maskz_max_round_ss(U,A,B,C) (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) -#define _mm_min_round_sd(A,B,C) (__m128d)__builtin_ia32_minsd_round(A, B, C) -#define _mm_mask_min_round_sd(W,U,A,B,C) (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C) -#define _mm_maskz_min_round_sd(U,A,B,C) (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) -#define _mm_min_round_ss(A,B,C) (__m128)__builtin_ia32_minss_round(A, B, C) -#define _mm_mask_min_round_ss(W,U,A,B,C) (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C) -#define _mm_maskz_min_round_ss(U,A,B,C) (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W) +_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { - return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) __W, + (__mmask16) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W) +_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) { - return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A, - (__v16sf) __W, + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_setzero_ps (), (__mmask16) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W) +_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm) { - return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W) +_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M, + __m512 __V, const int __imm) { - return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -#define _mm_fmadd_round_sd(A,B,C,R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R) -#define _mm_fmadd_round_ss(A,B,C,R) (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R) -#define _mm_fmsub_round_sd(A,B,C,R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R) -#define _mm_fmsub_round_ss(A,B,C,R) (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R) -#define _mm_fnmadd_round_sd(A,B,C,R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R) -#define _mm_fnmadd_round_ss(A,B,C,R) (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R) -#define _mm_fnmsub_round_sd(A,B,C,R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) -#define _mm_fnmsub_round_ss(A,B,C,R) (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) -#define _mm_comi_round_ss(A,B,C,D) __builtin_ia32_vcomiss(A, B, C, D) -#define _mm_comi_round_sd(A,B,C,D) __builtin_ia32_vcomisd(A, B, C, D) extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sqrt_pd (__m512d __A) +_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm) { - return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, (__v8df) _mm512_undefined_pd (), - (__mmask8) -1, - 0x04); + (__mmask8) -1); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) +_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __imm) { - return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, (__v8df) __W, - (__mmask8) __U, - 0x04); + (__mmask8) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) +_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __imm) { - return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, (__v8df) _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); -} -extern __inline __m512 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sqrt_ps (__m512 __A) -{ - return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - 0x04); -} -extern __inline __m512 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A) +_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C, + const int __imm, const int __R) { - return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_add_pd (__m512d __A, __m512d __B) +_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm, const int __R) { - return (__m512d) ((__v8df)__A + (__v8df)__B); + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm, const int __R) { - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); } -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C, + const int __imm, const int __R) { - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_add_ps (__m512 __A, __m512 __B) +_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm, const int __R) { - return (__m512) ((__v16sf)__A + (__v16sf)__B); + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm, const int __R) { - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); } -extern __inline __m512 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C, + const int __imm, const int __R) { - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, __R); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm, const int __R) { - return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - 0x04); + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, __R); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B) +_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm, const int __R) { - return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, __R); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C, + const int __imm, const int __R) { - return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - 0x04); + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, __R); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B) +_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm, const int __R) { - return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - 0x04); + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sub_pd (__m512d __A, __m512d __B) +_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm, const int __R) { - return (__m512d) ((__v8df)__A - (__v8df)__B); + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); } -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_movehdup_ps (__m512 __A) { - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sub_ps (__m512 __A, __m512 __B) +_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) { - return (__m512) ((__v16sf)__A - (__v16sf)__B); + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_moveldup_ps (__m512 __A) { - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) { - return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - 0x04); + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B) +_mm512_or_si512 (__m512i __A, __m512i __B) { - return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) ((__v16su) __A | (__v16su) __B); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm512_or_epi32 (__m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - 0x04); + return (__m512i) ((__v16su) __A | (__v16su) __B); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B) +_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mul_pd (__m512d __A, __m512d __B) +_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512d) ((__v8df)__A * (__v8df)__B); + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_or_epi64 (__m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) ((__v8du) __A | (__v8du) __B); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mul_ps (__m512 __A, __m512 __B) +_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512) ((__v16sf)__A * (__v16sf)__B); + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_xor_si512 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__m512i) ((__v16su) __A ^ (__v16su) __B); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_xor_epi32 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512i) ((__v16su) __A ^ (__v16su) __B); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) +_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B) +_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) +_mm512_xor_epi64 (__m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - 0x04); + return (__m512i) ((__v8du) __A ^ (__v8du) __B); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B) +_mm512_mask_xor_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_div_pd (__m512d __M, __m512d __V) +_mm512_maskz_xor_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512d) ((__v8df)__M / (__v8df)__V); + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) +_mm512_rol_epi32 (__m512i __A, const int __B) { - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, - (__v8df) __V, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V) +_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B) { - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, - (__v8df) __V, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_div_ps (__m512 __A, __m512 __B) +_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B) { - return (__m512) ((__v16sf)__A / (__v16sf)__B); + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_ror_epi32 (__m512i __A, int __B) { - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B) { - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) +_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B) { - return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B) +_mm512_rol_epi64 (__m512i __A, const int __B) { - return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) +_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B) { - return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B) +_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B) { - return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_pd (__m512d __A, __m512d __B) +_mm512_ror_epi64 (__m512i __A, int __B) { - return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - 0x04); + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B) { - return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B) { - return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_ps (__m512 __A, __m512 __B) +_mm512_and_si512 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - 0x04); + return (__m512i) ((__v16su) __A & (__v16su) __B); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_and_epi32 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__m512i) ((__v16su) __A & (__v16su) __B); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B) +_mm512_and_epi64 (__m512i __A, __m512i __B) { - return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) ((__v8du) __A & (__v8du) __B); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B) +_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_pd (__m512d __A, __m512d __B) +_mm512_andnot_si512 (__m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - 0x04); + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_andnot_epi32 (__m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_ps (__m512 __A, __m512 __B) +_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - 0x04); + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_andnot_epi64 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); } -extern __inline __m128d +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B) +_mm512_test_epi32_mask (__m512i __A, __m512i __B) { - return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } -extern __inline __m128 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - 0x04); + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, __U); } -extern __inline __m128 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B) +_mm512_test_epi64_mask (__m512i __A, __m512i __B) { - return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - 0x04); + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } -extern __inline __m512d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_scalef_pd (__m512d __A, __m512d __B) +_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - 0x04); + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U); } -extern __inline __m512d +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_testn_epi32_mask (__m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } -extern __inline __m512d +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, __U); } -extern __inline __m512 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_scalef_ps (__m512 __A, __m512 __B) +_mm512_testn_epi64_mask (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - 0x04); + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } -extern __inline __m512 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_abs_ps (__m512 __A) { - return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512) _mm512_and_epi32 ((__m512i) __A, + _mm512_set1_epi32 (0x7fffffff)); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_sd (__m128d __A, __m128d __B) +_mm512_mask_abs_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, - 0x04); + return (__m512) _mm512_mask_and_epi32 ((__m512i) __W, __U, (__m512i) __A, + _mm512_set1_epi32 (0x7fffffff)); } -extern __inline __m128 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_ss (__m128 __A, __m128 __B) +_mm512_abs_pd (__m512 __A) { - return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, - 0x04); + return (__m512d) _mm512_and_epi64 ((__m512i) __A, + _mm512_set1_epi64 (0x7fffffffffffffffLL)); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C) +_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512 __A) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - 0x04); + return (__m512d) + _mm512_mask_and_epi64 ((__m512i) __W, __U, (__m512i) __A, + _mm512_set1_epi64 (0x7fffffffffffffffLL)); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +_mm512_unpackhi_epi32 (__m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C) +_mm512_unpackhi_epi64 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - 0x04); + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +_mm512_unpacklo_epi32 (__m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C) +_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - 0x04); + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C) +_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - 0x04); + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m512 +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +_mm_cvt_roundss_u64 (__m128 __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - 0x04); + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R); } -extern __inline __m512 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +_mm_cvt_roundss_si64 (__m128 __A, const int __R) { - return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); } -extern __inline __m512 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +_mm_cvt_roundss_i64 (__m128 __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - 0x04); + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); } -extern __inline __m512d +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C) +_mm_cvtt_roundss_u64 (__m128 __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - 0x04); + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R); } -extern __inline __m512d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +_mm_cvtt_roundss_i64 (__m128 __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); } -extern __inline __m512d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +_mm_cvtt_roundss_si64 (__m128 __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); } -extern __inline __m512d +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +_mm_cvt_roundss_u32 (__m128 __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R); } -extern __inline __m512 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C) +_mm_cvt_roundss_si32 (__m128 __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - 0x04); + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); } -extern __inline __m512 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +_mm_cvt_roundss_i32 (__m128 __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); } -extern __inline __m512 +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +_mm_cvtt_roundss_u32 (__m128 __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R); } -extern __inline __m512 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +_mm_cvtt_roundss_i32 (__m128 __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); } -extern __inline __m512d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C) +_mm_cvtt_roundss_si32 (__m128 __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - 0x04); + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); } -extern __inline __m512d +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +_mm_cvt_roundsd_u64 (__m128d __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - 0x04); + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R); } -extern __inline __m512d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +_mm_cvt_roundsd_si64 (__m128d __A, const int __R) { - return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); } -extern __inline __m512d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +_mm_cvt_roundsd_i64 (__m128d __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - 0x04); + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); } -extern __inline __m512 +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C) +_mm_cvtt_roundsd_u64 (__m128d __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - 0x04); + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R); } -extern __inline __m512 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +_mm_cvtt_roundsd_si64 (__m128d __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - 0x04); + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); } -extern __inline __m512 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +_mm_cvtt_roundsd_i64 (__m128d __A, const int __R) { - return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); } -extern __inline __m512 +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +_mm_cvt_roundsd_u32 (__m128d __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - 0x04); + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R); } -extern __inline __m512d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C) +_mm_cvt_roundsd_si32 (__m128d __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - 0x04); + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); } -extern __inline __m512d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +_mm_cvt_roundsd_i32 (__m128d __A, const int __R) { - return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); } -extern __inline __m512d +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +_mm_cvtt_roundsd_u32 (__m128d __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R); } -extern __inline __m512d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +_mm_cvtt_roundsd_i32 (__m128d __A, const int __R) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); } -extern __inline __m512 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C) +_mm_cvtt_roundsd_si32 (__m128d __A, const int __R) { - return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - 0x04); + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +_mm512_movedup_pd (__m512d __A) { - return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) { - return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C) +_mm512_unpacklo_pd (__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - 0x04); + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +_mm512_unpackhi_pd (__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C) +_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - 0x04); + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +_mm512_unpackhi_ps (__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttpd_epi32 (__m512d __A) +_mm512_cvt_roundps_pd (__m256 __A, const int __R) { - return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1, - 0x04); + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A, + const int __R) { - return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) +_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R) { - return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttpd_epu32 (__m512d __A) +_mm512_cvt_roundph_ps (__m256i __A, const int __R) { - return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1, - 0x04); + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A, + const int __R) { - return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - 0x04); + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) +_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R) { - return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U, - 0x04); + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtpd_epi32 (__m512d __A) +_mm512_cvt_roundps_ph (__m512 __A, const int __I) { - return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1, - 0x04); + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_undefined_si256 (), + -1); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +_mm512_cvtps_ph (__m512 __A, const int __I) { - return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - 0x04); + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_undefined_si256 (), + -1); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) +_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A, + const int __I) { - return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U, - 0x04); + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtpd_epu32 (__m512d __A) +_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I) { - return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1, - 0x04); + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I) { - return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - 0x04); + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __W); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) +_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I) { - return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, - (__v8si) + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) _mm256_setzero_si256 (), - (__mmask8) __U, - 0x04); + (__mmask16) __W); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttps_epi32 (__m512 __A) +_mm512_cvt_roundpd_ps (__m512d __A, const int __R) { - return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1, - 0x04); + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_undefined_ps (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A, + const int __R) { - return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - 0x04); + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) +_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R) { - return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U, - 0x04); + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttps_epu32 (__m512 __A) +_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) { - return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1, - 0x04); + return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A, + (__v2df) __B, + __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) { - return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - 0x04); + return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A, + (__v4sf) __B, + __R); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) +_mm512_stream_si512 (__m512i * __P, __m512i __A) { - return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U, - 0x04); + __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtps_epi32 (__m512 __A) +_mm512_stream_ps (float *__P, __m512 __A) { - return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1, - 0x04); + __builtin_ia32_movntps512 (__P, (__v16sf) __A); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +_mm512_stream_pd (double *__P, __m512d __A) { - return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - 0x04); + __builtin_ia32_movntpd512 (__P, (__v8df) __A); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) +_mm512_stream_load_si512 (void *__P) { - return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U, - 0x04); + return __builtin_ia32_movntdqa512 ((__v8di *)__P); } -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtps_epu32 (__m512 __A) +typedef enum { - return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1, - 0x04); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) + _MM_MANT_NORM_1_2, + _MM_MANT_NORM_p5_2, + _MM_MANT_NORM_p5_1, + _MM_MANT_NORM_p75_1p5 +} _MM_MANTISSA_NORM_ENUM; +typedef enum { - return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - 0x04); -} -extern __inline __m512i + _MM_MANT_SIGN_src, + _MM_MANT_SIGN_zero, + _MM_MANT_SIGN_nan +} _MM_MANTISSA_SIGN_ENUM; +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A) +_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R) { - return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U, - 0x04); + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + __R); } -extern __inline double +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtsd_f64 (__m512d __A) +_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { - return __A[0]; + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } -extern __inline float +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtss_f32 (__m512 __A) +_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { - return __A[0]; + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m128 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu64_ss (__m128 __A, unsigned long long __B) +_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R) { - return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, - 0x04); + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + __R); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu64_sd (__m128d __A, unsigned long long __B) +_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { - return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, - 0x04); + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } -extern __inline __m128 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu32_ss (__m128 __A, unsigned __B) +_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { - return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, - 0x04); + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi32_ps (__m512i __A) +_mm512_getexp_round_ps (__m512 __A, const int __R) { - return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, (__v16sf) _mm512_undefined_ps (), - (__mmask16) -1, - 0x04); + (__mmask16) -1, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) +_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + const int __R) { - return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, (__v16sf) __W, - (__mmask16) __U, - 0x04); + (__mmask16) __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) +_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R) { - return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); -} -extern __inline __m512 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepu32_ps (__m512i __A) -{ - return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - 0x04); -} -extern __inline __m512 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) -{ - return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, - (__v16sf) __W, - (__mmask16) __U, - 0x04); -} -extern __inline __m512 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) -{ - return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + (__mmask16) __U, __R); } -#define _mm512_fixupimm_pd(X,Y,Z,C) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_fixupimm_pd(X,U,Y,Z,C) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_fixupimm_pd(U,X,Y,Z,C) ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_fixupimm_ps(X,Y,Z,C) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_fixupimm_ps(X,U,Y,Z,C) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_fixupimm_ps(U,X,Y,Z,C) ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_fixupimm_sd(X,Y,Z,C) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_fixupimm_sd(X,U,Y,Z,C) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_fixupimm_sd(U,X,Y,Z,C) ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_fixupimm_ss(X,Y,Z,C) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_fixupimm_ss(X,U,Y,Z,C) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_fixupimm_ss(U,X,Y,Z,C) ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -extern __inline unsigned long long +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_u64 (__m128 __A) +_mm512_getexp_round_pd (__m512d __A, const int __R) { - return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) - __A, - 0x04); + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline unsigned long long +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_u64 (__m128 __A) +_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) { - return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) - __A, - 0x04); + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline long long +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_i64 (__m128 __A) +_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R) { - return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, - 0x04); + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline unsigned +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_u32 (__m128 __A) +_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { - return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, - 0x04); + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_undefined_pd (), + (__mmask8) -1, __R); } -extern __inline unsigned +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_u32 (__m128 __A) +_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { - return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, - 0x04); + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + __R); } -extern __inline int +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_i32 (__m128 __A) +_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { - return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, - 0x04); + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); } -extern __inline unsigned long long +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_u64 (__m128d __A) +_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { - return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) - __A, - 0x04); + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_undefined_ps (), + (__mmask16) -1, __R); } -extern __inline unsigned long long +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_u64 (__m128d __A) +_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { - return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) - __A, - 0x04); + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + __R); } -extern __inline long long +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_i64 (__m128d __A) +_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { - return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, - 0x04); + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); } -extern __inline unsigned +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_u32 (__m128d __A) +_mm_getmant_round_sd (__m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { - return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, - 0x04); + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + __R); } -extern __inline unsigned +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_u32 (__m128d __A) +_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { - return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, - 0x04); + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, __R); } -extern __inline int +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_i32 (__m128d __A) +_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { - return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, - 0x04); + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtps_pd (__m256 __A) +_mm_getmant_round_ss (__m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { - return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - 0x04); + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) +_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { - return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, - (__v8df) __W, - (__mmask8) __U, - 0x04); + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, __R); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) +_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { - return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - 0x04); + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtph_ps (__m256i __A) +_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R) { - return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - 0x04); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) + _mm512_undefined_ps (), + -1, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) +_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm, const int __R) { - return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, - (__v16sf) __W, - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, __R); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) +_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B, + const int __imm, const int __R) { - return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - 0x04); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, __R); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtpd_ps (__m512d __A) +_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R) { - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) - _mm256_undefined_ps (), - (__mmask8) -1, - 0x04); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) + _mm512_undefined_pd (), + -1, __R); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) +_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B, + __m512d __C, const int __imm, const int __R) { - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) __W, - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, __R); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) +_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B, + const int __imm, const int __R) { - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U, - 0x04); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, __R); } -#define _mm512_getmant_pd(X,B,C) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_getmant_pd(W,U,X,B,C) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_getmant_pd(U,X,B,C) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_getmant_ps(X,B,C) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_getmant_ps(W,U,X,B,C) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_getmant_ps(U,X,B,C) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_getmant_sd(X,Y,C,D) ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_getmant_sd(W,U,X,Y,C,D) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_getmant_sd(U,X,Y,C,D) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_getmant_ss(X,Y,C,D) ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_getmant_ss(W,U,X,Y,C,D) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_getmant_ss(U,X,Y,C,D) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_getexp_ss(A,B) ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_getexp_ss(W,U,A,B) (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_getexp_ss(U,A,B) (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, _MM_FROUND_CUR_DIRECTION) -#define _mm_getexp_sd(A,B) ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_getexp_sd(W,U,A,B) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_getexp_sd(U,A,B) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, _MM_FROUND_CUR_DIRECTION) -#define _mm512_getexp_ps(A) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_getexp_ps(W,U,A) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_getexp_ps(U,A) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_getexp_pd(A) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_getexp_pd(W,U,A) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_getexp_pd(U,A) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_roundscale_ps(A,B) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_roundscale_ps(A,B,C,D) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), (int)(D), (__v16sf)(__m512)(A), (__mmask16)(B), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_roundscale_ps(A,B,C) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps(), (__mmask16)(A), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_roundscale_pd(A,B) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_roundscale_pd(A,B,C,D) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_roundscale_pd(A,B,C) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd(), (__mmask8)(A), _MM_FROUND_CUR_DIRECTION)) -#define _mm_roundscale_ss(A,B,C) ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), _MM_FROUND_CUR_DIRECTION)) -#define _mm_roundscale_sd(A,B,C) ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_cmp_pd_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) -#define _mm512_cmp_ps_mask(X,Y,P) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1,_MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_cmp_pd_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)M, _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_cmp_ps_mask(M,X,Y,P) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)M,_MM_FROUND_CUR_DIRECTION)) -#define _mm_cmp_sd_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_cmp_sd_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), M,_MM_FROUND_CUR_DIRECTION)) -#define _mm_cmp_ss_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_cmp_ss_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), M,_MM_FROUND_CUR_DIRECTION)) -extern __inline __mmask16 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kmov (__mmask16 __A) +_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, const int __R) { - return __builtin_ia32_kmovw (__A); + return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A, + (__v4sf) __B, __imm, __R); } -extern __inline __m512 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castpd_ps (__m512d __A) +_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm, + const int __R) { - return (__m512) (__A); + return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A, + (__v2df) __B, __imm, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castpd_si512 (__m512d __A) +_mm512_floor_ps (__m512 __A) { - return (__m512i) (__A); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + (0x01 | 0x00), + (__v16sf) __A, -1, + 0x04); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castps_pd (__m512 __A) -{ - return (__m512d) (__A); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castps_si512 (__m512 __A) +_mm512_floor_pd (__m512d __A) { - return (__m512i) (__A); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + (0x01 | 0x00), + (__v8df) __A, -1, + 0x04); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castsi512_ps (__m512i __A) +_mm512_ceil_ps (__m512 __A) { - return (__m512) (__A); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + (0x02 | 0x00), + (__v16sf) __A, -1, + 0x04); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castsi512_pd (__m512i __A) -{ - return (__m512d) (__A); -} -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castpd512_pd128 (__m512d __A) -{ - return (__m128d)((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) ((__m512)__A), (int) (0), (__v4sf)(__m128)_mm_undefined_ps(), (__mmask8)-1)); -} -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castps512_ps128 (__m512 __A) +_mm512_ceil_pd (__m512d __A) { - return ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (__A), (int) (0), (__v4sf)(__m128)_mm_undefined_ps(), (__mmask8)-1)); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + (0x02 | 0x00), + (__v8df) __A, -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castsi512_si128 (__m512i __A) +_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m128i)((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) ((__m512i)__A), (int) (0), (__v4si)(__m128i)_mm_undefined_si128 (), (__mmask8)-1)); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + (0x01 | 0x00), + (__v16sf) __W, __U, + 0x04); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castpd512_pd256 (__m512d __A) +_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + (0x01 | 0x00), + (__v8df) __W, __U, + 0x04); } -extern __inline __m256 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castps512_ps256 (__m512 __A) +_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m256)((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d)__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + (0x02 | 0x00), + (__v16sf) __W, __U, + 0x04); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castsi512_si256 (__m512i __A) +_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m256i)((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d)__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + (0x02 | 0x00), + (__v8df) __W, __U, + 0x04); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castpd128_pd512 (__m128d __A) +_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm) { - return (__m512d) __builtin_ia32_pd512_pd((__m128d)__A); + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castps128_ps512 (__m128 __A) +_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) { - return (__m512) __builtin_ia32_ps512_ps((__m128)__A); + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) __W, + (__mmask16) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castsi128_si512 (__m128i __A) +_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) { - return (__m512i) __builtin_ia32_si512_si((__v4si)__A); + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castpd256_pd512 (__m256d __A) +_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm) { - return __builtin_ia32_pd512_256pd (__A); + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castps256_ps512 (__m256 __A) +_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) { - return __builtin_ia32_ps512_256ps (__A); + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_castsi256_si512 (__m256i __A) +_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) { - return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A); + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B) +_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, - (__v16si) __B, 0, + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, (__mmask16) -1); } extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, - (__v16si) __B, 0, __U); + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, __U); } extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, - (__v8di) __B, 0, __U); + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, __U); } extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epu64_mask (__m512i __A, __m512i __B) +_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, - (__v8di) __B, 0, + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, (__mmask8) -1); } extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epu32_mask (__m512i __A, __m512i __B) +_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, - (__v16si) __B, 6, + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, (__mmask16) -1); } extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) { - return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, - (__v16si) __B, 6, __U); + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, __U); } extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, - (__v8di) __B, 6, __U); + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, __U); } extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epu64_mask (__m512i __A, __m512i __B) +_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, - (__v8di) __B, 6, + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, (__mmask8) -1); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 op __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 op __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 op __T7; return __T8[0] op __T8[1] -extern __inline int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_add_epi32 (__m512i __A) +_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y) { - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 + __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 + __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 + __T7; return __T8[0] + __T8[1]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) -1); } -extern __inline int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_mul_epi32 (__m512i __A) +_mm512_mask_cmpge_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 * __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 * __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 * __T7; return __T8[0] * __T8[1]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) __M); } -extern __inline int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_and_epi32 (__m512i __A) +_mm512_mask_cmpge_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 & __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 & __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 & __T7; return __T8[0] & __T8[1]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) __M); } -extern __inline int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_or_epi32 (__m512i __A) +_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y) { - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 | __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 | __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 | __T7; return __T8[0] | __T8[1]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) -1); } -extern __inline int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A) +_mm512_mask_cmpge_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __A = _mm512_maskz_mov_epi32 (__U, __A); - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 + __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 + __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 + __T7; return __T8[0] + __T8[1]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) __M); } -extern __inline int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A) +_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A); - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 * __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 * __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 * __T7; return __T8[0] * __T8[1]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) -1); } -extern __inline int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A) +_mm512_mask_cmpge_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 & __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 & __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 & __T7; return __T8[0] & __T8[1]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) __M); } -extern __inline int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A) +_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y) { - __A = _mm512_maskz_mov_epi32 (__U, __A); - __v8si __T1 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v8si __T2 = (__v8si) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 | __T2); __v4si __T4 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v4si __T5 = (__v4si) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v4si __T6 = __T4 | __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 | __T7; return __T8[0] | __T8[1]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) -1); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_ ##op (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_ ##op (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_ ##op (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_ ##op (__T8, __T9); return __T10[0] -extern __inline int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_min_epi32 (__m512i __A) +_mm512_mask_cmple_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_min_epi32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_min_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epi32 (__T8, __T9); return __T10[0]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) __M); } -extern __inline int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_epi32 (__m512i __A) +_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y) { - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_max_epi32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_max_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epi32 (__T8, __T9); return __T10[0]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) -1); } -extern __inline unsigned int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_min_epu32 (__m512i __A) +_mm512_mask_cmple_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_min_epu32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_min_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epu32 (__T8, __T9); return __T10[0]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) __M); } -extern __inline unsigned int +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_epu32 (__m512i __A) +_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y) { - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_max_epu32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_max_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epu32 (__T8, __T9); return __T10[0]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) -1); } -extern __inline int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A) +_mm512_mask_cmple_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (0x7fffffff), __U, __A); - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_min_epi32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_min_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epi32 (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) __M); } -extern __inline int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A) +_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-0x7fffffff - 1), __U, __A); - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_max_epi32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_max_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epi32 (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) -1); } -extern __inline unsigned int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A) +_mm512_mask_cmple_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_min_epu32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_min_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epu32 (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) __M); } -extern __inline unsigned int +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A) +_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y) { - __A = _mm512_maskz_mov_epi32 (__U, __A); - __m256i __T1 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T2 = (__m256i) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = _mm256_max_epu32 (__T1, __T2); __m128i __T4 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __m128i __T5 = (__m128i) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __m128i __T6 = _mm_max_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epu32 (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) -1); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = __T1 op __T2; __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = __T4 op __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 op __T7; return __T8[0] op __T8[1] -extern __inline float +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_add_ps (__m512 __A) +_mm512_mask_cmplt_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = __T1 + __T2; __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = __T4 + __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 + __T7; return __T8[0] + __T8[1]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) __M); } -extern __inline float +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_mul_ps (__m512 __A) +_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y) { - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = __T1 * __T2; __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = __T4 * __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 * __T7; return __T8[0] * __T8[1]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) -1); } -extern __inline float +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A) +_mm512_mask_cmplt_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __A = _mm512_maskz_mov_ps (__U, __A); - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = __T1 + __T2; __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = __T4 + __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 + __T7; return __T8[0] + __T8[1]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) __M); } -extern __inline float +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A) +_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A); - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = __T1 * __T2; __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = __T4 * __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 * __T7; return __T8[0] * __T8[1]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) -1); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = _mm256_ ##op (__T1, __T2); __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = _mm_ ##op (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_ ##op (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_ ##op (__T8, __T9); return __T10[0] -extern __inline float +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_min_ps (__m512 __A) +_mm512_mask_cmplt_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = _mm256_min_ps (__T1, __T2); __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = _mm_min_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_min_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_min_ps (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) __M); } -extern __inline float +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_ps (__m512 __A) +_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y) { - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = _mm256_max_ps (__T1, __T2); __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = _mm_max_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_max_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_max_ps (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) -1); } -extern __inline float +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A) +_mm512_mask_cmplt_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A); - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = _mm256_min_ps (__T1, __T2); __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = _mm_min_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_min_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_min_ps (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) __M); } -extern __inline float +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A) +_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A); - __m256 __T1 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T2 = (__m256) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) ((__m512d) __A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256 __T3 = _mm256_max_ps (__T1, __T2); __m128 __T4 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(1))); __m128 __T5 = ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(__T3), (int)(0))); __m128 __T6 = _mm_max_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_max_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_max_ps (__T8, __T9); return __T10[0]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) -1); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 op __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 op __T5; return __T6[0] op __T6[1] -extern __inline long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_add_epi64 (__m512i __A) +_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y) { - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 + __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 + __T5; return __T6[0] + __T6[1]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) -1); } -extern __inline long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_mul_epi64 (__m512i __A) +_mm512_mask_cmpneq_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 * __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 * __T5; return __T6[0] * __T6[1]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) __M); } -extern __inline long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_and_epi64 (__m512i __A) +_mm512_mask_cmpneq_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) { - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 & __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 & __T5; return __T6[0] & __T6[1]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) __M); } -extern __inline long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_or_epi64 (__m512i __A) +_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y) { - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 | __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 | __T5; return __T6[0] | __T6[1]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) -1); } -extern __inline long long +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A) +_mm512_mask_cmpneq_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __A = _mm512_maskz_mov_epi64 (__U, __A); - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 + __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 + __T5; return __T6[0] + __T6[1]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) __M); } -extern __inline long long +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A) +_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A); - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 * __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 * __T5; return __T6[0] * __T6[1]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) -1); } -extern __inline long long +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A) +_mm512_mask_cmpneq_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { - __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 & __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 & __T5; return __T6[0] & __T6[1]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) __M); } -extern __inline long long +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A) +_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y) { - __A = _mm512_maskz_mov_epi64 (__U, __A); - __v4di __T1 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (1), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __v4di __T2 = (__v4di) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (__A), (int) (0), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)); __m256i __T3 = (__m256i) (__T1 | __T2); __v2di __T4 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(1))); __v2di __T5 = (__v2di) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(__T3), (int)(0))); __v2di __T6 = __T4 | __T5; return __T6[0] | __T6[1]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) -1); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_ ##op (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_ ##op (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_ ##op (__T4, __T5); return __T6[0] -extern __inline long long +#define _MM_CMPINT_EQ 0x0 +#define _MM_CMPINT_LT 0x1 +#define _MM_CMPINT_LE 0x2 +#define _MM_CMPINT_UNUSED 0x3 +#define _MM_CMPINT_NE 0x4 +#define _MM_CMPINT_NLT 0x5 +#define _MM_CMPINT_GE 0x5 +#define _MM_CMPINT_NLE 0x6 +#define _MM_CMPINT_GT 0x6 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_min_epi64 (__m512i __A) +_kshiftli_mask16 (__mmask16 __A, unsigned int __B) { - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_min_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epi64 (__T4, __T5); return __T6[0]; + return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A, + (__mmask8) __B); } -extern __inline long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_epi64 (__m512i __A) +_kshiftri_mask16 (__mmask16 __A, unsigned int __B) { - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_max_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epi64 (__T4, __T5); return __T6[0]; + return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A, + (__mmask8) __B); } -extern __inline long long +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A) +_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P) { - __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (0x7fffffffffffffffLL), - __U, __A); - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_min_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epi64 (__T4, __T5); return __T6[0]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); } -extern __inline long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A) +_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P) { - __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-0x7fffffffffffffffLL - 1), - __U, __A); - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_max_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epi64 (__T4, __T5); return __T6[0]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); } -extern __inline unsigned long long +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_min_epu64 (__m512i __A) +_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P) { - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_min_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epu64 (__T4, __T5); return __T6[0]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); } -extern __inline unsigned long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_epu64 (__m512i __A) +_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P) { - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_max_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epu64 (__T4, __T5); return __T6[0]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); } -extern __inline unsigned long long +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A) +_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P, + const int __R) { - __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_min_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epu64 (__T4, __T5); return __T6[0]; + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, __R); } -extern __inline unsigned long long +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A) +_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R) { - __A = _mm512_maskz_mov_epi64 (__U, __A); - __m512i __T1 = ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(__A), (__v8di)(__m512i)(__A), (int)(0x4e), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)); __m512i __T2 = _mm512_max_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epu64 (__T4, __T5); return __T6[0]; + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, __R); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = __T1 op __T2; __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = __T4 op __T5; return __T6[0] op __T6[1] -extern __inline double +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_add_pd (__m512d __A) +_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) { - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = __T1 + __T2; __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = __T4 + __T5; return __T6[0] + __T6[1]; + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); } -extern __inline double +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_mul_pd (__m512d __A) +_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) { - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = __T1 * __T2; __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = __T4 * __T5; return __T6[0] * __T6[1]; + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); } -extern __inline double +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A) +_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) { - __A = _mm512_maskz_mov_pd (__U, __A); - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = __T1 + __T2; __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = __T4 + __T5; return __T6[0] + __T6[1]; + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); } -extern __inline double +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A) +_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) { - __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A); - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = __T1 * __T2; __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = __T4 * __T5; return __T6[0] * __T6[1]; + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); } -#undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = _mm256_ ##op (__T1, __T2); __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = _mm_ ##op (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_ ##op (__T6, __T7); return __T8[0] -extern __inline double +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_min_pd (__m512d __A) +_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, + const int __P, const int __R) { - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = _mm256_min_pd (__T1, __T2); __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = _mm_min_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_min_pd (__T6, __T7); return __T8[0]; + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, __R); } -extern __inline double +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_reduce_max_pd (__m512d __A) +_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, + const int __P, const int __R) { - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = _mm256_max_pd (__T1, __T2); __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = _mm_max_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_max_pd (__T6, __T7); return __T8[0]; + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, __R); } -extern __inline double +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A) +_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R) { - __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A); - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = _mm256_min_pd (__T1, __T2); __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = _mm_min_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_min_pd (__T6, __T7); return __T8[0]; + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, __R); } -extern __inline double +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A) +_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, + const int __P, const int __R) { - __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A); - __m256d __T1 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (1), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T2 = (__m256d) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (__A), (int) (0), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)); __m256d __T3 = _mm256_max_pd (__T1, __T2); __m128d __T4 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(1))); __m128d __T5 = ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(__T3), (int)(0))); __m128d __T6 = _mm_max_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_max_pd (__T6, __T7); return __T8[0]; + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, __R); } -#undef __MM512_REDUCE_OP -#undef __DISABLE_AVX512F__ -#pragma GCC pop_options -#define _AVX512ERINTRIN_H_INCLUDED -#pragma GCC push_options -#pragma GCC target("avx512er") -#define __DISABLE_AVX512ER__ -typedef double __v8df __attribute__ ((__vector_size__ (64))); -typedef float __v16sf __attribute__ ((__vector_size__ (64))); -typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); -typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); -typedef unsigned char __mmask8; -typedef unsigned short __mmask16; -#define _mm512_exp2a23_round_pd(A,C) __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) -#define _mm512_mask_exp2a23_round_pd(W,U,A,C) __builtin_ia32_exp2pd_mask(A, W, U, C) -#define _mm512_maskz_exp2a23_round_pd(U,A,C) __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_exp2a23_round_ps(A,C) __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) -#define _mm512_mask_exp2a23_round_ps(W,U,A,C) __builtin_ia32_exp2ps_mask(A, W, U, C) -#define _mm512_maskz_exp2a23_round_ps(U,A,C) __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm512_rcp28_round_pd(A,C) __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) -#define _mm512_mask_rcp28_round_pd(W,U,A,C) __builtin_ia32_rcp28pd_mask(A, W, U, C) -#define _mm512_maskz_rcp28_round_pd(U,A,C) __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_rcp28_round_ps(A,C) __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) -#define _mm512_mask_rcp28_round_ps(W,U,A,C) __builtin_ia32_rcp28ps_mask(A, W, U, C) -#define _mm512_maskz_rcp28_round_ps(U,A,C) __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm512_rsqrt28_round_pd(A,C) __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) -#define _mm512_mask_rsqrt28_round_pd(W,U,A,C) __builtin_ia32_rsqrt28pd_mask(A, W, U, C) -#define _mm512_maskz_rsqrt28_round_pd(U,A,C) __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) -#define _mm512_rsqrt28_round_ps(A,C) __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) -#define _mm512_mask_rsqrt28_round_ps(W,U,A,C) __builtin_ia32_rsqrt28ps_mask(A, W, U, C) -#define _mm512_maskz_rsqrt28_round_ps(U,A,C) __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm_rcp28_round_sd(A,B,R) __builtin_ia32_rcp28sd_round(A, B, R) -#define _mm_rcp28_round_ss(A,B,R) __builtin_ia32_rcp28ss_round(A, B, R) -#define _mm_rsqrt28_round_sd(A,B,R) __builtin_ia32_rsqrt28sd_round(A, B, R) -#define _mm_rsqrt28_round_ss(A,B,R) __builtin_ia32_rsqrt28ss_round(A, B, R) -#define _mm512_exp2a23_pd(A) _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_exp2a23_pd(W,U,A) _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_exp2a23_pd(U,A) _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_exp2a23_ps(A) _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_exp2a23_ps(W,U,A) _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_exp2a23_ps(U,A) _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_rcp28_pd(A) _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_rcp28_pd(W,U,A) _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_rcp28_pd(U,A) _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_rcp28_ps(A) _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_rcp28_ps(W,U,A) _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_rcp28_ps(U,A) _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_rsqrt28_pd(A) _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_rsqrt28_pd(W,U,A) _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_rsqrt28_pd(U,A) _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_rsqrt28_ps(A) _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_rsqrt28_ps(W,U,A) _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_rsqrt28_ps(U,A) _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) -#define _mm_rcp28_sd(A,B) __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) -#define _mm_rcp28_ss(A,B) __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) -#define _mm_rsqrt28_sd(A,B) __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) -#define _mm_rsqrt28_ss(A,B) __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) -#undef __DISABLE_AVX512ER__ -#pragma GCC pop_options -#define _AVX512PFINTRIN_H_INCLUDED -#pragma GCC push_options -#pragma GCC target("avx512pf") -#define __DISABLE_AVX512PF__ -typedef long long __v8di __attribute__ ((__vector_size__ (64))); -typedef int __v16si __attribute__ ((__vector_size__ (64))); -typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); -typedef unsigned char __mmask8; -typedef unsigned short __mmask16; -#define _mm512_prefetch_i32gather_pd(INDEX,ADDR,SCALE,HINT) __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i)INDEX, (void const *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_prefetch_i32gather_ps(INDEX,ADDR,SCALE,HINT) __builtin_ia32_gatherpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, (void const *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i32gather_pd(INDEX,MASK,ADDR,SCALE,HINT) __builtin_ia32_gatherpfdpd ((__mmask8)MASK, (__v8si)(__m256i)INDEX, (void const *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i32gather_ps(INDEX,MASK,ADDR,SCALE,HINT) __builtin_ia32_gatherpfdps ((__mmask16)MASK, (__v16si)(__m512i)INDEX, (void const *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_prefetch_i64gather_pd(INDEX,ADDR,SCALE,HINT) __builtin_ia32_gatherpfqpd ((__mmask8)0xFF, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_prefetch_i64gather_ps(INDEX,ADDR,SCALE,HINT) __builtin_ia32_gatherpfqps ((__mmask8)0xFF, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i64gather_pd(INDEX,MASK,ADDR,SCALE,HINT) __builtin_ia32_gatherpfqpd ((__mmask8)MASK, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i64gather_ps(INDEX,MASK,ADDR,SCALE,HINT) __builtin_ia32_gatherpfqps ((__mmask8)MASK, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_prefetch_i32scatter_pd(ADDR,INDEX,SCALE,HINT) __builtin_ia32_scatterpfdpd ((__mmask8)0xFF, (__v8si)(__m256i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_prefetch_i32scatter_ps(ADDR,INDEX,SCALE,HINT) __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i32scatter_pd(ADDR,MASK,INDEX,SCALE,HINT) __builtin_ia32_scatterpfdpd ((__mmask8)MASK, (__v8si)(__m256i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i32scatter_ps(ADDR,MASK,INDEX,SCALE,HINT) __builtin_ia32_scatterpfdps ((__mmask16)MASK, (__v16si)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_prefetch_i64scatter_pd(ADDR,INDEX,SCALE,HINT) __builtin_ia32_scatterpfqpd ((__mmask8)0xFF, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_prefetch_i64scatter_ps(ADDR,INDEX,SCALE,HINT) __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i64scatter_pd(ADDR,MASK,INDEX,SCALE,HINT) __builtin_ia32_scatterpfqpd ((__mmask8)MASK, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#define _mm512_mask_prefetch_i64scatter_ps(ADDR,MASK,INDEX,SCALE,HINT) __builtin_ia32_scatterpfqps ((__mmask8)MASK, (__v8di)(__m512i)INDEX, (void *)ADDR, (int)SCALE, (int)HINT) -#undef __DISABLE_AVX512PF__ -#pragma GCC pop_options -#define _AVX512CDINTRIN_H_INCLUDED -#pragma GCC push_options -#pragma GCC target("avx512cd") -#define __DISABLE_AVX512CD__ -typedef long long __v8di __attribute__ ((__vector_size__ (64))); -typedef int __v16si __attribute__ ((__vector_size__ (64))); -typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); -typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); -typedef unsigned char __mmask8; -typedef unsigned short __mmask16; -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_conflict_epi32 (__m512i __A) +_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R) { - return (__m512i) - __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) -1); + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, + const int __P, const int __R) { - return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, __R); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) +_mm512_i32gather_ps (__m512i __index, void const *__addr, int __scale) { - return (__m512i) - __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) __U); + __m512 __v1_old = _mm512_undefined_ps (); + __mmask16 __mask = 0xFFFF; + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_conflict_epi64 (__m512i __A) +_mm512_mask_i32gather_ps (__m512 __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, int __scale) { - return (__m512i) - __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +_mm512_i32gather_pd (__m256i __index, void const *__addr, int __scale) { - return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + __m512d __v1_old = _mm512_undefined_pd (); + __mmask8 __mask = 0xFF; + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old, + __addr, + (__v8si) __index, __mask, + __scale); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) +_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, int __scale) { - return (__m512i) - __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_lzcnt_epi64 (__m512i __A) +_mm512_i64gather_ps (__m512i __index, void const *__addr, int __scale) { - return (__m512i) - __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) -1); + __m256 __v1_old = _mm256_undefined_ps (); + __mmask8 __mask = 0xFF; + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) { - return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) +_mm512_i64gather_pd (__m512i __index, void const *__addr, int __scale) { - return (__m512i) - __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) __U); + __m512d __v1_old = _mm512_undefined_pd (); + __mmask8 __mask = 0xFF; + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_lzcnt_epi32 (__m512i __A) +_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) { - return (__m512i) - __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +_mm512_i32gather_epi32 (__m512i __index, void const *__addr, int __scale) { - return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask16 __mask = 0xFFFF; + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) +_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, int __scale) { - return (__m512i) - __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastmb_epi64 (__mmask8 __A) +_mm512_i32gather_epi64 (__m256i __index, void const *__addr, int __scale) { - return (__m512i) __builtin_ia32_broadcastmb512 (__A); + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask8 __mask = 0xFF; + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old, + __addr, + (__v8si) __index, __mask, + __scale); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastmw_epi32 (__mmask16 __A) +_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { - return (__m512i) __builtin_ia32_broadcastmw512 (__A); + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); } -#undef __DISABLE_AVX512CD__ -#pragma GCC pop_options -#define _AVX512VLINTRIN_H_INCLUDED -#pragma GCC push_options -#pragma GCC target("avx512vl") -#define __DISABLE_AVX512VL__ -typedef unsigned int __mmask32; -extern __inline __m256d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm512_i64gather_epi32 (__m512i __index, void const *__addr, int __scale) { - return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); + __m256i __v1_old = _mm256_undefined_si256 (); + __mmask8 __mask = 0xFF; + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } -extern __inline __m256d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) +_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) { - return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm512_i64gather_epi64 (__m512i __index, void const *__addr, int __scale) { - return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask8 __mask = 0xFF; + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) +_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) { - return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } -extern __inline __m256d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) +_mm512_i32scatter_ps (void *__addr, __m512i __index, __m512 __v1, int __scale) { - return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, - (__v4df) __W, - (__mmask8) __U); + __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16sf) __v1, __scale); } -extern __inline __m256d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_load_pd (__mmask8 __U, void const *__P) +_mm512_mask_i32scatter_ps (void *__addr, __mmask16 __mask, + __m512i __index, __m512 __v1, int __scale) { - return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index, + (__v16sf) __v1, __scale); } -extern __inline __m128d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) +_mm512_i32scatter_pd (void *__addr, __m256i __index, __m512d __v1, + int __scale) { - return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, - (__v2df) __W, - (__mmask8) __U); + __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8df) __v1, __scale); } -extern __inline __m128d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_load_pd (__mmask8 __U, void const *__P) +_mm512_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, __m512d __v1, int __scale) { - return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index, + (__v8df) __v1, __scale); } extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) +_mm512_i64scatter_ps (void *__addr, __m512i __index, __m256 __v1, int __scale) { - __builtin_ia32_storeapd256_mask ((__v4df *) __P, - (__v4df) __A, - (__mmask8) __U); + __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8sf) __v1, __scale); } extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) +_mm512_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m512i __index, __m256 __v1, int __scale) { - __builtin_ia32_storeapd128_mask ((__v2df *) __P, - (__v2df) __A, - (__mmask8) __U); + __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index, + (__v8sf) __v1, __scale); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm512_i64scatter_pd (void *__addr, __m512i __index, __m512d __v1, + int __scale) { - return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8df) __v1, __scale); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) +_mm512_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m512i __index, __m512d __v1, int __scale) { - return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index, + (__v8df) __v1, __scale); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm512_i32scatter_epi32 (void *__addr, __m512i __index, + __m512i __v1, int __scale) { - return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16si) __v1, __scale); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) +_mm512_mask_i32scatter_epi32 (void *__addr, __mmask16 __mask, + __m512i __index, __m512i __v1, int __scale) { - return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index, + (__v16si) __v1, __scale); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) +_mm512_i32scatter_epi64 (void *__addr, __m256i __index, + __m512i __v1, int __scale) { - return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, - (__v8sf) __W, - (__mmask8) __U); + __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8di) __v1, __scale); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_load_ps (__mmask8 __U, void const *__P) +_mm512_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m256i __index, __m512i __v1, int __scale) { - return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index, + (__v8di) __v1, __scale); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) +_mm512_i64scatter_epi32 (void *__addr, __m512i __index, + __m256i __v1, int __scale) { - return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, - (__v4sf) __W, - (__mmask8) __U); + __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8si) __v1, __scale); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_load_ps (__mmask8 __U, void const *__P) +_mm512_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m512i __index, __m256i __v1, int __scale) { - return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index, + (__v8si) __v1, __scale); } extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) +_mm512_i64scatter_epi64 (void *__addr, __m512i __index, + __m512i __v1, int __scale) { - __builtin_ia32_storeaps256_mask ((__v8sf *) __P, - (__v8sf) __A, - (__mmask8) __U); + __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8di) __v1, __scale); } extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) +_mm512_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m512i __index, __m512i __v1, int __scale) { - __builtin_ia32_storeaps128_mask ((__v4sf *) __P, - (__v4sf) __A, - (__mmask8) __U); + __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index, + (__v8di) __v1, __scale); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) +_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) { - return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) +_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) +_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) { - return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, - (__v4di) __W, - (__mmask8) - __U); + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) +_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) { - return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) - __U); + __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) +_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, - (__v2di) __W, - (__mmask8) - __U); + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_load_epi64 (__mmask8 __U, void const *__P) +_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) { - return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) - __U); + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) +_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) { - __builtin_ia32_movdqa64store256_mask ((__v4di *) __P, - (__v4di) __A, - (__mmask8) __U); + __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) +_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - __builtin_ia32_movdqa64store128_mask ((__v2di *) __P, - (__v2di) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) { - return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) +_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) { - return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) +_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), + return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), (__mmask8) __U); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) +_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, - (__v8si) __W, - (__mmask8) - __U); + return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) +_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) - __U); + return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) +_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, - (__v4si) __W, - (__mmask8) - __U); + return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_load_epi32 (__mmask8 __U, void const *__P) +_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) - __U); + return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) +_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P) { - __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, - (__v8si) __A, - (__mmask8) __U); + return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) +_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P) { - __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, - (__v4si) __A, - (__mmask8) __U); + return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m128d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A) { - return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) { - return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, + return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P, + (__v8di) __W, (__mmask8) __U); } -extern __inline __m256d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { - return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m512i) + __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) +_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) +_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) { - return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m256 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) +_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) { - return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m256 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) +_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P) { - return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 + (), (__mmask16) __U); } -extern __inline __m128d +#define _kand_mask16 _mm512_kand +#define _kandn_mask16 _mm512_kandn +#define _knot_mask16 _mm512_knot +#define _kor_mask16 _mm512_kor +#define _kxnor_mask16 _mm512_kxnor +#define _kxor_mask16 _mm512_kxor +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_kortest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) { - return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzhi (__A, __B); } -extern __inline __m128d +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) +_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B) { - return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); } -extern __inline __m256d +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B) { - return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); + return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); } -extern __inline __m256d +extern __inline unsigned int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) +_cvtmask16_u32 (__mmask16 __A) { - return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A); } -extern __inline __m128 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) +_cvtu32_mask16 (unsigned int __A) { - return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A); } -extern __inline __m128 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) +_load_mask16 (__mmask16 *__A) { - return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) +_store_mask16 (__mmask16 *__A, __mmask16 __B) { - return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + *(__mmask16 *) __A = __builtin_ia32_kmovw (__B); } -extern __inline __m256 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) +_mm512_kand (__mmask16 __A, __mmask16 __B) { - return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline void +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_store_epi64 (void *__P, __m256i __A) +_mm512_kandn (__mmask16 __A, __mmask16 __B) { - *(__m256i *) __P = __A; + return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, + (__mmask16) __B); } -extern __inline void +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_store_epi64 (void *__P, __m128i __A) +_mm512_kor (__mmask16 __A, __mmask16 __B) { - *(__m128i *) __P = __A; + return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline __m256d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) +_mm512_kortestz (__mmask16 __A, __mmask16 __B) { - return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, - (__v4df) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); } -extern __inline __m256d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) +_mm512_kortestc (__mmask16 __A, __mmask16 __B) { - return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); } -extern __inline __m128d +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) +_mm512_kxnor (__mmask16 __A, __mmask16 __B) { - return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, - (__v2df) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline __m128d +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_loadu_pd (__mmask8 __U, void const *__P) +_mm512_kxor (__mmask16 __A, __mmask16 __B) { - return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline void +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) +_mm512_knot (__mmask16 __A) { - __builtin_ia32_storeupd256_mask ((double *) __P, - (__v4df) __A, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A); } -extern __inline void +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) +_mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - __builtin_ia32_storeupd128_mask ((double *) __P, - (__v2df) __A, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline __m256 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) +_kunpackb_mask16 (__mmask8 __A, __mmask8 __B) { - return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, - (__v8sf) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline __m256 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) +_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D, + const int __imm) { - return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) + _mm512_setzero_si512 (), + __B); } -extern __inline __m128 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) +_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D, + const int __imm) { - return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, - (__v4sf) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) + _mm512_setzero_ps (), __B); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_loadu_ps (__mmask8 __U, void const *__P) +_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C, + __m128i __D, const int __imm) { - return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) __A, + __B); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) +_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C, + __m128 __D, const int __imm) { - __builtin_ia32_storeups256_mask ((float *) __P, - (__v8sf) __A, - (__mmask8) __U); + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) __A, __B); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) +_mm512_max_epi64 (__m512i __A, __m512i __B) { - __builtin_ia32_storeups128_mask ((float *) __P, - (__v4sf) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) +_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, - (__v4di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +_mm512_min_epi64 (__m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, - (__v2di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) +_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) { - __builtin_ia32_storedqudi256_mask ((long long *) __P, - (__v4di) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) +_mm512_max_epu64 (__m512i __A, __m512i __B) { - __builtin_ia32_storedqudi128_mask ((long long *) __P, - (__v2di) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) +_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, - (__v8si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +_mm512_min_epu64 (__m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, - (__v4si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) +_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) { - __builtin_ia32_storedqusi256_mask ((int *) __P, - (__v8si) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) +_mm512_max_epi32 (__m512i __A, __m512i __B) { - __builtin_ia32_storedqusi128_mask ((int *) __P, - (__v4si) __A, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) +_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_min_epi32 (__m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) +_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_abs_epi64 (__m256i __A) +_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +_mm512_max_epu32 (__m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) +_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_abs_epi64 (__m128i __A) +_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_min_epu32 (__m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtpd_epu32 (__m256d __A) +_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) +_mm512_unpacklo_ps (__m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) +_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtpd_epu32 (__m128d __A) +_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) +_mm_max_round_sd (__m128d __A, __m128d __B, const int __R) { - return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, + (__v2df) __B, + __R); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) +_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { - return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) +_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { - return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) +_mm_max_round_ss (__m128 __A, __m128 __B, const int __R) { - return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) +_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) +_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { - return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvttps_epu32 (__m256 __A) +_mm_min_round_sd (__m128d __A, __m128d __B, const int __R) { - return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, + (__v2df) __B, + __R); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) +_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { - return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) +_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { - return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttps_epu32 (__m128 __A) +_mm_min_round_ss (__m128 __A, __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128) __builtin_ia32_minss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) +_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) +_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { - return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) +_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W) { - return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, - (__v4si) __W, + return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A, + (__v8df) __W, (__mmask8) __U); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) +_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W) { - return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) +_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W) { - return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) +_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W) { - return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvttpd_epu32 (__m256d __A) +_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) { - return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) +_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + __R); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) +_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) { - return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttpd_epu32 (__m128d __A) +_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + __R); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) +_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) { - return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) +_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + __R); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) +_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) { - return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + __R); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) +_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) { - return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + __R); } -extern __inline __m128i +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) +_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R) { - return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); + return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R); } -extern __inline __m128i +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) +_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R) { - return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) +_mm512_sqrt_pd (__m512d __A) { - return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) +_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) +_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) { - return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) +_mm512_sqrt_ps (__m512 __A) { - return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtepu32_pd (__m128i __A) +_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) +_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A) { - return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) +_mm512_add_pd (__m512d __A, __m512d __B) { - return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m512d) ((__v8df)__A + (__v8df)__B); } -extern __inline __m128d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepu32_pd (__m128i __A) +_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) +_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) +_mm512_add_ps (__m512 __A, __m512 __B) { - return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m512) ((__v16sf)__A + (__v16sf)__B); } -extern __inline __m256 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) +_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m256 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) +_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) +_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) +_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtepu32_ps (__m256i __A) +_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) +_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) +_mm512_sub_pd (__m512d __A, __m512d __B) { - return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m512d) ((__v8df)__A - (__v8df)__B); } -extern __inline __m128 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepu32_ps (__m128i __A) +_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) +_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) +_mm512_sub_ps (__m512 __A, __m512 __B) { - return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m512) ((__v16sf)__A - (__v16sf)__B); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) +_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) +_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) +_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + 0x04); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) +_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepi32_epi8 (__m128i __A) +_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B) { - __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_mul_pd (__m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi) __O, __M); + return (__m512d) ((__v8df)__A * (__v8df)__B); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) +_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtepi32_epi8 (__m256i __A) +_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_mul_ps (__m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi) __O, __M); + return (__m512) ((__v16sf)__A * (__v16sf)__B); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) +_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsepi32_epi8 (__m128i __A) +_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { - return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B) { - __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { - return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, - (__v16qi) __O, __M); -} -extern __inline __m128i + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + 0x04); +} +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) +_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtsepi32_epi8 (__m256i __A) +_mm512_div_pd (__m512d __M, __m512d __V) { - return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) ((__v8df)__M / (__v8df)__V); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) { - __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V) { - return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, - (__v16qi) __O, __M); + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) +_mm512_div_ps (__m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512) ((__v16sf)__A / (__v16sf)__B); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtusepi32_epi8 (__m128i __A) +_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B) { - __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { - return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, - (__v16qi) __O, - __M); + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) +_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtusepi32_epi8 (__m256i __A) +_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { - return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B) { - __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M); + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_max_pd (__m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, - (__v16qi) __O, - __M); + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) +_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepi32_epi16 (__m128i __A) +_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +_mm512_max_ps (__m512 __A, __m512 __B) { - __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, - (__v8hi) __O, __M); + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) +_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtepi32_epi16 (__m256i __A) +_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B) { - __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, - (__v8hi) __O, __M); + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) +_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsepi32_epi16 (__m128i __A) +_mm512_min_pd (__m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + 0x04); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, - (__v8hi)__O, - __M); + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) +_mm512_min_ps (__m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtsepi32_epi16 (__m256i __A) +_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) { - __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, - (__v8hi) __O, __M); + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) +_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtusepi32_epi16 (__m128i __A) +_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B) { - __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_scalef_pd (__m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, - (__v8hi) __O, __M); + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) +_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtusepi32_epi16 (__m256i __A) +_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +_mm512_scalef_ps (__m512 __A, __m512 __B) { - __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, - (__v8hi) __O, __M); + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) +_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepi64_epi8 (__m128i __A) +_mm_scalef_sd (__m128d __A, __m128d __B) { - return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + 0x04); } -extern __inline void +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +_mm_scalef_ss (__m128 __A, __m128 __B) { - __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, - (__v16qi) __O, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) +_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtepi64_epi8 (__m256i __A) +_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, - (__v16qi) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) +_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsepi64_epi8 (__m128i __A) +_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, - (__v16qi) __O, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) +_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtsepi64_epi8 (__m256i __A) +_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, - (__v16qi) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) +_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtusepi64_epi8 (__m128i __A) +_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, - (__v16qi) __O, - __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) +_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtusepi64_epi8 (__m256i __A) +_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, - (__v16qi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, - (__v16qi) __O, - __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) +_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepi64_epi16 (__m128i __A) +_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, - (__v8hi)__O, - __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) +_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtepi64_epi16 (__m256i __A) +_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, - (__v8hi) __O, __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) +_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsepi64_epi16 (__m128i __A) +_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, - (__v8hi) __O, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) +_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtsepi64_epi16 (__m256i __A) +_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, - (__v8hi) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) +_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtusepi64_epi16 (__m128i __A) +_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, - (__v8hi) __O, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) +_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtusepi64_epi16 (__m256i __A) +_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, - (__v8hi) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, - (__v8hi) __O, __M); + return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) +_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, - (__v8hi) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtepi64_epi32 (__m128i __A) +_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, - (__v4si) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_cvttpd_epi32 (__m512d __A) { - return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, - (__v4si) __O, __M); + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) +_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtepi64_epi32 (__m256i __A) +_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, - (__v4si) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +_mm512_cvttpd_epu32 (__m512d __A) { - __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, - (__v4si) __O, __M); + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) +_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsepi64_epi32 (__m128i __A) +_mm512_cvtpd_epi32 (__m512d __A) { - return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, - (__v4si) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + 0x04); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) { - __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, - (__v4si) __O, __M); + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) +_mm512_cvtpd_epu32 (__m512d __A) { - return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtsepi64_epi32 (__m256i __A) +_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, - (__v4si) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) { - __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_cvttps_epi32 (__m512 __A) { - return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, - (__v4si)__O, - __M); + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) +_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtusepi64_epi32 (__m128i __A) +_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, - (__v4si) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + 0x04); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +_mm512_cvttps_epu32 (__m512 __A) { - __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, - (__v4si) __O, __M); + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) +_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtusepi64_epi32 (__m256i __A) +_mm512_cvtps_epi32 (__m512 __A) { - return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, - (__v4si) - _mm_undefined_si128 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + 0x04); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) { - __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, - (__v4si) __O, __M); + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) +_mm512_cvtps_epu32 (__m512 __A) { - return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + 0x04); } -extern __inline __m256 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) +_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) { - return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, - (__v8sf) __O, - __M); + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m256 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A) { - return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, - (__v8sf) - _mm256_setzero_ps (), - __M); + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + 0x04); } -extern __inline __m128 +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) +_mm512_cvtsd_f64 (__m512d __A) { - return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, - (__v4sf) __O, - __M); + return __A[0]; +} +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtss_f32 (__m512 __A) +{ + return __A[0]; } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +_mm_cvtu64_ss (__m128 __A, unsigned long long __B) { - return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - __M); + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, + 0x04); } -extern __inline __m256d +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) +_mm_cvtu64_sd (__m128d __A, unsigned long long __B) { - return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, - (__v4df) __O, - __M); + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, + 0x04); } -extern __inline __m256d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +_mm_cvtu32_ss (__m128 __A, unsigned __B) { - return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, - (__v4df) - _mm256_setzero_pd (), - __M); + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) +_mm512_cvtepi32_ps (__m512i __A) { - return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, - (__v8si) __O, - __M); + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) { - return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_set1_epi32 (__m256i __O, __mmask8 __M, int __A) +_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) { - return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, (__v8si) __O, - __M); + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_set1_epi32 (__mmask8 __M, int __A) +_mm512_cvtepu32_ps (__m512i __A) { - return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) { - return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, - (__v4si) __O, - __M); + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) { - return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_set1_epi32 (__m128i __O, __mmask8 __M, int __A) +_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm) { - return (__m128i) __builtin_ia32_pbroadcastd128_gpr_mask (__A, (__v4si) __O, - __M); + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_set1_epi32 (__mmask8 __M, int __A) +_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm) { - return (__m128i) - __builtin_ia32_pbroadcastd128_gpr_mask (__A, - (__v4si) _mm_setzero_si128 (), - __M); + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) +_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm) { - return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, - (__v4di) __O, - __M); + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm) { - return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) +_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O, - __M); + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) +_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) +_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm) { - return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, - (__v2di) __O, - __M); + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) { - return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) +_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) { - return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O, - __M); + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_set1_epi64 (__mmask8 __M, long long __A) +_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm) { - return (__m128i) - __builtin_ia32_pbroadcastq128_gpr_mask (__A, - (__v2di) _mm_setzero_si128 (), - __M); + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, + 0x04); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_broadcast_f32x4 (__m128 __A) +_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf)_mm256_undefined_pd (), - (__mmask8) -1); + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + 0x04); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A) +_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf) __O, - __M); + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + 0x04); } -extern __inline __m256 +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) +_mm_cvtss_u64 (__m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf) - _mm256_setzero_ps (), - __M); + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) + __A, + 0x04); } -extern __inline __m256i +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_broadcast_i32x4 (__m128i __A) +_mm_cvttss_u64 (__m128 __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) - __A, - (__v8si)_mm256_undefined_si256 (), - (__mmask8) -1); + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) + __A, + 0x04); } -extern __inline __m256i +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A) +_mm_cvttss_i64 (__m128 __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) - __A, - (__v8si) - __O, __M); + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, + 0x04); } -extern __inline __m256i +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A) +_mm_cvtss_u32 (__m128 __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) - __A, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, + 0x04); } -extern __inline __m256i +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +_mm_cvttss_u32 (__m128 __A) { - return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, - (__v8si) __W, - (__mmask8) __U); + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, + 0x04); } -extern __inline __m256i +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) +_mm_cvttss_i32 (__m128 __A) { - return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, + 0x04); } -extern __inline __m128i +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_cvtsd_u64 (__m128d __A) { - return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, - (__v4si) __W, - (__mmask8) __U); + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) + __A, + 0x04); } -extern __inline __m128i +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) +_mm_cvttsd_u64 (__m128d __A) { - return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) + __A, + 0x04); } -extern __inline __m256i +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm_cvttsd_i64 (__m128d __A) { - return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, - (__v4di) __W, - (__mmask8) __U); + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, + 0x04); } -extern __inline __m256i +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +_mm_cvtsd_u32 (__m128d __A) { - return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, + 0x04); } -extern __inline __m128i +extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_cvttsd_u32 (__m128d __A) { - return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, - (__v2di) __W, - (__mmask8) __U); + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, + 0x04); } -extern __inline __m128i +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +_mm_cvttsd_i32 (__m128d __A) { - return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, + 0x04); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +_mm512_cvtps_pd (__m256 __A) { - return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) +_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) { - return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) +_mm512_cvtph_ps (__m256i __A) { - return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_cvtpd_ps (__m512d __A) { - return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_undefined_ps (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) { - return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, - (__v4di) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +_mm512_getexp_ps (__m512 __A) { - return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, - (__v2di) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) { - return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +_mm512_getexp_pd (__m512d __A) { - return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) { - return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +_mm_getexp_ss (__m128 __A, __m128 __B) { - return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + 0x04); } -extern __inline __m256i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m256i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_getexp_sd (__m128d __A, __m128d __B) { - return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_undefined_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + 0x04); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_undefined_ps (), + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + 0x04); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, + 0x04); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + 0x04); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) { - return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, - (__v4di) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, + 0x04); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) { - return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, - (__v2di) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + 0x04); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, + 0x04); } -extern __inline __m256d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rcp14_pd (__m256d __A) +_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) { - return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, + 0x04); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm512_roundscale_ps (__m512 __A, const int __imm) { - return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) + _mm512_undefined_ps (), + -1, + 0x04); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) +_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm) { - return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, + 0x04); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rcp14_pd (__m128d __A) +_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm) { - return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, + 0x04); } -extern __inline __m128d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm512_roundscale_pd (__m512d __A, const int __imm) { - return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) + _mm512_undefined_pd (), + -1, + 0x04); } -extern __inline __m128d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) +_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C, + const int __imm) { - return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, + 0x04); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rcp14_ps (__m256 __A) +_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm) { - return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, + 0x04); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm) { - return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A, + (__v4sf) __B, __imm, + 0x04); } -extern __inline __m256 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) +_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm) { - return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A, + (__v2df) __B, __imm, + 0x04); } -extern __inline __m128 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rcp14_ps (__m128 __A) +_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P) { - return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, + 0x04); } -extern __inline __m128 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P) { - return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, + 0x04); } -extern __inline __m128 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) +_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P) { - return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, + 0x04); } -extern __inline __m256d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rsqrt14_pd (__m256d __A) +_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P) { - return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, + 0x04); } -extern __inline __m256d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm512_cmpeq_pd_mask (__m512d __X, __m512d __Y) { - return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x00, + (__mmask8) -1, + 0x04); } -extern __inline __m256d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) +_mm512_mask_cmpeq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x00, + (__mmask8) __U, + 0x04); } -extern __inline __m128d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rsqrt14_pd (__m128d __A) +_mm512_cmplt_pd_mask (__m512d __X, __m512d __Y) { - return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x01, + (__mmask8) -1, + 0x04); } -extern __inline __m128d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm512_mask_cmplt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x01, + (__mmask8) __U, + 0x04); } -extern __inline __m128d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) +_mm512_cmple_pd_mask (__m512d __X, __m512d __Y) { - return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x02, + (__mmask8) -1, + 0x04); } -extern __inline __m256 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rsqrt14_ps (__m256 __A) +_mm512_mask_cmple_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x02, + (__mmask8) __U, + 0x04); } -extern __inline __m256 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm512_cmpunord_pd_mask (__m512d __X, __m512d __Y) { - return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x03, + (__mmask8) -1, + 0x04); } -extern __inline __m256 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) +_mm512_mask_cmpunord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x03, + (__mmask8) __U, + 0x04); } -extern __inline __m128 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rsqrt14_ps (__m128 __A) +_mm512_cmpneq_pd_mask (__m512d __X, __m512d __Y) { - return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x04, + (__mmask8) -1, + 0x04); } -extern __inline __m128 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm512_mask_cmpneq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x04, + (__mmask8) __U, + 0x04); } -extern __inline __m128 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) +_mm512_cmpnlt_pd_mask (__m512d __X, __m512d __Y) { - return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x05, + (__mmask8) -1, + 0x04); } -extern __inline __m256d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm512_mask_cmpnlt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x05, + (__mmask8) __U, + 0x04); } -extern __inline __m256d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) +_mm512_cmpnle_pd_mask (__m512d __X, __m512d __Y) { - return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x06, + (__mmask8) -1, + 0x04); } -extern __inline __m128d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm512_mask_cmpnle_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x06, + (__mmask8) __U, + 0x04); } -extern __inline __m128d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) +_mm512_cmpord_pd_mask (__m512d __X, __m512d __Y) { - return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x07, + (__mmask8) -1, + 0x04); } -extern __inline __m256 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm512_mask_cmpord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) { - return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, 0x07, + (__mmask8) __U, + 0x04); } -extern __inline __m256 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) +_mm512_cmpeq_ps_mask (__m512 __X, __m512 __Y) { - return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x00, + (__mmask16) -1, + 0x04); } -extern __inline __m128 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm512_mask_cmpeq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x00, + (__mmask16) __U, + 0x04); } -extern __inline __m128 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) +_mm512_cmplt_ps_mask (__m512 __X, __m512 __Y) { - return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x01, + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_mask_cmplt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x01, + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_cmple_ps_mask (__m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x02, + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_mask_cmple_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x02, + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_cmpunord_ps_mask (__m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x03, + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_mask_cmpunord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x03, + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_cmpneq_ps_mask (__m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x04, + (__mmask16) -1, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_mask_cmpneq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x04, + (__mmask16) __U, + 0x04); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_cmpnlt_ps_mask (__m512 __X, __m512 __Y) { - return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x05, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm512_mask_cmpnlt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x05, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm512_cmpnle_ps_mask (__m512 __X, __m512 __Y) { - return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x06, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm512_mask_cmpnle_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x06, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm512_cmpord_ps_mask (__m512 __X, __m512 __Y) { - return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x07, + (__mmask16) -1, + 0x04); } -extern __inline __m128i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm512_mask_cmpord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) { - return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, 0x07, + (__mmask16) __U, + 0x04); } -extern __inline __m128i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P) { - return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, + 0x04); } -extern __inline __m128i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P) { - return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, + 0x04); } -extern __inline __m128i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P) { - return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, + 0x04); } -extern __inline __m256 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_getexp_ps (__m256 __A) +_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P) { - return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, + 0x04); } -extern __inline __m256 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm512_kmov (__mmask16 __A) { - return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return __builtin_ia32_kmovw (__A); } -extern __inline __m256 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) +_mm512_castpd_ps (__m512d __A) { - return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m512) (__A); } -extern __inline __m256d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_getexp_pd (__m256d __A) +_mm512_castpd_si512 (__m512d __A) { - return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); + return (__m512i) (__A); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm512_castps_pd (__m512 __A) { - return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m512d) (__A); } -extern __inline __m256d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) +_mm512_castps_si512 (__m512 __A) { - return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m512i) (__A); } -extern __inline __m128 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getexp_ps (__m128 __A) +_mm512_castsi512_ps (__m512i __A) { - return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); + return (__m512) (__A); } -extern __inline __m128 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm512_castsi512_pd (__m512i __A) { - return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m512d) (__A); } -extern __inline __m128 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) +_mm512_castpd512_pd128 (__m512d __A) { - return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0); } -extern __inline __m128d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getexp_pd (__m128d __A) +_mm512_castps512_ps128 (__m512 __A) { - return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); + return _mm512_extractf32x4_ps(__A, 0); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm512_castsi512_si128 (__m512i __A) { - return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0); } -extern __inline __m128d +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) +_mm512_castpd512_pd256 (__m512d __A) { - return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return _mm512_extractf64x4_pd(__A, 0); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm512_castps512_ps256 (__m512 __A) { - return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +_mm512_castsi512_si256 (__m512i __A) { - return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0); } -extern __inline __m128i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm512_castpd128_pd512 (__m128d __A) { - return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_pd512_pd((__m128d)__A); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm512_castps128_ps512 (__m128 __A) { - return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512) __builtin_ia32_ps512_ps((__m128)__A); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm512_castsi128_si512 (__m128i __A) { - return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_si512_si((__v4si)__A); } -extern __inline __m256i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +_mm512_castpd256_pd512 (__m256d __A) { - return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return __builtin_ia32_pd512_256pd (__A); } -extern __inline __m128i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm512_castps256_ps512 (__m256 __A) { - return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return __builtin_ia32_ps512_256ps (__A); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm512_castsi256_si512 (__m256i __A) { - return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 0, + (__mmask16) -1); } -extern __inline __m256i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_mask_cmpeq_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 0, __U); } -extern __inline __m256d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_scalef_pd (__m256d __A, __m256d __B) +_mm512_mask_cmpeq_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 0, __U); } -extern __inline __m256d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_mm512_cmpeq_epu64_mask (__m512i __A, __m512i __B) { - return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 0, + (__mmask8) -1); } -extern __inline __m256d +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm512_cmpgt_epu32_mask (__m512i __A, __m512i __B) { - return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 6, + (__mmask16) -1); } -extern __inline __m256 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_scalef_ps (__m256 __A, __m256 __B) +_mm512_mask_cmpgt_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 6, __U); } -extern __inline __m256 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) +_mm512_mask_cmpgt_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 6, __U); } -extern __inline __m256 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) +_mm512_cmpgt_epu64_mask (__m512i __A, __m512i __B) { - return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 6, + (__mmask8) -1); } -extern __inline __m128d +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 op __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 op __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 op __T7; return __T8[0] op __T8[1] +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_pd (__m128d __A, __m128d __B) +_mm512_reduce_add_epi32 (__m512i __A) { - return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 + __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 + __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 + __T7; return __T8[0] + __T8[1]; } -extern __inline __m128d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) +_mm512_reduce_mul_epi32 (__m512i __A) { - return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 * __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 * __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 * __T7; return __T8[0] * __T8[1]; } -extern __inline __m128d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm512_reduce_and_epi32 (__m512i __A) { - return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 & __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 & __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 & __T7; return __T8[0] & __T8[1]; } -extern __inline __m128 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_ps (__m128 __A, __m128 __B) +_mm512_reduce_or_epi32 (__m512i __A) { - return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 | __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 | __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 | __T7; return __T8[0] | __T8[1]; } -extern __inline __m128 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A) { - return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + __A = _mm512_maskz_mov_epi32 (__U, __A); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 + __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 + __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 + __T7; return __T8[0] + __T8[1]; } -extern __inline __m128 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) +_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A) { - return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 * __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 * __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 * __T7; return __T8[0] * __T8[1]; } -extern __inline __m256d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) +_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 & __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 & __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 & __T7; return __T8[0] & __T8[1]; } -extern __inline __m256d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmadd_pd (__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) +_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + __A = _mm512_maskz_mov_epi32 (__U, __A); + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 | __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 | __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 | __T7; return __T8[0] | __T8[1]; } -extern __inline __m256d +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_ ##op (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_ ##op (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_ ##op (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_ ##op (__T8, __T9); return __T10[0] +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) +_mm512_reduce_min_epi32 (__m512i __A) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_min_epi32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_min_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epi32 (__T8, __T9); return __T10[0]; } -extern __inline __m128d +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +_mm512_reduce_max_epi32 (__m512i __A) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_max_epi32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_max_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epi32 (__T8, __T9); return __T10[0]; } -extern __inline __m128d +extern __inline unsigned int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmadd_pd (__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) +_mm512_reduce_min_epu32 (__m512i __A) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_min_epu32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_min_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epu32 (__T8, __T9); return __T10[0]; } -extern __inline __m128d +extern __inline unsigned int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) +_mm512_reduce_max_epu32 (__m512i __A) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_max_epu32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_max_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epu32 (__T8, __T9); return __T10[0]; } -extern __inline __m256 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A) { - return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (0x7fffffff), __U, __A); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_min_epi32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_min_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epi32 (__T8, __T9); return __T10[0]; } -extern __inline __m256 +extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmadd_ps (__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) +_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A) { - return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-0x7fffffff - 1), __U, __A); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_max_epi32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_max_epi32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epi32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epi32 (__T8, __T9); return __T10[0]; } -extern __inline __m256 +extern __inline unsigned int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) +_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A) { - return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_min_epu32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_min_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_min_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_min_epu32 (__T8, __T9); return __T10[0]; } -extern __inline __m128 +extern __inline unsigned int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A) { - return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __A = _mm512_maskz_mov_epi32 (__U, __A); + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_max_epu32 (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_max_epu32 (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_max_epu32 (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_max_epu32 (__T8, __T9); return __T10[0]; } -extern __inline __m128 +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = __T1 op __T2; __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = __T4 op __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 op __T7; return __T8[0] op __T8[1] +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +_mm512_reduce_add_ps (__m512 __A) { - return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = __T1 + __T2; __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = __T4 + __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 + __T7; return __T8[0] + __T8[1]; } -extern __inline __m128 +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +_mm512_reduce_mul_ps (__m512 __A) { - return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = __T1 * __T2; __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = __T4 * __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 * __T7; return __T8[0] * __T8[1]; } -extern __inline __m256d +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) +_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + __A = _mm512_maskz_mov_ps (__U, __A); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = __T1 + __T2; __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = __T4 + __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 + __T7; return __T8[0] + __T8[1]; } -extern __inline __m256d +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmsub_pd (__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) +_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A) { - return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = __T1 * __T2; __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = __T4 * __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 * __T7; return __T8[0] * __T8[1]; } -extern __inline __m256d +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = _mm256_ ##op (__T1, __T2); __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = _mm_ ##op (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_ ##op (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_ ##op (__T8, __T9); return __T10[0] +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) +_mm512_reduce_min_ps (__m512 __A) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = _mm256_min_ps (__T1, __T2); __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = _mm_min_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_min_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_min_ps (__T8, __T9); return __T10[0]; } -extern __inline __m128d +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +_mm512_reduce_max_ps (__m512 __A) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = _mm256_max_ps (__T1, __T2); __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = _mm_max_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_max_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_max_ps (__T8, __T9); return __T10[0]; } -extern __inline __m128d +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsub_pd (__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) +_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A) { - return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = _mm256_min_ps (__T1, __T2); __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = _mm_min_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_min_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_min_ps (__T8, __T9); return __T10[0]; } -extern __inline __m128d +extern __inline float __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) +_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A); + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = _mm256_max_ps (__T1, __T2); __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = _mm_max_ps (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_max_ps (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_max_ps (__T8, __T9); return __T10[0]; } -extern __inline __m256 +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 op __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 op __T5; return __T6[0] op __T6[1] +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +_mm512_reduce_add_epi64 (__m512i __A) { - return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 + __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 + __T5; return __T6[0] + __T6[1]; } -extern __inline __m256 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmsub_ps (__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) +_mm512_reduce_mul_epi64 (__m512i __A) { - return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 * __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 * __T5; return __T6[0] * __T6[1]; } -extern __inline __m256 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) +_mm512_reduce_and_epi64 (__m512i __A) { - return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 & __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 & __T5; return __T6[0] & __T6[1]; } -extern __inline __m128 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +_mm512_reduce_or_epi64 (__m512i __A) { - return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 | __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 | __T5; return __T6[0] | __T6[1]; } -extern __inline __m128 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A) { - return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __A = _mm512_maskz_mov_epi64 (__U, __A); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 + __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 + __T5; return __T6[0] + __T6[1]; } -extern __inline __m128 +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A) { - return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 * __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 * __T5; return __T6[0] * __T6[1]; } -extern __inline __m256d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmaddsub_pd (__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) +_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 & __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 & __T5; return __T6[0] & __T6[1]; } -extern __inline __m256d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) +_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + __A = _mm512_maskz_mov_epi64 (__U, __A); + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 | __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 | __T5; return __T6[0] | __T6[1]; } -extern __inline __m256d +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_ ##op (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_ ##op (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_ ##op (__T4, __T5); return __T6[0] +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmaddsub_pd (__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) +_mm512_reduce_min_epi64 (__m512i __A) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_min_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epi64 (__T4, __T5); return __T6[0]; } -extern __inline __m128d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmaddsub_pd (__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) +_mm512_reduce_max_epi64 (__m512i __A) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_max_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epi64 (__T4, __T5); return __T6[0]; } -extern __inline __m128d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) +_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (0x7fffffffffffffffLL), + __U, __A); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_min_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epi64 (__T4, __T5); return __T6[0]; } -extern __inline __m128d +extern __inline long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmaddsub_pd (__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) +_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-0x7fffffffffffffffLL - 1), + __U, __A); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_max_epi64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epi64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epi64 (__T4, __T5); return __T6[0]; } -extern __inline __m256 +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmaddsub_ps (__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) +_mm512_reduce_min_epu64 (__m512i __A) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_min_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epu64 (__T4, __T5); return __T6[0]; } -extern __inline __m256 +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) +_mm512_reduce_max_epu64 (__m512i __A) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_max_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epu64 (__T4, __T5); return __T6[0]; } -extern __inline __m256 +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmaddsub_ps (__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) +_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A) { - return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_min_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_min_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_min_epu64 (__T4, __T5); return __T6[0]; } -extern __inline __m128 +extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmaddsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __A = _mm512_maskz_mov_epi64 (__U, __A); + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_max_epu64 (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_max_epu64 (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_max_epu64 (__T4, __T5); return __T6[0]; } -extern __inline __m128 +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = __T1 op __T2; __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = __T4 op __T5; return __T6[0] op __T6[1] +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) +_mm512_reduce_add_pd (__m512d __A) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = __T1 + __T2; __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = __T4 + __T5; return __T6[0] + __T6[1]; } -extern __inline __m128 +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmaddsub_ps (__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) +_mm512_reduce_mul_pd (__m512d __A) { - return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = __T1 * __T2; __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = __T4 * __T5; return __T6[0] * __T6[1]; } -extern __inline __m256d +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmsubadd_pd (__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) +_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + __A = _mm512_maskz_mov_pd (__U, __A); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = __T1 + __T2; __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = __T4 + __T5; return __T6[0] + __T6[1]; } -extern __inline __m256d +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) +_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A) { - return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = __T1 * __T2; __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = __T4 * __T5; return __T6[0] * __T6[1]; } -extern __inline __m256d +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = _mm256_ ##op (__T1, __T2); __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = _mm_ ##op (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_ ##op (__T6, __T7); return __T8[0] +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmsubadd_pd (__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) +_mm512_reduce_min_pd (__m512d __A) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) - __U); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = _mm256_min_pd (__T1, __T2); __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = _mm_min_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_min_pd (__T6, __T7); return __T8[0]; } -extern __inline __m128d +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsubadd_pd (__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) +_mm512_reduce_max_pd (__m512d __A) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = _mm256_max_pd (__T1, __T2); __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = _mm_max_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_max_pd (__T6, __T7); return __T8[0]; } -extern __inline __m128d +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) +_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A) { - return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = _mm256_min_pd (__T1, __T2); __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = _mm_min_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_min_pd (__T6, __T7); return __T8[0]; } -extern __inline __m128d +extern __inline double __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsubadd_pd (__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) +_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) - __U); + __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A); + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = _mm256_max_pd (__T1, __T2); __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = _mm_max_pd (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_max_pd (__T6, __T7); return __T8[0]; } -extern __inline __m256 +#undef __MM512_REDUCE_OP +#undef __DISABLE_AVX512F__ +#pragma GCC pop_options +#define _AVX512ERINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("avx512er") +#define __DISABLE_AVX512ER__ +typedef double __v8df __attribute__ ((__vector_size__ (64))); +typedef float __v16sf __attribute__ ((__vector_size__ (64))); +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fmsubadd_ps (__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) +_mm512_exp2a23_round_pd (__m512d __A, int __R) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + __m512d __W; + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) +_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) { - return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fmsubadd_ps (__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) +_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R) { - return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m128 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsubadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +_mm512_exp2a23_round_ps (__m512 __A, int __R) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + __m512 __W; + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); } -extern __inline __m128 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) +_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) { - return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m128 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsubadd_ps (__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) +_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R) { - return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fnmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) +_mm512_rcp28_round_pd (__m512d __A, int __R) { - return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + __m512d __W; + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) +_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m256d +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fnmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) +_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) +_mm512_rcp28_round_ps (__m512 __A, int __R) { - return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + __m512 __W; + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) +_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m128d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) +_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m256 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fnmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) +_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R) { - return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __B, + (__v2df) __A, + __R); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) +_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) { - return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __B, + (__v4sf) __A, + __R); } -extern __inline __m256 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fnmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) +_mm512_rsqrt28_round_pd (__m512d __A, int __R) { - return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __m512d __W; + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); } -extern __inline __m128 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) { - return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } -extern __inline __m128 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R) { - return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m128 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +_mm512_rsqrt28_round_ps (__m512 __A, int __R) { - return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __m512 __W; + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fnmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) +_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) { - return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } -extern __inline __m256d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) +_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R) { - return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); } -extern __inline __m256d +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fnmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) +_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __B, + (__v2df) __A, + __R); } -extern __inline __m128d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) +_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) { - return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __B, + (__v4sf) __A, + __R); } -extern __inline __m128d +#define _mm512_exp2a23_pd(A) _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_exp2a23_pd(W,U,A) _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_exp2a23_pd(U,A) _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_exp2a23_ps(A) _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_exp2a23_ps(W,U,A) _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_exp2a23_ps(U,A) _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_rcp28_pd(A) _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_rcp28_pd(W,U,A) _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rcp28_pd(U,A) _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_rcp28_ps(A) _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_rcp28_ps(W,U,A) _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rcp28_ps(U,A) _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_rsqrt28_pd(A) _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_rsqrt28_pd(W,U,A) _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rsqrt28_pd(U,A) _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_rsqrt28_ps(A) _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_rsqrt28_ps(W,U,A) _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rsqrt28_ps(U,A) _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rcp28_sd(A,B) __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rcp28_ss(A,B) __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rsqrt28_sd(A,B) __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rsqrt28_ss(A,B) __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) +#undef __DISABLE_AVX512ER__ +#pragma GCC pop_options +#define _AVX512PFINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("avx512pf") +#define __DISABLE_AVX512PF__ +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) +_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr, + int __scale, int __hint) { - return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr, + __scale, __hint); } -extern __inline __m128d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) +_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr, + int __scale, int __hint) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr, + __scale, __hint); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_fnmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) +_mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) { - return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfdpd (__mask, (__v8si) __index, __addr, __scale, + __hint); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask3_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) +_mm512_mask_prefetch_i32gather_ps (__m512i __index, __mmask16 __mask, + void const *__addr, int __scale, int __hint) { - return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfdps (__mask, (__v16si) __index, __addr, __scale, + __hint); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_fnmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) +_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr, + int __scale, int __hint) { - return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr, + int __scale, int __hint) { - return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +_mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) { - return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfqpd (__mask, (__v8di) __index, __addr, __scale, + __hint); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +_mm512_mask_prefetch_i64gather_ps (__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) { - return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + __builtin_ia32_gatherpfqps (__mask, (__v8di) __index, __addr, __scale, + __hint); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm512_prefetch_i32scatter_pd (void *__addr, __m256i __index, int __scale, + int __hint) { - return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + __builtin_ia32_scatterpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr, + __scale, __hint); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm512_prefetch_i32scatter_ps (void *__addr, __m512i __index, int __scale, + int __hint) { - return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr, + __scale, __hint); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_mask_prefetch_i32scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, int __scale, int __hint) { - return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + __builtin_ia32_scatterpfdpd (__mask, (__v8si) __index, __addr, __scale, + __hint); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_mask_prefetch_i32scatter_ps (void *__addr, __mmask16 __mask, + __m512i __index, int __scale, int __hint) { - return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_scatterpfdps (__mask, (__v16si) __index, __addr, __scale, + __hint); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm512_prefetch_i64scatter_pd (void *__addr, __m512i __index, int __scale, + int __hint) { - return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + __builtin_ia32_scatterpfqpd ((__mmask8) 0xFF, (__v8di) __index,__addr, + __scale, __hint); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm512_prefetch_i64scatter_ps (void *__addr, __m512i __index, int __scale, + int __hint) { - return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_mask_prefetch_i64scatter_pd (void *__addr, __mmask16 __mask, + __m512i __index, int __scale, int __hint) { - return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + __builtin_ia32_scatterpfqpd (__mask, (__v8di) __index, __addr, __scale, + __hint); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_mask_prefetch_i64scatter_ps (void *__addr, __mmask16 __mask, + __m512i __index, int __scale, int __hint) { - return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_scatterpfqps (__mask, (__v8di) __index, __addr, __scale, + __hint); } -extern __inline __m128i +#undef __DISABLE_AVX512PF__ +#pragma GCC pop_options +#define _AVX512CDINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("avx512cd") +#define __DISABLE_AVX512CD__ +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +_mm512_conflict_epi32 (__m512i __A) { - return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) { - return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm512_conflict_epi64 (__m512i __A) { - return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); -} -extern __inline __m128i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) -{ - return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); -} -extern __inline __m128i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) +_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) +_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) { - return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) +_mm512_lzcnt_epi64 (__m512i __A) { - return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); } -extern __inline __m128 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) +_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) +_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) { - return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) +_mm512_lzcnt_epi32 (__m512i __A) { - return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) +_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } -extern __inline __m128i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) +_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) { - return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cvtps_epu32 (__m256 __A) +_mm512_broadcastmb_epi64 (__mmask8 __A) { - return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_broadcastmb512 (__A); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) +_mm512_broadcastmw_epi32 (__mmask16 __A) { - return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m512i) __builtin_ia32_broadcastmw512 (__A); } -extern __inline __m256i +#undef __DISABLE_AVX512CD__ +#pragma GCC pop_options +#define _AVX512VLINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("avx512vl") +#define __DISABLE_AVX512VL__ +typedef unsigned int __mmask32; +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) +_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtps_epu32 (__m128 __A) +_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) +_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) +_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) { - return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, (__v4df) __W, (__mmask8) __U); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) +_mm256_maskz_load_pd (__mmask8 __U, void const *__P) { - return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, (__v4df) _mm256_setzero_pd (), (__mmask8) __U); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) { - return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, (__v2df) __W, (__mmask8) __U); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) +_mm_maskz_load_pd (__mmask8 __U, void const *__P) { - return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, (__v2df) _mm_setzero_pd (), (__mmask8) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_storeapd256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storeapd128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) +_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) +_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) { - return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) +_mm256_maskz_load_ps (__mmask8 __U, void const *__P) { - return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) { - return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) +_mm_maskz_load_ps (__mmask8 __U, void const *__P) { - return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpackhi_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) { - return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + __builtin_ia32_storeaps256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpackhi_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + __builtin_ia32_storeaps128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpackhi_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, + return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, + (__v4di) __W, (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpackhi_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) + return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, + (__v4di) _mm256_setzero_si256 (), (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpackhi_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpackhi_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpackhi_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpackhi_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpacklo_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) { - return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpacklo_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_load_epi64 (__mmask8 __U, void const *__P) { - return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_movdqa64store256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_movdqa64store128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpacklo_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, - (__v8si) __B, + return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, (__v8si) __W, (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpacklo_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, - (__v8si) __B, + return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, (__v8si) _mm256_setzero_si256 (), (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpacklo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpacklo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpacklo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpacklo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_epu32_mask (__m128i __A, __m128i __B) +_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, - (__v4si) __B, 0, - (__mmask8) -1); + return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_epi32_mask (__m128i __A, __m128i __B) +_mm_maskz_load_epi32 (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpeq_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, - (__v4si) __B, 0, __U); + __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpeq_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) { - return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, - (__v4si) __B, __U); + __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpeq_epu32_mask (__m256i __A, __m256i __B) +_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, - (__v8si) __B, 0, - (__mmask8) -1); -} -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpeq_epi32_mask (__m256i __A, __m256i __B) -{ - return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpeq_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, - (__v8si) __B, 0, __U); + return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpeq_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { - return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, - (__v8si) __B, __U); + return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_epu64_mask (__m128i __A, __m128i __B) +_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, - (__v2di) __B, 0, - (__mmask8) -1); + return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpeq_epi64_mask (__m128i __A, __m128i __B) +_mm_mask_add_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpeq_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_add_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, - (__v2di) __B, 0, __U); + return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpeq_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_add_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, - (__v2di) __B, __U); + return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpeq_epu64_mask (__m256i __A, __m256i __B) +_mm256_maskz_add_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, - (__v4di) __B, 0, - (__mmask8) -1); + return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpeq_epi64_mask (__m256i __A, __m256i __B) +_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpeq_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, - (__v4di) __B, 0, __U); + return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpeq_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { - return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, - (__v4di) __B, __U); + return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_epu32_mask (__m128i __A, __m128i __B) +_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, - (__v4si) __B, 6, - (__mmask8) -1); + return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_epi32_mask (__m128i __A, __m128i __B) +_mm_mask_sub_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpgt_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sub_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, - (__v4si) __B, 6, __U); + return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpgt_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_sub_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, - (__v4si) __B, __U); + return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpgt_epu32_mask (__m256i __A, __m256i __B) +_mm256_maskz_sub_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, - (__v8si) __B, 6, - (__mmask8) -1); + return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpgt_epi32_mask (__m256i __A, __m256i __B) +_mm256_store_epi64 (void *__P, __m256i __A) { - return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + *(__m256i *) __P = __A; } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpgt_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm_store_epi64 (void *__P, __m128i __A) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, - (__v8si) __B, 6, __U); + *(__m128i *) __P = __A; } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpgt_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, - (__v8si) __B, __U); + return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, + (__v4df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_epu64_mask (__m128i __A, __m128i __B) +_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, - (__v2di) __B, 6, - (__mmask8) -1); + return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpgt_epi64_mask (__m128i __A, __m128i __B) +_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpgt_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_loadu_pd (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, - (__v2di) __B, 6, __U); + return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpgt_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) { - return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, - (__v2di) __B, __U); + __builtin_ia32_storeupd256_mask ((double *) __P, + (__v4df) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpgt_epu64_mask (__m256i __A, __m256i __B) +_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, - (__v4di) __B, 6, - (__mmask8) -1); + __builtin_ia32_storeupd128_mask ((double *) __P, + (__v2df) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpgt_epi64_mask (__m256i __A, __m256i __B) +_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpgt_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, - (__v4di) __B, 6, __U); + return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpgt_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, - (__v4di) __B, __U); + return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_test_epi32_mask (__m128i __A, __m128i __B) +_mm_maskz_loadu_ps (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) { - return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, - (__v4si) __B, __U); + __builtin_ia32_storeups256_mask ((float *) __P, + (__v8sf) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_test_epi32_mask (__m256i __A, __m256i __B) +_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) { - return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + __builtin_ia32_storeups128_mask ((float *) __P, + (__v4sf) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, - (__v8si) __B, __U); + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, + (__v4di) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_test_epi64_mask (__m128i __A, __m128i __B) +_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, - (__v2di) __B, __U); + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, + (__v2di) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_test_epi64_mask (__m256i __A, __m256i __B) +_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, - (__v4di) __B, __U); + __builtin_ia32_storedqudi256_mask ((long long *) __P, + (__v4di) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_testn_epi32_mask (__m128i __A, __m128i __B) +_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) { - return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + __builtin_ia32_storedqudi128_mask ((long long *) __P, + (__v2di) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, - (__v4si) __B, __U); + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, + (__v8si) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_testn_epi32_mask (__m256i __A, __m256i __B) +_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, - (__v8si) __B, __U); + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, + (__v4si) __W, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_testn_epi64_mask (__m128i __A, __m128i __B) +_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, - (__v2di) __B, __U); + __builtin_ia32_storedqusi256_mask ((int *) __P, + (__v8si) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_testn_epi64_mask (__m256i __A, __m256i __B) +_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) { - return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + __builtin_ia32_storedqusi128_mask ((int *) __P, + (__v4si) __A, + (__mmask8) __U); } -extern __inline __mmask8 +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, - (__v4di) __B, __U); + return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) { - return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) +_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); } -extern __inline void +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) +_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) { - __builtin_ia32_compressstoredf256_mask ((__v4df *) __P, - (__v4df) __A, + return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), (__mmask8) __U); } -extern __inline __m128d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm256_abs_epi64 (__m256i __A) { - return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); } -extern __inline __m128d +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) +_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) +_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) { - __builtin_ia32_compressstoredf128_mask ((__v2df *) __P, - (__v2df) __A, + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), (__mmask8) __U); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm_abs_epi64 (__m128i __A) { - return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) +_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} -extern __inline void + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) +_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) { - __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P, - (__v8sf) __A, + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), (__mmask8) __U); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm256_cvtpd_epu32 (__m256d __A) { - return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, - (__v4sf) __W, + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) __W, (__mmask8) __U); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) +_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) { - return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), (__mmask8) __U); } -extern __inline void +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) +_mm_cvtpd_epu32 (__m128d __A) { - __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P, - (__v4sf) __A, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) +_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) +_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) { - __builtin_ia32_compressstoredi256_mask ((__v4di *) __P, - (__v4di) __A, - (__mmask8) __U); + return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline void +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) +_mm256_cvttps_epu32 (__m256 __A) { - __builtin_ia32_compressstoredi128_mask ((__v2di *) __P, - (__v2di) __A, - (__mmask8) __U); + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, (__v8si) __W, (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) +_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, (__v8si) _mm256_setzero_si256 (), (__mmask8) __U); } -extern __inline void +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) +_mm_cvttps_epu32 (__m128 __A) { - __builtin_ia32_compressstoresi256_mask ((__v8si *) __P, - (__v8si) __A, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, (__v4si) __W, (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) +_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, (__v4si) _mm_setzero_si128 (), (__mmask8) __U); } -extern __inline void +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) +_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - __builtin_ia32_compressstoresi128_mask ((__v4si *) __P, - (__v4si) __A, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) +_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) +_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m256d) __builtin_ia32_expanddf256_maskz ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) __W, (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) +_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) { - return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P, - (__v4df) __W, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +_mm256_cvttpd_epu32 (__m256d __A) { - return (__m256d) __builtin_ia32_expandloaddf256_maskz ((__v4df *) __P, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) - __U); + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) +_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) +_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) { - return (__m128d) __builtin_ia32_expanddf128_maskz ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) +_mm_cvttpd_epu32 (__m128d __A) { - return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P, - (__v2df) __W, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_expandloaddf128_maskz ((__v2df *) __P, - (__v2df) - _mm_setzero_pd (), - (__mmask8) - __U); + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) +_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) { - return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) +_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m256) __builtin_ia32_expandsf256_maskz ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), + return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, + (__v4si) __W, (__mmask8) __U); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) +_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P, - (__v8sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m256) __builtin_ia32_expandloadsf256_maskz ((__v8sf *) __P, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) - __U); + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) +_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) { - return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) +_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_expandsf128_maskz ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), + return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, + (__v4df) __W, (__mmask8) __U); } -extern __inline __m128 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) +_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P, - (__v4sf) __W, - (__mmask8) __U); + return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_expandloadsf128_maskz ((__v4sf *) __P, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) - __U); + return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, - (__v4di) __W, + return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), (__mmask8) __U); } -extern __inline __m256i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) +_mm256_cvtepu32_pd (__m128i __A) { - return (__m256i) __builtin_ia32_expanddi256_maskz ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) __W, (__mmask8) __U); } -extern __inline __m256i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, - void const *__P) +_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P, - (__v4di) __W, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +_mm_cvtepu32_pd (__m128i __A) { - return (__m256i) __builtin_ia32_expandloaddi256_maskz ((__v4di *) __P, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) - __U); + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expanddi128_maskz ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P, - (__v2di) __W, - (__mmask8) - __U); + return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) { - return (__m128i) __builtin_ia32_expandloaddi128_maskz ((__v2di *) __P, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) - __U); + return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) +_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_expandsi256_maskz ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, - void const *__P) +_mm256_cvtepu32_ps (__m256i __A) { - return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P, - (__v8si) __W, - (__mmask8) - __U); + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_expandloadsi256_maskz ((__v8si *) __P, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) - __U); + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) { - return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, - (__v4si) __W, + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), (__mmask8) __U); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) +_mm_cvtepu32_ps (__m128i __A) { - return (__m128i) __builtin_ia32_expandsi128_maskz ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P, - (__v4si) __W, - (__mmask8) - __U); + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expandloadsi128_maskz ((__v4si *) __P, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) - __U); + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) +_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) { - return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I - , - (__v4df) __A, - (__v4df) __B, - (__mmask8) -1); + return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, + (__v4df) __W, + (__mmask8) __U); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I, - __m256d __B) +_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I - , - (__v4df) __A, - (__v4df) __B, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U, - __m256d __B) +_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) { - return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A, - (__v4di) __I - , - (__v4df) __B, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I, - __m256d __B) +_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I - , - (__v4df) __A, - (__v4df) __B, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) +_mm_cvtepi32_epi8 (__m128i __A) { - return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I - , - (__v8sf) __A, - (__v8sf) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I, - __m256 __B) +_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I - , - (__v8sf) __A, - (__v8sf) __B, - (__mmask8) __U); + __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U, - __m256 __B) +_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A, - (__v8si) __I - , - (__v8sf) __B, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I, - __m256 __B) +_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) { - return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I - , - (__v8sf) __A, - (__v8sf) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) +_mm256_cvtepi32_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I - , - (__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) +_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I - , - (__v2di) __A, - (__v2di) __B, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) +_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A, - (__v2di) __I - , - (__v2di) __B, - (__mmask8) __U); + __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) +_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I - , - (__v2di) __A, - (__v2di) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) +_mm_cvtsepi32_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I - , - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) +_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I - , - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) +_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A, - (__v4si) __I - , - (__v4si) __B, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) +_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I - , - (__v4si) __A, - (__v4si) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) +_mm256_cvtsepi32_epi8 (__m256i __A) { - return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I - , - (__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) +_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I - , - (__v4di) __A, - (__v4di) __B, - (__mmask8) __U); + __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I, - __mmask8 __U, __m256i __B) +_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A, - (__v4di) __I - , - (__v4di) __B, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A, - __m256i __I, __m256i __B) +_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I - , - (__v4di) __A, - (__v4di) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) +_mm_cvtusepi32_epi8 (__m128i __A) { - return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I - , - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) +_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I - , - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I, - __mmask8 __U, __m256i __B) +_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A, - (__v8si) __I - , - (__v8si) __B, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) __O, + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A, - __m256i __I, __m256i __B) +_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I - , - (__v8si) __A, - (__v8si) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) +_mm256_cvtusepi32_epi8 (__m256i __A) { - return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I - , - (__v2df) __A, - (__v2df) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m128d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I, - __m128d __B) +_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { - return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I - , - (__v2df) __A, - (__v2df) __B, - (__mmask8) - __U); + __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U, - __m128d __B) +_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A, - (__v2di) __I - , - (__v2df) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) __O, + __M); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I, - __m128d __B) +_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) { - return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I - , - (__v2df) __A, - (__v2df) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) +_mm_cvtepi32_epi16 (__m128i __A) { - return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I - , - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I, - __m128 __B) +_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I - , - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U); + __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U, - __m128 __B) +_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A, - (__v4si) __I - , - (__v4sf) __B, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I, - __m128 __B) +_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I - , - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) - __U); + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_srav_epi64 (__m128i __X, __m128i __Y) +_mm256_cvtepi32_epi16 (__m256i __A) { - return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) _mm_setzero_si128 (), (__mmask8) -1); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm_cvtsepi32_epi16 (__m128i __A) { - return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi)__O, + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_cvtsepi32_epi16 (__m256i __A) { - return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm_cvtusepi32_epi16 (__m128i __A) { - return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_cvtusepi32_epi16 (__m256i __A) { - return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm_cvtepi64_epi8 (__m128i __A) { - return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rolv_epi32 (__m256i __A, __m256i __B) +_mm256_cvtepi64_epi8 (__m256i __A) { - return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rolv_epi32 (__m128i __A, __m128i __B) +_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) _mm_setzero_si128 (), - (__mmask8) -1); + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_cvtsepi64_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rorv_epi32 (__m256i __A, __m256i __B) +_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_cvtsepi64_epi8 (__m256i __A) { - return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rorv_epi32 (__m128i __A, __m128i __B) +_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_cvtusepi64_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rolv_epi64 (__m256i __A, __m256i __B) +_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) __O, + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rolv_epi64 (__m128i __A, __m128i __B) +_mm256_cvtusepi64_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) __O, + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_rorv_epi64 (__m256i __A, __m256i __B) +_mm_cvtepi64_epi16 (__m128i __A) { - return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi)__O, + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rorv_epi64 (__m128i __A, __m128i __B) +_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi) _mm_setzero_si128 (), - (__mmask8) -1); + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm256_cvtepi64_epi16 (__m256i __A) { - return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_srav_epi64 (__m256i __X, __m256i __Y) +_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm_cvtsepi64_epi16 (__m128i __A) { - return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __U); + __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_pd (), - __U); + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __U); + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_cvtsepi64_epi16 (__m256i __A) { - return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_pd (), - __U); + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __U); + __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_pd (), - __U); + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __U); + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_pd (), - __U); -} -extern __inline __m256i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm_cvtusepi64_epi16 (__m128i __A) { - return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_cvtusepi64_epi16 (__m256i __A) { - return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); -} -extern __inline __m256d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) -{ - return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); -} -extern __inline __m256d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) { - return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +_mm_cvtepi64_epi32 (__m128i __A) { - return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) +_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) __O, __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) +_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm256_cvtepi64_epi32 (__m256i __A) { - return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m128d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) { - return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) __O, __M); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) { - return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm_cvtsepi64_epi32 (__m128i __A) { - return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) { - return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); } -extern __inline __m256d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) __O, __M); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) { - return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); } -extern __inline __m256 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) +_mm256_cvtsepi64_epi32 (__m256i __A) { - return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m256 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) +_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) { - return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si)__O, + __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) { - return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) +_mm_cvtusepi64_epi32 (__m128i __A) { - return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m128 +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) +_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) __O, __M); } -extern __inline __m128 +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) +_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) { - return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm256_cvtusepi64_epi32 (__m256i __A) { - return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } -extern __inline __m128d +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) { - return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) __O, __M); } -extern __inline __m128d +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) { - return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); } -extern __inline __m128d +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) { - return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); } -extern __inline __m128d +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) { - return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, + (__v4sf) __O, + __M); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) +_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __M); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) { - return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); + return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, + (__v4df) __O, + __M); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { - return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, + (__v4df) + _mm256_setzero_pd (), + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, + (__v8si) __O, + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); + return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_min_epi64 (__m256i __A, __m256i __B) +_mm256_mask_set1_epi32 (__m256i __O, __mmask8 __M, int __A) { - return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, (__v8si) __O, + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_maskz_set1_epi32 (__mmask8 __M, int __A) { - return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); + return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, + (__v4si) __O, + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_max_epi64 (__m256i __A, __m256i __B) +_mm_mask_set1_epi32 (__m128i __O, __mmask8 __M, int __A) { - return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pbroadcastd128_gpr_mask (__A, (__v4si) __O, + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_max_epu64 (__m256i __A, __m256i __B) +_mm_maskz_set1_epi32 (__mmask8 __M, int __A) { - return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m128i) + __builtin_ia32_pbroadcastd128_gpr_mask (__A, + (__v4si) _mm_setzero_si128 (), + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); + return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, + (__v4di) __O, + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_min_epu64 (__m256i __A, __m256i __B) +_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, + (__v4di) + _mm256_setzero_si256 (), + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); + return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O, + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, + (__v4di) + _mm256_setzero_si256 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, + (__v2di) __O, + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); + return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O, + __M); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); + return (__m128i) + __builtin_ia32_pbroadcastq128_gpr_mask (__A, + (__v2di) _mm_setzero_si128 (), + __M); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +_mm256_broadcast_f32x4 (__m128 __A) { - return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf)_mm256_undefined_pd (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A) { - return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) { - return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_broadcast_i32x4 (__m128i __A) { - return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si)_mm256_undefined_si256 (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si) + __O, __M); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si) + _mm256_setzero_si256 (), + __M); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_epi64 (__m128i __A, __m128i __B) +_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, + (__v8si) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); + return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, + (__v4si) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_epi64 (__m128i __A, __m128i __B) +_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, + (__v4di) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_epu64 (__m128i __A, __m128i __B) +_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); + return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, + (__v2di) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_epu64 (__m128i __A, __m128i __B) +_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); + return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, + (__v8si) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, + (__v4si) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); + return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, + (__v4di) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); + return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, + (__v2di) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); + return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, + (__v4di) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); + return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } -#pragma GCC push_options -#pragma GCC target("avx512vl,avx512cd") -#define __DISABLE_AVX512VLCD__ extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_broadcastmb_epi64 (__mmask8 __A) -{ - return (__m128i) __builtin_ia32_broadcastmb128 (__A); -} -extern __inline __m256i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_broadcastmb_epi64 (__mmask8 __A) +_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) { - return (__m256i) __builtin_ia32_broadcastmb256 (__A); + return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, + (__v2di) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_broadcastmw_epi32 (__mmask16 __A) +_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_broadcastmw128 (__A); + return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_broadcastmw_epi32 (__mmask16 __A) +_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_broadcastmw256 (__A); + return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, + (__v8si) __W, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_lzcnt_epi32 (__m256i __A) +_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, + (__v4si) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) +_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_lzcnt_epi64 (__m256i __A) +_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, + (__v4di) __W, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) +_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, + (__v2di) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_conflict_epi64 (__m256i __A) +_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) - __U); + return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, + (__v8si) __W, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) +_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) - __U); + return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_conflict_epi32 (__m256i __A) +_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) - __U); + return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, + (__v4di) __W, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) +_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) - __U); + return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_lzcnt_epi32 (__m128i __A) +_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, + (__v2di) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) +_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, + (__v4di) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_lzcnt_epi64 (__m128i __A) +_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, + (__v2di) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_conflict_epi64 (__m128i __A) +_mm256_rcp14_pd (__m256d __A) { - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) +_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) - __U); + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_conflict_epi32 (__m128i __A) +_mm_rcp14_pd (__m128d __A) { - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) +_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) - __U); + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } -#pragma GCC pop_options -extern __inline __m256d +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_mm256_rcp14_ps (__m256 __A) { - return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) __W, (__mmask8) __U); } -extern __inline __m256d +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) { - return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), (__mmask8) __U); } -extern __inline __m128d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) +_mm_rcp14_ps (__m128 __A) { - return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) __W, (__mmask8) __U); } -extern __inline __m128d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) { - return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), (__mmask8) __U); } -extern __inline __m256 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) +_mm256_rsqrt14_pd (__m256d __A) { - return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) +_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B) +_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) +_mm_rsqrt14_pd (__m128d __A) { - return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B) +_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) +_mm256_rsqrt14_ps (__m256 __A) { - return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B) +_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __m128 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) { - return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B) +_mm_rsqrt14_ps (__m128 __A) { - return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) +_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } -extern __inline __m256 +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B) +_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) +_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B) +_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { - return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, - (__v4si) __B, + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, (__v8si) __W, (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, - (__v4si) __B, + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, (__v8si) _mm256_setzero_si256 (), (__mmask8) __U); } -extern __inline __m128i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) -{ - return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); -} -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { - return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_sra_epi64 (__m256i __A, __m128i __B) +_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, - (__v2di) __B, + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, (__v4di) _mm256_setzero_si256 (), - (__mmask8) -1); + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) __W, + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) _mm256_setzero_si256 (), (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sra_epi64 (__m128i __A, __m128i __B) +_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); -} -extern __inline __m128i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) -{ - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A, +_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, (__v4si) __B, (__v4si) __W, (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, (__v4si) __B, (__v4si) _mm_setzero_si128 (), @@ -26549,2149 +27797,9912 @@ _mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A, +_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, (__v2di) __B, (__v2di) __W, (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, (__v2di) __B, (__v2di) _mm_setzero_si128 (), (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { - return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, (__v4si) __B, - (__v8si) __W, + (__v4si) __W, (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, (__v4si) __B, - (__v8si) - _mm256_setzero_si256 (), + (__v4si) + _mm_setzero_si128 (), (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { - return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, (__v2di) __B, - (__v4di) __W, + (__v2di) __W, (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, (__v2di) __B, - (__v4di) - _mm256_setzero_si256 (), + (__v2di) + _mm_setzero_si128 (), (__mmask8) __U); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X, - __m256 __Y) +_mm256_getexp_ps (__m256 __A) { - return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, - (__v8si) __X, - (__v8sf) __W, - (__mmask8) __U); + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y) +_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, - (__v8si) __X, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutexvar_pd (__m256i __X, __m256d __Y) +_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) { - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, - (__v4di) __X, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, - __m256d __Y) +_mm256_getexp_pd (__m256d __A) { - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, - (__v4di) __X, - (__v4df) __W, - (__mmask8) __U); + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) +_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, - (__v4di) __X, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256i __C) +_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, - (__v4di) __C, - (__v4df) __W, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __m256d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C) +_mm_getexp_ps (__m128 __A) { - return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, - (__v4di) __C, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) - __U); + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A, - __m256i __C) +_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, - (__v8si) __C, - (__v8sf) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __m256 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C) +_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) { - return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, - (__v8si) __C, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A, - __m128i __C) +_mm_getexp_pd (__m128d __A) { - return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, - (__v2di) __C, - (__v2df) __W, - (__mmask8) __U); + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C) -{ - return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, - (__v2di) __C, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A, - __m128i __C) +_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, - (__v4si) __C, - (__v4sf) __W, + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) __W, (__mmask8) __U); } -extern __inline __m128 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C) +_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) { - return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, - (__v4si) __C, - (__v4sf) - _mm_setzero_ps (), + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); -} -extern __inline __m256i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, - (__v4di) __X, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); + return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { - return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); + return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { - return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) __W, __M); + return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) +_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { - return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) __W, __M); + return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) +_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y) +_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, - (__v4di) __X, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, - (__v4di) __X, - (__v4di) __W, - __M); + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +_mm256_scalef_pd (__m256d __A, __m256d __B) { - return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) __W, __M); + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); } -extern __inline __m256i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, - (__v8si) __X, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } -extern __inline __m128i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) +_mm256_scalef_ps (__m256 __A, __m256 __B) { - return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) __W, __M); + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); } -extern __inline __m128i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) +_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { - return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) +_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, - (__v8si) __X, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); } -extern __inline __m256i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +_mm_scalef_pd (__m128d __A, __m128d __B) { - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, - (__v8si) __X, - (__v8si) __W, - __M); + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, - (__v8si) __Y, 4, - (__mmask8) __M); + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y) +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, - (__v8si) __Y, 4, - (__mmask8) -1); + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ps (__m128 __A, __m128 __B) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, - (__v8si) __Y, 1, - (__mmask8) __M); + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y) +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, - (__v8si) __Y, 1, - (__mmask8) -1); + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmaddsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmaddsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmaddsub_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmaddsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmaddsub_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmaddsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmaddsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmaddsub_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsubadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsubadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsubadd_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsubadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsubadd_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsubadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsubadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsubadd_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epu32 (__m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epu32 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_compressstoredf256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_compressstoredf128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_compressstoredi256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_compressstoredi128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_compressstoresi256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_compressstoresi128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_expanddf256_maskz ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P, + (__v4df) __W, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_expandloaddf256_maskz ((__v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_expanddf128_maskz ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P, + (__v2df) __W, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_expandloaddf128_maskz ((__v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_expandsf256_maskz ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_expandloadsf256_maskz ((__v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_expandsf128_maskz ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_expandloadsf128_maskz ((__v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expanddi256_maskz ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, + void const *__P) +{ + return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloaddi256_maskz ((__v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expanddi128_maskz ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloaddi128_maskz ((__v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expandsi256_maskz ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, + void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadsi256_maskz ((__v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expandsi128_maskz ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadsi128_maskz ((__v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I + , + (__v4df) __A, + (__v4df) __B, + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I + , + (__v4df) __A, + (__v4df) __B, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A, + (__v4di) __I + , + (__v4df) __B, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I + , + (__v4df) __A, + (__v4df) __B, + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I + , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I + , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A, + (__v8si) __I + , + (__v8sf) __B, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I + , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I + , + (__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I + , + (__v2di) __A, + (__v2di) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A, + (__v2di) __I + , + (__v2di) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I + , + (__v2di) __A, + (__v2di) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I + , + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I + , + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A, + (__v4si) __I + , + (__v4si) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I + , + (__v4si) __A, + (__v4si) __B, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I + , + (__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I + , + (__v4di) __A, + (__v4di) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A, + (__v4di) __I + , + (__v4di) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I + , + (__v4di) __A, + (__v4di) __B, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I + , + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I + , + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A, + (__v8si) __I + , + (__v8si) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I + , + (__v8si) __A, + (__v8si) __B, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I + , + (__v2df) __A, + (__v2df) __B, + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I + , + (__v2df) __A, + (__v2df) __B, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A, + (__v2di) __I + , + (__v2df) __B, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I + , + (__v2df) __A, + (__v2df) __B, + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I + , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I + , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A, + (__v4si) __I + , + (__v4sf) __B, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I + , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rolv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rolv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rorv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rorv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rolv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rolv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rorv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rorv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +#pragma GCC push_options +#pragma GCC target("avx512vl,avx512cd") +#define __DISABLE_AVX512VLCD__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_broadcastmb128 (__A); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_broadcastmb256 (__A); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m128i) __builtin_ia32_broadcastmw128 (__A); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m256i) __builtin_ia32_broadcastmw256 (__A); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lzcnt_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lzcnt_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conflict_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conflict_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lzcnt_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lzcnt_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conflict_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conflict_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +#pragma GCC pop_options +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X, + __m256 __Y) +{ + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, + (__v8si) __X, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, + (__v8si) __X, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_pd (__m256i __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, + __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, + (__v4di) __C, + (__v4df) __W, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, + (__v4di) __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, + (__v8si) __C, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, + (__v8si) __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, + (__v2di) __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, + (__v2di) __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A, + __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, + (__v4si) __C, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, + (__v4si) __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) __W, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) __W, + __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, (__v8si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex_epi64 (__m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) + _mm256_setzero_si256(), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex_epi64 (__m256i __W, __mmask8 __M, + __m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) __W, + (__mmask8) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex_epi64 (__mmask8 __M, __m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) + _mm256_setzero_si256 (), (__mmask8) __M); } +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A, + (__v4df) __B, __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_pd (__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A, + (__v4df) __B, __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __imm) +{ + return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_pd (__mmask8 __U, __m128d __A, __m128d __B, + const int __imm) +{ + return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A, + (__v8sf) __B, __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_ps (__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A, + (__v8sf) __B, __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __imm) +{ + return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_ps (__mmask8 __U, __m128 __A, __m128 __B, + const int __imm) +{ + return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_inserti32x4 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_inserti32x4 (__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf32x4 (__m256 __A, __m128 __B, const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_insertf32x4 (__m256 __W, __mmask8 __U, __m256 __A, + __m128 __B, const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_insertf32x4 (__mmask8 __U, __m256 __A, __m128 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti32x4_epi32 (__m256i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extracti32x4_epi32 (__mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf32x4_ps (__m256 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extractf32x4_ps (__mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_i64x2 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_i64x2 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_i64x2 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_i32x4 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_i32x4 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_i32x4 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_f64x2 (__m256d __A, __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_f64x2 (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_f64x2 (__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_f32x4 (__m256 __A, __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_f32x4 (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_f32x4 (__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fixupimm_pd (__m256d __A, __m256d __B, __m256i __C, + const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fixupimm_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256i __C, const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fixupimm_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256i __C, const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fixupimm_ps (__m256 __A, __m256 __B, __m256i __C, + const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fixupimm_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256i __C, const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fixupimm_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256i __C, const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_pd (__m128d __A, __m128d __B, __m128i __C, + const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_ps (__m128 __A, __m128 __B, __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_ps (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, __imm, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, __imm, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, __imm, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, __imm, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_ps (__m256 __A, const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_ps (__m256 __W, __mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_ps (__mmask8 __U, __m256 __A, const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_pd (__m256d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_pd (__m256d __W, __mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_pd (__mmask8 __U, __m256d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_ps (__m128 __A, const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ps (__m128 __W, __mmask8 __U, __m128 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ps (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_pd (__m128d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_pd (__m128d __W, __mmask8 __U, __m128d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_pd (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_ps (__m256 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_ps (__m256 __W, __mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_ps (__mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ps (__m128 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ps (__m128 __W, __mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ps (__mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_pd (__m256d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_pd (__m256d __W, __mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_pd (__mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_pd (__m128d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_pd (__m128d __W, __mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_pd (__mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_ps (__m256 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256) __builtin_ia32_gather3siv8sf ((__v8sf) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_ps (__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3siv4sf ((__v4sf) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_pd (__m256d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m256d) __builtin_ia32_gather3siv4df ((__v4df) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_pd (__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128d) __builtin_ia32_gather3siv2df ((__v2df) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3div8sf ((__v4sf) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3div4sf ((__v4sf) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_pd (__m256d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256d) __builtin_ia32_gather3div4df ((__v4df) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_pd (__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128d) __builtin_ia32_gather3div2df ((__v2df) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_epi32 (__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3siv8si ((__v8si) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3siv4si ((__v4si) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_epi64 (__m256i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3siv4di ((__v4di) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_epi64 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3siv2di ((__v2di) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div8si ((__v4si) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div4si ((__v4si) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_epi64 (__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3div4di ((__v4di) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_epi64 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div2di ((__v2di) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_ps (void *__addr, __m256i __index, + __m256 __v1, const int __scale) +{ + __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_ps (void *__addr, __mmask8 __mask, + __m256i __index, __m256 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index, + (__v8sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_ps (void *__addr, __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_ps (void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index, + (__v4sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_pd (void *__addr, __m128i __index, + __m256d __v1, const int __scale) +{ + __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m256d __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index, + (__v4df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_pd (void *__addr, __m128i __index, + __m128d __v1, const int __scale) +{ + __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v2df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) +{ + __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index, + (__v2df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_ps (void *__addr, __m256i __index, + __m128 __v1, const int __scale) +{ + __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m256i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index, + (__v4sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_ps (void *__addr, __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v4sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index, + (__v4sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_pd (void *__addr, __m256i __index, + __m256d __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, __m256d __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index, + (__v4df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_pd (void *__addr, __m128i __index, + __m128d __v1, const int __scale) +{ + __builtin_ia32_scatterdiv2df (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v2df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index, + (__v2df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_epi32 (void *__addr, __m256i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index, + (__v8si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_epi32 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index, + (__v4si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_epi64 (void *__addr, __m128i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index, + (__v4di) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_epi64 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v2di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index, + (__v2di) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_epi32 (void *__addr, __m256i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m256i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv8si (__addr, __mask, (__v4di) __index, + (__v4si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_epi32 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v4si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index, + (__v4si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_epi64 (void *__addr, __m256i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index, + (__v4di) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_epi64 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v2di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index, + (__v2di) __v1, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) +{ + return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_epi32 (__mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) +{ + return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) +{ + return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_epi32 (__mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) +{ + return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rol_epi32 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rol_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rol_epi32 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rol_epi32 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rol_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rol_epi32 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ror_epi32 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ror_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ror_epi32 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ror_epi32 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ror_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ror_epi32 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rol_epi64 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rol_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rol_epi64 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rol_epi64 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rol_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rol_epi64 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ror_epi64 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ror_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ror_epi64 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ror_epi64 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ror_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ror_epi64 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi32 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi32 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi64 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi64 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi32 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi32 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi64 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi64 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A, + const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A, + const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi64 (__m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi64 (__m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex_pd (__m256d __W, __mmask8 __U, __m256d __X, + const int __imm) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex_pd (__mmask8 __U, __m256d __X, const int __imm) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permute_pd (__m256d __W, __mmask8 __U, __m256d __X, + const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permute_pd (__mmask8 __U, __m256d __X, const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permute_pd (__m128d __W, __mmask8 __U, __m128d __X, + const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permute_pd (__mmask8 __U, __m128d __X, const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permute_ps (__m256 __W, __mmask8 __U, __m256 __X, + const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permute_ps (__mmask8 __U, __m256 __X, const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permute_ps (__m128 __W, __mmask8 __U, __m128 __X, + const int __C) +{ + return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C) +{ + return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) +{ + return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) +{ + return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) +{ + return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) +{ + return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y) +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, - (__v8si) __Y, 5, + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi32_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu64_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, (__mmask8) -1); } extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu32_mask (__m256i __X, __m256i __Y, const int __P) { return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, - (__v8si) __Y, 2, - (__mmask8) __M); + (__v8si) __Y, __P, + (__mmask8) -1); } extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y) +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pd_mask (__m256d __X, __m256d __Y, const int __P) { - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, - (__v8si) __Y, 2, + return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X, + (__v4df) __Y, __P, (__mmask8) -1); } extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_ps_mask (__m256 __X, __m256 __Y, const int __P) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 4, - (__mmask8) __M); + return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X, + (__v8sf) __Y, __P, + (__mmask8) -1); } extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y) +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi64_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi32_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu64_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) { return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 4, + (__v4di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu32_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_pd_mask (__mmask8 __U, __m256d __X, __m256d __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X, + (__v4df) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_ps_mask (__mmask8 __U, __m256 __X, __m256 __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X, + (__v8sf) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi64_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi32_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu64_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, (__mmask8) -1); } extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu32_mask (__m128i __X, __m128i __Y, const int __P) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 1, - (__mmask8) __M); + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) -1); } extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y) +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pd_mask (__m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ps_mask (__m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi64_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi32_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu64_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu32_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_pd_mask (__mmask8 __U, __m128d __X, __m128d __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ps_mask (__mmask8 __U, __m128 __X, __m128 __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __M, + (__v4df) + _mm256_undefined_pd (), + (__mmask8) -1); +} +#define _mm256_permutexvar_ps(A,B) _mm256_permutevar8x32_ps ((B), (A)) +#undef __DISABLE_AVX512VL__ +#pragma GCC pop_options +#define _AVX512BWINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("avx512bw") +#define __DISABLE_AVX512BW__ +typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef unsigned long long __mmask64; +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcsi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_ktestcsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_ktestcdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_kortestcsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_kortestcdi (__A, __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask32_u32 (__mmask32 __A) +{ + return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A); +} +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask64_u64 (__mmask64 __A) +{ + return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask32 (unsigned int __A) +{ + return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu64_mask64 (unsigned long long __A) +{ + return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask32 (__mmask32 *__A) +{ + return (__mmask32) __builtin_ia32_kmovd (*__A); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask64 (__mmask64 *__A) +{ + return (__mmask64) __builtin_ia32_kmovq (*(__mmask64 *) __A); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask32 (__mmask32 *__A, __mmask32 __B) +{ + *(__mmask32 *) __A = __builtin_ia32_kmovd (__B); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask64 (__mmask64 *__A, __mmask64 __B) +{ + *(__mmask64 *) __A = __builtin_ia32_kmovq (__B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask32 (__mmask32 __A) +{ + return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask64 (__mmask64 __A) +{ + return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) +{ + __builtin_ia32_storedquhi512_mask ((short *) __P, + (__v32hi) __A, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackw (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackw_mask32 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 1, - (__mmask8) -1); + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackd_mask64 (__mmask32 __A, __mmask32 __B) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 5, - (__mmask8) __M); + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 5, - (__mmask8) -1); + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 2, - (__mmask8) __M); + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y) +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) { - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, - (__v4di) __Y, 2, - (__mmask8) -1); + __builtin_ia32_storedquqi512_mask ((char *) __P, + (__v64qi) __A, + (__mmask64) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sad_epu8 (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 4, - (__mmask8) __M); + return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A, + (__v64qi) __B); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi8 (__m512i __A) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 4, - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) _mm256_undefined_si256(), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 1, - (__mmask8) __M); + __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 1, - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) __O, __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 5, - (__mmask8) __M); + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi16_epi8 (__m512i __A) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 5, - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)_mm256_undefined_si256(), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 2, - (__mmask8) __M); + __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, - (__v8si) __Y, 2, - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)__O, + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 4, - (__mmask8) __M); + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi16_epi8 (__m512i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 4, - (__mmask8) -1); + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi)_mm256_undefined_si256(), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 1, - (__mmask8) __M); + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) __O, + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y) +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 1, - (__mmask8) -1); + __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 5, - (__mmask8) __M); + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastb_epi8 (__m128i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 5, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi)_mm512_undefined_epi32(), + (__mmask64) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 2, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi) __O, + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) { - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, - (__v4di) __Y, 2, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 4, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, + (__v64qi) __O, + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 4, - (__mmask8) -1); + return (__m512i) + __builtin_ia32_pbroadcastb512_gpr_mask (__A, + (__v64qi) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastw_epi16 (__m128i __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 1, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi)_mm512_undefined_epi32(), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 1, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi) __O, + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 5, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 5, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, + (__v32hi) __O, + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 2, - (__mmask8) __M); + return (__m512i) + __builtin_ia32_pbroadcastw512_gpr_mask (__A, + (__v32hi) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_epu32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhrs_epi16 (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, - (__v4si) __Y, 2, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 4, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 4, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhi_epi16 (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 1, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 1, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 5, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhi_epu16 (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 5, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 2, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_epu64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, - (__v2di) __Y, 2, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi16 (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 4, - (__mmask8) __M); + return (__m512i) ((__v32hu) __A * (__v32hu) __B); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 4, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 1, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi16 (__m256i __A) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 1, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 5, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 5, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi16 (__m256i __A) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 2, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_epi32_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, - (__v4si) __Y, 2, - (__mmask8) -1); + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 4, - (__mmask8) __M); + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 4, - (__mmask8) -1); + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, + __m512i __B) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 1, - (__mmask8) __M); + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 1, - (__mmask8) -1); + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) __W, + (__mmask32) __M); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 5, - (__mmask8) __M); + return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I + , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U, + __m512i __I, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 5, - (__mmask8) -1); + return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I + , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) + __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I, + __mmask32 __U, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 2, - (__mmask8) __M); + return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A, + (__v32hi) __I + , + (__v32hi) __B, + (__mmask32) + __U); } -extern __inline __mmask8 - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmple_epi64_mask (__m128i __X, __m128i __Y) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A, + __m512i __I, __m512i __B) { - return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, - (__v2di) __Y, 2, - (__mmask8) -1); + return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I + , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) + __U); } -#define _mm256_permutex_pd(X,M) ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(X), (int)(M), (__v4df)(__m256d) _mm256_undefined_pd (), (__mmask8)-1)) -#define _mm256_permutex_epi64(X,I) ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i) (_mm256_setzero_si256 ()), (__mmask8) -1)) -#define _mm256_maskz_permutex_epi64(M,X,I) ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i) (_mm256_setzero_si256 ()), (__mmask8)(M))) -#define _mm256_mask_permutex_epi64(W,M,X,I) ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i)(W), (__mmask8)(M))) -#define _mm256_insertf32x4(X,Y,C) ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), (__v4sf)(__m128) (Y), (int) (C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) -#define _mm256_mask_insertf32x4(W,U,X,Y,C) ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), (__v4sf)(__m128) (Y), (int) (C), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_insertf32x4(U,X,Y,C) ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), (__v4sf)(__m128) (Y), (int) (C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) -#define _mm256_inserti32x4(X,Y,C) ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_inserti32x4(W,U,X,Y,C) ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_inserti32x4(U,X,Y,C) ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm256_extractf32x4_ps(X,C) ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), (int) (C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1)) -#define _mm256_mask_extractf32x4_ps(W,U,X,C) ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), (int) (C), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm256_maskz_extractf32x4_ps(U,X,C) ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), (int) (C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) -#define _mm256_extracti32x4_epi32(X,C) ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X), (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) -#define _mm256_mask_extracti32x4_epi32(W,U,X,C) ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X), (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm256_maskz_extracti32x4_epi32(U,X,C) ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X), (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_shuffle_i64x2(X,Y,C) ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_shuffle_i64x2(W,U,X,Y,C) ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_shuffle_i64x2(U,X,Y,C) ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm256_shuffle_i32x4(X,Y,C) ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_shuffle_i32x4(W,U,X,Y,C) ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_shuffle_i32x4(U,X,Y,C) ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm256_shuffle_f64x2(X,Y,C) ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1)) -#define _mm256_mask_shuffle_f64x2(W,U,X,Y,C) ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_shuffle_f64x2(U,X,Y,C) ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd( ), (__mmask8)(U))) -#define _mm256_shuffle_f32x4(X,Y,C) ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) -#define _mm256_mask_shuffle_f32x4(W,U,X,Y,C) ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_shuffle_f32x4(U,X,Y,C) ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) -#define _mm256_mask_shuffle_pd(W,U,A,B,C) ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_shuffle_pd(U,A,B,C) ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)(__m256d) _mm256_setzero_pd (), (__mmask8)(U))) -#define _mm_mask_shuffle_pd(W,U,A,B,C) ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) -#define _mm_maskz_shuffle_pd(U,A,B,C) ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) -#define _mm256_mask_shuffle_ps(W,U,A,B,C) ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_shuffle_ps(U,A,B,C) ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) -#define _mm_mask_shuffle_ps(W,U,A,B,C) ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm_maskz_shuffle_ps(U,A,B,C) ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) -#define _mm256_fixupimm_pd(X,Y,Z,C) ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), (int)(C), (__mmask8)(-1))) -#define _mm256_mask_fixupimm_pd(X,U,Y,Z,C) ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), (int)(C), (__mmask8)(U))) -#define _mm256_maskz_fixupimm_pd(U,X,Y,Z,C) ((__m256d)__builtin_ia32_fixupimmpd256_maskz ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), (int)(C), (__mmask8)(U))) -#define _mm256_fixupimm_ps(X,Y,Z,C) ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), (int)(C), (__mmask8)(-1))) -#define _mm256_mask_fixupimm_ps(X,U,Y,Z,C) ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), (int)(C), (__mmask8)(U))) -#define _mm256_maskz_fixupimm_ps(U,X,Y,Z,C) ((__m256)__builtin_ia32_fixupimmps256_maskz ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), (int)(C), (__mmask8)(U))) -#define _mm_fixupimm_pd(X,Y,Z,C) ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(-1))) -#define _mm_mask_fixupimm_pd(X,U,Y,Z,C) ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U))) -#define _mm_maskz_fixupimm_pd(U,X,Y,Z,C) ((__m128d)__builtin_ia32_fixupimmpd128_maskz ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U))) -#define _mm_fixupimm_ps(X,Y,Z,C) ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(-1))) -#define _mm_mask_fixupimm_ps(X,U,Y,Z,C) ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U))) -#define _mm_maskz_fixupimm_ps(U,X,Y,Z,C) ((__m128)__builtin_ia32_fixupimmps128_maskz ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U))) -#define _mm256_mask_srli_epi32(W,U,A,B) ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_srli_epi32(U,A,B) ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_mask_srli_epi32(W,U,A,B) ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_srli_epi32(U,A,B) ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_mask_srli_epi64(W,U,A,B) ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_srli_epi64(U,A,B) ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_mask_srli_epi64(W,U,A,B) ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_srli_epi64(U,A,B) ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_mask_slli_epi32(W,U,X,C) ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_slli_epi32(U,X,C) ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm256_mask_slli_epi64(W,U,X,C) ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_slli_epi64(U,X,C) ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_mask_slli_epi32(W,U,X,C) ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_slli_epi32(U,X,C) ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm_mask_slli_epi64(W,U,X,C) ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_slli_epi64(U,X,C) ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_ternarylogic_epi64(A,B,C,I) ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)-1)) -#define _mm256_mask_ternarylogic_epi64(A,U,B,C,I) ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U))) -#define _mm256_maskz_ternarylogic_epi64(U,A,B,C,I) ((__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U))) -#define _mm256_ternarylogic_epi32(A,B,C,I) ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)-1)) -#define _mm256_mask_ternarylogic_epi32(A,U,B,C,I) ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U))) -#define _mm256_maskz_ternarylogic_epi32(U,A,B,C,I) ((__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U))) -#define _mm_ternarylogic_epi64(A,B,C,I) ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)-1)) -#define _mm_mask_ternarylogic_epi64(A,U,B,C,I) ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U))) -#define _mm_maskz_ternarylogic_epi64(U,A,B,C,I) ((__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U))) -#define _mm_ternarylogic_epi32(A,B,C,I) ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)-1)) -#define _mm_mask_ternarylogic_epi32(A,U,B,C,I) ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U))) -#define _mm_maskz_ternarylogic_epi32(U,A,B,C,I) ((__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U))) -#define _mm256_roundscale_ps(A,B) ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) -#define _mm256_mask_roundscale_ps(W,U,A,B) ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_roundscale_ps(U,A,B) ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) -#define _mm256_roundscale_pd(A,B) ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1)) -#define _mm256_mask_roundscale_pd(W,U,A,B) ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_roundscale_pd(U,A,B) ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) -#define _mm_roundscale_ps(A,B) ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1)) -#define _mm_mask_roundscale_ps(W,U,A,B) ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm_maskz_roundscale_ps(U,A,B) ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) -#define _mm_roundscale_pd(A,B) ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1)) -#define _mm_mask_roundscale_pd(W,U,A,B) ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) -#define _mm_maskz_roundscale_pd(U,A,B) ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) -#define _mm256_getmant_ps(X,B,C) ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), (int)(((C)<<2) | (B)), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) -#define _mm256_mask_getmant_ps(W,U,X,B,C) ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), (int)(((C)<<2) | (B)), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_getmant_ps(U,X,B,C) ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), (int)(((C)<<2) | (B)), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) -#define _mm_getmant_ps(X,B,C) ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), (int)(((C)<<2) | (B)), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1)) -#define _mm_mask_getmant_ps(W,U,X,B,C) ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), (int)(((C)<<2) | (B)), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm_maskz_getmant_ps(U,X,B,C) ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), (int)(((C)<<2) | (B)), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) -#define _mm256_getmant_pd(X,B,C) ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), (int)(((C)<<2) | (B)), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1)) -#define _mm256_mask_getmant_pd(W,U,X,B,C) ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), (int)(((C)<<2) | (B)), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_getmant_pd(U,X,B,C) ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), (int)(((C)<<2) | (B)), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) -#define _mm_getmant_pd(X,B,C) ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), (int)(((C)<<2) | (B)), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1)) -#define _mm_mask_getmant_pd(W,U,X,B,C) ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), (int)(((C)<<2) | (B)), (__v2df)(__m128d)(W), (__mmask8)(U))) -#define _mm_maskz_getmant_pd(U,X,B,C) ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), (int)(((C)<<2) | (B)), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) -#define _mm256_mmask_i32gather_ps(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256) __builtin_ia32_gather3siv8sf ((__v8sf)(__m256)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i32gather_ps(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128) __builtin_ia32_gather3siv4sf ((__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_mmask_i32gather_pd(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256d) __builtin_ia32_gather3siv4df ((__v4df)(__m256d)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i32gather_pd(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128d) __builtin_ia32_gather3siv2df ((__v2df)(__m128d)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_mmask_i64gather_ps(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128) __builtin_ia32_gather3div8sf ((__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i64gather_ps(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128) __builtin_ia32_gather3div4sf ((__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_mmask_i64gather_pd(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256d) __builtin_ia32_gather3div4df ((__v4df)(__m256d)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i64gather_pd(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128d) __builtin_ia32_gather3div2df ((__v2df)(__m128d)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_mmask_i32gather_epi32(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256i) __builtin_ia32_gather3siv8si ((__v8si)(__m256i)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i32gather_epi32(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128i) __builtin_ia32_gather3siv4si ((__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_mmask_i32gather_epi64(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256i) __builtin_ia32_gather3siv4di ((__v4di)(__m256i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i32gather_epi64(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128i) __builtin_ia32_gather3siv2di ((__v2di)(__m128i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_mmask_i64gather_epi32(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128i) __builtin_ia32_gather3div8si ((__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i64gather_epi32(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128i) __builtin_ia32_gather3div4si ((__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_mmask_i64gather_epi64(V1OLD,MASK,INDEX,ADDR,SCALE) (__m256i) __builtin_ia32_gather3div4di ((__v4di)(__m256i)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm_mmask_i64gather_epi64(V1OLD,MASK,INDEX,ADDR,SCALE) (__m128i) __builtin_ia32_gather3div2di ((__v2di)(__m128i)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, (__mmask8)MASK, (int)SCALE) -#define _mm256_i32scatter_ps(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv8sf ((void *)ADDR, (__mmask8)0xFF, (__v8si)(__m256i)INDEX, (__v8sf)(__m256)V1, (int)SCALE) -#define _mm256_mask_i32scatter_ps(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv8sf ((void *)ADDR, (__mmask8)MASK, (__v8si)(__m256i)INDEX, (__v8sf)(__m256)V1, (int)SCALE) -#define _mm_i32scatter_ps(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv4sf ((void *)ADDR, (__mmask8)0xFF, (__v4si)(__m128i)INDEX, (__v4sf)(__m128)V1, (int)SCALE) -#define _mm_mask_i32scatter_ps(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv4sf ((void *)ADDR, (__mmask8)MASK, (__v4si)(__m128i)INDEX, (__v4sf)(__m128)V1, (int)SCALE) -#define _mm256_i32scatter_pd(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv4df ((void *)ADDR, (__mmask8)0xFF, (__v4si)(__m128i)INDEX, (__v4df)(__m256d)V1, (int)SCALE) -#define _mm256_mask_i32scatter_pd(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv4df ((void *)ADDR, (__mmask8)MASK, (__v4si)(__m128i)INDEX, (__v4df)(__m256d)V1, (int)SCALE) -#define _mm_i32scatter_pd(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv2df ((void *)ADDR, (__mmask8)0xFF, (__v4si)(__m128i)INDEX, (__v2df)(__m128d)V1, (int)SCALE) -#define _mm_mask_i32scatter_pd(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv2df ((void *)ADDR, (__mmask8)MASK, (__v4si)(__m128i)INDEX, (__v2df)(__m128d)V1, (int)SCALE) -#define _mm256_i64scatter_ps(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8sf ((void *)ADDR, (__mmask8)0xFF, (__v4di)(__m256i)INDEX, (__v4sf)(__m128)V1, (int)SCALE) -#define _mm256_mask_i64scatter_ps(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8sf ((void *)ADDR, (__mmask8)MASK, (__v4di)(__m256i)INDEX, (__v4sf)(__m128)V1, (int)SCALE) -#define _mm_i64scatter_ps(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4sf ((void *)ADDR, (__mmask8)0xFF, (__v2di)(__m128i)INDEX, (__v4sf)(__m128)V1, (int)SCALE) -#define _mm_mask_i64scatter_ps(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4sf ((void *)ADDR, (__mmask8)MASK, (__v2di)(__m128i)INDEX, (__v4sf)(__m128)V1, (int)SCALE) -#define _mm256_i64scatter_pd(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4df ((void *)ADDR, (__mmask8)0xFF, (__v4di)(__m256i)INDEX, (__v4df)(__m256d)V1, (int)SCALE) -#define _mm256_mask_i64scatter_pd(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4df ((void *)ADDR, (__mmask8)MASK, (__v4di)(__m256i)INDEX, (__v4df)(__m256d)V1, (int)SCALE) -#define _mm_i64scatter_pd(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv2df ((void *)ADDR, (__mmask8)0xFF, (__v2di)(__m128i)INDEX, (__v2df)(__m128d)V1, (int)SCALE) -#define _mm_mask_i64scatter_pd(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv2df ((void *)ADDR, (__mmask8)MASK, (__v2di)(__m128i)INDEX, (__v2df)(__m128d)V1, (int)SCALE) -#define _mm256_i32scatter_epi32(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv8si ((void *)ADDR, (__mmask8)0xFF, (__v8si)(__m256i)INDEX, (__v8si)(__m256i)V1, (int)SCALE) -#define _mm256_mask_i32scatter_epi32(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv8si ((void *)ADDR, (__mmask8)MASK, (__v8si)(__m256i)INDEX, (__v8si)(__m256i)V1, (int)SCALE) -#define _mm_i32scatter_epi32(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv4si ((void *)ADDR, (__mmask8)0xFF, (__v4si)(__m128i)INDEX, (__v4si)(__m128i)V1, (int)SCALE) -#define _mm_mask_i32scatter_epi32(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv4si ((void *)ADDR, (__mmask8)MASK, (__v4si)(__m128i)INDEX, (__v4si)(__m128i)V1, (int)SCALE) -#define _mm256_i32scatter_epi64(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv4di ((void *)ADDR, (__mmask8)0xFF, (__v4si)(__m128i)INDEX, (__v4di)(__m256i)V1, (int)SCALE) -#define _mm256_mask_i32scatter_epi64(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv4di ((void *)ADDR, (__mmask8)MASK, (__v4si)(__m128i)INDEX, (__v4di)(__m256i)V1, (int)SCALE) -#define _mm_i32scatter_epi64(ADDR,INDEX,V1,SCALE) __builtin_ia32_scattersiv2di ((void *)ADDR, (__mmask8)0xFF, (__v4si)(__m128i)INDEX, (__v2di)(__m128i)V1, (int)SCALE) -#define _mm_mask_i32scatter_epi64(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scattersiv2di ((void *)ADDR, (__mmask8)MASK, (__v4si)(__m128i)INDEX, (__v2di)(__m128i)V1, (int)SCALE) -#define _mm256_i64scatter_epi32(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8si ((void *)ADDR, (__mmask8)0xFF, (__v4di)(__m256i)INDEX, (__v4si)(__m128i)V1, (int)SCALE) -#define _mm256_mask_i64scatter_epi32(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv8si ((void *)ADDR, (__mmask8)MASK, (__v4di)(__m256i)INDEX, (__v4si)(__m128i)V1, (int)SCALE) -#define _mm_i64scatter_epi32(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4si ((void *)ADDR, (__mmask8)0xFF, (__v2di)(__m128i)INDEX, (__v4si)(__m128i)V1, (int)SCALE) -#define _mm_mask_i64scatter_epi32(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4si ((void *)ADDR, (__mmask8)MASK, (__v2di)(__m128i)INDEX, (__v4si)(__m128i)V1, (int)SCALE) -#define _mm256_i64scatter_epi64(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4di ((void *)ADDR, (__mmask8)0xFF, (__v4di)(__m256i)INDEX, (__v4di)(__m256i)V1, (int)SCALE) -#define _mm256_mask_i64scatter_epi64(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv4di ((void *)ADDR, (__mmask8)MASK, (__v4di)(__m256i)INDEX, (__v4di)(__m256i)V1, (int)SCALE) -#define _mm_i64scatter_epi64(ADDR,INDEX,V1,SCALE) __builtin_ia32_scatterdiv2di ((void *)ADDR, (__mmask8)0xFF, (__v2di)(__m128i)INDEX, (__v2di)(__m128i)V1, (int)SCALE) -#define _mm_mask_i64scatter_epi64(ADDR,MASK,INDEX,V1,SCALE) __builtin_ia32_scatterdiv2di ((void *)ADDR, (__mmask8)MASK, (__v2di)(__m128i)INDEX, (__v2di)(__m128i)V1, (int)SCALE) -#define _mm256_mask_shuffle_epi32(W,U,X,C) ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_shuffle_epi32(U,X,C) ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_mask_shuffle_epi32(W,U,X,C) ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_shuffle_epi32(U,X,C) ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_rol_epi64(A,B) ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_rol_epi64(W,U,A,B) ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_rol_epi64(U,A,B) ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_rol_epi64(A,B) ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) -#define _mm_mask_rol_epi64(W,U,A,B) ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_rol_epi64(U,A,B) ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_ror_epi64(A,B) ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_ror_epi64(W,U,A,B) ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_ror_epi64(U,A,B) ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_ror_epi64(A,B) ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) -#define _mm_mask_ror_epi64(W,U,A,B) ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_ror_epi64(U,A,B) ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_rol_epi32(A,B) ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_rol_epi32(W,U,A,B) ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_rol_epi32(U,A,B) ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_rol_epi32(A,B) ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) -#define _mm_mask_rol_epi32(W,U,A,B) ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_rol_epi32(U,A,B) ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_ror_epi32(A,B) ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_ror_epi32(W,U,A,B) ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_ror_epi32(U,A,B) ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_ror_epi32(A,B) ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) -#define _mm_mask_ror_epi32(W,U,A,B) ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_ror_epi32(U,A,B) ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_alignr_epi32(X,Y,C) ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(X), (__mmask8)-1)) -#define _mm256_mask_alignr_epi32(W,U,X,Y,C) ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_alignr_epi32(U,X,Y,C) ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm256_alignr_epi64(X,Y,C) ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(X), (__mmask8)-1)) -#define _mm256_mask_alignr_epi64(W,U,X,Y,C) ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_alignr_epi64(U,X,Y,C) ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_alignr_epi32(X,Y,C) ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(X), (__mmask8)-1)) -#define _mm_mask_alignr_epi32(W,U,X,Y,C) ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_alignr_epi32(U,X,Y,C) ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm_alignr_epi64(X,Y,C) ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) -#define _mm_mask_alignr_epi64(W,U,X,Y,C) ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) -#define _mm_maskz_alignr_epi64(U,X,Y,C) ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm_mask_cvtps_ph(W,U,A,I) ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) A, (int) (I), (__v8hi)(__m128i) (W), (__mmask8) (U))) -#define _mm_maskz_cvtps_ph(U,A,I) ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) A, (int) (I), (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) -#define _mm256_mask_cvtps_ph(W,U,A,I) ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) A, (int) (I), (__v8hi)(__m128i) (W), (__mmask8) (U))) -#define _mm256_maskz_cvtps_ph(U,A,I) ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) A, (int) (I), (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) -#define _mm256_mask_srai_epi32(W,U,A,B) ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_srai_epi32(U,A,B) ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_mask_srai_epi32(W,U,A,B) ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_srai_epi32(U,A,B) ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_srai_epi64(A,B) ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_srai_epi64(W,U,A,B) ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_srai_epi64(U,A,B) ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm_srai_epi64(A,B) ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1)) -#define _mm_mask_srai_epi64(W,U,A,B) ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_srai_epi64(U,A,B) ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_mask_permutex_pd(W,U,A,B) ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_permutex_pd(U,A,B) ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) -#define _mm256_mask_permute_pd(W,U,X,C) ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_permute_pd(U,X,C) ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) -#define _mm256_mask_permute_ps(W,U,X,C) ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_permute_ps(U,X,C) ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) -#define _mm_mask_permute_pd(W,U,X,C) ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) -#define _mm_maskz_permute_pd(U,X,C) ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) -#define _mm_mask_permute_ps(W,U,X,C) ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm_maskz_permute_ps(U,X,C) ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) -#define _mm256_mask_blend_pd(__U,__A,__W) ((__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) (__A), (__v4df) (__W), (__mmask8) (__U))) -#define _mm256_mask_blend_ps(__U,__A,__W) ((__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) (__A), (__v8sf) (__W), (__mmask8) (__U))) -#define _mm256_mask_blend_epi64(__U,__A,__W) ((__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) (__A), (__v4di) (__W), (__mmask8) (__U))) -#define _mm256_mask_blend_epi32(__U,__A,__W) ((__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) (__A), (__v8si) (__W), (__mmask8) (__U))) -#define _mm_mask_blend_pd(__U,__A,__W) ((__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) (__A), (__v2df) (__W), (__mmask8) (__U))) -#define _mm_mask_blend_ps(__U,__A,__W) ((__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) (__A), (__v4sf) (__W), (__mmask8) (__U))) -#define _mm_mask_blend_epi64(__U,__A,__W) ((__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) (__A), (__v2di) (__W), (__mmask8) (__U))) -#define _mm_mask_blend_epi32(__U,__A,__W) ((__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) (__A), (__v4si) (__W), (__mmask8) (__U))) -#define _mm256_cmp_epu32_mask(X,Y,P) ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) -#define _mm256_cmp_epi64_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) -#define _mm256_cmp_epi32_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) -#define _mm256_cmp_epu64_mask(X,Y,P) ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) -#define _mm256_cmp_pd_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)-1)) -#define _mm256_cmp_ps_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)-1)) -#define _mm256_mask_cmp_epi64_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) -#define _mm256_mask_cmp_epi32_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) -#define _mm256_mask_cmp_epu64_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) -#define _mm256_mask_cmp_epu32_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) -#define _mm256_mask_cmp_pd_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)(M))) -#define _mm256_mask_cmp_ps_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)(M))) -#define _mm_cmp_epi64_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) -#define _mm_cmp_epi32_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) -#define _mm_cmp_epu64_mask(X,Y,P) ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) -#define _mm_cmp_epu32_mask(X,Y,P) ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) -#define _mm_cmp_pd_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1)) -#define _mm_cmp_ps_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1)) -#define _mm_mask_cmp_epi64_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) -#define _mm_mask_cmp_epi32_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) -#define _mm_mask_cmp_epu64_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) -#define _mm_mask_cmp_epu32_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) -#define _mm_mask_cmp_pd_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)(M))) -#define _mm_mask_cmp_ps_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)(M))) -#define _mm256_permutexvar_ps(A,B) _mm256_permutevar8x32_ps ((B), (A)) -#undef __DISABLE_AVX512VL__ -#pragma GCC pop_options -#define _AVX512BWINTRIN_H_INCLUDED -#pragma GCC push_options -#pragma GCC target("avx512bw") -#define __DISABLE_AVX512BW__ -typedef short __v32hi __attribute__ ((__vector_size__ (64))); -typedef char __v64qi __attribute__ ((__vector_size__ (64))); -typedef unsigned long long __mmask64; -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +_mm512_avg_epu8 (__m512i __A, __m512i __B) { - *__CF = (unsigned char) __builtin_ia32_ktestcsi (__A, __B); - return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B); - return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +_mm512_add_epi8 (__m512i __A, __m512i __B) { - return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); + return (__m512i) ((__v64qu) __A + (__v64qu) __B); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (unsigned char) __builtin_ia32_ktestcsi (__A, __B); + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (unsigned char) __builtin_ia32_ktestcdi (__A, __B); + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +_mm512_sub_epi8 (__m512i __A, __m512i __B) { - *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B); - return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); + return (__m512i) ((__v64qu) __A - (__v64qu) __B); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B); - return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +_mm512_avg_epu16 (__m512i __A, __m512i __B) { - return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (unsigned char) __builtin_ia32_kortestcsi (__A, __B); + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (unsigned char) __builtin_ia32_kortestcdi (__A, __B); + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kadd_mask32 (__mmask32 __A, __mmask32 __B) +_mm512_subs_epi8 (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B); + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kadd_mask64 (__mmask64 __A, __mmask64 __B) +_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B); + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline unsigned int +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtmask32_u32 (__mmask32 __A) +_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A); + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline unsigned long long +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtmask64_u64 (__mmask64 __A) +_mm512_subs_epu8 (__m512i __A, __m512i __B) { - return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A); + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu32_mask32 (unsigned int __A) +_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A); + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu64_mask64 (unsigned long long __A) +_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A); + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_load_mask32 (__mmask32 *__A) +_mm512_adds_epi8 (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_kmovd (*__A); + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_load_mask64 (__mmask64 *__A) +_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (__mmask64) __builtin_ia32_kmovq (*(__mmask64 *) __A); + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_store_mask32 (__mmask32 *__A, __mmask32 __B) +_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - *(__mmask32 *) __A = __builtin_ia32_kmovd (__B); + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_store_mask64 (__mmask64 *__A, __mmask64 __B) +_mm512_adds_epu8 (__m512i __A, __m512i __B) { - *(__mmask64 *) __A = __builtin_ia32_kmovq (__B); + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_knot_mask32 (__mmask32 __A) +_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A); + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_knot_mask64 (__mmask64 __A) +_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A); + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kor_mask32 (__mmask32 __A, __mmask32 __B) +_mm512_sub_epi16 (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B); + return (__m512i) ((__v32hu) __A - (__v32hu) __B); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kor_mask64 (__mmask64 __A, __mmask64 __B) +_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B); + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kxnor_mask32 (__mmask32 __A, __mmask32 __B) +_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B); + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kxnor_mask64 (__mmask64 __A, __mmask64 __B) +_mm512_subs_epi16 (__m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B); + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kxor_mask32 (__mmask32 __A, __mmask32 __B) +_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B); + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kxor_mask64 (__mmask64 __A, __mmask64 __B) +_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B); + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kand_mask32 (__mmask32 __A, __mmask32 __B) +_mm512_subs_epu16 (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B); + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kand_mask64 (__mmask64 __A, __mmask64 __B) +_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B); + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kandn_mask32 (__mmask32 __A, __mmask32 __B) +_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B); + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kandn_mask64 (__mmask64 __A, __mmask64 __B) +_mm512_add_epi16 (__m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B); + return (__m512i) ((__v32hu) __A + (__v32hu) __B); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) +_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) +_mm512_adds_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) +_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) +_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - __builtin_ia32_storedquhi512_mask ((short *) __P, - (__v32hi) __A, - (__mmask32) __U); + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +_mm512_adds_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) +_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kunpackw (__mmask32 __A, __mmask32 __B) +_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, - (__mmask32) __B); + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kunpackw_mask32 (__mmask16 __A, __mmask16 __B) +_mm512_srl_epi16 (__m512i __A, __m128i __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, - (__mmask32) __B); + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_kunpackd (__mmask64 __A, __mmask64 __B) +_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, - (__mmask64) __B); + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kunpackd_mask64 (__mmask32 __A, __mmask32 __B) +_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, - (__mmask64) __B); + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) +_mm512_packs_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) +_mm512_sll_epi16 (__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) +_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) { - __builtin_ia32_storedquqi512_mask ((char *) __P, - (__v64qi) __A, - (__mmask64) __U); + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sad_epu8 (__m512i __A, __m512i __B) +_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A, - (__v64qi) __B); + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi16_epi8 (__m512i __A) +_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) { - return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, - (__v32qi) _mm256_undefined_si256(), - (__mmask32) -1); + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X, + __m512i __Y) { - __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) { - return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, - (__v32qi) __O, __M); + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) +_mm512_madd_epi16 (__m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, - (__v32qi) - _mm256_setzero_si256 (), - __M); + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtsepi16_epi8 (__m512i __A) +_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { - return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, - (__v32qi)_mm256_undefined_si256(), - (__mmask32) -1); + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) __W, + (__mmask16) __U); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) { - __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, - (__v32qi)__O, - __M); + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) +_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, - (__v32qi) - _mm256_setzero_si256 (), - __M); + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtusepi16_epi8 (__m512i __A) +_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, - (__v32qi)_mm256_undefined_si256(), - (__mmask32) -1); + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, - (__v32qi) __O, - __M); + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } -extern __inline __m256i +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) +_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, - (__v32qi) - _mm256_setzero_si256 (), - __M); + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastb_epi8 (__m128i __A) +_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, - (__v64qi)_mm512_undefined_epi32(), - (__mmask64) -1); + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) +_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, - (__v64qi) __O, - __M); + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) +_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, - (__v64qi) - _mm512_setzero_si512 (), - __M); + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) +_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, - (__v64qi) __O, - __M); + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) +_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { - return (__m512i) - __builtin_ia32_pbroadcastb512_gpr_mask (__A, - (__v64qi) - _mm512_setzero_si512 (), - __M); + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcastw_epi16 (__m128i __A) +_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, - (__v32hi)_mm512_undefined_epi32(), - (__mmask32) -1); + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) +_mm512_cmpeq_epu8_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, - (__v32hi) __O, - __M); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 0, + (__mmask64) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) +_mm512_cmpeq_epi8_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, - (__v32hi) - _mm512_setzero_si512 (), - __M); + return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) +_mm512_mask_cmpeq_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, - (__v32hi) __O, - __M); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 0, + __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) +_mm512_mask_cmpeq_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) - __builtin_ia32_pbroadcastw512_gpr_mask (__A, - (__v32hi) - _mm512_setzero_si512 (), - __M); + return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, + (__v64qi) __B, + __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mulhrs_epi16 (__m512i __A, __m512i __B) +_mm512_cmpeq_epu16_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 0, (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_cmpeq_epi16_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpeq_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 0, + __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mulhi_epi16 (__m512i __A, __m512i __B) +_mm512_mask_cmpeq_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, + (__v32hi) __B, + __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_cmpgt_epu8_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 6, + (__mmask64) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_cmpgt_epi8_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mulhi_epu16 (__m512i __A, __m512i __B) +_mm512_mask_cmpgt_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 6, + __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_cmpgt_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, + (__v64qi) __B, + __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_cmpgt_epu16_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 6, + (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mullo_epi16 (__m512i __A, __m512i __B) +_mm512_cmpgt_epi16_mask (__m512i __A, __m512i __B) { - return (__m512i) ((__v32hu) __A * (__v32hu) __B); + return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_cmpgt_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 6, + __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpgt_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, + (__v32hi) __B, + __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi8_epi16 (__m256i __A) +_mm512_movepi8_mask (__m512i __A) { - return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +_mm512_movepi16_mask (__m512i __A) { - return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A) +_mm512_movm_epi8 (__mmask64 __A) { - return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512i) __builtin_ia32_cvtmask2b512 (__A); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepu8_epi16 (__m256i __A) +_mm512_movm_epi16 (__mmask32 __A) { - return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512i) __builtin_ia32_cvtmask2w512 (__A); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +_mm512_test_epi8_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A) +_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, + (__v64qi) __B, __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) +_mm512_test_epi16_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, - (__v32hi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, - (__v32hi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __M); + return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, + (__v32hi) __B, __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_testn_epi8_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, - (__v32hi) __A, - (__v32hi) __W, - (__mmask32) __M); + return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B) +_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I - , - (__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, + (__v64qi) __B, __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U, - __m512i __I, __m512i __B) +_mm512_testn_epi16_mask (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I - , - (__v32hi) __A, + return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, (__v32hi) __B, - (__mmask32) - __U); + (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I, - __mmask32 __U, __m512i __B) +_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A, - (__v32hi) __I - , - (__v32hi) __B, - (__mmask32) - __U); + return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, + (__v32hi) __B, __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A, - __m512i __I, __m512i __B) +_mm512_shuffle_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I - , - (__v32hi) __A, - (__v32hi) __B, - (__mmask32) - __U); + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_avg_epu8 (__m512i __A, __m512i __B) +_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_min_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_add_epi8 (__m512i __A, __m512i __B) +_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) ((__v64qu) __A + (__v64qu) __B); + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_min_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sub_epi8 (__m512i __A, __m512i __B) +_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) ((__v64qu) __A - (__v64qu) __B); + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_max_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_avg_epu16 (__m512i __A, __m512i __B) +_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_max_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_subs_epi8 (__m512i __A, __m512i __B) +_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, (__v64qi) __B, (__v64qi) _mm512_setzero_si512 (), - (__mmask64) -1); + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, (__v64qi) __B, (__v64qi) __W, - (__mmask64) __U); + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_min_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, (__v64qi) __B, (__v64qi) _mm512_setzero_si512 (), - (__mmask64) __U); + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_subs_epu8 (__m512i __A, __m512i __B) +_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_min_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_adds_epi8 (__m512i __A, __m512i __B) +_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, (__v64qi) __B, (__v64qi) _mm512_setzero_si512 (), - (__mmask64) -1); + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, (__v64qi) __B, (__v64qi) __W, - (__mmask64) __U); + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_max_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) _mm512_setzero_si512 (), - (__mmask64) __U); + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_adds_epu8 (__m512i __A, __m512i __B) +_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, +_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_max_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sub_epi16 (__m512i __A, __m512i __B) +_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) ((__v32hu) __A - (__v32hu) __B); + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, +_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, - (__v32hi) __B, + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi16 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, (__v32hi) __W, (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, - (__v32hi) __B, + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, (__v32hi) _mm512_setzero_si512 (), (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_subs_epi16 (__m512i __A, __m512i __B) +_mm512_srav_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, (__v32hi) __B, (__v32hi) _mm512_setzero_si512 (), @@ -28699,19 +37710,19 @@ _mm512_subs_epi16 (__m512i __A, __m512i __B) } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, +_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, (__v32hi) __B, (__v32hi) __W, (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, (__v32hi) __B, (__v32hi) _mm512_setzero_si512 (), @@ -28719,65 +37730,39 @@ _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_subs_epu16 (__m512i __A, __m512i __B) +_mm512_srlv_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, +_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_add_epi16 (__m512i __A, __m512i __B) -{ - return (__m512i) ((__v32hu) __A + (__v32hu) __B); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) -{ - return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_adds_epi16 (__m512i __A, __m512i __B) +_mm512_sllv_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, (__v32hi) __B, (__v32hi) _mm512_setzero_si512 (), @@ -28785,19 +37770,19 @@ _mm512_adds_epi16 (__m512i __A, __m512i __B) } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, +_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, (__v32hi) __B, (__v32hi) __W, (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, (__v32hi) __B, (__v32hi) _mm512_setzero_si512 (), @@ -28805,2380 +37790,2755 @@ _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_adds_epu16 (__m512i __A, __m512i __B) +_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) __W, + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_packus_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_srl_epi16 (__m512i __A, __m128i __B) +_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) __W, + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) +_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +_mm512_abs_epi8 (__m512i __A) { - return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) _mm512_setzero_si512 (), - (__mmask32) __U); + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_packs_epi16 (__m512i __A, __m512i __B) +_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) { - return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sll_epi16 (__m512i __A, __m128i __B) +_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) { - return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, - (__v8hi) __B, + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi16 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, (__v32hi) _mm512_setzero_si512 (), (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) +_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { - return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, - (__v8hi) __B, + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, (__v32hi) __W, (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) { - return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, - (__v8hi) __B, + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, (__v32hi) _mm512_setzero_si512 (), (__mmask32) __U); } -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) -{ - return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); -} -extern __inline __m512i -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X, - __m512i __Y) -{ - return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v32hi) __W, - (__mmask32) __U); -} -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) +_mm512_mask_cmpneq_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_madd_epi16 (__m512i __A, __m512i __B) +_mm512_mask_cmplt_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) +_mm512_mask_cmpge_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v16si) __W, - (__mmask16) __U); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_cmple_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) +_mm512_mask_cmpneq_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_cmplt_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpge_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) +_mm512_mask_cmple_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) __M); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_cmpneq_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_mask_cmplt_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) +_mm512_mask_cmpge_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_cmple_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_mask_cmpneq_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) +_mm512_mask_cmplt_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_cmpge_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) __M); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_mask_cmple_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) __M); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epu8_mask (__m512i __A, __m512i __B) +_mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, - (__v64qi) __B, 0, - (__mmask64) -1); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) -1); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epi8_mask (__m512i __A, __m512i __B) +_mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) -1); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, - (__v64qi) __B, 0, - __U); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) -1); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_cmple_epu8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, - (__v64qi) __B, - __U); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epu16_mask (__m512i __A, __m512i __B) +_mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, - (__v32hi) __B, 0, - (__mmask32) -1); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpeq_epi16_mask (__m512i __A, __m512i __B) +_mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, - (__v32hi) __B, 0, - __U); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpeq_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_cmple_epu16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, - (__v32hi) __B, - __U); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) -1); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epu8_mask (__m512i __A, __m512i __B) +_mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, - (__v64qi) __B, 6, - (__mmask64) -1); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) -1); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epi8_mask (__m512i __A, __m512i __B) +_mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) -1); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, - (__v64qi) __B, 6, - __U); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) -1); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_cmple_epi8_mask (__m512i __X, __m512i __Y) { - return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, - (__v64qi) __B, - __U); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epu16_mask (__m512i __A, __m512i __B) +_mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, - (__v32hi) __B, 6, - (__mmask32) -1); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpgt_epi16_mask (__m512i __A, __m512i __B) +_mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, - (__v32hi) __B, 6, - __U); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) -1); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpgt_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_cmple_epi16_mask (__m512i __X, __m512i __Y) { - return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, - (__v32hi) __B, - __U); + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) -1); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movepi8_mask (__m512i __A) +_mm512_packs_epi32 (__m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movepi16_mask (__m512i __A) +_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movm_epi8 (__mmask64 __A) +_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { - return (__m512i) __builtin_ia32_cvtmask2b512 (__A); + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) __W, + __M); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movm_epi16 (__mmask32 __A) +_mm512_packus_epi32 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_cvtmask2w512 (__A); + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_test_epi8_mask (__m512i __A, __m512i __B) +_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { - return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, - (__v64qi) __B, __U); + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) __W, + __M); } extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_test_epi16_mask (__m512i __A, __m512i __B) +_kshiftli_mask32 (__mmask32 __A, unsigned int __B) { - return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A, + (__mmask8) __B); } -extern __inline __mmask32 +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +_kshiftli_mask64 (__mmask64 __A, unsigned int __B) { - return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, - (__v32hi) __B, __U); + return (__mmask64) __builtin_ia32_kshiftlidi ((__mmask64) __A, + (__mmask8) __B); } -extern __inline __mmask64 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_testn_epi8_mask (__m512i __A, __m512i __B) +_kshiftri_mask32 (__mmask32 __A, unsigned int __B) { - return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A, + (__mmask8) __B); } extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +_kshiftri_mask64 (__mmask64 __A, unsigned int __B) { - return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, - (__v64qi) __B, __U); + return (__mmask64) __builtin_ia32_kshiftridi ((__mmask64) __A, + (__mmask8) __B); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_testn_epi16_mask (__m512i __A, __m512i __B) +_mm512_alignr_epi8 (__m512i __A, __m512i __B, const int __N) { - return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__m512i) __builtin_ia32_palignr512 ((__v8di) __A, + (__v8di) __B, __N * 8); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_mask_alignr_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B, const int __N) { - return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, - (__v32hi) __B, __U); + return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A, + (__v8di) __B, + __N * 8, + (__v8di) __W, + (__mmask64) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_shuffle_epi8 (__m512i __A, __m512i __B) +_mm512_maskz_alignr_epi8 (__mmask64 __U, __m512i __A, __m512i __B, + const int __N) { - return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A, + (__v8di) __B, + __N * 8, + (__v8di) + _mm512_setzero_si512 (), + (__mmask64) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_dbsad_epu8 (__m512i __A, __m512i __B, const int __imm) { - return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_mask_dbsad_epu8 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B, const int __imm) { - return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epu16 (__m512i __A, __m512i __B) +_mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B, + const int __imm) { - return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_srli_epi16 (__m512i __A, const int __imm) { - return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, - (__v32hi) __B, + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, (__v32hi) _mm512_setzero_si512 (), - (__mmask32) __M); + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { - return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, - (__v32hi) __B, + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, (__v32hi) __W, - (__mmask32) __M); + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epi16 (__m512i __A, __m512i __B) +_mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm) { - return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, - (__v32hi) __B, + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, (__v32hi) _mm512_setzero_si512 (), - (__mmask32) -1); + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_slli_epi16 (__m512i __A, const int __B) { - return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, - (__v32hi) __B, + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, (__v32hi) _mm512_setzero_si512 (), - (__mmask32) __M); + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __B) { - return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, - (__v32hi) __B, + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, (__v32hi) __W, - (__mmask32) __M); + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epu8 (__m512i __A, __m512i __B) +_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B) { - return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, + (__v32hi) _mm512_setzero_si512 (), - (__mmask64) -1); + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +_mm512_shufflehi_epi16 (__m512i __A, const int __imm) { - return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __M); + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_shufflehi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { - return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epi8 (__m512i __A, __m512i __B) +_mm512_maskz_shufflehi_epi16 (__mmask32 __U, __m512i __A, + const int __imm) { - return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +_mm512_shufflelo_epi16 (__m512i __A, const int __imm) { - return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __M); + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_shufflelo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { - return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epu8 (__m512i __A, __m512i __B) +_mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A, + const int __imm) { - return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +_mm512_srai_epi16 (__m512i __A, const int __imm) { - return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) _mm512_setzero_si512 (), - (__mmask64) __M); + (__mmask32) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { - return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_min_epi8 (__m512i __A, __m512i __B) +_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm) { - return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) _mm512_setzero_si512 (), - (__mmask64) -1); + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) { - return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __M); + return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) { - return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epi16 (__m512i __A, __m512i __B) +_mm512_mask_cmp_epi16_mask (__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) { - return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi16_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) { - return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __M); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_cmp_epi8_mask (__m512i __X, __m512i __Y, const int __P) { - return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __M); + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) -1); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_max_epu16 (__m512i __A, __m512i __B) +_mm512_mask_cmp_epu16_mask (__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) { - return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) __U); } -extern __inline __m512i +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_cmp_epu16_mask (__m512i __X, __m512i __Y, const int __P) { - return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __M); + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) -1); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) { - return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __M); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) __U); } -extern __inline __m512i +extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sra_epi16 (__m512i __A, __m128i __B) +_mm512_cmp_epu8_mask (__m512i __X, __m512i __Y, const int __P) { - return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) -1); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) +_mm512_bslli_epi128 (__m512i __A, const int __N) { - return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i) __builtin_ia32_pslldq512 (__A, __N * 8); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +_mm512_bsrli_epi128 (__m512i __A, const int __N) { - return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512i) __builtin_ia32_psrldq512 (__A, __N * 8); } -extern __inline __m512i +#undef __DISABLE_AVX512BW__ +#pragma GCC pop_options +#define _AVX512DQINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("avx512dq") +#define __DISABLE_AVX512DQ__ +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_srav_epi16 (__m512i __A, __m512i __B) +_ktest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) { - return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + *__CF = (unsigned char) __builtin_ia32_ktestcqi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_ktestz_mask8_u8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_ktestc_mask8_u8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (unsigned char) __builtin_ia32_ktestcqi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_srlv_epi16 (__m512i __A, __m512i __B) +_ktest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) { - return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + *__CF = (unsigned char) __builtin_ia32_ktestchi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_ktestz_mask16_u8 (__mmask16 __A, __mmask16 __B) { - return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_ktestc_mask16_u8 (__mmask16 __A, __mmask16 __B) { - return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (unsigned char) __builtin_ia32_ktestchi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_sllv_epi16 (__m512i __A, __m512i __B) +_kortest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) { - return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + *__CF = (unsigned char) __builtin_ia32_kortestcqi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_kortestz_mask8_u8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); } -extern __inline __m512i +extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_kortestc_mask8_u8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (unsigned char) __builtin_ia32_kortestcqi (__A, __B); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_kadd_mask8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__mmask8) __builtin_ia32_kaddqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512i +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +_kadd_mask16 (__mmask16 __A, __mmask16 __B) { - return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) - _mm512_setzero_si512 (), - __M); + return (__mmask16) __builtin_ia32_kaddhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline __m512i +extern __inline unsigned int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_packus_epi16 (__m512i __A, __m512i __B) +_cvtmask8_u32 (__mmask8 __A) { - return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + return (unsigned int) __builtin_ia32_kmovb ((__mmask8 ) __A); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_cvtu32_mask8 (unsigned int __A) { - return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__mmask8) __builtin_ia32_kmovb ((__mmask8) __A); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +_load_mask8 (__mmask8 *__A) { - return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __M); + return (__mmask8) __builtin_ia32_kmovb (*(__mmask8 *) __A); } -extern __inline __m512i +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_abs_epi8 (__m512i __A) +_store_mask8 (__mmask8 *__A, __mmask8 __B) { - return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) -1); + *(__mmask8 *) __A = __builtin_ia32_kmovb (__B); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +_knot_mask8 (__mmask8 __A) { - return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, - (__v64qi) __W, - (__mmask64) __U); + return (__mmask8) __builtin_ia32_knotqi ((__mmask8) __A); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) +_kor_mask8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); + return (__mmask8) __builtin_ia32_korqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_abs_epi16 (__m512i __A) +_kxnor_mask8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__mmask8) __builtin_ia32_kxnorqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +_kxor_mask8 (__mmask8 __A, __mmask8 __B) { - return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, - (__v32hi) __W, - (__mmask32) __U); + return (__mmask8) __builtin_ia32_kxorqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512i +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f64x2 (__m128d __A) +{ + return (__m512d) + __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, + _mm512_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) + __A, + (__v8df) + __O, __M); +} +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) +_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) { - return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); + return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) + __A, + (__v8df) + _mm512_setzero_ps (), + __M); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_broadcast_i64x2 (__m128i __A) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 4, - (__mmask64) __M); + return (__m512i) + __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, + _mm512_undefined_epi32 (), + (__mmask8) -1); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 1, - (__mmask64) __M); + return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) + __A, + (__v8di) + __O, __M); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 5, - (__mmask64) __M); + return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) + __A, + (__v8di) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask64 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_broadcast_f32x2 (__m128 __A) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 2, - (__mmask64) __M); + return (__m512) + __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf)_mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline __mmask32 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 4, - (__mmask32) __M); + return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf) + __O, __M); } -extern __inline __mmask32 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 1, - (__mmask32) __M); + return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_broadcast_i32x2 (__m128i __A) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 5, - (__mmask32) __M); + return (__m512i) + __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 2, - (__mmask32) __M); + return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) + __A, + (__v16si) + __O, __M); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 4, - (__mmask64) __M); + return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) + __A, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask64 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_broadcast_f32x8 (__m256 __A) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 1, - (__mmask64) __M); + return (__m512) + __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + _mm512_undefined_ps (), + (__mmask16) -1); } -extern __inline __mmask64 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 5, - (__mmask64) __M); + return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + (__v16sf)__O, + __M); } -extern __inline __mmask64 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 2, - (__mmask64) __M); + return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_broadcast_i32x8 (__m256i __A) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 4, - (__mmask32) __M); + return (__m512i) + __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmplt_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 1, - (__mmask32) __M); + return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) + __A, + (__v16si)__O, + __M); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpge_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 5, - (__mmask32) __M); + return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) + __A, + (__v16si) + _mm512_setzero_si512 (), + __M); } -extern __inline __mmask32 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmple_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +_mm512_mullo_epi64 (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 2, - (__mmask32) __M); + return (__m512i) ((__v8du) __A * (__v8du) __B); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y) +_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 4, - (__mmask64) -1); + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } -extern __inline __mmask64 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y) +_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 1, - (__mmask64) -1); + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } -extern __inline __mmask64 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y) +_mm512_xor_pd (__m512d __A, __m512d __B) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 5, - (__mmask64) -1); + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); } -extern __inline __mmask64 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epu8_mask (__m512i __X, __m512i __Y) +_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { - return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 2, - (__mmask64) -1); + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __mmask32 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y) +_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 4, - (__mmask32) -1); + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask32 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y) +_mm512_xor_ps (__m512 __A, __m512 __B) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 1, - (__mmask32) -1); + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); } -extern __inline __mmask32 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y) +_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 5, - (__mmask32) -1); + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __mmask32 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epu16_mask (__m512i __X, __m512i __Y) +_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 2, - (__mmask32) -1); + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __mmask64 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y) +_mm512_or_pd (__m512d __A, __m512d __B) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 4, - (__mmask64) -1); + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); } -extern __inline __mmask64 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y) +_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 1, - (__mmask64) -1); + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __mmask64 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y) +_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 5, - (__mmask64) -1); + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __mmask64 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epi8_mask (__m512i __X, __m512i __Y) +_mm512_or_ps (__m512 __A, __m512 __B) { - return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, - (__v64qi) __Y, 2, - (__mmask64) -1); + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); } -extern __inline __mmask32 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y) +_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 4, - (__mmask32) -1); + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __mmask32 +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y) +_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 1, - (__mmask32) -1); + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __mmask32 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y) +_mm512_and_pd (__m512d __A, __m512d __B) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 5, - (__mmask32) -1); + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); } -extern __inline __mmask32 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cmple_epi16_mask (__m512i __X, __m512i __Y) +_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { - return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, - (__v32hi) __Y, 2, - (__mmask32) -1); + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_packs_epi32 (__m512i __A, __m512i __B) +_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_and_ps (__m512 __A, __m512 __B) { - return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) - _mm512_setzero_si512 (), - __M); + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) __W, - __M); + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline __m512i +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_packus_epi32 (__m512i __A, __m512i __B) +_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) -1); + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_andnot_pd (__m512d __A, __m512d __B) { - return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) - _mm512_setzero_si512 (), - __M); + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { - return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) __W, - __M); + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); } -#define _kshiftli_mask32(X,Y) ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y))) -#define _kshiftli_mask64(X,Y) ((__mmask64) __builtin_ia32_kshiftlidi ((__mmask64)(X), (__mmask8)(Y))) -#define _kshiftri_mask32(X,Y) ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y))) -#define _kshiftri_mask64(X,Y) ((__mmask64) __builtin_ia32_kshiftridi ((__mmask64)(X), (__mmask8)(Y))) -#define _mm512_alignr_epi8(X,Y,N) ((__m512i) __builtin_ia32_palignr512 ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(N * 8))) -#define _mm512_mask_alignr_epi8(W,U,X,Y,N) ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(N * 8), (__v8di)(__m512i)(W), (__mmask64)(U))) -#define _mm512_maskz_alignr_epi8(U,X,Y,N) ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(N * 8), (__v8di)(__m512i) _mm512_setzero_si512 (), (__mmask64)(U))) -#define _mm512_dbsad_epu8(X,Y,C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1)) -#define _mm512_mask_dbsad_epu8(W,U,X,Y,C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i)(W), (__mmask32)(U))) -#define _mm512_maskz_dbsad_epu8(U,X,Y,C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U))) -#define _mm512_srli_epi16(A,B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1)) -#define _mm512_mask_srli_epi16(W,U,A,B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) -#define _mm512_maskz_srli_epi16(U,A,B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U))) -#define _mm512_slli_epi16(X,C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)-1)) -#define _mm512_mask_slli_epi16(W,U,X,C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)(W), (__mmask32)(U))) -#define _mm512_maskz_slli_epi16(U,X,C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(U))) -#define _mm512_shufflehi_epi16(A,B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1)) -#define _mm512_mask_shufflehi_epi16(W,U,A,B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) -#define _mm512_maskz_shufflehi_epi16(U,A,B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U))) -#define _mm512_shufflelo_epi16(A,B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1)) -#define _mm512_mask_shufflelo_epi16(W,U,A,B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) -#define _mm512_maskz_shufflelo_epi16(U,A,B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U))) -#define _mm512_srai_epi16(A,B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1)) -#define _mm512_mask_srai_epi16(W,U,A,B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) -#define _mm512_maskz_srai_epi16(U,A,B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U))) -#define _mm512_mask_blend_epi16(__U,__A,__W) ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A), (__v32hi) (__W), (__mmask32) (__U))) -#define _mm512_mask_blend_epi8(__U,__A,__W) ((__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) (__A), (__v64qi) (__W), (__mmask64) (__U))) -#define _mm512_cmp_epi16_mask(X,Y,P) ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(-1))) -#define _mm512_cmp_epi8_mask(X,Y,P) ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(-1))) -#define _mm512_cmp_epu16_mask(X,Y,P) ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(-1))) -#define _mm512_cmp_epu8_mask(X,Y,P) ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(-1))) -#define _mm512_mask_cmp_epi16_mask(M,X,Y,P) ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) -#define _mm512_mask_cmp_epi8_mask(M,X,Y,P) ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) -#define _mm512_mask_cmp_epu16_mask(M,X,Y,P) ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) -#define _mm512_mask_cmp_epu8_mask(M,X,Y,P) ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) -#define _mm512_bslli_epi128(A,N) ((__m512i)__builtin_ia32_pslldq512 ((__m512i)(A), (int)(N) * 8)) -#define _mm512_bsrli_epi128(A,N) ((__m512i)__builtin_ia32_psrldq512 ((__m512i)(A), (int)(N) * 8)) -#undef __DISABLE_AVX512BW__ -#pragma GCC pop_options -#define _AVX512DQINTRIN_H_INCLUDED -#pragma GCC push_options -#pragma GCC target("avx512dq") -#define __DISABLE_AVX512DQ__ -extern __inline unsigned char +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) +_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) { - *__CF = (unsigned char) __builtin_ia32_ktestcqi (__A, __B); - return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } -extern __inline unsigned char +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestz_mask8_u8 (__mmask8 __A, __mmask8 __B) +_mm512_andnot_ps (__m512 __A, __m512 __B) { - return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); } -extern __inline unsigned char +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestc_mask8_u8 (__mmask8 __A, __mmask8 __B) +_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { - return (unsigned char) __builtin_ia32_ktestcqi (__A, __B); + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } -extern __inline unsigned char +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) +_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) { - *__CF = (unsigned char) __builtin_ia32_ktestchi (__A, __B); - return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } -extern __inline unsigned char +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestz_mask16_u8 (__mmask16 __A, __mmask16 __B) +_mm512_movepi32_mask (__m512i __A) { - return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); + return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); } -extern __inline unsigned char +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestc_mask16_u8 (__mmask16 __A, __mmask16 __B) +_mm512_movepi64_mask (__m512i __A) { - return (unsigned char) __builtin_ia32_ktestchi (__A, __B); + return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) +_mm512_movm_epi32 (__mmask16 __A) { - *__CF = (unsigned char) __builtin_ia32_kortestcqi (__A, __B); - return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); + return (__m512i) __builtin_ia32_cvtmask2d512 (__A); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestz_mask8_u8 (__mmask8 __A, __mmask8 __B) +_mm512_movm_epi64 (__mmask8 __A) { - return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); + return (__m512i) __builtin_ia32_cvtmask2q512 (__A); } -extern __inline unsigned char +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestc_mask8_u8 (__mmask8 __A, __mmask8 __B) +_mm512_cvttpd_epi64 (__m512d __A) { - return (unsigned char) __builtin_ia32_kortestcqi (__A, __B); + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kadd_mask8 (__mmask8 __A, __mmask8 __B) +_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__mmask8) __builtin_ia32_kaddqi ((__mmask8) __A, (__mmask8) __B); + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } -extern __inline __mmask16 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kadd_mask16 (__mmask16 __A, __mmask16 __B) +_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { - return (__mmask16) __builtin_ia32_kaddhi ((__mmask16) __A, (__mmask16) __B); + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } -extern __inline unsigned int +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtmask8_u32 (__mmask8 __A) +_mm512_cvttpd_epu64 (__m512d __A) { - return (unsigned int) __builtin_ia32_kmovb ((__mmask8 ) __A); + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu32_mask8 (unsigned int __A) +_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__mmask8) __builtin_ia32_kmovb ((__mmask8) __A); + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_load_mask8 (__mmask8 *__A) +_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { - return (__mmask8) __builtin_ia32_kmovb (*(__mmask8 *) __A); + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } -extern __inline void +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_store_mask8 (__mmask8 *__A, __mmask8 __B) +_mm512_cvttps_epi64 (__m256 __A) { - *(__mmask8 *) __A = __builtin_ia32_kmovb (__B); + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_knot_mask8 (__mmask8 __A) +_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__mmask8) __builtin_ia32_knotqi ((__mmask8) __A); + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kor_mask8 (__mmask8 __A, __mmask8 __B) +_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { - return (__mmask8) __builtin_ia32_korqi ((__mmask8) __A, (__mmask8) __B); + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kxnor_mask8 (__mmask8 __A, __mmask8 __B) +_mm512_cvttps_epu64 (__m256 __A) { - return (__mmask8) __builtin_ia32_kxnorqi ((__mmask8) __A, (__mmask8) __B); + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kxor_mask8 (__mmask8 __A, __mmask8 __B) +_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__mmask8) __builtin_ia32_kxorqi ((__mmask8) __A, (__mmask8) __B); + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kand_mask8 (__mmask8 __A, __mmask8 __B) +_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { - return (__mmask8) __builtin_ia32_kandqi ((__mmask8) __A, (__mmask8) __B); + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } -extern __inline __mmask8 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kandn_mask8 (__mmask8 __A, __mmask8 __B) +_mm512_cvtpd_epi64 (__m512d __A) { - return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B); + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + 0x04); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_f64x2 (__m128d __A) +_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__m512d) - __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A) +_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) - __A, - (__v8df) - __O, __M); + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } -extern __inline __m512d +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +_mm512_cvtpd_epu64 (__m512d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) - __A, - (__v8df) - _mm512_setzero_ps (), - __M); + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + 0x04); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_i64x2 (__m128i __A) +_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__m512i) - __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A) +_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) - __A, - (__v8di) - __O, __M); + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +_mm512_cvtps_epi64 (__m256 __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) - __A, + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), - __M); -} -extern __inline __m512 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_f32x2 (__m128 __A) -{ - return (__m512) - __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf)_mm512_undefined_ps (), - (__mmask16) -1); + (__mmask8) -1, + 0x04); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) +_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf) - __O, __M); + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m512 +extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) +_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf) - _mm512_setzero_ps (), - __M); + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_i32x2 (__m128i __A) +_mm512_cvtps_epu64 (__m256 __A) { - return (__m512i) - __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + 0x04); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) +_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) - __A, - (__v16si) - __O, __M); + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + 0x04); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) +_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) - __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + 0x04); } -extern __inline __m512 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_f32x8 (__m256 __A) +_mm512_cvtepi64_ps (__m512i __A) { - return (__m512) - __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + 0x04); } -extern __inline __m512 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A) +_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - (__v16sf)__O, - __M); + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m512 +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A) +_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - (__v16sf) - _mm512_setzero_ps (), - __M); + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_i32x8 (__m256i __A) +_mm512_cvtepu64_ps (__m512i __A) { - return (__m512i) - __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + 0x04); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A) +_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) - __A, - (__v16si)__O, - __M); + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m512i +extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A) +_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) - __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mullo_epi64 (__m512i __A, __m512i __B) +_mm512_cvtepi64_pd (__m512i __A) { - return (__m512i) ((__v8du) __A * (__v8du) __B); + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) +_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m512i +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_xor_pd (__m512d __A, __m512d __B) +_mm512_cvtepu64_pd (__m512i __A) { - return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + 0x04); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) +_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + 0x04); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m512 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_xor_ps (__m512 __A, __m512 __B) +_kshiftli_mask8 (__mmask8 __A, unsigned int __B) { - return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); + return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512 +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_kshiftri_mask8 (__mmask8 __A, unsigned int __B) { - return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); + return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512 +extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm512_range_pd (__m512d __A, __m512d __B, int __C) { - return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + 0x04); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_or_pd (__m512d __A, __m512d __B) +_mm512_mask_range_pd (__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, int __C) { - return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) __W, + (__mmask8) __U, + 0x04); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +_mm512_maskz_range_pd (__mmask8 __U, __m512d __A, __m512d __B, int __C) { - return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m512d +extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm512_range_ps (__m512 __A, __m512 __B, int __C) { - return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + 0x04); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_or_ps (__m512 __A, __m512 __B) +_mm512_mask_range_ps (__m512 __W, __mmask16 __U, + __m512 __A, __m512 __B, int __C) { - return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) __W, + (__mmask16) __U, + 0x04); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm512_maskz_range_ps (__mmask16 __U, __m512 __A, __m512 __B, int __C) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + 0x04); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_sd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) { - return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); } -extern __inline __m512 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm_reduce_ss (__m128 __A, __m128 __B, int __C) { - return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) _mm_setzero_ps (), + (__mmask8) -1); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_and_pd (__m512d __A, __m512d __B) +_mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C) { - return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) +_mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) { - return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); } -extern __inline __m512d +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm_range_sd (__m128d __A, __m128d __B, int __C) { - return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + 0x04); } -extern __inline __m512 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_and_ps (__m512 __A, __m512 __B) +_mm_mask_range_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C) { - return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m512 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +_mm_maskz_range_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) { - return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + 0x04); } -extern __inline __m512 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm_range_ss (__m128 __A, __m128 __B, int __C) { - return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + 0x04); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_andnot_pd (__m512d __A, __m512d __B) +_mm_mask_range_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) { - return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U, + 0x04); } -extern __inline __m512d +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) +_mm_maskz_range_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) { - return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + 0x04); } -extern __inline __m512d +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) +_mm_range_round_sd (__m128d __A, __m128d __B, int __C, const int __R) { - return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); } -extern __inline __m512 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_andnot_ps (__m512 __A, __m512 __B) +_mm_mask_range_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) { - return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U, __R); } -extern __inline __m512 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) +_mm_maskz_range_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C, + const int __R) { - return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } -extern __inline __m512 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) +_mm_range_round_ss (__m128 __A, __m128 __B, int __C, const int __R) { - return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); } -extern __inline __mmask16 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movepi32_mask (__m512i __A) +_mm_mask_range_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) { - return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U, __R); } -extern __inline __mmask8 +extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movepi64_mask (__m512i __A) +_mm_maskz_range_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C, + const int __R) { - return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movm_epi32 (__mmask16 __A) +_mm_fpclass_ss_mask (__m128 __A, const int __imm) { - return (__m512i) __builtin_ia32_cvtmask2d512 (__A); + return (__mmask8) __builtin_ia32_fpclassss ((__v4sf) __A, __imm); } -extern __inline __m512i +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_movm_epi64 (__mmask8 __A) +_mm_fpclass_sd_mask (__m128d __A, const int __imm) { - return (__m512i) __builtin_ia32_cvtmask2q512 (__A); + return (__mmask8) __builtin_ia32_fpclasssd ((__v2df) __A, __imm); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttpd_epi64 (__m512d __A) +_mm512_cvtt_roundpd_epi64 (__m512d __A, const int __R) { return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvtt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) +_mm512_maskz_cvtt_roundpd_epi64 (__mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttpd_epu64 (__m512d __A) +_mm512_cvtt_roundpd_epu64 (__m512d __A, const int __R) { return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvtt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) +_mm512_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttps_epi64 (__m256 __A) +_mm512_cvtt_roundps_epi64 (__m256 __A, const int __R) { return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) +_mm512_mask_cvtt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) +_mm512_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvttps_epu64 (__m256 __A) +_mm512_cvtt_roundps_epu64 (__m256 __A, const int __R) { return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) +_mm512_mask_cvtt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) +_mm512_maskz_cvtt_roundps_epu64 (__mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtpd_epi64 (__m512d __A) +_mm512_cvt_roundpd_epi64 (__m512d __A, const int __R) { return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) +_mm512_maskz_cvt_roundpd_epi64 (__mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtpd_epu64 (__m512d __A) +_mm512_cvt_roundpd_epu64 (__m512d __A, const int __R) { return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) +_mm512_mask_cvt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) +_mm512_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m512d __A, + const int __R) { return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtps_epi64 (__m256 __A) +_mm512_cvt_roundps_epi64 (__m256 __A, const int __R) { return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) +_mm512_mask_cvt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) +_mm512_maskz_cvt_roundps_epi64 (__mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtps_epu64 (__m256 __A) +_mm512_cvt_roundps_epu64 (__m256 __A, const int __R) { return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) +_mm512_mask_cvt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, (__v8di) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) +_mm512_maskz_cvt_roundps_epu64 (__mmask8 __U, __m256 __A, + const int __R) { return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi64_ps (__m512i __A) +_mm512_cvt_roundepi64_ps (__m512i __A, const int __R) { return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) +_mm512_mask_cvt_roundepi64_ps (__m256 __W, __mmask8 __U, __m512i __A, + const int __R) { return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, (__v8sf) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) +_mm512_maskz_cvt_roundepi64_ps (__mmask8 __U, __m512i __A, + const int __R) { return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepu64_ps (__m512i __A) +_mm512_cvt_roundepu64_ps (__m512i __A, const int __R) { return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) +_mm512_mask_cvt_roundepu64_ps (__m256 __W, __mmask8 __U, __m512i __A, + const int __R) { return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, (__v8sf) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) +_mm512_maskz_cvt_roundepu64_ps (__mmask8 __U, __m512i __A, + const int __R) { return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, (__v8sf) _mm256_setzero_ps (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepi64_pd (__m512i __A) +_mm512_cvt_roundepi64_pd (__m512i __A, const int __R) { return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, (__v8df) _mm512_setzero_pd (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) +_mm512_mask_cvt_roundepi64_pd (__m512d __W, __mmask8 __U, __m512i __A, + const int __R) { return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, (__v8df) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) +_mm512_maskz_cvt_roundepi64_pd (__mmask8 __U, __m512i __A, + const int __R) { return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, (__v8df) _mm512_setzero_pd (), (__mmask8) __U, - 0x04); + __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_cvtepu64_pd (__m512i __A) +_mm512_cvt_roundepu64_pd (__m512i __A, const int __R) { return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, (__v8df) _mm512_setzero_pd (), (__mmask8) -1, - 0x04); + __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) +_mm512_mask_cvt_roundepu64_pd (__m512d __W, __mmask8 __U, __m512i __A, + const int __R) { return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, (__v8df) __W, (__mmask8) __U, - 0x04); + __R); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) +_mm512_maskz_cvt_roundepu64_pd (__mmask8 __U, __m512i __A, + const int __R) { return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, (__v8df) _mm512_setzero_pd (), (__mmask8) __U, - 0x04); + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_pd (__m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_ps (__m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf32x8_ps (__m512 __A, const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf32x8_ps (__m256 __W, __mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf32x8_ps (__mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf64x2_pd (__m512d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) __W, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf64x2_pd (__mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti32x8_epi32 (__m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti32x8_epi32 (__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti32x8_epi32 (__mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti64x2_epi64 (__m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti64x2_epi64 (__mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_round_pd (__m512d __A, __m512d __B, int __C, + const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_round_pd (__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, int __C, + const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + int __C, const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_round_ps (__m512 __A, __m512 __B, int __C, const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_round_ps (__m512 __W, __mmask16 __U, + __m512 __A, __m512 __B, int __C, + const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) __W, + (__mmask16) __U, + __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + int __C, const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti32x8 (__m512i __A, __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti32x8 (__m512i __W, __mmask16 __U, __m512i __A, + __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti32x8 (__mmask16 __U, __m512i __A, __m256i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf32x8 (__m512 __A, __m256 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf32x8 (__m512 __W, __mmask16 __U, __m512 __A, + __m256 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf32x8 (__mmask16 __U, __m512 __A, __m256 __B, + const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti64x2 (__m512i __A, __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti64x2 (__m512i __W, __mmask8 __U, __m512i __A, + __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) __W, + (__mmask8) + __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti64x2 (__mmask8 __U, __m512i __A, __m128i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) + __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf64x2 (__m512d __A, __m128d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf64x2 (__m512d __W, __mmask8 __U, __m512d __A, + __m128d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) __W, + (__mmask8) + __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf64x2 (__mmask8 __U, __m512d __A, __m128d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_pd_mask (__mmask8 __U, __m512d __A, + const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_pd_mask (__m512d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_ps_mask (__mmask16 __U, __m512 __A, + const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A, + __imm, __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_ps_mask (__m512 __A, const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A, + __imm, + (__mmask16) -1); } -#define _kshiftli_mask8(X,Y) ((__mmask8) __builtin_ia32_kshiftliqi ((__mmask8)(X), (__mmask8)(Y))) -#define _kshiftri_mask8(X,Y) ((__mmask8) __builtin_ia32_kshiftriqi ((__mmask8)(X), (__mmask8)(Y))) -#define _mm_range_sd(A,B,C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_range_sd(W,U,A,B,C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_range_sd(U,A,B,C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_range_ss(A,B,C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_range_ss(W,U,A,B,C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_range_ss(U,A,B,C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_range_round_sd(A,B,C,R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8) -1, (R))) -#define _mm_mask_range_round_sd(W,U,A,B,C,R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), (R))) -#define _mm_maskz_range_round_sd(U,A,B,C,R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), (R))) -#define _mm_range_round_ss(A,B,C,R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8) -1, (R))) -#define _mm_mask_range_round_ss(W,U,A,B,C,R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), (R))) -#define _mm_maskz_range_round_ss(U,A,B,C,R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), (R))) -#define _mm512_cvtt_roundpd_epi64(A,B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di) _mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvtt_roundpd_epi64(W,U,A,B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvtt_roundpd_epi64(U,A,B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvtt_roundpd_epu64(A,B) ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvtt_roundpd_epu64(W,U,A,B) ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvtt_roundpd_epu64(U,A,B) ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvtt_roundps_epi64(A,B) ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvtt_roundps_epi64(W,U,A,B) ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvtt_roundps_epi64(U,A,B) ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvtt_roundps_epu64(A,B) ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvtt_roundps_epu64(W,U,A,B) ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvtt_roundps_epu64(U,A,B) ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvt_roundpd_epi64(A,B) ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvt_roundpd_epi64(W,U,A,B) ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvt_roundpd_epi64(U,A,B) ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvt_roundpd_epu64(A,B) ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvt_roundpd_epu64(W,U,A,B) ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvt_roundpd_epu64(U,A,B) ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvt_roundps_epi64(A,B) ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvt_roundps_epi64(W,U,A,B) ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvt_roundps_epi64(U,A,B) ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvt_roundps_epu64(A,B) ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) -#define _mm512_mask_cvt_roundps_epu64(W,U,A,B) ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)(W), (U), (B))) -#define _mm512_maskz_cvt_roundps_epu64(U,A,B) ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) -#define _mm512_cvt_roundepi64_ps(A,B) ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B))) -#define _mm512_mask_cvt_roundepi64_ps(W,U,A,B) ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (W), (U), (B))) -#define _mm512_maskz_cvt_roundepi64_ps(U,A,B) ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B))) -#define _mm512_cvt_roundepu64_ps(A,B) ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B))) -#define _mm512_mask_cvt_roundepu64_ps(W,U,A,B) ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (W), (U), (B))) -#define _mm512_maskz_cvt_roundepu64_ps(U,A,B) ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B))) -#define _mm512_cvt_roundepi64_pd(A,B) ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B))) -#define _mm512_mask_cvt_roundepi64_pd(W,U,A,B) ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (W), (U), (B))) -#define _mm512_maskz_cvt_roundepi64_pd(U,A,B) ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B))) -#define _mm512_cvt_roundepu64_pd(A,B) ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B))) -#define _mm512_mask_cvt_roundepu64_pd(W,U,A,B) ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (W), (U), (B))) -#define _mm512_maskz_cvt_roundepu64_pd(U,A,B) ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B))) -#define _mm512_reduce_pd(A,B) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1)) -#define _mm512_mask_reduce_pd(W,U,A,B) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U))) -#define _mm512_maskz_reduce_pd(U,A,B) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)(U))) -#define _mm512_reduce_ps(A,B) ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1)) -#define _mm512_mask_reduce_ps(W,U,A,B) ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U))) -#define _mm512_maskz_reduce_ps(U,A,B) ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U))) -#define _mm512_extractf32x8_ps(X,C) ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8)-1)) -#define _mm512_mask_extractf32x8_ps(W,U,X,C) ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), (int) (C), (__v8sf)(__m256) (W), (__mmask8) (U))) -#define _mm512_maskz_extractf32x8_ps(U,X,C) ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8) (U))) -#define _mm512_extractf64x2_pd(X,C) ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8)-1)) -#define _mm512_mask_extractf64x2_pd(W,U,X,C) ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X), (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U))) -#define _mm512_maskz_extractf64x2_pd(U,X,C) ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8) (U))) -#define _mm512_extracti32x8_epi32(X,C) ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm512_mask_extracti32x8_epi32(W,U,X,C) ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), (int) (C), (__v8si)(__m256i) (W), (__mmask8) (U))) -#define _mm512_maskz_extracti32x8_epi32(U,X,C) ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8) (U))) -#define _mm512_extracti64x2_epi64(X,C) ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1)) -#define _mm512_mask_extracti64x2_epi64(W,U,X,C) ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X), (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U))) -#define _mm512_maskz_extracti64x2_epi64(U,X,C) ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) -#define _mm512_range_pd(A,B,C) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_range_pd(W,U,A,B,C) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_range_pd(U,A,B,C) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_range_ps(A,B,C) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_range_ps(W,U,A,B,C) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_range_ps(U,A,B,C) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_range_round_pd(A,B,C,R) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R))) -#define _mm512_mask_range_round_pd(W,U,A,B,C,R) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U), (R))) -#define _mm512_maskz_range_round_pd(U,A,B,C,R) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)(U), (R))) -#define _mm512_range_round_ps(A,B,C,R) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R))) -#define _mm512_mask_range_round_ps(W,U,A,B,C,R) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U), (R))) -#define _mm512_maskz_range_round_ps(U,A,B,C,R) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R))) -#define _mm512_insertf64x2(X,Y,C) ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (X), (__mmask8)-1)) -#define _mm512_mask_insertf64x2(W,U,X,Y,C) ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (W), (__mmask8) (U))) -#define _mm512_maskz_insertf64x2(U,X,Y,C) ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) _mm512_setzero_pd (), (__mmask8) (U))) -#define _mm512_inserti64x2(X,Y,C) ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (X), (__mmask8)-1)) -#define _mm512_mask_inserti64x2(W,U,X,Y,C) ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (W), (__mmask8) (U))) -#define _mm512_maskz_inserti64x2(U,X,Y,C) ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) _mm512_setzero_si512 (), (__mmask8) (U))) -#define _mm512_insertf32x8(X,Y,C) ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), (__v8sf)(__m256) (Y), (int) (C), (__v16sf)(__m512)_mm512_setzero_ps (), (__mmask16)-1)) -#define _mm512_mask_insertf32x8(W,U,X,Y,C) ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), (__v8sf)(__m256) (Y), (int) (C), (__v16sf)(__m512)(W), (__mmask16)(U))) -#define _mm512_maskz_insertf32x8(U,X,Y,C) ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), (__v8sf)(__m256) (Y), (int) (C), (__v16sf)(__m512)_mm512_setzero_ps (), (__mmask16)(U))) -#define _mm512_inserti32x8(X,Y,C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)-1)) -#define _mm512_mask_inserti32x8(W,U,X,Y,C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_inserti32x8(U,X,Y,C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) -#define _mm_fpclass_ss_mask(X,C) ((__mmask8) __builtin_ia32_fpclassss ((__v4sf) (__m128) (X), (int) (C))) -#define _mm_fpclass_sd_mask(X,C) ((__mmask8) __builtin_ia32_fpclasssd ((__v2df) (__m128d) (X), (int) (C))) -#define _mm512_mask_fpclass_pd_mask(u,X,C) ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), (int) (C), (__mmask8)(u))) -#define _mm512_mask_fpclass_ps_mask(u,x,c) ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x), (int) (c),(__mmask8)(u))) -#define _mm512_fpclass_pd_mask(X,C) ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), (int) (C), (__mmask8)-1)) -#define _mm512_fpclass_ps_mask(x,c) ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x), (int) (c),(__mmask8)-1)) -#define _mm_reduce_sd(A,B,C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)-1)) -#define _mm_mask_reduce_sd(W,U,A,B,C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) -#define _mm_maskz_reduce_sd(U,A,B,C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U))) -#define _mm_reduce_ss(A,B,C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)-1)) -#define _mm_mask_reduce_ss(W,U,A,B,C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm_maskz_reduce_ss(U,A,B,C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U))) #undef __DISABLE_AVX512DQ__ #pragma GCC pop_options #define _AVX512VLBWINTRIN_H_INCLUDED @@ -32286,56 +41646,473 @@ _mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A, (__v8hi) __W, (__mmask8) __M); } -#define _mm256_mask_alignr_epi8(W,U,X,Y,N) ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(N * 8), (__v4di)(__m256i)(X), (__mmask32)(U))) -#define _mm256_mask_srli_epi16(W,U,A,B) ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) -#define _mm256_maskz_srli_epi16(U,A,B) ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U))) -#define _mm_mask_srli_epi16(W,U,A,B) ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_srli_epi16(U,A,B) ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U))) -#define _mm256_mask_srai_epi16(W,U,A,B) ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) -#define _mm256_maskz_srai_epi16(U,A,B) ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U))) -#define _mm_mask_srai_epi16(W,U,A,B) ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_srai_epi16(U,A,B) ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U))) -#define _mm256_mask_shufflehi_epi16(W,U,A,B) ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) -#define _mm256_maskz_shufflehi_epi16(U,A,B) ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(U))) -#define _mm_mask_shufflehi_epi16(W,U,A,B) ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_shufflehi_epi16(U,A,B) ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_mask_shufflelo_epi16(W,U,A,B) ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) -#define _mm256_maskz_shufflelo_epi16(U,A,B) ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(U))) -#define _mm_mask_shufflelo_epi16(W,U,A,B) ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_shufflelo_epi16(U,A,B) ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_maskz_alignr_epi8(U,X,Y,N) ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(N * 8), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask32)(U))) -#define _mm_mask_alignr_epi8(W,U,X,Y,N) ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N * 8), (__v2di)(__m128i)(X), (__mmask16)(U))) -#define _mm_maskz_alignr_epi8(U,X,Y,N) ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N * 8), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask16)(U))) -#define _mm_mask_slli_epi16(W,U,X,C) ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_slli_epi16(U,X,C) ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) -#define _mm256_dbsad_epu8(X,Y,C) ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), (__v32qi)(__m256i) (Y), (int) (C), (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)-1)) -#define _mm256_mask_slli_epi16(W,U,X,C) ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C), (__v16hi)(__m256i)(W), (__mmask16)(U))) -#define _mm256_maskz_slli_epi16(U,X,C) ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(U))) -#define _mm256_mask_dbsad_epu8(W,U,X,Y,C) ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), (__v32qi)(__m256i) (Y), (int) (C), (__v16hi)(__m256i)(W), (__mmask16)(U))) -#define _mm256_maskz_dbsad_epu8(U,X,Y,C) ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), (__v32qi)(__m256i) (Y), (int) (C), (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) -#define _mm_dbsad_epu8(X,Y,C) ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), (__v16qi)(__m128i) (Y), (int) (C), (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)-1)) -#define _mm_mask_dbsad_epu8(W,U,X,Y,C) ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), (__v16qi)(__m128i) (Y), (int) (C), (__v8hi)(__m128i)(W), (__mmask8)(U))) -#define _mm_maskz_dbsad_epu8(U,X,Y,C) ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), (__v16qi)(__m128i) (Y), (int) (C), (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) -#define _mm_mask_blend_epi16(__U,__A,__W) ((__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) (__A), (__v8hi) (__W), (__mmask8) (__U))) -#define _mm_mask_blend_epi8(__U,__A,__W) ((__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) (__A), (__v16qi) (__W), (__mmask16) (__U))) -#define _mm256_mask_blend_epi16(__U,__A,__W) ((__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) (__A), (__v16hi) (__W), (__mmask16) (__U))) -#define _mm256_mask_blend_epi8(__U,__A,__W) ((__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) (__A), (__v32qi) (__W), (__mmask32) (__U))) -#define _mm_cmp_epi16_mask(X,Y,P) ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) -#define _mm_cmp_epi8_mask(X,Y,P) ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(-1))) -#define _mm256_cmp_epi16_mask(X,Y,P) ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(-1))) -#define _mm256_cmp_epi8_mask(X,Y,P) ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)(-1))) -#define _mm_cmp_epu16_mask(X,Y,P) ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) -#define _mm_cmp_epu8_mask(X,Y,P) ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(-1))) -#define _mm256_cmp_epu16_mask(X,Y,P) ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(-1))) -#define _mm256_cmp_epu8_mask(X,Y,P) ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)-1)) -#define _mm_mask_cmp_epi16_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask16)(M))) -#define _mm_mask_cmp_epi8_mask(M,X,Y,P) ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) -#define _mm256_mask_cmp_epi16_mask(M,X,Y,P) ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) -#define _mm256_mask_cmp_epi8_mask(M,X,Y,P) ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)(M))) -#define _mm_mask_cmp_epu16_mask(M,X,Y,P) ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(M))) -#define _mm_mask_cmp_epu8_mask(M,X,Y,P) ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) -#define _mm256_mask_cmp_epu16_mask(M,X,Y,P) ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) -#define _mm256_mask_cmp_epu8_mask(M,X,Y,P) ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)M)) +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A, + (__v4di) __B, + __N * 8, + (__v4di) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi8 (__mmask32 __U, __m256i __A, __m256i __B, + const int __N) +{ + return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A, + (__v4di) __B, + __N * 8, + (__v4di) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B, const int __N) +{ + return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A, + (__v2di) __B, + __N * 8, + (__v2di) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi8 (__mmask16 __U, __m128i __A, __m128i __B, + const int __N) +{ + return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A, + (__v2di) __B, + __N * 8, + (__v2di) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dbsad_epu8 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dbsad_epu8 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dbsad_epu8 (__mmask16 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dbsad_epu8 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dbsad_epu8 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi16_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi16_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi16_mask (__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi16_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi8_mask (__mmask16 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi8_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi8_mask (__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi8_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu16_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu16_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu16_mask (__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu16_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu8_mask (__mmask16 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu8_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu8_mask (__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu8_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi16 (__mmask16 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shufflehi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shufflehi_epi16 (__mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shufflehi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shufflehi_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shufflelo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shufflelo_epi16 (__mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shufflelo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shufflelo_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_cmpneq_epi8_mask (__m256i __X, __m256i __Y) @@ -32690,7 +42467,7 @@ _mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) +_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, (__v16hi) __W, @@ -32707,7 +42484,7 @@ _mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A) } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) +_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, (__v8hi) __W, @@ -32724,7 +42501,7 @@ _mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A) } extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) +_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, (__v16hi) __W, @@ -32741,7 +42518,7 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) } extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) +_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, (__v8hi) __W, @@ -34592,133 +44369,133 @@ _mm_mask_cmple_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) (__v8hi) __Y, 2, (__mmask8) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 4, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 1, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 5, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 2, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 4, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 1, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 5, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 2, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 4, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 1, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 5, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 2, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 4, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 1, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 5, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 2, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); } #undef __DISABLE_AVX512VLBW__ #pragma GCC pop_options @@ -35896,50 +45673,429 @@ _mm256_movepi64_mask (__m256i __A) { return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); } -#define _mm256_insertf64x2(X,Y,C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) -#define _mm256_mask_insertf64x2(W,U,X,Y,C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_insertf64x2(U,X,Y,C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) -#define _mm256_inserti64x2(X,Y,C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) -#define _mm256_mask_inserti64x2(W,U,X,Y,C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)(W), (__mmask8)(U))) -#define _mm256_maskz_inserti64x2(U,X,Y,C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) -#define _mm256_extractf64x2_pd(X,C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8)-1)) -#define _mm256_mask_extractf64x2_pd(W,U,X,C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U))) -#define _mm256_maskz_extractf64x2_pd(U,X,C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8) (U))) -#define _mm256_extracti64x2_epi64(X,C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1)) -#define _mm256_mask_extracti64x2_epi64(W,U,X,C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U))) -#define _mm256_maskz_extracti64x2_epi64(U,X,C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) -#define _mm256_reduce_pd(A,B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) -#define _mm256_mask_reduce_pd(W,U,A,B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm256_maskz_reduce_pd(U,A,B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) -#define _mm_reduce_pd(A,B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1)) -#define _mm_mask_reduce_pd(W,U,A,B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) -#define _mm_maskz_reduce_pd(U,A,B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)(U))) -#define _mm256_reduce_ps(A,B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) -#define _mm256_mask_reduce_ps(W,U,A,B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_reduce_ps(U,A,B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) -#define _mm_reduce_ps(A,B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) -#define _mm_mask_reduce_ps(W,U,A,B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm_maskz_reduce_ps(U,A,B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) -#define _mm256_range_pd(A,B,C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) -#define _mm256_maskz_range_pd(U,A,B,C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) -#define _mm_range_pd(A,B,C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_setzero_pd(), (__mmask8)-1)) -#define _mm256_range_ps(A,B,C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) -#define _mm256_mask_range_ps(W,U,A,B,C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) -#define _mm256_maskz_range_ps(U,A,B,C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) -#define _mm_range_ps(A,B,C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) -#define _mm_mask_range_ps(W,U,A,B,C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) -#define _mm_maskz_range_ps(U,A,B,C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) -#define _mm256_mask_range_pd(W,U,A,B,C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) -#define _mm_mask_range_pd(W,U,A,B,C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) -#define _mm_maskz_range_pd(U,A,B,C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_setzero_pd(), (__mmask8)(U))) -#define _mm256_mask_fpclass_pd_mask(u,X,C) ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), (int) (C),(__mmask8)(u))) -#define _mm256_mask_fpclass_ps_mask(u,X,C) ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), (int) (C),(__mmask8)(u))) -#define _mm_mask_fpclass_pd_mask(u,X,C) ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), (int) (C),(__mmask8)(u))) -#define _mm_mask_fpclass_ps_mask(u,X,C) ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), (int) (C),(__mmask8)(u))) -#define _mm256_fpclass_pd_mask(X,C) ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), (int) (C),(__mmask8)-1)) -#define _mm256_fpclass_ps_mask(X,C) ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), (int) (C),(__mmask8)-1)) -#define _mm_fpclass_pd_mask(X,C) ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), (int) (C),(__mmask8)-1)) -#define _mm_fpclass_ps_mask(X,C) ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), (int) (C),(__mmask8)-1)) +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf64x2_pd (__m256d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) __W, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extractf64x2_pd (__mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti64x2_epi64 (__m256i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extracti64x2_epi64 (__mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_pd (__m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_pd (__m256d __W, __mmask8 __U, __m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_pd (__mmask8 __U, __m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_pd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_pd (__m128d __W, __mmask8 __U, __m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_pd (__mmask8 __U, __m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_ps (__m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_ps (__m256 __W, __mmask8 __U, __m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_ps (__mmask8 __U, __m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_ps (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_ps (__m128 __W, __mmask8 __U, __m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_ps (__mmask8 __U, __m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_range_pd (__m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_range_pd (__m256d __W, __mmask8 __U, + __m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_range_pd (__mmask8 __U, __m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_pd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_pd (__m128d __W, __mmask8 __U, + __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_pd (__mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_range_ps (__m256 __A, __m256 __B, int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_range_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, + int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_range_ps (__mmask8 __U, __m256 __A, __m256 __B, int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_ps (__m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_ps (__m128 __W, __mmask8 __U, + __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_ps (__mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_pd_mask (__mmask8 __U, __m256d __A, + const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_pd_mask (__m256d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_ps_mask (__mmask8 __U, __m256 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_ps_mask (__m256 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_pd_mask (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_pd_mask (__m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_ps_mask (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_ps_mask (__m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A, + __imm, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_inserti64x2 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_inserti64x2 (__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_insertf64x2 (__m256d __W, __mmask8 __U, __m256d __A, + __m128d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) __W, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_insertf64x2 (__mmask8 __U, __m256d __A, __m128d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} #undef __DISABLE_AVX512VLDQ__ #pragma GCC pop_options #define _AVX512IFMAINTRIN_H_INCLUDED @@ -36763,20 +46919,106 @@ _mm512_maskz_popcnt_epi64 (__mmask8 __U, __m512i __A) #pragma GCC push_options #pragma GCC target("avx512vbmi2") #define __DISABLE_AVX512VBMI2__ -#define _mm512_shrdi_epi16(A,B,C) ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), (__v32hi)(__m512i)(B),(int)(C)) -#define _mm512_shrdi_epi32(A,B,C) ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B),(int)(C)) -#define _mm512_mask_shrdi_epi32(A,B,C,D,E) ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B)) -#define _mm512_maskz_shrdi_epi32(A,B,C,D) ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), (__v16si)(__m512i)(C),(int)(D), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A)) -#define _mm512_shrdi_epi64(A,B,C) ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B),(int)(C)) -#define _mm512_mask_shrdi_epi64(A,B,C,D,E) ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B)) -#define _mm512_maskz_shrdi_epi64(A,B,C,D) ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), (__v8di)(__m512i)(C),(int)(D), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A)) -#define _mm512_shldi_epi16(A,B,C) ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), (__v32hi)(__m512i)(B),(int)(C)) -#define _mm512_shldi_epi32(A,B,C) ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B),(int)(C)) -#define _mm512_mask_shldi_epi32(A,B,C,D,E) ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B)) -#define _mm512_maskz_shldi_epi32(A,B,C,D) ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), (__v16si)(__m512i)(C),(int)(D), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A)) -#define _mm512_shldi_epi64(A,B,C) ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B),(int)(C)) -#define _mm512_mask_shldi_epi64(A,B,C,D,E) ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B)) -#define _mm512_maskz_shldi_epi64(A,B,C,D) ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), (__v8di)(__m512i)(C),(int)(D), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A)) +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi16 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)__A, (__v32hi) __B, + __C); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi32 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)__A, (__v16si) __B, + __C); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__C, + (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__B, + (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi64 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)__A, (__v8di) __B, __C); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__C, (__v8di) __D, + __E, (__v8di) __A, (__mmask8)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__B, (__v8di) __C, + __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi16 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)__A, (__v32hi) __B, + __C); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi32 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v16si ((__v16si)__A, (__v16si) __B, + __C); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__C, + (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__B, + (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi64 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v8di ((__v8di)__A, (__v8di) __B, __C); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__C, (__v8di) __D, + __E, (__v8di) __A, (__mmask8)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__B, (__v8di) __C, + __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A); +} extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm512_shrdv_epi16 (__m512i __A, __m512i __B, __m512i __C) @@ -36980,10 +47222,36 @@ _mm512_maskz_expandloadu_epi16 (__mmask32 __A, const void * __B) return (__m512i) __builtin_ia32_expandloadhi512_maskz ((const __v32hi *) __B, (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A); } -#define _mm512_mask_shrdi_epi16(A,B,C,D,E) ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B)) -#define _mm512_maskz_shrdi_epi16(A,B,C,D) ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), (__v32hi)(__m512i)(C),(int)(D), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A)) -#define _mm512_mask_shldi_epi16(A,B,C,D,E) ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B)) -#define _mm512_maskz_shldi_epi16(A,B,C,D) ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B), (__v32hi)(__m512i)(C),(int)(D), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A)) +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__C, + (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__B, + (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__C, + (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__B, + (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A); +} extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_shrdv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) @@ -37168,42 +47436,260 @@ _mm256_maskz_expandloadu_epi16 (__mmask16 __A, const void * __B) return (__m256i) __builtin_ia32_expandloadhi256_maskz ((const __v16hi *) __B, (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A); } -#define _mm256_shrdi_epi16(A,B,C) ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), (__v16hi)(__m256i)(B),(int)(C)) -#define _mm256_mask_shrdi_epi16(A,B,C,D,E) ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B)) -#define _mm256_maskz_shrdi_epi16(A,B,C,D) ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)) -#define _mm256_shrdi_epi32(A,B,C) ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B),(int)(C)) -#define _mm256_mask_shrdi_epi32(A,B,C,D,E) ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shrdi_epi32(A,B,C,D) ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm256_shrdi_epi64(A,B,C) ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B),(int)(C)) -#define _mm256_mask_shrdi_epi64(A,B,C,D,E) ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shrdi_epi64(A,B,C,D) ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm_shrdi_epi16(A,B,C) ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), (__v8hi)(__m128i)(B),(int)(C)) -#define _mm_mask_shrdi_epi16(A,B,C,D,E) ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shrdi_epi16(A,B,C,D) ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shrdi_epi32(A,B,C) ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B),(int)(C)) -#define _mm_mask_shrdi_epi32(A,B,C,D,E) ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask16)(B)) -#define _mm_maskz_shrdi_epi32(A,B,C,D) ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shrdi_epi64(A,B,C) ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B),(int)(C)) -#define _mm_mask_shrdi_epi64(A,B,C,D,E) ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shrdi_epi64(A,B,C,D) ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm256_shldi_epi16(A,B,C) ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), (__v16hi)(__m256i)(B),(int)(C)) -#define _mm256_mask_shldi_epi16(A,B,C,D,E) ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B)) -#define _mm256_maskz_shldi_epi16(A,B,C,D) ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)) -#define _mm256_shldi_epi32(A,B,C) ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B),(int)(C)) -#define _mm256_mask_shldi_epi32(A,B,C,D,E) ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shldi_epi32(A,B,C,D) ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm256_shldi_epi64(A,B,C) ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B),(int)(C)) -#define _mm256_mask_shldi_epi64(A,B,C,D,E) ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shldi_epi64(A,B,C,D) ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm_shldi_epi16(A,B,C) ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), (__v8hi)(__m128i)(B),(int)(C)) -#define _mm_mask_shldi_epi16(A,B,C,D,E) ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shldi_epi16(A,B,C,D) ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shldi_epi32(A,B,C) ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B),(int)(C)) -#define _mm_mask_shldi_epi32(A,B,C,D,E) ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask16)(B)) -#define _mm_maskz_shldi_epi32(A,B,C,D) ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shldi_epi64(A,B,C) ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B),(int)(C)) -#define _mm_mask_shldi_epi64(A,B,C,D,E) ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shldi_epi64(A,B,C,D) ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi16 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)__A, (__v16hi) __B, + __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__C, + (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__B, + (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__C, (__v8si) __D, + __E, (__v8si) __A, (__mmask8)__B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__B, (__v8si) __C, + __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi32 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)__A, (__v8si) __B, __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__C, (__v4di) __D, + __E, (__v4di) __A, (__mmask8)__B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__B, (__v4di) __C, + __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi64 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)__A, (__v4di) __B, __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__C, (__v8hi) __D, + __E, (__v8hi) __A, (__mmask8)__B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__B, (__v8hi) __C, + __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi16 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)__A, (__v8hi) __B, __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__C, (__v4si) __D, + __E, (__v4si) __A, (__mmask8)__B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__B, (__v4si) __C, + __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi32 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)__A, (__v4si) __B, __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__C, (__v2di) __D, + __E, (__v2di) __A, (__mmask8)__B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__B, (__v2di) __C, + __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi64 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)__A, (__v2di) __B, __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi16 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)__A, (__v16hi) __B, + __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__C, + (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__B, + (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__C, (__v8si) __D, + __E, (__v8si) __A, (__mmask8)__B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__B, (__v8si) __C, + __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi32 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v8si ((__v8si)__A, (__v8si) __B, __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__C, (__v4di) __D, + __E, (__v4di) __A, (__mmask8)__B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__B, (__v4di) __C, + __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi64 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v4di ((__v4di)__A, (__v4di) __B, __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__C, (__v8hi) __D, + __E, (__v8hi) __A, (__mmask8)__B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__B, (__v8hi) __C, + __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi16 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)__A, (__v8hi) __B, __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__C, (__v4si) __D, + __E, (__v4si) __A, (__mmask8)__B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__B, (__v4si) __C, + __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi32 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v4si ((__v4si)__A, (__v4si) __B, __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__C, (__v2di) __D, + __E, (__v2di) __A, (__mmask8)__B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__B, (__v2di) __C, + __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi64 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v2di ((__v2di)__A, (__v2di) __B, __C); +} extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_shrdv_epi16 (__m256i __A, __m256i __B, __m256i __C) @@ -37948,7 +48434,7 @@ _mm512_bitshuffle_epi64_mask (__m512i __A, __m512i __B) } extern __inline __mmask64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_bitshuffle_epi64_mask (__mmask8 __M, __m512i __A, __m512i __B) +_mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B) { return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, (__v64qi) __B, @@ -38112,7 +48598,12 @@ _mm_sha1nexte_epu32 (__m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B); } -#define _mm_sha1rnds4_epu32(A,B,I) ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)A, (__v4si)(__m128i)B, (int)I)) +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I) +{ + return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I); +} extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_sha256msg1_epu32 (__m128i __A, __m128i __B) @@ -38135,6 +48626,9 @@ _mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C) #undef __DISABLE_SHA__ #pragma GCC pop_options #define _LZCNTINTRIN_H_INCLUDED +#pragma GCC push_options +#pragma GCC target("lzcnt") +#define __DISABLE_LZCNT__ extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __lzcnt16 (unsigned short __X) { @@ -38160,6 +48654,8 @@ _lzcnt_u64 (unsigned long long __X) { return __builtin_ia32_lzcnt_u64 (__X); } +#undef __DISABLE_LZCNT__ +#pragma GCC pop_options #define _BMIINTRIN_H_INCLUDED #pragma GCC push_options #pragma GCC target("bmi") @@ -38587,9 +49083,23 @@ _mm256_cvtph_ps (__m128i __A) { return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A); } -#define _cvtss_sh(__F,__I) (__extension__ ({ __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); })) -#define _mm_cvtps_ph(A,I) ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) A, (int) (I))) -#define _mm256_cvtps_ph(A,I) ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) A, (int) (I))) +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cvtss_sh (float __F, const int __I) +{ + __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; + __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); + return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_ph (__m128 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_ph (__m256 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I); +} #undef __DISABLE_F16C__ #pragma GCC pop_options #define _RTMINTRIN_H_INCLUDED @@ -38616,7 +49126,12 @@ _xend (void) { __builtin_ia32_xend (); } -#define _xabort(N) __builtin_ia32_xabort (N) +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xabort (const unsigned int __imm) +{ + __builtin_ia32_xabort (__imm); +} #undef __DISABLE_RTM__ #pragma GCC pop_options #define _XTESTINTRIN_H_INCLUDED @@ -38708,8 +49223,21 @@ _mm_gf2p8mul_epi8 (__m128i __A, __m128i __B) return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, (__v16qi) __B); } -#define _mm_gf2p8affineinv_epi64_epi8(A,B,C) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) -#define _mm_gf2p8affine_epi64_epi8(A,B,C) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8affineinv_epi64_epi8 (__m128i __A, __m128i __B, const int __C) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi ((__v16qi) __A, + (__v16qi) __B, + __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8affine_epi64_epi8 (__m128i __A, __m128i __B, const int __C) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi) __A, + (__v16qi) __B, __C); +} #undef __DISABLE_GFNI__ #pragma GCC pop_options #pragma GCC push_options @@ -38722,8 +49250,21 @@ _mm256_gf2p8mul_epi8 (__m256i __A, __m256i __B) return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi ((__v32qi) __A, (__v32qi) __B); } -#define _mm256_gf2p8affineinv_epi64_epi8(A,B,C) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C))) -#define _mm256_gf2p8affine_epi64_epi8(A,B,C) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi)(__m256i)(A), ( __v32qi)(__m256i)(B), (int)(C))) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8affineinv_epi64_epi8 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi ((__v32qi) __A, + (__v32qi) __B, + __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8affine_epi64_epi8 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi) __A, + (__v32qi) __B, __C); +} #undef __DISABLE_GFNIAVX__ #pragma GCC pop_options #pragma GCC push_options @@ -38744,10 +49285,43 @@ _mm_maskz_gf2p8mul_epi8 (__mmask16 __A, __m128i __B, __m128i __C) return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __B, (__v16qi) __C, (__v16qi) _mm_setzero_si128 (), __A); } -#define _mm_mask_gf2p8affineinv_epi64_epi8(A,B,C,D,E) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B))) -#define _mm_maskz_gf2p8affineinv_epi64_epi8(A,B,C,D) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A))) -#define _mm_mask_gf2p8affine_epi64_epi8(A,B,C,D,E) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B))) -#define _mm_maskz_gf2p8affine_epi64_epi8(A,B,C,D) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A))) +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8affineinv_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, + __E, + (__v16qi)__A, + __B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8affineinv_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C, + const int __D) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, __D, + (__v16qi) _mm_setzero_si128 (), + __A); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8affine_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, __E, (__v16qi)__A, __B); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8affine_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C, + const int __D) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, __D, (__v16qi) _mm_setzero_si128 (), __A); +} #undef __DISABLE_GFNIAVX512VL__ #pragma GCC pop_options #pragma GCC push_options @@ -38769,10 +49343,45 @@ _mm256_maskz_gf2p8mul_epi8 (__mmask32 __A, __m256i __B, __m256i __C) return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __B, (__v32qi) __C, (__v32qi) _mm256_setzero_si256 (), __A); } -#define _mm256_mask_gf2p8affineinv_epi64_epi8(A,B,C,D,E) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B))) -#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A,B,C,D) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A))) -#define _mm256_mask_gf2p8affine_epi64_epi8(A,B,C,D,E) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B))) -#define _mm256_maskz_gf2p8affine_epi64_epi8(A,B,C,D) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A))) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8affineinv_epi64_epi8 (__m256i __A, __mmask32 __B, + __m256i __C, __m256i __D, const int __E) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + __E, + (__v32qi)__A, + __B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8affineinv_epi64_epi8 (__mmask32 __A, __m256i __B, + __m256i __C, const int __D) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, __D, + (__v32qi) _mm256_setzero_si256 (), __A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8affine_epi64_epi8 (__m256i __A, __mmask32 __B, __m256i __C, + __m256i __D, const int __E) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + __E, + (__v32qi)__A, + __B); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8affine_epi64_epi8 (__mmask32 __A, __m256i __B, + __m256i __C, const int __D) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, __D, (__v32qi)_mm256_setzero_si256 (), __A); +} #undef __DISABLE_GFNIAVX512VLBW__ #pragma GCC pop_options #pragma GCC push_options @@ -38800,12 +49409,56 @@ _mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B) return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A, (__v64qi) __B); } -#define _mm512_mask_gf2p8affineinv_epi64_epi8(A,B,C,D,E) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B))) -#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A,B,C,D) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A))) -#define _mm512_gf2p8affineinv_epi64_epi8(A,B,C) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ( (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) -#define _mm512_mask_gf2p8affine_epi64_epi8(A,B,C,D,E) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B))) -#define _mm512_maskz_gf2p8affine_epi64_epi8(A,B,C,D) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A))) -#define _mm512_gf2p8affine_epi64_epi8(A,B,C) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8affineinv_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D, const int __E) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, + __E, + (__v64qi)__A, + __B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8affineinv_epi64_epi8 (__mmask64 __A, __m512i __B, + __m512i __C, const int __D) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, __D, + (__v64qi) _mm512_setzero_si512 (), __A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A, + (__v64qi) __B, __C); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8affine_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D, const int __E) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, __E, (__v64qi)__A, __B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8affine_epi64_epi8 (__mmask64 __A, __m512i __B, __m512i __C, + const int __D) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, __D, (__v64qi) _mm512_setzero_si512 (), __A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A, + (__v64qi) __B, __C); +} #undef __DISABLE_GFNIAVX512FBW__ #pragma GCC pop_options #define __VAESINTRIN_H_INCLUDED @@ -38906,19 +49559,37 @@ _mm_aesenclast_epi128 (__m128i __A, __m128i __B) #pragma GCC push_options #pragma GCC target("vpclmulqdq,avx512f") #define __DISABLE_VPCLMULQDQF__ -#define _mm512_clmulepi64_epi128(A,B,C) ((__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (int)(C))) +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_clmulepi64_epi128 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)__A, + (__v8di) __B, __C); +} #undef __DISABLE_VPCLMULQDQF__ #pragma GCC pop_options #pragma GCC push_options #pragma GCC target("vpclmulqdq,avx512vl") #define __DISABLE_VPCLMULQDQVL__ -#define _mm_clmulepi64_epi128(A,B,C) ((__m128i) __builtin_ia32_vpclmulqdq_v2di ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (int)(C))) +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clmulepi64_epi128 (__m128i __A, __m128i __B, const int __C) +{ + return (__m128i) __builtin_ia32_vpclmulqdq_v2di ((__v2di)__A, + (__v2di) __B, __C); +} #undef __DISABLE_VPCLMULQDQVL__ #pragma GCC pop_options #pragma GCC push_options #pragma GCC target("vpclmulqdq,avx512vl") #define __DISABLE_VPCLMULQDQ__ -#define _mm256_clmulepi64_epi128(A,B,C) ((__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (int)(C))) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_clmulepi64_epi128 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)__A, + (__v4di) __B, __C); +} #undef __DISABLE_VPCLMULQDQ__ #pragma GCC pop_options #define _MOVDIRINTRIN_H_INCLUDED @@ -39055,6 +49726,9 @@ _m_prefetchw (void *__P) { __builtin_prefetch (__P, 1, 3 ); } +#pragma GCC push_options +#pragma GCC target("sse,3dnow") +#define __DISABLE_3dNOW__ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_femms (void) { @@ -39172,6 +49846,11 @@ _m_to_float (__m64 __A) __tmp.v = (__v2sf)__A; return __tmp.a[0]; } +#undef __DISABLE_3dNOW__ +#pragma GCC pop_options +#pragma GCC push_options +#pragma GCC target("sse,3dnowa") +#define __DISABLE_3dNOW_A__ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _m_pf2iw (__m64 __A) { @@ -39197,6 +49876,8 @@ _m_pswapd (__m64 __A) { return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A); } +#undef __DISABLE_3dNOW_A__ +#pragma GCC pop_options #define _FMA4INTRIN_H_INCLUDED #pragma GCC push_options #pragma GCC target("fma4") @@ -39532,10 +50213,26 @@ _mm_rot_epi64(__m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B); } -#define _mm_roti_epi8(A,N) ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N))) -#define _mm_roti_epi16(A,N) ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N))) -#define _mm_roti_epi32(A,N) ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N))) -#define _mm_roti_epi64(A,N) ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N))) +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi8(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi16(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi32(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi64(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B); +} extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_shl_epi8(__m128i __A, __m128i __B) { @@ -39930,10 +50627,38 @@ _mm256_frcz_pd (__m256d __A) { return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A); } -#define _mm_permute2_pd(X,Y,C,I) ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128d)(C), (int)(I))) -#define _mm256_permute2_pd(X,Y,C,I) ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256d)(C), (int)(I))) -#define _mm_permute2_ps(X,Y,C,I) ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128)(C), (int)(I))) -#define _mm256_permute2_ps(X,Y,C,I) ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256)(C), (int)(I))) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I) +{ + return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X, + (__v2df)__Y, + (__v2di)__C, + __I); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I) +{ + return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X, + (__v4df)__Y, + (__v4di)__C, + __I); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I) +{ + return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X, + (__v4sf)__Y, + (__v4si)__C, + __I); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I) +{ + return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8si)__C, + __I); +} #undef __DISABLE_XOP__ #pragma GCC pop_options #define _LWPINTRIN_H_INCLUDED @@ -39950,17 +50675,39 @@ __slwpcb (void) { return __builtin_ia32_slwpcb (); } -#define __lwpval32(D2,D1,F) (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), (unsigned int) (F))) -#define __lwpval64(D2,D1,F) (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), (unsigned int) (F))) -#define __lwpins32(D2,D1,F) (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), (unsigned int) (F))) -#define __lwpins64(D2,D1,F) (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), (unsigned int) (F))) +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval32 (unsigned int __data2, unsigned int __data1, unsigned int __flags) +{ + __builtin_ia32_lwpval32 (__data2, __data1, __flags); +} +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval64 (unsigned long long __data2, unsigned int __data1, + unsigned int __flags) +{ + __builtin_ia32_lwpval64 (__data2, __data1, __flags); +} +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins32 (unsigned int __data2, unsigned int __data1, unsigned int __flags) +{ + return __builtin_ia32_lwpins32 (__data2, __data1, __flags); +} +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins64 (unsigned long long __data2, unsigned int __data1, + unsigned int __flags) +{ + return __builtin_ia32_lwpins64 (__data2, __data1, __flags); +} #undef __DISABLE_LWP__ #pragma GCC pop_options #define _TBMINTRIN_H_INCLUDED #pragma GCC push_options #pragma GCC target("tbm") #define __DISABLE_TBM__ -#define __bextri_u32(X,I) ((unsigned int)__builtin_ia32_bextri_u32 ((unsigned int)(X), (unsigned int)(I))) +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u32 (unsigned int __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u32 (__X, __I); +} extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __blcfill_u32 (unsigned int __X) { @@ -40006,7 +50753,11 @@ __tzmsk_u32 (unsigned int __X) { return ~__X & (__X - 1); } -#define __bextri_u64(X,I) ((unsigned long long)__builtin_ia32_bextri_u64 ((unsigned long long)(X), (unsigned long long)(I))) +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u64 (unsigned long long __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u64 (__X, __I); +} extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __blcfill_u64 (unsigned long long __X) { @@ -40129,7 +50880,7 @@ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _xgetbv (unsigned int __A) { - __builtin_ia32_xgetbv (__A); + return __builtin_ia32_xgetbv (__A); } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -40553,6 +51304,22 @@ _wrpkru (unsigned int __key) DWORD64 _umul128(DWORD64 Multiplier,DWORD64 Multiplicand,DWORD64 *HighProduct); LONG64 MultiplyExtract128(LONG64 Multiplier,LONG64 Multiplicand,BYTE Shift); DWORD64 UnsignedMultiplyExtract128(DWORD64 Multiplier,DWORD64 Multiplicand,BYTE Shift); + extern inline __attribute__((__gnu_inline__)) LONG64 MultiplyExtract128(LONG64 Multiplier,LONG64 Multiplicand,BYTE Shift) { + LONG64 extractedProduct; + LONG64 highProduct; + LONG64 lowProduct; + lowProduct = _mul128(Multiplier,Multiplicand,&highProduct); + extractedProduct = (LONG64)__shiftright128((LONG64)lowProduct,(LONG64)highProduct,Shift); + return extractedProduct; + } + extern inline __attribute__((__gnu_inline__)) DWORD64 UnsignedMultiplyExtract128(DWORD64 Multiplier,DWORD64 Multiplicand,BYTE Shift) { + DWORD64 extractedProduct; + DWORD64 highProduct; + DWORD64 lowProduct; + lowProduct = _umul128(Multiplier,Multiplicand,&highProduct); + extractedProduct = __shiftright128(lowProduct,highProduct,Shift); + return extractedProduct; + } #define EXCEPTION_READ_FAULT 0 #define EXCEPTION_WRITE_FAULT 1 #define EXCEPTION_EXECUTE_FAULT 8 @@ -44860,6 +55627,11 @@ typedef DWORD ( *PRTL_RUN_ONCE_INIT_FN)(PRTL_RUN_ONCE, PVOID, PVOID *); #define RtlFillMemory(Destination,Length,Fill) memset((Destination),(Fill),(Length)) #define RtlZeroMemory(Destination,Length) memset((Destination),0,(Length)) PVOID RtlSecureZeroMemory(PVOID ptr,SIZE_T cnt); + extern inline __attribute__((__gnu_inline__)) PVOID RtlSecureZeroMemory(PVOID ptr,SIZE_T cnt) { + volatile char *vptr =(volatile char *)ptr; + __stosb((PBYTE)((DWORD64)vptr),0,cnt); + return ptr; + } typedef struct _MESSAGE_RESOURCE_ENTRY { WORD Length; WORD Flags; @@ -47073,7 +57845,6 @@ __attribute__((dllimport)) WINBOOL TerminateProcess (HANDLE hProcess, UINT uExit __attribute__((dllimport)) void GetSystemTimeAsFileTime (LPFILETIME lpSystemTimeAsFileTime); __attribute__((dllimport)) void GetLocalTime (LPSYSTEMTIME lpSystemTime); __attribute__((dllimport)) void GetNativeSystemInfo (LPSYSTEM_INFO lpSystemInfo); - __attribute__((dllimport)) DWORD GetVersion (void); typedef struct _MEMORYSTATUSEX { DWORD dwLength; DWORD dwMemoryLoad; @@ -47085,6 +57856,12 @@ __attribute__((dllimport)) WINBOOL TerminateProcess (HANDLE hProcess, UINT uExit DWORDLONG ullAvailVirtual; DWORDLONG ullAvailExtendedVirtual; } MEMORYSTATUSEX,*LPMEMORYSTATUSEX; + __attribute__((dllimport)) void GetSystemInfo (LPSYSTEM_INFO lpSystemInfo); + __attribute__((dllimport)) WINBOOL GlobalMemoryStatusEx (LPMEMORYSTATUSEX lpBuffer); + __attribute__((dllimport)) DWORD GetTickCount (void); + __attribute__((dllimport)) void GetSystemTimePreciseAsFileTime (LPFILETIME lpSystemTimeAsFileTime); + __attribute__((dllimport)) WINBOOL GetVersionExA (LPOSVERSIONINFOA lpVersionInformation); + __attribute__((dllimport)) WINBOOL GetVersionExW (LPOSVERSIONINFOW lpVersionInformation); typedef enum _COMPUTER_NAME_FORMAT { ComputerNameNetBIOS, ComputerNameDnsHostname, @@ -47096,10 +57873,8 @@ __attribute__((dllimport)) WINBOOL TerminateProcess (HANDLE hProcess, UINT uExit ComputerNamePhysicalDnsFullyQualified, ComputerNameMax } COMPUTER_NAME_FORMAT; - __attribute__((dllimport)) WINBOOL GlobalMemoryStatusEx (LPMEMORYSTATUSEX lpBuffer); + __attribute__((dllimport)) DWORD GetVersion (void); __attribute__((dllimport)) WINBOOL SetLocalTime (const SYSTEMTIME *lpSystemTime); - __attribute__((dllimport)) void GetSystemInfo (LPSYSTEM_INFO lpSystemInfo); - __attribute__((dllimport)) DWORD GetTickCount (void); __attribute__((dllimport)) WINBOOL GetSystemTimeAdjustment (PDWORD lpTimeAdjustment, PDWORD lpTimeIncrement, PBOOL lpTimeAdjustmentDisabled); __attribute__((dllimport)) UINT GetSystemDirectoryA (LPSTR lpBuffer, UINT uSize); __attribute__((dllimport)) UINT GetSystemDirectoryW (LPWSTR lpBuffer, UINT uSize); @@ -47111,11 +57886,8 @@ __attribute__((dllimport)) WINBOOL TerminateProcess (HANDLE hProcess, UINT uExit __attribute__((dllimport)) WINBOOL GetComputerNameExW (COMPUTER_NAME_FORMAT NameType, LPWSTR lpBuffer, LPDWORD nSize); __attribute__((dllimport)) WINBOOL SetComputerNameExW (COMPUTER_NAME_FORMAT NameType, LPCWSTR lpBuffer); __attribute__((dllimport)) WINBOOL SetSystemTime (const SYSTEMTIME *lpSystemTime); - __attribute__((dllimport)) WINBOOL GetVersionExA (LPOSVERSIONINFOA lpVersionInformation); - __attribute__((dllimport)) WINBOOL GetVersionExW (LPOSVERSIONINFOW lpVersionInformation); __attribute__((dllimport)) WINBOOL GetLogicalProcessorInformation (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION Buffer, PDWORD ReturnedLength); __attribute__((dllimport)) ULONGLONG VerSetConditionMask (ULONGLONG ConditionMask, ULONG TypeMask, UCHAR Condition); - __attribute__((dllimport)) void GetSystemTimePreciseAsFileTime (LPFILETIME lpSystemTimeAsFileTime); __attribute__((dllimport)) UINT EnumSystemFirmwareTables (DWORD FirmwareTableProviderSignature, PVOID pFirmwareTableEnumBuffer, DWORD BufferSize); __attribute__((dllimport)) UINT GetSystemFirmwareTable (DWORD FirmwareTableProviderSignature, DWORD FirmwareTableID, PVOID pFirmwareTableBuffer, DWORD BufferSize); #define GetSystemDirectory __MINGW_NAME_AW(GetSystemDirectory) @@ -47780,6 +58552,7 @@ __attribute__((dllimport)) WINBOOL TerminateProcess (HANDLE hProcess, UINT uExit __attribute__((dllimport)) WINBOOL DebugSetProcessKillOnExit (WINBOOL KillOnExit); __attribute__((dllimport)) WINBOOL DebugBreakProcess (HANDLE Process); #define CRITICAL_SECTION_NO_DEBUG_INFO RTL_CRITICAL_SECTION_FLAG_NO_DEBUG_INFO + __attribute__((dllimport)) DWORD WaitForMultipleObjects (DWORD nCount, const HANDLE *lpHandles, WINBOOL bWaitAll, DWORD dwMilliseconds); typedef enum _DEP_SYSTEM_POLICY_TYPE { DEPPolicyAlwaysOff = 0, DEPPolicyAlwaysOn, @@ -47795,7 +58568,6 @@ __attribute__((dllimport)) WINBOOL TerminateProcess (HANDLE hProcess, UINT uExit #define SET_TAPE_MEDIA_INFORMATION 0 #define SET_TAPE_DRIVE_INFORMATION 1 __attribute__((dllimport)) WINBOOL PulseEvent (HANDLE hEvent); - __attribute__((dllimport)) DWORD WaitForMultipleObjects (DWORD nCount, const HANDLE *lpHandles, WINBOOL bWaitAll, DWORD dwMilliseconds); __attribute__((dllimport)) ATOM GlobalDeleteAtom (ATOM nAtom); __attribute__((dllimport)) WINBOOL InitAtomTable (DWORD nSize); __attribute__((dllimport)) ATOM DeleteAtom (ATOM nAtom); @@ -48985,6 +59757,15 @@ __attribute__((dllimport)) WINBOOL TerminateProcess (HANDLE hProcess, UINT uExit #define ERROR_TIMEOUT __MSABI_LONG(1460) #define ERROR_INVALID_MONITOR_HANDLE __MSABI_LONG(1461) #define ERROR_INCORRECT_SIZE __MSABI_LONG(1462) +#define ERROR_SYMLINK_CLASS_DISABLED __MSABI_LONG(1463) +#define ERROR_SYMLINK_NOT_SUPPORTED __MSABI_LONG(1464) +#define ERROR_XML_PARSE_ERROR __MSABI_LONG(1465) +#define ERROR_XMLDSIG_ERROR __MSABI_LONG(1466) +#define ERROR_RESTART_APPLICATION __MSABI_LONG(1467) +#define ERROR_WRONG_COMPARTMENT __MSABI_LONG(1468) +#define ERROR_AUTHIP_FAILURE __MSABI_LONG(1469) +#define ERROR_NO_NVRAM_RESOURCES __MSABI_LONG(1470) +#define ERROR_NOT_GUI_PROCESS __MSABI_LONG(1471) #define ERROR_EVENTLOG_FILE_CORRUPT __MSABI_LONG(1500) #define ERROR_EVENTLOG_CANT_START __MSABI_LONG(1501) #define ERROR_LOG_FILE_FULL __MSABI_LONG(1502) @@ -95844,15 +106625,55 @@ extern const GUID IID_IPrintDialogServices; size_t __attribute__((__cdecl__)) uaw_wcslen(PCUWSTR String); PUWSTR __attribute__((__cdecl__)) uaw_wcsrchr(PCUWSTR String,WCHAR Character); LPUWSTR ua_CharUpperW(LPUWSTR String); + extern inline __attribute__((__gnu_inline__)) LPUWSTR ua_CharUpperW(LPUWSTR String) { + if(1) return CharUpperW((PWSTR)String); + return uaw_CharUpperW(String); + } int ua_lstrcmpW(LPCUWSTR String1,LPCUWSTR String2); int ua_lstrcmpiW(LPCUWSTR String1,LPCUWSTR String2); int ua_lstrlenW(LPCUWSTR String); + extern inline __attribute__((__gnu_inline__)) int ua_lstrcmpW(LPCUWSTR String1,LPCUWSTR String2) { + if(1 && 1) + return lstrcmpW((LPCWSTR)String1,(LPCWSTR)String2); + return uaw_lstrcmpW(String1,String2); + } + extern inline __attribute__((__gnu_inline__)) int ua_lstrcmpiW(LPCUWSTR String1,LPCUWSTR String2) { + if(1 && 1) + return lstrcmpiW((LPCWSTR)String1,(LPCWSTR)String2); + return uaw_lstrcmpiW(String1,String2); + } + extern inline __attribute__((__gnu_inline__)) int ua_lstrlenW(LPCUWSTR String) { + if(1) return lstrlenW((PCWSTR)String); + return uaw_lstrlenW(String); + } typedef WCHAR *PUWSTR_C; PUWSTR_C ua_wcschr(PCUWSTR String,WCHAR Character); PUWSTR_C ua_wcsrchr(PCUWSTR String,WCHAR Character); PUWSTR ua_wcscpy(PUWSTR Destination,PCUWSTR Source); size_t ua_wcslen(PCUWSTR String); + extern inline __attribute__((__gnu_inline__)) PUWSTR_C ua_wcschr(PCUWSTR String,WCHAR Character) { + if(1) return (PUWSTR_C)wcschr((PCWSTR)String,Character); + return (PUWSTR_C)uaw_wcschr(String,Character); + } + extern inline __attribute__((__gnu_inline__)) PUWSTR_C ua_wcsrchr(PCUWSTR String,WCHAR Character) { + if(1) return (PUWSTR_C)wcsrchr((PCWSTR)String,Character); + return (PUWSTR_C)uaw_wcsrchr(String,Character); + } + extern inline __attribute__((__gnu_inline__)) PUWSTR ua_wcscpy(PUWSTR Destination,PCUWSTR Source) { + if(1 && 1) + return wcscpy((PWSTR)Destination,(PCWSTR)Source); + return uaw_wcscpy(Destination,Source); + } + extern inline __attribute__((__gnu_inline__)) size_t ua_wcslen(PCUWSTR String) { + if(1) return wcslen((PCWSTR)String); + return uaw_wcslen(String); + } int ua_wcsicmp(LPCUWSTR String1,LPCUWSTR String2); + extern inline __attribute__((__gnu_inline__)) int ua_wcsicmp(LPCUWSTR String1,LPCUWSTR String2) { + if(1 && 1) + return _wcsicmp((LPCWSTR)String1,(LPCWSTR)String2); + return uaw_wcsicmp(String1,String2); + } #define __UA_WCSLEN ua_wcslen #define __UA_WSTRSIZE(s) ((__UA_WCSLEN(s)+1)*sizeof(WCHAR)) #define __UA_STACKCOPY(p,s) memcpy(_alloca(s),p,s) @@ -102487,7 +113308,7 @@ typedef struct in6_addr RASIPV6ADDR; #define LPRASDEVINFOW RASDEVINFOW* #define LPRASDEVINFOA RASDEVINFOA* #define LPRASDEVINFO RASDEVINFO* -#define RASCTRYINFO struct RASCTRYINFO + struct RASCTRYINFO { DWORD dwSize; DWORD dwCountryID; @@ -102500,7 +113321,7 @@ typedef struct in6_addr RASIPV6ADDR; #define LPRASCTRYINFOW RASCTRYINFOW* #define LPRASCTRYINFOA RASCTRYINFOW* #define LPRASCTRYINFO RASCTRYINFO* -#define RASIPADDR struct RASIPADDR + struct RASIPADDR { BYTE a; BYTE b; @@ -104642,8 +115463,10 @@ typedef struct _IPV6_ADDRESS_EX { __attribute__((dllimport)) errno_t __attribute__((__cdecl__)) _wctime64_s (wchar_t *_Buf,size_t _SizeInWords,const __time64_t *_Time); #define _INC_WTIME_INL wchar_t *__attribute__((__cdecl__)) _wctime(const time_t *) ; + extern inline __attribute__((__gnu_inline__)) wchar_t *__attribute__((__cdecl__)) _wctime(const time_t *_Time) { return _wctime64(_Time); } #define _INC_WTIME_S_INL errno_t __attribute__((__cdecl__)) _wctime_s(wchar_t *, size_t, const time_t *); + extern inline __attribute__((__gnu_inline__)) errno_t __attribute__((__cdecl__)) _wctime_s (wchar_t *_Buffer,size_t _SizeInWords,const time_t *_Time) { return _wctime64_s (_Buffer,_SizeInWords,_Time); } #define _WTIME_DEFINED double __attribute__((__cdecl__)) difftime(time_t _Time1,time_t _Time2); char *__attribute__((__cdecl__)) ctime(const time_t *_Time) ; @@ -104652,6 +115475,15 @@ struct tm *__attribute__((__cdecl__)) localtime(const time_t *_Time) ; time_t __attribute__((__cdecl__)) mktime(struct tm *_Tm); time_t __attribute__((__cdecl__)) _mkgmtime(struct tm *_Tm); time_t __attribute__((__cdecl__)) time(time_t *_Time); +#define __TIME_INLINE __CRT_INLINE +extern inline __attribute__((__gnu_inline__)) double __attribute__((__cdecl__)) difftime(time_t _Time1,time_t _Time2) + { return _difftime64(_Time1,_Time2); } +extern inline __attribute__((__gnu_inline__)) char *__attribute__((__cdecl__)) ctime(const time_t *_Time) { return _ctime64(_Time); } +extern inline __attribute__((__gnu_inline__)) struct tm *__attribute__((__cdecl__)) gmtime(const time_t *_Time) { return _gmtime64(_Time); } +extern inline __attribute__((__gnu_inline__)) struct tm *__attribute__((__cdecl__)) localtime(const time_t *_Time) { return _localtime64(_Time); } +extern inline __attribute__((__gnu_inline__)) time_t __attribute__((__cdecl__)) mktime(struct tm *_Tm) { return _mktime64(_Tm); } +extern inline __attribute__((__gnu_inline__)) time_t __attribute__((__cdecl__)) _mkgmtime(struct tm *_Tm) { return _mkgmtime64(_Tm); } +extern inline __attribute__((__gnu_inline__)) time_t __attribute__((__cdecl__)) time(time_t *_Time) { return _time64(_Time); } extern __inline__ __attribute__((__always_inline__,__gnu_inline__)) errno_t __attribute__((__cdecl__)) localtime_s(struct tm *_Tm,const time_t *_Time) { return _localtime64_s(_Tm,_Time); } extern __inline__ __attribute__((__always_inline__,__gnu_inline__)) errno_t __attribute__((__cdecl__)) gmtime_s(struct tm *_Tm, const time_t *_Time) { return _gmtime64_s(_Tm, _Time); } extern __inline__ __attribute__((__always_inline__,__gnu_inline__)) errno_t __attribute__((__cdecl__)) ctime_s(char *_Buf,size_t _SizeInBytes,const time_t *_Time) { return _ctime64_s(_Buf,_SizeInBytes,_Time); } @@ -104705,6 +115537,9 @@ extern __inline__ __attribute__((__always_inline__,__gnu_inline__)) char *__attr #define _timeb __timeb64 __attribute__ ((__dllimport__)) void __attribute__((__cdecl__)) _ftime(struct __timeb64 *); void __attribute__((__cdecl__)) ftime (struct timeb *); + extern inline __attribute__((__gnu_inline__)) void __attribute__((__cdecl__)) ftime(struct timeb *_Tmb) { + _ftime64((struct __timeb64 *)_Tmb); + } #pragma pack(pop) #define _TIMEB_H_S __attribute__ ((__dllimport__)) errno_t __attribute__((__cdecl__)) _ftime_s(struct __timeb32 *_Time); @@ -105280,7 +116115,19 @@ typedef IF_PHYSICAL_ADDRESS *PIF_PHYSICAL_ADDRESS; int __attribute__((__cdecl__)) _utime(const char *,struct _utimbuf *); int __attribute__((__cdecl__)) _futime(int,struct _utimbuf *); int __attribute__((__cdecl__)) _wutime(const wchar_t *,struct _utimbuf *); +extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) _utime(const char *_Filename,struct _utimbuf *_Utimbuf) { + return _utime64(_Filename,(struct __utimbuf64 *)_Utimbuf); +} +extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) _futime(int _Desc,struct _utimbuf *_Utimbuf) { + return _futime64(_Desc,(struct __utimbuf64 *)_Utimbuf); +} +extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) _wutime(const wchar_t *_Filename,struct _utimbuf *_Utimbuf) { + return _wutime64(_Filename,(struct __utimbuf64 *)_Utimbuf); +} int __attribute__((__cdecl__)) utime(const char *, struct utimbuf *); +extern inline __attribute__((__gnu_inline__)) int __attribute__((__cdecl__)) utime(const char *_Filename,struct utimbuf *_Utimbuf) { + return _utime64(_Filename,(struct __utimbuf64 *)_Utimbuf); +} #pragma pack(pop) #define rb_w32_iswinnt() TRUE #define rb_w32_iswin95() FALSE @@ -105876,7 +116723,6 @@ rb_w32_pow(double x, double y) #pragma GCC visibility pop #define ASSUME(x) (RB_LIKELY(!!(x)) ? (void)0 : UNREACHABLE) -#define UNREACHABLE_RETURN(val) UNREACHABLE #define RUBY_MACRO_SELECT(base,n) TOKEN_PASTE(base, n) #pragma GCC visibility push(default) @@ -105902,14 +116748,12 @@ typedef char ruby_check_sizeof_voidp[8 == sizeof(void*) ? 1 : -1]; #define PRIxVALUE PRI_VALUE_PREFIX"x" #define PRIXVALUE PRI_VALUE_PREFIX"X" #define PRIsVALUE PRI_VALUE_PREFIX"i" RUBY_PRI_VALUE_MARK -#define PRI_PTRDIFF_PREFIX PRI_LL_PREFIX #define PRIdPTRDIFF PRI_PTRDIFF_PREFIX"d" #define PRIiPTRDIFF PRI_PTRDIFF_PREFIX"i" #define PRIoPTRDIFF PRI_PTRDIFF_PREFIX"o" #define PRIuPTRDIFF PRI_PTRDIFF_PREFIX"u" #define PRIxPTRDIFF PRI_PTRDIFF_PREFIX"x" #define PRIXPTRDIFF PRI_PTRDIFF_PREFIX"X" -#define PRI_SIZE_PREFIX PRI_LL_PREFIX #define PRIdSIZE PRI_SIZE_PREFIX"d" #define PRIiSIZE PRI_SIZE_PREFIX"i" #define PRIoSIZE PRI_SIZE_PREFIX"o" @@ -106231,7 +117075,7 @@ enum ruby_fl_type { RUBY_FL_USER16 = (1<<(RUBY_FL_USHIFT+16)), RUBY_FL_USER17 = (1<<(RUBY_FL_USHIFT+17)), RUBY_FL_USER18 = (1<<(RUBY_FL_USHIFT+18)), -#define RUBY_FL_USER19 (((VALUE)1)<<(RUBY_FL_USHIFT+19)) + RUBY_FL_USER19 = (1<<(RUBY_FL_USHIFT+19)), RUBY_ELTS_SHARED = RUBY_FL_USER2, RUBY_FL_DUPPED = (RUBY_T_MASK|RUBY_FL_EXIVAR|RUBY_FL_TAINT), RUBY_FL_SINGLETON = RUBY_FL_USER0 @@ -106550,12 +117394,25 @@ rb_obj_freeze_inline(VALUE x) } } #define RUBY_UNTYPED_DATA_FUNC(func) func __attribute__((warning("untyped Data is unsafe; use TypedData instead"))) +static inline VALUE rb_data_object_wrap_warning(VALUE,void*,RUBY_DATA_FUNC,RUBY_DATA_FUNC) __attribute__((warning("untyped Data is unsafe; use TypedData instead"))); +static inline void *rb_data_object_get_warning(VALUE) __attribute__((warning("untyped Data is unsafe; use TypedData instead"))); +static inline VALUE +rb_data_object_wrap_warning(VALUE klass, void *ptr, RUBY_DATA_FUNC mark, RUBY_DATA_FUNC free) +{ + return rb_data_object_wrap(klass, ptr, mark, free); +} +#define rb_data_object_wrap_warning(klass,ptr,mark,free) __extension__( __builtin_choose_expr( __builtin_constant_p(klass) && !(klass), rb_data_object_wrap(klass, ptr, mark, free), rb_data_object_wrap_warning(klass, ptr, mark, free))) static inline void * rb_data_object_get(VALUE obj) { rb_check_type((VALUE)(obj),(RUBY_T_DATA)); return ((struct RData *)obj)->data; } +static inline void * +rb_data_object_get_warning(VALUE obj) +{ + return rb_data_object_get(obj); +} static inline VALUE rb_data_object_make(VALUE klass, RUBY_DATA_FUNC mark_func, RUBY_DATA_FUNC free_func, void **datap, size_t size) { @@ -106580,6 +117437,15 @@ rb_data_typed_object_alloc(VALUE klass, void *datap, const rb_data_type_t *type) { return rb_data_typed_object_wrap(klass, datap, type); } +#define rb_data_object_wrap_0 rb_data_object_wrap +#define rb_data_object_wrap_1 rb_data_object_wrap_warning +#define rb_data_object_wrap RUBY_MACRO_SELECT(rb_data_object_wrap_, RUBY_UNTYPED_DATA_WARNING) +#define rb_data_object_get_0 rb_data_object_get +#define rb_data_object_get_1 rb_data_object_get_warning +#define rb_data_object_get RUBY_MACRO_SELECT(rb_data_object_get_, RUBY_UNTYPED_DATA_WARNING) +#define rb_data_object_make_0 rb_data_object_make +#define rb_data_object_make_1 rb_data_object_make_warning +#define rb_data_object_make RUBY_MACRO_SELECT(rb_data_object_make_, RUBY_UNTYPED_DATA_WARNING) #define RB_OBJ_PROMOTED_RAW(x) RB_FL_ALL_RAW(x, RUBY_FL_PROMOTED) #define RB_OBJ_PROMOTED(x) (RB_SPECIAL_CONST_P(x) ? 0 : RB_OBJ_PROMOTED_RAW(x)) #define RB_OBJ_WB_UNPROTECT(x) rb_obj_wb_unprotect(x, __FILE__, __LINE__) @@ -107209,6 +118075,7 @@ VALUE rb_class_protected_instance_methods(int, const VALUE*, VALUE); VALUE rb_class_private_instance_methods(int, const VALUE*, VALUE); VALUE rb_obj_singleton_methods(int, const VALUE*, VALUE); void rb_define_method_id(VALUE, ID, VALUE (*)(), int); +void rb_frozen_class_p(VALUE); void rb_undef(VALUE, ID); void rb_define_protected_method(VALUE, const char*, VALUE (*)(), int); void rb_define_private_method(VALUE, const char*, VALUE (*)(), int); @@ -107243,7 +118110,7 @@ __attribute__ ((__noreturn__)) void rb_error_frozen_object(VALUE); void rb_error_untrusted(VALUE); void rb_check_frozen(VALUE); void rb_check_trusted(VALUE); -#define rb_check_frozen_internal(obj) do { VALUE frozen_obj = (obj); if (RB_UNLIKELY(RB_OBJ_FROZEN(frozen_obj))) { rb_error_frozen_object(frozen_obj); } } while (0) +#define rb_check_frozen_internal(obj) do { VALUE frozen_obj = (obj); if (RB_OBJ_FROZEN(frozen_obj)) { rb_error_frozen_object(frozen_obj); } } while (0) #define rb_check_trusted_internal(obj) ((void) 0) #define rb_check_frozen(obj) __extension__({rb_check_frozen_internal(obj);}) #define rb_check_trusted(obj) __extension__({rb_check_trusted_internal(obj);}) @@ -107889,6 +118756,175 @@ unsigned long ruby_strtoul(const char *str, char **endptr, int base); #define InitVM(ext) {void InitVM_ ##ext(void);InitVM_ ##ext();} int ruby_snprintf(char *str, size_t n, char const *fmt, ...) __attribute__((format(printf, 3, 4))); int ruby_vsnprintf(char *str, size_t n, char const *fmt, va_list ap); +#define rb_scan_args(argc,argvp,fmt,...) __builtin_choose_expr(__builtin_constant_p(fmt), rb_scan_args0(argc,argvp,fmt, (sizeof((VALUE*[]){__VA_ARGS__})/sizeof(VALUE*)), ((VALUE*[]){__VA_ARGS__})), rb_scan_args(argc,argvp,fmt,__VA_ARGS__)) +__attribute__ ((__error__ ("bad scan arg format"))) int rb_scan_args_bad_format(const char*); +__attribute__ ((__error__ ("variable argument length doesn't match"))) int rb_scan_args_length_mismatch(const char*,int); +#define rb_scan_args_isdigit(c) ((unsigned char)((c)-'0')<10) +#define rb_scan_args_count_end(fmt,ofs,varc,vari) ((vari)/(!fmt[ofs] || rb_scan_args_bad_format(fmt))) +#define rb_scan_args_count_block(fmt,ofs,varc,vari) (fmt[ofs]!='&' ? rb_scan_args_count_end(fmt, ofs, varc, vari) : rb_scan_args_count_end(fmt, ofs+1, varc, vari+1)) +#define rb_scan_args_count_hash(fmt,ofs,varc,vari) (fmt[ofs]!=':' ? rb_scan_args_count_block(fmt, ofs, varc, vari) : rb_scan_args_count_block(fmt, ofs+1, varc, vari+1)) +#define rb_scan_args_count_trail(fmt,ofs,varc,vari) (!rb_scan_args_isdigit(fmt[ofs]) ? rb_scan_args_count_hash(fmt, ofs, varc, vari) : rb_scan_args_count_hash(fmt, ofs+1, varc, vari+(fmt[ofs]-'0'))) +#define rb_scan_args_count_var(fmt,ofs,varc,vari) (fmt[ofs]!='*' ? rb_scan_args_count_trail(fmt, ofs, varc, vari) : rb_scan_args_count_trail(fmt, ofs+1, varc, vari+1)) +#define rb_scan_args_count_opt(fmt,ofs,varc,vari) (!rb_scan_args_isdigit(fmt[1]) ? rb_scan_args_count_var(fmt, ofs, varc, vari) : rb_scan_args_count_var(fmt, ofs+1, varc, vari+fmt[ofs]-'0')) +#define rb_scan_args_count(fmt,varc) ((!rb_scan_args_isdigit(fmt[0]) ? rb_scan_args_count_var(fmt, 0, varc, 0) : rb_scan_args_count_opt(fmt, 1, varc, fmt[0]-'0')) == (varc) || rb_scan_args_length_mismatch(fmt, varc)) +#define rb_scan_args_verify_count(fmt,varc) ((varc)/(rb_scan_args_count(fmt, varc))) +#define rb_scan_args_verify(fmt,varc) __extension__ ({ int verify; _Pragma("GCC diagnostic push"); _Pragma("GCC diagnostic ignored \"-Warray-bounds\""); verify = rb_scan_args_verify_count(fmt, varc); _Pragma("GCC diagnostic pop"); verify; }) +__attribute__ ((__always_inline__)) static int rb_scan_args_lead_p(const char *fmt); +static inline int +rb_scan_args_lead_p(const char *fmt) +{ + return ((unsigned char)((fmt[0])-'0')<10); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_n_lead(const char *fmt); +static inline int +rb_scan_args_n_lead(const char *fmt) +{ + return (rb_scan_args_lead_p(fmt) ? fmt[0]-'0' : 0); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_opt_p(const char *fmt); +static inline int +rb_scan_args_opt_p(const char *fmt) +{ + return (rb_scan_args_lead_p(fmt) && ((unsigned char)((fmt[1])-'0')<10)); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_n_opt(const char *fmt); +static inline int +rb_scan_args_n_opt(const char *fmt) +{ + return (rb_scan_args_opt_p(fmt) ? fmt[1]-'0' : 0); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_var_idx(const char *fmt); +static inline int +rb_scan_args_var_idx(const char *fmt) +{ + return (!rb_scan_args_lead_p(fmt) ? 0 : !((unsigned char)((fmt[1])-'0')<10) ? 1 : 2); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_f_var(const char *fmt); +static inline int +rb_scan_args_f_var(const char *fmt) +{ + return (fmt[rb_scan_args_var_idx(fmt)]=='*'); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_trail_idx(const char *fmt); +static inline int +rb_scan_args_trail_idx(const char *fmt) +{ + const int idx = rb_scan_args_var_idx(fmt); + return idx+(fmt[idx]=='*'); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_n_trail(const char *fmt); +static inline int +rb_scan_args_n_trail(const char *fmt) +{ + const int idx = rb_scan_args_trail_idx(fmt); + return (((unsigned char)((fmt[idx])-'0')<10) ? fmt[idx]-'0' : 0); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_hash_idx(const char *fmt); +static inline int +rb_scan_args_hash_idx(const char *fmt) +{ + const int idx = rb_scan_args_trail_idx(fmt); + return idx+((unsigned char)((fmt[idx])-'0')<10); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_f_hash(const char *fmt); +static inline int +rb_scan_args_f_hash(const char *fmt) +{ + return (fmt[rb_scan_args_hash_idx(fmt)]==':'); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_block_idx(const char *fmt); +static inline int +rb_scan_args_block_idx(const char *fmt) +{ + const int idx = rb_scan_args_hash_idx(fmt); + return idx+(fmt[idx]==':'); +} +__attribute__ ((__always_inline__)) static int rb_scan_args_f_block(const char *fmt); +static inline int +rb_scan_args_f_block(const char *fmt) +{ + return (fmt[rb_scan_args_block_idx(fmt)]=='&'); +} +#define rb_scan_args0(argc,argv,fmt,varc,vars) rb_scan_args_set(argc, argv, rb_scan_args_n_lead(fmt), rb_scan_args_n_opt(fmt), rb_scan_args_n_trail(fmt), rb_scan_args_f_var(fmt), rb_scan_args_f_hash(fmt), rb_scan_args_f_block(fmt), (rb_scan_args_verify(fmt, varc), vars)) +__attribute__ ((__always_inline__)) static int rb_scan_args_set(int argc, const VALUE *argv, int n_lead, int n_opt, int n_trail, int f_var, int f_hash, int f_block, VALUE *vars[]); +inline int +rb_scan_args_set(int argc, const VALUE *argv, + int n_lead, int n_opt, int n_trail, + int f_var, int f_hash, int f_block, + VALUE *vars[]) +{ + int i, argi = 0, vari = 0, last_idx = -1; + VALUE *var, hash = ((VALUE)RUBY_Qnil), last_hash = 0; + const int n_mand = n_lead + n_trail; + if (f_hash && n_mand < argc) { + VALUE last = argv[argc - 1]; + if (!((VALUE)(last) != ((VALUE)RUBY_Qnil))) { + if (!f_var && n_mand + n_opt < argc) + argc--; + } + else { + hash = rb_check_hash_type(last); + if (!!((VALUE)(hash) != ((VALUE)RUBY_Qnil))) { + VALUE opts = rb_extract_keywords(&hash); + if (!(last_hash = hash)) argc--; + else last_idx = argc - 1; + hash = opts ? opts : ((VALUE)RUBY_Qnil); + } + } + } + rb_check_arity(argc, n_mand, f_var ? (-1) : n_mand + n_opt); + for (i = n_lead; i-- > 0; ) { + var = vars[vari++]; + if (var) *var = (argi == last_idx) ? last_hash : argv[argi]; + argi++; + } + for (i = n_opt; i-- > 0; ) { + var = vars[vari++]; + if (argi < argc - n_trail) { + if (var) *var = (argi == last_idx) ? last_hash : argv[argi]; + argi++; + } + else { + if (var) *var = ((VALUE)RUBY_Qnil); + } + } + if (f_var) { + int n_var = argc - argi - n_trail; + var = vars[vari++]; + if (0 < n_var) { + if (var) { + int f_last = (last_idx + 1 == argc - n_trail); + *var = rb_ary_new_from_values(n_var-f_last, &argv[argi]); + if (f_last) rb_ary_push(*var, last_hash); + } + argi += n_var; + } + else { + if (var) *var = rb_ary_new(); + } + } + for (i = n_trail; i-- > 0; ) { + var = vars[vari++]; + if (var) *var = (argi == last_idx) ? last_hash : argv[argi]; + argi++; + } + if (f_hash) { + var = vars[vari++]; + if (var) *var = hash; + } + if (f_block) { + var = vars[vari++]; + if (rb_block_given_p()) { + *var = rb_block_proc(); + } + else { + *var = ((VALUE)RUBY_Qnil); + } + } + return argc; +} +#define rb_yield_values(argc,...) __extension__({ const int rb_yield_values_argc = (argc); const VALUE rb_yield_values_args[] = {__VA_ARGS__}; const int rb_yield_values_nargs = (int)(sizeof(rb_yield_values_args) / sizeof(VALUE)); rb_yield_values2( rb_varargs_argc_check(rb_yield_values_argc, rb_yield_values_nargs), rb_yield_values_nargs ? rb_yield_values_args : NULL); }) +#define rb_funcall(recv,mid,argc,...) __extension__({ const int rb_funcall_argc = (argc); const VALUE rb_funcall_args[] = {__VA_ARGS__}; const int rb_funcall_nargs = (int)(sizeof(rb_funcall_args) / sizeof(VALUE)); rb_funcallv(recv, mid, rb_varargs_argc_check(rb_funcall_argc, rb_funcall_nargs), rb_funcall_nargs ? rb_funcall_args : NULL); }) #define RUBY_SUBST_H 1 #undef snprintf #undef vsnprintf @@ -107936,7 +118972,6 @@ void ruby_sig_finalize(void); #define numberof(array) ((int)(sizeof(array) / sizeof((array)[0]))) #define __has_feature(x) 0 #define __has_extension __has_feature -#define ACCESS_ONCE(type,x) (*((volatile type *)&(x))) #define STATIC_ASSERT(name,expr) _Static_assert(expr, #name ": " #expr) #define SIGNED_INTEGER_TYPE_P(int_type) (0 > ((int_type)0)-1) #define SIGNED_INTEGER_MAX(sint_type) (sint_type) ((((sint_type)1) << (sizeof(sint_type) * CHAR_BIT - 2)) | ((((sint_type)1) << (sizeof(sint_type) * CHAR_BIT - 2)) - 1)) @@ -108264,7 +119299,7 @@ int rb_singleton_class_internal_p(VALUE sklass); static inline void RCLASS_SET_ORIGIN(VALUE klass, VALUE origin) { - rb_obj_write((VALUE)(klass), (VALUE *)(&((((struct RClass*)(klass))->ptr)->origin_)), (VALUE)(origin), "../ruby_2_5/internal.h", 826); + rb_obj_write((VALUE)(klass), (VALUE *)(&((((struct RClass*)(klass))->ptr)->origin_)), (VALUE)(origin), "../snapshot/internal.h", 823); if (klass != origin) ((!(((VALUE)(origin) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(origin) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(origin))->flags & RUBY_T_MASK) != RUBY_T_NODE) ? (void)(((struct RBasic*)(origin))->flags |= (((VALUE)RUBY_FL_USER5))) : (void)0); } #undef RCLASS_SUPER @@ -108280,7 +119315,7 @@ RCLASS_SET_SUPER(VALUE klass, VALUE super) rb_class_remove_from_super_subclasses(klass); rb_class_subclass_add(super, klass); } - rb_obj_write((VALUE)(klass), (VALUE *)(&((struct RClass*)(klass))->super), (VALUE)(super), "../ruby_2_5/internal.h", 844); + rb_obj_write((VALUE)(klass), (VALUE *)(&((struct RClass*)(klass))->super), (VALUE)(super), "../snapshot/internal.h", 841); return super; } #define IMEMO_DEBUG 0 @@ -108556,7 +119591,6 @@ VALUE rb_warning_warn(VALUE mod, VALUE str); VALUE rb_warning_string(const char *fmt, ...) __attribute__((format(printf, 1, 2))); VALUE rb_refinement_module_get_refined_class(VALUE module); extern ID ruby_static_id_signo, ruby_static_id_status; -void rb_class_modify_check(VALUE); #define id_signo ruby_static_id_signo #define id_status ruby_static_id_status VALUE rb_get_backtrace(VALUE info); @@ -109053,6 +120087,10 @@ VALUE rb_big2str_generic(VALUE x, int base); VALUE rb_str2big_poweroftwo(VALUE arg, int base, int badcheck); VALUE rb_str2big_normal(VALUE arg, int base, int badcheck); VALUE rb_str2big_karatsuba(VALUE arg, int base, int badcheck); +VALUE rb_big_mul_gmp(VALUE x, VALUE y); +VALUE rb_big_divrem_gmp(VALUE x, VALUE y); +VALUE rb_big2str_gmp(VALUE x, int base); +VALUE rb_str2big_gmp(VALUE arg, int base, int badcheck); enum rb_int_parse_flags { RB_INT_PARSE_SIGN = 0x01, RB_INT_PARSE_UNDERSCORE = 0x02, @@ -109084,6 +120122,7 @@ VALUE rb_execarg_extract_options(VALUE execarg_obj, VALUE opthash); void rb_execarg_setenv(VALUE execarg_obj, VALUE env); VALUE rb_gcd(VALUE x, VALUE y); VALUE rb_gcd_normal(VALUE self, VALUE other); +VALUE rb_gcd_gmp(VALUE x, VALUE y); int rb_grantpt(int fd); VALUE rb_str_upto_each(VALUE, VALUE, int, int (*each)(VALUE, VALUE), VALUE); VALUE rb_str_upto_endless_each(VALUE, int (*each)(VALUE, VALUE), VALUE); @@ -109114,13 +120153,6 @@ VALUE rb_imemo_new(enum imemo_type type, VALUE v1, VALUE v2, VALUE v3, VALUE v0) #define rb_obj_builtin_type(obj) __extension__({ VALUE arg_obj = (obj); RB_SPECIAL_CONST_P(arg_obj) ? -1 : RB_BUILTIN_TYPE(arg_obj); }) #define FLEX_ARY_LEN #define BITFIELD(type) type -#define COMPILER_WARNING_PUSH _Pragma("GCC diagnostic push") -#define COMPILER_WARNING_POP _Pragma("GCC diagnostic pop") -#define COMPILER_WARNING_SPECIFIER(kind,msg) GCC diagnostic kind #msg -#define COMPILER_WARNING_ERROR(flag) COMPILER_WARNING_PRAGMA(COMPILER_WARNING_SPECIFIER(error, flag)) -#define COMPILER_WARNING_IGNORED(flag) COMPILER_WARNING_PRAGMA(COMPILER_WARNING_SPECIFIER(ignored, flag)) -#define COMPILER_WARNING_PRAGMA(str) COMPILER_WARNING_PRAGMA_(str) -#define COMPILER_WARNING_PRAGMA_(str) _Pragma(#str) #define RUBY_VM_H 1 #pragma GCC visibility push(default) @@ -109131,7 +120163,6 @@ void ruby_vm_at_exit(void(*func)(ruby_vm_t *)); #pragma GCC visibility pop -#define vm_exec rb_vm_exec #define RUBY_GC_H 1 #define SET_MACHINE_STACK_END(p) __asm__ __volatile__ ("movq\t%%rsp, %0" : "=r" (*(p))) #define RUBY_MARK_FREE_DEBUG 0 @@ -109492,6 +120523,7 @@ typedef struct RNode { #define nd_alias u1.id #define nd_orig u2.id #define nd_undef u2.node +#define nd_compile_option u3.value #define NEW_NODE(t,a0,a1,a2,loc) rb_node_newnode((t),(VALUE)(a0),(VALUE)(a1),(VALUE)(a2),loc) #define NEW_DEFN(i,a,d,loc) NEW_NODE(NODE_DEFN,0,i,NEW_SCOPE(a,d,loc),loc) #define NEW_DEFS(r,i,a,d,loc) NEW_NODE(NODE_DEFS,r,i,NEW_SCOPE(a,d,loc),loc) @@ -110178,8 +121210,8 @@ static inline void list_del_init_(struct list_node *n, const char *abortstr) } static inline void list_del_from(struct list_head *h, struct list_node *n) { - (void) ((!!(!list_empty_(h, "../ruby_2_5/ccan/list/list.h" ":" "328"))) || (_assert("!list_empty(h)","../ruby_2_5/ccan/list/list.h",328),0)); - list_del_(n, "../ruby_2_5/ccan/list/list.h" ":" "329"); + (void) ((!!(!list_empty_(h, "../snapshot/ccan/list/list.h" ":" "328"))) || (_assert("!list_empty(h)","../snapshot/ccan/list/list.h",328),0)); + list_del_(n, "../snapshot/ccan/list/list.h" ":" "329"); } #define list_swap(o,n) list_swap_(o, n, LIST_LOC) static inline void list_swap_(struct list_node *o, @@ -110195,7 +121227,7 @@ static inline void list_swap_(struct list_node *o, #define list_top(h,type,member) ((type *)list_top_((h), list_off_(type, member))) static inline const void *list_top_(const struct list_head *h, size_t off) { - if (list_empty_(h, "../ruby_2_5/ccan/list/list.h" ":" "399")) + if (list_empty_(h, "../snapshot/ccan/list/list.h" ":" "399")) return ((void *)0); return (const char *)h->n.next - off; } @@ -110203,16 +121235,16 @@ static inline const void *list_top_(const struct list_head *h, size_t off) static inline const void *list_pop_(const struct list_head *h, size_t off) { struct list_node *n; - if (list_empty_(h, "../ruby_2_5/ccan/list/list.h" ":" "425")) + if (list_empty_(h, "../snapshot/ccan/list/list.h" ":" "425")) return ((void *)0); n = h->n.next; - list_del_(n, "../ruby_2_5/ccan/list/list.h" ":" "428"); + list_del_(n, "../snapshot/ccan/list/list.h" ":" "428"); return (const char *)n - off; } #define list_tail(h,type,member) ((type *)list_tail_((h), list_off_(type, member))) static inline const void *list_tail_(const struct list_head *h, size_t off) { - if (list_empty_(h, "../ruby_2_5/ccan/list/list.h" ":" "451")) + if (list_empty_(h, "../snapshot/ccan/list/list.h" ":" "451")) return ((void *)0); return (const char *)h->n.prev - off; } @@ -110233,7 +121265,7 @@ static inline void list_append_list_(struct list_head *to, from_tail->next = &to->n; to_tail->next = &from->n; from->n.prev = to_tail; - list_del_(&from->n, "../ruby_2_5/ccan/list/list.h" ":" "600"); + list_del_(&from->n, "../snapshot/ccan/list/list.h" ":" "600"); list_head_init(from); } #define list_prepend_list(t,f) list_prepend_list_(t, f, LIST_LOC) @@ -110247,7 +121279,7 @@ static inline void list_prepend_list_(struct list_head *to, from->n.prev = &to->n; to_head->prev = from_tail; from_tail->next = to_head; - list_del_(&from->n, "../ruby_2_5/ccan/list/list.h" ":" "632"); + list_del_(&from->n, "../snapshot/ccan/list/list.h" ":" "632"); list_head_init(from); } #define list_for_each_off_dir_(h,i,off,dir) for (i = list_node_to_off_(list_debug(h, LIST_LOC)->n.dir, (off)); list_node_from_off_((void *)i, (off)) != &(h)->n; i = list_node_to_off_(list_node_from_off_((void *)i, (off))->dir, (off))) @@ -110318,7 +121350,6 @@ typedef struct rb_global_vm_lock_struct { #define va_init_list(a,b) va_start((a),(b)) #define RB_ALTSTACK_INIT(var) #define RB_ALTSTACK_FREE(var) -#define RB_ALTSTACK(var) (0) void rb_addr2insn_init(void); typedef unsigned long rb_num_t; typedef signed long rb_snum_t; @@ -110589,8 +121620,9 @@ typedef struct rb_hook_list_struct { typedef struct rb_vm_struct { VALUE self; rb_global_vm_lock_t gvl; + rb_nativethread_lock_t thread_destruct_lock; struct rb_thread_struct *main_thread; - const struct rb_thread_struct *running_thread; + struct rb_thread_struct *running_thread; rb_serial_t fork_gen; rb_nativethread_lock_t waitpid_lock; struct list_head waiting_pids; @@ -110753,6 +121785,7 @@ typedef struct rb_execution_context_struct { rb_control_frame_t *cfp; struct rb_vm_tag *tag; struct rb_vm_protect_tag *protect_tag; + int raised_flag; rb_atomic_t interrupt_flag; rb_atomic_t interrupt_mask; rb_fiber_t *fiber_ptr; @@ -110767,7 +121800,6 @@ typedef struct rb_execution_context_struct { VALUE errinfo; VALUE passed_block_handler; const rb_callable_method_entry_t *passed_bmethod_me; - int raised_flag; enum method_missing_reason method_missing_reason; VALUE private_const_reference; struct { @@ -110797,6 +121829,7 @@ typedef struct rb_thread_struct { VALUE value; VALUE pending_interrupt_queue; VALUE pending_interrupt_mask_stack; + int pending_interrupt_queue_checked; rb_nativethread_lock_t interrupt_lock; struct rb_unblock_callback unblock; VALUE locking_mutex; @@ -110808,11 +121841,10 @@ typedef struct rb_thread_struct { VALUE stat_insn_usage; rb_fiber_t *root_fiber; rb_jmpbuf_t root_jmpbuf; - VALUE name; - uint32_t running_time_us; unsigned int abort_on_exception: 1; unsigned int report_on_exception: 1; - unsigned int pending_interrupt_queue_checked: 1; + uint32_t running_time_us; + VALUE name; } rb_thread_t; typedef enum { VM_DEFINECLASS_TYPE_CLASS = 0x00, @@ -111310,7 +122342,7 @@ void rb_vm_pop_frame(rb_execution_context_t *ec); void rb_thread_start_timer_thread(void); void rb_thread_stop_timer_thread(void); void rb_thread_reset_timer_thread(void); -void rb_thread_wakeup_timer_thread(int); +void rb_thread_wakeup_timer_thread(void); static inline void rb_vm_living_threads_init(rb_vm_t *vm) { @@ -111323,13 +122355,13 @@ rb_vm_living_threads_init(rb_vm_t *vm) static inline void rb_vm_living_threads_insert(rb_vm_t *vm, rb_thread_t *th) { - list_add_tail_(&vm->living_threads, &th->vmlt_node, "../ruby_2_5/vm_core.h" ":" "1601"); + list_add_tail_(&vm->living_threads, &th->vmlt_node, "../snapshot/vm_core.h" ":" "1595"); vm->living_thread_num++; } static inline void rb_vm_living_threads_remove(rb_vm_t *vm, rb_thread_t *th) { - list_del_(&th->vmlt_node, "../ruby_2_5/vm_core.h" ":" "1608"); + list_del_(&th->vmlt_node, "../snapshot/vm_core.h" ":" "1602"); vm->living_thread_num--; } typedef int rb_backtrace_iter_func(void *, VALUE, int, VALUE); @@ -111585,9 +122617,9 @@ struct iseq_compile_data { VALUE ensure_node; VALUE for_iseq; struct iseq_compile_data_ensure_node_stack *ensure_node_stack; + int loopval_popped; struct iseq_compile_data_storage *storage_head; struct iseq_compile_data_storage *storage_current; - int loopval_popped; int last_line; int label_no; int node_level; @@ -111808,7 +122840,7 @@ CREF_REFINEMENTS(const rb_cref_t *cref) static inline void CREF_REFINEMENTS_SET(rb_cref_t *cref, VALUE refs) { - rb_obj_write((VALUE)(cref), (VALUE *)(&cref->refinements), (VALUE)(refs), "../ruby_2_5/eval_intern.h", 237); + rb_obj_write((VALUE)(cref), (VALUE *)(&cref->refinements), (VALUE)(refs), "../snapshot/eval_intern.h", 237); } static inline int CREF_PUSHED_BY_EVAL(const rb_cref_t *cref) @@ -112101,24 +123133,24 @@ void rb_vm_block_ep_update(VALUE obj, const struct rb_block *dst, const VALUE *ep) { *((const VALUE **)&dst->as.captured.ep) = ep; - rb_obj_written((VALUE)(obj), (VALUE)(((VALUE)RUBY_Qundef)), (VALUE)(VM_ENV_ENVVAL(ep)), "../ruby_2_5/vm.c", 284); + rb_obj_written((VALUE)(obj), (VALUE)(((VALUE)RUBY_Qundef)), (VALUE)(VM_ENV_ENVVAL(ep)), "../snapshot/vm.c", 282); } static void vm_bind_update_env(VALUE bindval, rb_binding_t *bind, VALUE envval) { const rb_env_t *env = (rb_env_t *)envval; - rb_obj_write((VALUE)(bindval), (VALUE *)(&bind->block.as.captured.code.iseq), (VALUE)(env->iseq), "../ruby_2_5/vm.c", 291); + rb_obj_write((VALUE)(bindval), (VALUE *)(&bind->block.as.captured.code.iseq), (VALUE)(env->iseq), "../snapshot/vm.c", 289); rb_vm_block_ep_update(bindval, &bind->block, env->ep); } static VALUE vm_make_env_object(const rb_execution_context_t *ec, rb_control_frame_t *cfp); -extern VALUE rb_vm_invoke_bmethod(rb_execution_context_t *ec, rb_proc_t *proc, VALUE self, int argc, const VALUE *argv, VALUE block_handler); +extern VALUE vm_invoke_bmethod(rb_execution_context_t *ec, rb_proc_t *proc, VALUE self, int argc, const VALUE *argv, VALUE block_handler); static VALUE vm_invoke_proc(rb_execution_context_t *ec, rb_proc_t *proc, VALUE self, int argc, const VALUE *argv, VALUE block_handler); static VALUE rb_block_param_proxy; #define RUBY_MJIT_H 1 enum rb_mjit_iseq_func { NOT_ADDED_JIT_ISEQ_FUNC = 0, NOT_READY_JIT_ISEQ_FUNC = 1, - NOT_COMPILED_JIT_ISEQ_FUNC = 2, + NOT_COMPILABLE_JIT_ISEQ_FUNC = 2, LAST_JIT_ISEQ_FUNC = 3 }; struct mjit_options { @@ -112186,7 +123218,7 @@ mjit_exec(rb_execution_context_t *ec) } return ((VALUE)RUBY_Qundef); case NOT_READY_JIT_ISEQ_FUNC: - case NOT_COMPILED_JIT_ISEQ_FUNC: + case NOT_COMPILABLE_JIT_ISEQ_FUNC: return ((VALUE)RUBY_Qundef); default: break; @@ -112259,7 +123291,6 @@ enum vm_regan_acttype { #define INC_SP(x) (VM_REG_SP += (COLLECT_USAGE_REGISTER_HELPER(SP, SET, (x)))) #define DEC_SP(x) (VM_REG_SP -= (COLLECT_USAGE_REGISTER_HELPER(SP, SET, (x)))) #define SET_SV(x) (*GET_SP() = (x)) -#define ADJ_SP(x) INC_SP(x) #define GET_ISEQ() (GET_CFP()->iseq) #define GET_PREV_EP(ep) ((VALUE *)((ep)[VM_ENV_DATA_INDEX_SPECVAL] & ~0x03)) #define GET_GLOBAL(entry) rb_gvar_get((struct rb_global_entry*)(entry)) @@ -112839,7 +123870,7 @@ lep_svar_write(const rb_execution_context_t *ec, const VALUE *lep, const struct vm_env_write(lep, (-2), (VALUE)svar); } else { - rb_obj_write((VALUE)(rb_ec_thread_ptr(ec)->self), (VALUE *)(&ec->root_svar), (VALUE)(svar), "../ruby_2_5/vm_insnhelper.c", 413); + rb_obj_write((VALUE)(rb_ec_thread_ptr(ec)->self), (VALUE *)(&ec->root_svar), (VALUE)(svar), "../snapshot/vm_insnhelper.c", 413); } } static VALUE @@ -112877,15 +123908,15 @@ lep_svar_set(const rb_execution_context_t *ec, const VALUE *lep, rb_num_t key, V } switch (key) { case VM_SVAR_LASTLINE: - rb_obj_write((VALUE)(svar), (VALUE *)(&svar->lastline), (VALUE)(val), "../ruby_2_5/vm_insnhelper.c", 459); + rb_obj_write((VALUE)(svar), (VALUE *)(&svar->lastline), (VALUE)(val), "../snapshot/vm_insnhelper.c", 459); return; case VM_SVAR_BACKREF: - rb_obj_write((VALUE)(svar), (VALUE *)(&svar->backref), (VALUE)(val), "../ruby_2_5/vm_insnhelper.c", 462); + rb_obj_write((VALUE)(svar), (VALUE *)(&svar->backref), (VALUE)(val), "../snapshot/vm_insnhelper.c", 462); return; default: { VALUE ary = svar->others; if (!((VALUE)(ary) != ((VALUE)RUBY_Qnil))) { - rb_obj_write((VALUE)(svar), (VALUE *)(&svar->others), (VALUE)(ary = rb_ary_new()), "../ruby_2_5/vm_insnhelper.c", 468); + rb_obj_write((VALUE)(svar), (VALUE *)(&svar->others), (VALUE)(ary = rb_ary_new()), "../snapshot/vm_insnhelper.c", 468); } rb_ary_store(ary, key - VM_SVAR_EXTRA_START, val); } @@ -113026,7 +124057,7 @@ cref_replace_with_duplicated_cref_each_frame(const VALUE *vptr, int can_be_svar, cref = (rb_cref_t *)v; new_cref = vm_cref_dup(cref); if (parent) { - rb_obj_write((VALUE)(parent), (VALUE *)(vptr), (VALUE)(new_cref), "../ruby_2_5/vm_insnhelper.c", 643); + rb_obj_write((VALUE)(parent), (VALUE *)(vptr), (VALUE)(new_cref), "../snapshot/vm_insnhelper.c", 643); } else { VM_FORCE_WRITE(vptr, (VALUE)new_cref); @@ -113151,7 +124182,7 @@ static inline void vm_check_if_namespace(VALUE klass) { if (!( ((RUBY_T_CLASS) == RUBY_T_FIXNUM) ? (((int)(long long)(klass))&RUBY_FIXNUM_FLAG) : ((RUBY_T_CLASS) == RUBY_T_TRUE) ? ((klass) == ((VALUE)RUBY_Qtrue)) : ((RUBY_T_CLASS) == RUBY_T_FALSE) ? ((klass) == ((VALUE)RUBY_Qfalse)) : ((RUBY_T_CLASS) == RUBY_T_NIL) ? ((klass) == ((VALUE)RUBY_Qnil)) : ((RUBY_T_CLASS) == RUBY_T_UNDEF) ? ((klass) == ((VALUE)RUBY_Qundef)) : ((RUBY_T_CLASS) == RUBY_T_SYMBOL) ? ((((VALUE)(klass)&~((~(VALUE)0)<flags & RUBY_T_MASK) == (RUBY_T_SYMBOL))) : ((RUBY_T_CLASS) == RUBY_T_FLOAT) ? ( ((((int)(long long)(klass))&RUBY_FLONUM_MASK) == RUBY_FLONUM_FLAG) || (!(((VALUE)(klass) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(klass) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(klass))->flags & RUBY_T_MASK) == RUBY_T_FLOAT)) : (!(((VALUE)(klass) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(klass) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(klass))->flags & RUBY_T_MASK) == (RUBY_T_CLASS))) && !( ((RUBY_T_MODULE) == RUBY_T_FIXNUM) ? (((int)(long long)(klass))&RUBY_FIXNUM_FLAG) : ((RUBY_T_MODULE) == RUBY_T_TRUE) ? ((klass) == ((VALUE)RUBY_Qtrue)) : ((RUBY_T_MODULE) == RUBY_T_FALSE) ? ((klass) == ((VALUE)RUBY_Qfalse)) : ((RUBY_T_MODULE) == RUBY_T_NIL) ? ((klass) == ((VALUE)RUBY_Qnil)) : ((RUBY_T_MODULE) == RUBY_T_UNDEF) ? ((klass) == ((VALUE)RUBY_Qundef)) : ((RUBY_T_MODULE) == RUBY_T_SYMBOL) ? ((((VALUE)(klass)&~((~(VALUE)0)<flags & RUBY_T_MASK) == (RUBY_T_SYMBOL))) : ((RUBY_T_MODULE) == RUBY_T_FLOAT) ? ( ((((int)(long long)(klass))&RUBY_FLONUM_MASK) == RUBY_FLONUM_FLAG) || (!(((VALUE)(klass) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(klass) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(klass))->flags & RUBY_T_MASK) == RUBY_T_FLOAT)) : (!(((VALUE)(klass) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(klass) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(klass))->flags & RUBY_T_MASK) == (RUBY_T_MODULE)))) { - rb_raise(rb_eTypeError, "%+""I64""i" "\v"" is not a class/module", klass); + rb_raise(rb_eTypeError, "%+""ll""i" "\v"" is not a class/module", klass); } } static inline void @@ -113307,7 +124338,7 @@ vm_getivar(VALUE obj, ID id, IC ic, struct rb_call_cache *cc, st_index_t index, } if ((__builtin_expect(!!(val == ((VALUE)RUBY_Qundef)), 0))) { if (!is_attr && !(((VALUE)((*rb_ruby_verbose_ptr())) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) - rb_warning("instance variable %""I64""i" "\v"" not initialized", rb_id_quote_unprintable(id)); + rb_warning("instance variable %""ll""i" "\v"" not initialized", rb_id_quote_unprintable(id)); val = ((VALUE)RUBY_Qnil); } ((void)0); @@ -113324,7 +124355,7 @@ vm_getivar(VALUE obj, ID id, IC ic, struct rb_call_cache *cc, st_index_t index, static inline VALUE vm_setivar(VALUE obj, ID id, VALUE val, IC ic, struct rb_call_cache *cc, int is_attr) { - __extension__({do { VALUE frozen_obj = (obj); if ((__builtin_expect(!!((!(!(((VALUE)(frozen_obj) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(frozen_obj) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(frozen_obj))->flags & RUBY_T_MASK) != RUBY_T_NODE) || (((struct RBasic*)(frozen_obj))->flags&RUBY_FL_FREEZE))), 0))) { rb_error_frozen_object(frozen_obj); } } while (0);}); + __extension__({do { VALUE frozen_obj = (obj); if ((!(!(((VALUE)(frozen_obj) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(frozen_obj) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(frozen_obj))->flags & RUBY_T_MASK) != RUBY_T_NODE) || (((struct RBasic*)(frozen_obj))->flags&RUBY_FL_FREEZE))) { rb_error_frozen_object(frozen_obj); } } while (0);}); if ((__builtin_expect(!!(( ((RUBY_T_OBJECT) == RUBY_T_FIXNUM) ? (((int)(long long)(obj))&RUBY_FIXNUM_FLAG) : ((RUBY_T_OBJECT) == RUBY_T_TRUE) ? ((obj) == ((VALUE)RUBY_Qtrue)) : ((RUBY_T_OBJECT) == RUBY_T_FALSE) ? ((obj) == ((VALUE)RUBY_Qfalse)) : ((RUBY_T_OBJECT) == RUBY_T_NIL) ? ((obj) == ((VALUE)RUBY_Qnil)) : ((RUBY_T_OBJECT) == RUBY_T_UNDEF) ? ((obj) == ((VALUE)RUBY_Qundef)) : ((RUBY_T_OBJECT) == RUBY_T_SYMBOL) ? ((((VALUE)(obj)&~((~(VALUE)0)<flags & RUBY_T_MASK) == (RUBY_T_SYMBOL))) : ((RUBY_T_OBJECT) == RUBY_T_FLOAT) ? ( ((((int)(long long)(obj))&RUBY_FLONUM_MASK) == RUBY_FLONUM_FLAG) || (!(((VALUE)(obj) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(obj) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(obj))->flags & RUBY_T_MASK) == RUBY_T_FLOAT)) : (!(((VALUE)(obj) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(obj) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(obj))->flags & RUBY_T_MASK) == (RUBY_T_OBJECT)))), 1))) { VALUE klass = ((struct RBasic*)(obj))->klass; st_data_t index; @@ -113332,7 +124363,7 @@ vm_setivar(VALUE obj, ID id, VALUE val, IC ic, struct rb_call_cache *cc, int is_ VALUE *ptr = ((((struct RBasic*)(obj))->flags & ROBJECT_EMBED) ? ((struct RObject*)(obj))->as.ary : ((struct RObject*)(obj))->as.heap.ivptr); index = !is_attr ? ic->ic_value.index : cc->aux.index-1; if ((index < ((((struct RBasic*)(obj))->flags & ROBJECT_EMBED) ? ROBJECT_EMBED_LEN_MAX : ((struct RObject*)(obj))->as.heap.numiv))) { - rb_obj_write((VALUE)(obj), (VALUE *)(&ptr[index]), (VALUE)(val), "../ruby_2_5/vm_insnhelper.c", 1006); + rb_obj_write((VALUE)(obj), (VALUE *)(&ptr[index]), (VALUE)(val), "../snapshot/vm_insnhelper.c", 1006); ((void)0); return val; } @@ -113530,11 +124561,11 @@ vm_throw(const rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, } } static inline void -vm_expandarray(VALUE *sp, VALUE ary, rb_num_t num, int flag) +vm_expandarray(rb_control_frame_t *cfp, VALUE ary, rb_num_t num, int flag) { int is_splat = flag & 0x01; rb_num_t space_size = num + is_splat; - VALUE *base = sp - 1; + VALUE *base = cfp->sp; const VALUE *ptr; rb_num_t len; const VALUE obj = ary; @@ -113547,6 +124578,7 @@ vm_expandarray(VALUE *sp, VALUE ary, rb_num_t num, int flag) ptr = rb_array_const_ptr(ary); len = (rb_num_t)rb_array_len(ary); } + cfp->sp += space_size; if (flag & 0x02) { rb_num_t i = 0, j; if (len < num) { @@ -113587,7 +124619,7 @@ vm_expandarray(VALUE *sp, VALUE ary, rb_num_t num, int flag) } static VALUE vm_call_general(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling, const struct rb_call_info *ci, struct rb_call_cache *cc); __attribute__ ((__visibility__("default"))) extern void -rb_vm_search_method_slowpath(const struct rb_call_info *ci, struct rb_call_cache *cc, VALUE klass) +vm_search_method_slowpath(const struct rb_call_info *ci, struct rb_call_cache *cc, VALUE klass) { cc->me = rb_callable_method_entry(klass, ci->mid); ((void)0); @@ -113607,7 +124639,7 @@ vm_search_method(const struct rb_call_info *ci, struct rb_call_cache *cc, VALUE return; } ((void)0); - rb_vm_search_method_slowpath(ci, cc, klass); + vm_search_method_slowpath(ci, cc, klass); } static inline int check_cfunc(const rb_callable_method_entry_t *me, VALUE (*func)()) @@ -113727,7 +124759,7 @@ rb_eql_opt(VALUE obj1, VALUE obj2) cc.me = ((void *)0); return opt_eql_func(obj1, obj2, &ci, &cc); } -extern VALUE rb_vm_call0(rb_execution_context_t *ec, VALUE, ID, int, const VALUE*, const rb_callable_method_entry_t *); +extern VALUE vm_call0(rb_execution_context_t *ec, VALUE, ID, int, const VALUE*, const rb_callable_method_entry_t *); static VALUE check_match(rb_execution_context_t *ec, VALUE pattern, VALUE target, enum vm_check_match_type type) { @@ -113742,7 +124774,7 @@ check_match(rb_execution_context_t *ec, VALUE pattern, VALUE target, enum vm_che const rb_callable_method_entry_t *me = rb_callable_method_entry_with_refinements(rb_class_of((VALUE)(pattern)), idEqq, ((void *)0)); if (me) { - return rb_vm_call0(ec, pattern, idEqq, 1, &target, me); + return vm_call0(ec, pattern, idEqq, 1, &target, me); } else { return rb_funcallv(pattern, idEqq, 1, &target); @@ -113801,8 +124833,8 @@ static VALUE method_missing(VALUE obj, ID id, int argc, const VALUE *argv, struct args_info { VALUE *argv; int argc; - int rest_index; const struct rb_call_info_kw_arg *kw_arg; + int rest_index; VALUE *kw_argv; VALUE rest; }; @@ -113884,7 +124916,7 @@ args_copy(struct args_info *args) args->argc = 0; args->rest = rb_ary_dup(args->rest); while (args->rest_index > 0 && argc > 0) { - do { const VALUE _ary = (args->rest); VALUE *ptr = (VALUE *)((VALUE *)rb_array_const_ptr(_ary)); rb_obj_write((VALUE)(_ary), (VALUE *)(&ptr[--args->rest_index]), (VALUE)((args->argv[--argc])), "../ruby_2_5/vm_args.c", 140); ; } while (0); + do { const VALUE _ary = (args->rest); VALUE *ptr = (VALUE *)((VALUE *)rb_array_const_ptr(_ary)); rb_obj_write((VALUE)(_ary), (VALUE *)(&ptr[--args->rest_index]), (VALUE)((args->argv[--argc])), "../snapshot/vm_args.c", 140); ; } while (0); } while (argc > 0) { rb_ary_unshift(args->rest, args->argv[--argc]); @@ -113953,7 +124985,7 @@ args_pop_keyword_hash(struct args_info *args, VALUE *kw_hash_ptr) *kw_hash_ptr = (rb_array_const_ptr(args->rest)[len - 1]); if (keyword_hash_p(kw_hash_ptr, &rest_hash)) { if (rest_hash) { - do { const VALUE _ary = (args->rest); VALUE *ptr = (VALUE *)((VALUE *)rb_array_const_ptr(_ary)); rb_obj_write((VALUE)(_ary), (VALUE *)(&ptr[len - 1]), (VALUE)((rest_hash)), "../ruby_2_5/vm_args.c", 219); ; } while (0); + do { const VALUE _ary = (args->rest); VALUE *ptr = (VALUE *)((VALUE *)rb_array_const_ptr(_ary)); rb_obj_write((VALUE)(_ary), (VALUE *)(&ptr[len - 1]), (VALUE)((rest_hash)), "../snapshot/vm_args.c", 219); ; } while (0); } else { args->rest = rb_ary_dup(args->rest); @@ -114443,7 +125475,7 @@ vm_to_proc(VALUE proc) const rb_callable_method_entry_t *me = rb_callable_method_entry_with_refinements(rb_class_of((VALUE)(proc)), idTo_proc, ((void *)0)); if (me) { - b = rb_vm_call0(rb_current_execution_context(), proc, idTo_proc, 0, ((void *)0), me); + b = vm_call0(rb_current_execution_context(), proc, idTo_proc, 0, ((void *)0), me); } else { b = rb_check_convert_type_with_id(proc, RUBY_T_DATA, "Proc", idTo_proc); @@ -114479,7 +125511,7 @@ refine_sym_proc_call(VALUE yielded_arg, VALUE callback_arg, int argc, const VALU if (!me) { return method_missing(obj, mid, argc, argv, MISSING_NOENTRY); } - return rb_vm_call0(ec, obj, mid, argc, argv, me); + return vm_call0(ec, obj, mid, argc, argv, me); } static void vm_caller_setup_arg_block(const rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, @@ -114814,7 +125846,7 @@ vm_call_bmethod_body(rb_execution_context_t *ec, struct rb_calling_info *calling VALUE val; ec->passed_bmethod_me = cc->me; (((proc)) = (rb_proc_t*)(((struct RData*)(((cc->me->def->body.proc))))->data)); - val = rb_vm_invoke_bmethod(ec, proc, calling->recv, calling->argc, argv, calling->block_handler); + val = vm_invoke_bmethod(ec, proc, calling->recv, calling->argc, argv, calling->block_handler); return val; } static VALUE @@ -115008,7 +126040,7 @@ aliased_callable_method_entry(const rb_callable_method_entry_t *me) ((void)0); cme = rb_method_entry_complement_defined_class(orig_me, me->called_id, defined_class); if (me->def->alias_count + me->def->complemented_count == 0) { - rb_obj_write((VALUE)(me), (VALUE *)(&me->def->body.alias.original_me), (VALUE)(cme), "../ruby_2_5/vm_insnhelper.c", 2220); + rb_obj_write((VALUE)(me), (VALUE *)(&me->def->body.alias.original_me), (VALUE)(cme), "../snapshot/vm_insnhelper.c", 2222); } else { rb_method_definition_t *def = @@ -115238,7 +126270,7 @@ vm_search_super_method(const rb_execution_context_t *ec, rb_control_frame_t *reg ((struct RBasic*)(current_defined_class))->klass : current_defined_class; rb_raise(rb_eTypeError, "self has wrong type to call super in this context: " - "%""I64""i" "\v"" (expected %""I64""i" "\v"")", + "%""ll""i" "\v"" (expected %""ll""i" "\v"")", rb_obj_class(calling->recv), m); } if (me->def->type == VM_METHOD_TYPE_BMETHOD && !sigval) { @@ -115758,13 +126790,13 @@ static VALUE vm_check_if_class(ID id, rb_num_t flags, VALUE super, VALUE klass) { if (!( ((RUBY_T_CLASS) == RUBY_T_FIXNUM) ? (((int)(long long)(klass))&RUBY_FIXNUM_FLAG) : ((RUBY_T_CLASS) == RUBY_T_TRUE) ? ((klass) == ((VALUE)RUBY_Qtrue)) : ((RUBY_T_CLASS) == RUBY_T_FALSE) ? ((klass) == ((VALUE)RUBY_Qfalse)) : ((RUBY_T_CLASS) == RUBY_T_NIL) ? ((klass) == ((VALUE)RUBY_Qnil)) : ((RUBY_T_CLASS) == RUBY_T_UNDEF) ? ((klass) == ((VALUE)RUBY_Qundef)) : ((RUBY_T_CLASS) == RUBY_T_SYMBOL) ? ((((VALUE)(klass)&~((~(VALUE)0)<flags & RUBY_T_MASK) == (RUBY_T_SYMBOL))) : ((RUBY_T_CLASS) == RUBY_T_FLOAT) ? ( ((((int)(long long)(klass))&RUBY_FLONUM_MASK) == RUBY_FLONUM_FLAG) || (!(((VALUE)(klass) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(klass) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(klass))->flags & RUBY_T_MASK) == RUBY_T_FLOAT)) : (!(((VALUE)(klass) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(klass) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(klass))->flags & RUBY_T_MASK) == (RUBY_T_CLASS)))) { - rb_raise(rb_eTypeError, "%""I64""i" "\v"" is not a class", rb_id2str(id)); + rb_raise(rb_eTypeError, "%""ll""i" "\v"" is not a class", rb_id2str(id)); } else if (((flags) & 0x10)) { VALUE tmp = rb_class_real(RCLASS_SUPER(klass)); if (tmp != super) { rb_raise(rb_eTypeError, - "superclass mismatch for class %""I64""i" "\v""", + "superclass mismatch for class %""ll""i" "\v""", rb_id2str(id)); } else { @@ -115779,7 +126811,7 @@ static VALUE vm_check_if_module(ID id, VALUE mod) { if (!( ((RUBY_T_MODULE) == RUBY_T_FIXNUM) ? (((int)(long long)(mod))&RUBY_FIXNUM_FLAG) : ((RUBY_T_MODULE) == RUBY_T_TRUE) ? ((mod) == ((VALUE)RUBY_Qtrue)) : ((RUBY_T_MODULE) == RUBY_T_FALSE) ? ((mod) == ((VALUE)RUBY_Qfalse)) : ((RUBY_T_MODULE) == RUBY_T_NIL) ? ((mod) == ((VALUE)RUBY_Qnil)) : ((RUBY_T_MODULE) == RUBY_T_UNDEF) ? ((mod) == ((VALUE)RUBY_Qundef)) : ((RUBY_T_MODULE) == RUBY_T_SYMBOL) ? ((((VALUE)(mod)&~((~(VALUE)0)<flags & RUBY_T_MASK) == (RUBY_T_SYMBOL))) : ((RUBY_T_MODULE) == RUBY_T_FLOAT) ? ( ((((int)(long long)(mod))&RUBY_FLONUM_MASK) == RUBY_FLONUM_FLAG) || (!(((VALUE)(mod) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(mod) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(mod))->flags & RUBY_T_MASK) == RUBY_T_FLOAT)) : (!(((VALUE)(mod) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(mod) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(mod))->flags & RUBY_T_MASK) == (RUBY_T_MODULE)))) { - rb_raise(rb_eTypeError, "%""I64""i" "\v"" is not a module", rb_id2str(id)); + rb_raise(rb_eTypeError, "%""ll""i" "\v"" is not a module", rb_id2str(id)); } else { return mod; @@ -115809,7 +126841,7 @@ vm_define_class(ID id, rb_num_t flags, VALUE cbase, VALUE super) VALUE klass; if (((flags) & 0x10) && !( ((RUBY_T_CLASS) == RUBY_T_FIXNUM) ? (((int)(long long)(super))&RUBY_FIXNUM_FLAG) : ((RUBY_T_CLASS) == RUBY_T_TRUE) ? ((super) == ((VALUE)RUBY_Qtrue)) : ((RUBY_T_CLASS) == RUBY_T_FALSE) ? ((super) == ((VALUE)RUBY_Qfalse)) : ((RUBY_T_CLASS) == RUBY_T_NIL) ? ((super) == ((VALUE)RUBY_Qnil)) : ((RUBY_T_CLASS) == RUBY_T_UNDEF) ? ((super) == ((VALUE)RUBY_Qundef)) : ((RUBY_T_CLASS) == RUBY_T_SYMBOL) ? ((((VALUE)(super)&~((~(VALUE)0)<flags & RUBY_T_MASK) == (RUBY_T_SYMBOL))) : ((RUBY_T_CLASS) == RUBY_T_FLOAT) ? ( ((((int)(long long)(super))&RUBY_FLONUM_MASK) == RUBY_FLONUM_FLAG) || (!(((VALUE)(super) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(super) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(super))->flags & RUBY_T_MASK) == RUBY_T_FLOAT)) : (!(((VALUE)(super) & RUBY_IMMEDIATE_MASK) || !!(((VALUE)(super) & (VALUE)~((VALUE)RUBY_Qnil)) == 0)) && (int)(((struct RBasic*)(super))->flags & RUBY_T_MASK) == (RUBY_T_CLASS)))) { rb_raise(rb_eTypeError, - "superclass must be a Class (%""I64""i" "\v"" given)", + "superclass must be a Class (%""ll""i" "\v"" given)", rb_obj_class(super)); } vm_check_if_namespace(cbase); @@ -115858,7 +126890,7 @@ vm_opt_str_freeze(VALUE str, int bop, ID id) return str; } else { - return rb_funcall(rb_str_resurrect(str), id, 0); + return __extension__({ const int rb_funcall_argc = (0); const VALUE rb_funcall_args[] = {}; const int rb_funcall_nargs = (int)(sizeof(rb_funcall_args) / sizeof(VALUE)); rb_funcallv(rb_str_resurrect(str), id, __builtin_choose_expr(__builtin_constant_p(rb_funcall_argc), (((rb_funcall_argc) == 0 ? (rb_funcall_nargs) <= 1 : (rb_funcall_argc) == (rb_funcall_nargs)) ? (rb_funcall_argc) : rb_varargs_bad_length(rb_funcall_argc, rb_funcall_nargs)), (((rb_funcall_argc) <= (rb_funcall_nargs)) ? (rb_funcall_argc) : (rb_fatal("argc(%d) exceeds actual arguments(%d)", rb_funcall_argc, rb_funcall_nargs), 0))), rb_funcall_nargs ? rb_funcall_args : ((void *)0)); }); } } #define id_cmp idCmp @@ -115884,7 +126916,7 @@ vm_opt_newarray_max(rb_num_t num, const VALUE *ptr) } else { VALUE ary = rb_ary_new_from_values(num, ptr); - return rb_funcall(ary, idMax, 0); + return __extension__({ const int rb_funcall_argc = (0); const VALUE rb_funcall_args[] = {}; const int rb_funcall_nargs = (int)(sizeof(rb_funcall_args) / sizeof(VALUE)); rb_funcallv(ary, idMax, __builtin_choose_expr(__builtin_constant_p(rb_funcall_argc), (((rb_funcall_argc) == 0 ? (rb_funcall_nargs) <= 1 : (rb_funcall_argc) == (rb_funcall_nargs)) ? (rb_funcall_argc) : rb_varargs_bad_length(rb_funcall_argc, rb_funcall_nargs)), (((rb_funcall_argc) <= (rb_funcall_nargs)) ? (rb_funcall_argc) : (rb_fatal("argc(%d) exceeds actual arguments(%d)", rb_funcall_argc, rb_funcall_nargs), 0))), rb_funcall_nargs ? rb_funcall_args : ((void *)0)); }); } } static VALUE @@ -115909,7 +126941,7 @@ vm_opt_newarray_min(rb_num_t num, const VALUE *ptr) } else { VALUE ary = rb_ary_new_from_values(num, ptr); - return rb_funcall(ary, idMin, 0); + return __extension__({ const int rb_funcall_argc = (0); const VALUE rb_funcall_args[] = {}; const int rb_funcall_nargs = (int)(sizeof(rb_funcall_args) / sizeof(VALUE)); rb_funcallv(ary, idMin, __builtin_choose_expr(__builtin_constant_p(rb_funcall_argc), (((rb_funcall_argc) == 0 ? (rb_funcall_nargs) <= 1 : (rb_funcall_argc) == (rb_funcall_nargs)) ? (rb_funcall_argc) : rb_varargs_bad_length(rb_funcall_argc, rb_funcall_nargs)), (((rb_funcall_argc) <= (rb_funcall_nargs)) ? (rb_funcall_argc) : (rb_fatal("argc(%d) exceeds actual arguments(%d)", rb_funcall_argc, rb_funcall_nargs), 0))), rb_funcall_nargs ? rb_funcall_args : ((void *)0)); }); } } #undef id_cmp @@ -115943,7 +126975,7 @@ vm_once_dispatch(rb_execution_context_t *ec, ISEQ iseq, ISE is) VALUE val; is->once.running_thread = th; val = rb_ensure(vm_once_exec, (VALUE)iseq, vm_once_clear, (VALUE)is); - rb_obj_write((VALUE)(ec->cfp->iseq), (VALUE *)(&is->once.value), (VALUE)(val), "../ruby_2_5/vm_insnhelper.c", 3317); + rb_obj_write((VALUE)(ec->cfp->iseq), (VALUE *)(&is->once.value), (VALUE)(val), "../snapshot/vm_insnhelper.c", 3319); is->once.running_thread = RUNNING_THREAD_ONCE_DONE; return val; } @@ -115992,7 +127024,7 @@ vm_stack_consistency_error(const rb_execution_context_t *ec, const ptrdiff_t nsp = ((cfp->sp) - (ec)->vm_stack); const ptrdiff_t nbp = ((bp) - (ec)->vm_stack); static const char stack_consistency_error[] = - "Stack consistency error (sp: %""I64""d"", bp: %""I64""d"")"; + "Stack consistency error (sp: %""t""d"", bp: %""t""d"")"; VALUE mesg = rb_sprintf(stack_consistency_error, nsp, nbp); __extension__ ({ (__builtin_constant_p("\n")) ? rb_str_cat((mesg), ("\n"), (long)strlen("\n")) : rb_str_cat_cstr((mesg), ("\n")); }); rb_str_append(mesg, rb_iseq_disasm(cfp->iseq)); @@ -116393,7 +127425,7 @@ vm_opt_regexpmatch1(VALUE recv, VALUE obj) return rb_reg_match(recv, obj); } else { - return rb_funcall(recv, idEqTilde, 1, obj); + return __extension__({ const int rb_funcall_argc = (1); const VALUE rb_funcall_args[] = {obj}; const int rb_funcall_nargs = (int)(sizeof(rb_funcall_args) / sizeof(VALUE)); rb_funcallv(recv, idEqTilde, __builtin_choose_expr(__builtin_constant_p(rb_funcall_argc), (((rb_funcall_argc) == 0 ? (rb_funcall_nargs) <= 1 : (rb_funcall_argc) == (rb_funcall_nargs)) ? (rb_funcall_argc) : rb_varargs_bad_length(rb_funcall_argc, rb_funcall_nargs)), (((rb_funcall_argc) <= (rb_funcall_nargs)) ? (rb_funcall_argc) : (rb_fatal("argc(%d) exceeds actual arguments(%d)", rb_funcall_argc, rb_funcall_nargs), 0))), rb_funcall_nargs ? rb_funcall_args : ((void *)0)); }); } } static VALUE @@ -116466,7 +127498,7 @@ static inline VALUE vm_yield_with_cref(rb_execution_context_t *ec, int argc, con static inline VALUE vm_yield(rb_execution_context_t *ec, int argc, const VALUE *argv); static inline VALUE vm_yield_with_block(rb_execution_context_t *ec, int argc, const VALUE *argv, VALUE block_handler); static inline VALUE vm_yield_force_blockarg(rb_execution_context_t *ec, VALUE args); -VALUE rb_vm_exec(rb_execution_context_t *ec, int mjit_enable_p); +VALUE vm_exec(rb_execution_context_t *ec, int mjit_enable_p); static void vm_set_eval_stack(rb_execution_context_t * th, const rb_iseq_t *iseq, const rb_cref_t *cref, const struct rb_block *base_block); static int vm_collect_local_variables_in_heap(const VALUE *dfp, const struct local_var_list *vars); static VALUE rb_eUncaughtThrow; @@ -116500,7 +127532,7 @@ raise_method_missing(rb_execution_context_t *ec, int argc, const VALUE *argv, VA } else if ((__builtin_expect(!!(!((((VALUE)(argv[0])&~((~(VALUE)0)<flags & RUBY_T_MASK) == (RUBY_T_SYMBOL)))), 0))) { const VALUE e = rb_eArgError; - rb_raise(e, "method name must be a Symbol but %""I64""i" "\v"" is given", + rb_raise(e, "method name must be a Symbol but %""ll""i" "\v"" is given", rb_obj_class(argv[0])); } stack_check(ec); @@ -116555,7 +127587,7 @@ method_missing(VALUE obj, ID id, int argc, const VALUE *argv, enum method_missin me = rb_callable_method_entry(klass, idMethodMissing); if (!me || (int) (((me)->flags & (((VALUE)RUBY_FL_USER6) )) >> ((((VALUE)RUBY_FL_USHIFT) + 4)+2))) goto missing; vm_passed_block_handler_set(ec, block_handler); - result = rb_vm_call0(ec, obj, idMethodMissing, argc, argv, me); + result = vm_call0(ec, obj, idMethodMissing, argc, argv, me); if (work) rb_free_tmp_buffer(&(work)); return result; }