IPU C++ intrinsics
Functions that target single IPU instructions.
#include <ipu_memory_intrinsics>
These intrinsic functions target single IPU instructions and may be used in C++ IPU code. Each function is named after the instruction it targets. Float instructions that have a type (for example, f16v2
) often omit this prefix in their function name. For example, use cmpeq() to target any of the cmpeq
instructions, such as f16v2cmpeq
.
Refer to the Tile Vertex Instruction Set Architecture for Mk2 IPUs for more detailed information on the instructions targeted by these intrinsics.
Defines
-
IPU_INTRINSICS_INCLUDED
Variables
-
namespace ipu
IPU intrinsic functions.
Functions
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>
inline unsigned andc(T0 src0, T1 src1) Targets the
andc
instruction.- Parameters
src0 – An integer value.
src1 – An integer value, can be a 12-bit constant.
- Returns
The bitwise logical
and
ofsrc0
and the negated value ofsrc1
of typeunsigned
.
-
inline float andc(float src0, float src1)
Targets the
andc
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
The bitwise logical
and
ofsrc0
and the negated value ofsrc1
of typefloat
.
-
inline float2 andc(float2 src0, float2 src1)
Targets the
andc64
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The bitwise logical
and
ofsrc0
and the negated value ofsrc1
of typefloat2
.
-
inline unsigned bitrev8(unsigned src)
Targets the
bitrev8
instruction.- Parameters
src – A value of type
unsigned
.- Returns
A result of type
unsigned
that is equivalent to the value ofsrc
with the bit order of each byte reversed.
-
inline unsigned cms(unsigned src)
Targets the
cms
instruction.- Parameters
src – A value of type
unsigned
.- Returns
The number of higher order bits in
src
that match the sign bit (bit 31), as anunsigned
.
-
inline float2 roll32(float2 src0, float2 src1)
Targets the
roll32
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The result of a SIMD roll permutation on the 4 32-bit float values across
src0
andsrc1
, as afloat2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 1 |
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>
inline unsigned roll8l(T0 src0, T1 src1) Targets the
roll8l
instruction.- Parameters
src0 – An integer value.
src1 – An integer value.
- Returns
The result of a SIMD roll-left permutation on the 8 8-bit values across
src0
andsrc1
, as anunsigned
. src0 src1 -> Result | 7 | 6 | 5 | 4 | | 3 | 2 | 1 | 0 | | 6 | 5 | 4 | 3 |
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>
inline unsigned roll8r(T0 src0, T1 src1) Targets the
roll8r
instruction.- Parameters
src0 – An integer value.
src1 – An integer value.
- Returns
The result of a SIMD roll-right permutation on the 8 8-bit values across
src0
andsrc1
, as anunsigned
. src0 src1 -> Result | 7 | 6 | 5 | 4 | | 3 | 2 | 1 | 0 | | 4 | 3 | 2 | 1 |
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>
inline unsigned shuf8x8hi(T0 src0, T1 src1) Targets the
shuf8x8hi
instruction.- Parameters
src0 – An integer value.
src1 – An integer value.
- Returns
The upper word of a SIMD shuffle permutation on the 8 8-bit values across
src0
andsrc1
, as anunsigned
. src0 src1 -> Result | 7 | 6 | 5 | 4 | | 3 | 2 | 1 | 0 | | 7 | 3 | 6 | 2 |
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>
inline unsigned shuf8x8lo(T0 src0, T1 src1) Targets the
shuf8x8lo
instruction.- Parameters
src0 – An integer value.
src1 – An integer value.
- Returns
The lower word of a SIMD shuffle permutation on the 8 8-bit values across
src0
andsrc1
, as anunsigned
. src0 src1 -> Result | 7 | 6 | 5 | 4 | | 3 | 2 | 1 | 0 | | 5 | 1 | 4 | 0 |
-
inline float2 sort4x32hi(float2 src0, float2 src1)
Targets the
sort4x32hi
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The result of a SIMD sort permutation on the 4 32-bit float values across
src0
andsrc1
, as afloat2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 3 | 1 |
-
inline float2 sort4x32lo(float2 src0, float2 src1)
Targets the
sort4x32lo
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The result of a SIMD sort permutation on the 4 32-bit float values across
src0
andsrc1
, as afloat2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 0 |
-
inline unsigned sort8(unsigned src)
Targets the
sort8
instruction.- Parameters
src – A value of type
unsigned
.- Returns
The result of a SIMD sort8 permutation on the 4 8-bit values in
src
, as anunsigned
. src -> Result | 3 | 2 | 1 | 0 | | 3 | 1 | 2 | 0 |
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>
inline unsigned sort8x8hi(T0 src0, T1 src1) Targets the
sort8x8hi
instruction.- Parameters
src0 – An integer value.
src1 – An integer value.
- Returns
The upper word of the result of a SIMD sort8 permutation on the 8 8-bit values across
src0
andsrc1
, as anunsigned
. src0 src1 -> Result | 7 | 6 | 5 | 4 | | 3 | 2 | 1 | 0 | | 7 | 5 | 3 | 1 |
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>
inline unsigned sort8x8lo(T0 src0, T1 src1) Targets the
sort8x8lo
instruction.- Parameters
src0 – An integer value.
src1 – An integer value.
- Returns
The lower word of the result of a SIMD sort8 permutation on the 8 8-bit values across
src0
andsrc1
, as anunsigned
. src0 src1 -> Result | 7 | 6 | 5 | 4 | | 3 | 2 | 1 | 0 | | 6 | 4 | 2 | 0 |
-
inline unsigned swap8(unsigned src)
Targets the
sort8
instruction.- Parameters
src – A value of type
unsigned
.- Returns
The result of a SIMD swap permutation on the 4 8-bit values in
src
, as anunsigned
. src -> Result | 3 | 2 | 1 | 0 | | 2 | 3 | 0 | 1 |
-
inline half2 absadd(half2 src0, half2 src1)
Targets the
f16v2absadd
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The result of an element-wise addition of absolute values in
src0
andsrc1
.
-
inline half4 absadd(half4 src0, half4 src1)
Targets the
f16v4absadd
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
The result of an element-wise addition of absolute values in
src0
andsrc1
.
-
inline float2 absadd(float2 src0, float2 src1)
Targets the
f32v2absadd
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The result of an element-wise addition of absolute values in
src0
andsrc1
.
-
inline float absadd(float src0, float src1)
Targets the
f32absadd
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
The result of a scalar addition of absolute values
src0
andsrc1
.
-
inline half2 absmax(half2 src0, half2 src1)
Targets the
f16v2absmax
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The element-wise maximum of absolute values in
src0
andsrc1
.
-
inline half4 absmax(half4 src0, half4 src1)
Targets the
f16v4absmax
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
The element-wise maximum of absolute values in
src0
andsrc1
.
-
inline float2 absmax(float2 src0, float2 src1)
Targets the
f32v2absmax
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The element-wise maximum of absolute values in
src0
andsrc1
.
-
inline float absmax(float src0, float src1)
Targets the
f32absmax
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
The maximum of absolute values
src0
andsrc1
.
-
inline half2 max(half2 src0, half2 src1)
Targets the
f16v2max
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The element-wise maximum of
src0
andsrc1
.
-
inline half4 max(half4 src0, half4 src1)
Targets the
f16v4max
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
The element-wise maximum of
src0
andsrc1
.
-
inline float2 max(float2 src0, float2 src1)
Targets the
f32v2max
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The element-wise maximum of
src0
andsrc1
.
-
inline float max(float src0, float src1)
Targets the
f32max
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
The maximum of
src0
andsrc1
.
-
inline half2 maxc(half4 src)
Targets the
f16v4maxc
instruction.- Parameters
src – A value of type
half4
.- Returns
The 2x2 lateral maximum of
src
. The 0th element in the result vector is the maximum of src[0] and src[1], and the 1st element is the maximum of src[2] and src[3].
-
inline half2 min(half2 src0, half2 src1)
Targets the
f16v2min
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The element-wise minimum of
src0
andsrc1
.
-
inline half4 min(half4 src0, half4 src1)
Targets the
f16v4min
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
The element-wise minimum of
src0
andsrc1
.
-
inline float2 min(float2 src0, float2 src1)
Targets the
f32v2min
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The element-wise minimum of
src0
andsrc1
.
-
inline float min(float src0, float src1)
Targets the
f32min
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
The minimum of
src0
andsrc1
.
-
inline half2 clamp(half2 src0, half2 src1)
Targets the
f16v2clamp
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The min-of-maximum result of
src0
andsrc1
, of typehalf2
. The first element is the median value of the first element ofsrc0
and the two elements insrc1
. The second element is the median of the second element ofsrc0
and the two elements insrc1
.
-
inline half4 clamp(half4 src0, half2 src1)
Targets the
f16v4clamp
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half2
.
- Returns
The min-of-maximum result of
src0
andsrc1
, of typehalf4
. Each element is the median of the element insrc0
at the same index, and the two values insrc1
.
-
inline float2 clamp(float2 src0, float2 src1)
Targets the
f32v2clamp
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The min-of-maximum result of
src0
andsrc1
, of typefloat2
. The first element is the median of the first element ofsrc0
and the two elements insrc1
. The second element is the median of the second element ofsrc0
and the two elements insrc1
.
-
inline float clamp(float src0, float2 src1)
Targets the
f32clamp
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float2
.
- Returns
The median of
src0
and the two elements insrc1
.
-
inline void cmac(half2 src0, half2 src1)
Targets the
f16v2cmac
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
-
inline void cmac(half4 src0, half4 src1)
Targets the
f16v4cmac
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
-
inline half2 exp(half2 src)
Targets the
f16v2exp
instruction.- Parameters
src – A value of type
half2
.- Returns
A vector of the results of
e^X
of the two elements insrc
.
-
inline float exp(float src)
Targets the
f32exp
instruction.- Parameters
src – A value of type
float
.- Returns
The result of
e^{src}
.
-
inline half2 exp2(half2 src)
Targets the
f16v2exp
instruction.- Parameters
src – A value of type
half2
.- Returns
A vector of the results of 2^X of the two elements in
src
.
-
inline float exp2(float src)
Targets the
f32exp
instruction.- Parameters
src – A value of type
float
.- Returns
The result of
2^{src}
.
-
inline half2 log2(half2 src)
Targets the
f16v2log2
instruction.- Parameters
src – A value of type
half2
.- Returns
A vector of the results of the log (base 2) of the two elements in
src
.
-
inline float log2(float src)
Targets the
f32ln
instruction.- Parameters
src – A value of type
half2
.- Returns
The result of the log (base 2) of
src
.
-
inline half2 tanh(half2 src)
Targets the
f16v2tanh
instruction.- Parameters
src – A value of type
half2
.- Returns
The result of tanh(src)`.
-
inline float tanh(float src)
Targets the
f32tanh
instruction.- Parameters
src – A value of type
float
.- Returns
The result of tanh(src)`.
-
inline half2 ln(half2 src)
Targets the
f16v2ln
instruction.- Parameters
src – A value of type
half2
.- Returns
A vector of the results of the natural log of the two elements in
src
.
-
inline float ln(float src)
Targets the
f32ln
instruction.- Parameters
src – A value of type
half2
.- Returns
The result of the natural log of
src
.
-
inline float2 axpy(float2 src0, float2 src1)
Targets the
f32v2axpy
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
The single precision two-element vector
res = a*src0 + src1
. The scalar multiplicanda
is provided by the internal state element$TAS
.
-
inline half2 f16v2grand()
Targets the
f16v2grand
instruction.- Returns
Gaussian distribution, two-element half-precision random vector.
-
inline float2 f32v2grand()
Targets the
f32v2grand
instruction.- Returns
Gaussian distribution, two-element single-precision random vector.
-
inline half4 rmask(half4 src0, float src1)
Targets the
f16v4rmask
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
float
.
- Returns
The result is a masked version of
src0
, with each element of the input being individually masked with the probability specified by the bottom 17-bits ofsrc1:
if
src1
[16] == 1, no masking is applied;if
src1
[16:0] == 0, the result is a zero vector;otherwise each element is individually unmasked with probability
src1
[15:0] / 65536. PRNG is used by this instruction to generate 4 x 16-bit random values from the discrete uniform distribution.
-
inline float2 rmask(float2 src0, float src1)
Targets the
f32v2rmask
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float
.
- Returns
The result is a masked version of
src0
, with each element of the input being individually masked with the probability specified by the bottom 17-bits ofsrc1:
if
src1
[16] == 1, no masking is applied;if
src1
[16:0] == 0, the result is a zero vector;otherwise each element is individually unmasked with probability
src1
[15:0] / 65536. PRNG is used by this instruction to generate 2 x 16-bit random values from the discrete uniform distribution.
-
inline half2 sigm(half2 src)
Targets the
f16v2sigm
instruction.- Parameters
src – A value of type
half2
.- Returns
The result of an element-wise application of the sigmoid function on
src
.
-
inline float sigm(float src)
Targets the
f32sigm
instruction.- Parameters
src – A value of type
float
.- Returns
The result of an element-wise application of the sigmoid function on
src
.
-
inline float sum(half2 src)
Targets the
f16v2sum
instruction.- Parameters
src – A value of type
half2
.- Returns
The sum of the two elements in
src
as afloat
.
-
inline float2 sum(half4 src)
Targets the
f16v4sum
instruction.- Parameters
src – A value of type
half2
.- Returns
The 2x2 lateral summation of the elements in
src
as afloat2
. The first element is the sum ofsrc
[0] andsrc
[1], the second element is the sum ofsrc
[2] andsrc
[3].
-
inline half2 cmpeq(half2 src0, half2 src1)
Targets the
f16v2cmpeq
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
Element-wise equality test of
src0
andsrc1
. If src0[i] == src1[i], the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline half4 cmpeq(half4 src0, half4 src1)
Targets the
f16v4cmpeq
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
Element-wise equality test of
src0
andsrc1
. If src0[i] == src1[i], the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float2 cmpeq(float2 src0, float2 src1)
Targets the
f32v2cmpeq
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
Element-wise equality test of
src0
andsrc1
. If src0[i] == src1[i], the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float cmpeq(float src0, float src1)
Targets the
f32cmpeq
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
Equality test of
src0
andsrc1
. Ifsrc0
==src1
the result will be0xffff
, and0x0000
otherwise.
-
inline half2 cmpge(half2 src0, half2 src1)
Targets the
f16v2cmpge
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
Element-wise greater-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
[i] >=src1
[i] the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline half4 cmpge(half4 src0, half4 src1)
Targets the
f16v4cmpge
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
Element-wise greater-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
[i] >=src1
[i] the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float2 cmpge(float2 src0, float2 src1)
Targets the
f32v2cmpge
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
Element-wise greater-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
[i] >=src1
[i] the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float cmpge(float src0, float src1)
Targets the
f32cmpge
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
Greater-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
>=src1
the result will be0xffff
, and0x0000
otherwise.
-
inline half2 cmpgt(half2 src0, half2 src1)
Targets the
f16v2cmpgt
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
Element-wise greater-than test of
src0
andsrc1
. Ifsrc0
>src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline half4 cmpgt(half4 src0, half4 src1)
Targets the
f16v4cmpgt
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
Element-wise greater-than test of
src0
andsrc1
. Ifsrc0
>src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float2 cmpgt(float2 src0, float2 src1)
Targets the
f32v2cmpgt
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
Element-wise greater-than test of
src0
andsrc1
. Ifsrc0
>src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float cmpgt(float src0, float src1)
Targets the
f32cmpgt
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
Greater-than test of
src0
andsrc1
. Ifsrc0
>src1
the result will be0xffff
, and0x0000
otherwise.
-
inline half2 cmple(half2 src0, half2 src1)
Targets the
f16v2cmple
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
Element-wise less-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
<=src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline half4 cmple(half4 src0, half4 src1)
Targets the
f16v4cmple
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
Element-wise less-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
<=src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float2 cmple(float2 src0, float2 src1)
Targets the
f32v2cmple
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
Element-wise less-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
<=src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float cmple(float src0, float src1)
Targets the
f32cmple
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
Less-than-or-equal-to test of
src0
andsrc1
. Ifsrc0
<=src1
the result will be0xffff
, and0x0000
otherwise.
-
inline half2 cmplt(half2 src0, half2 src1)
Targets the
f16v2cmplt
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
Element-wise less-than test of
src0
andsrc1
. Ifsrc0
<src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline half4 cmplt(half4 src0, half4 src1)
Targets the
f16v4cmplt
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
Element-wise less-than test of
src0
andsrc1
. Ifsrc0
<src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float2 cmplt(float2 src0, float2 src1)
Targets the
f32v2cmplt
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
Element-wise less-than test of
src0
andsrc1
. Ifsrc0
<src1
the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float cmplt(float src0, float src1)
Targets the
f32cmplt
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
Less-than test of
src0
andsrc1
. Ifsrc0
<src1
the result will be0xffff
, and0x0000
otherwise.
-
inline half2 cmpne(half2 src0, half2 src1)
Targets the
f16v2cmpne
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
Element-wise inequality test of
src0
andsrc1
. If src0[i] != src1[i], the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline half4 cmpne(half4 src0, half4 src1)
Targets the
f16v4cmpne
instruction.- Parameters
src0 – A value of type
half4
.src1 – A value of type
half4
.
- Returns
Element-wise inequality test of
src0
andsrc1
. If src0[i] != src1[i], the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float2 cmpne(float2 src0, float2 src1)
Targets the
f32v2cmpne
instruction.- Parameters
src0 – A value of type
float2
.src1 – A value of type
float2
.
- Returns
Element-wise inequality test of
src0
andsrc1
. If src0[i] != src1[i], the result vector element at indexi
will be0xffff
, and0x0000
otherwise.
-
inline float cmpne(float src0, float src1)
Targets the
f32cmpne
instruction.- Parameters
src0 – A value of type
float
.src1 – A value of type
float
.
- Returns
Inequality test of
src0
andsrc1
. Ifsrc0
!=src1
the result will be0xffff
, and0x0000
otherwise.
-
inline float to_float(half src)
Conversion from a half-precision float to single-precision.
Targets the
f16tof32
instruction.- Parameters
src – A value of type
half
.- Returns
The half-precision float value
src
converted to single-precision.
-
inline float2 to_float(half2 src)
Two-element vector conversion from half-precision to single-precision.
Targets the
f16v2tof32
instruction.- Parameters
src – A vector of type
half2
.- Returns
The vector of two half-precision float values
src
converted to single-precision.
-
inline float to_float(unsigned src)
Conversion from an unsigned integer to a single-precision float.
Targets the
f32fromui32
instruction.- Parameters
src – An unsigned integer.
- Returns
The unsigned integer
src
converted to a single-precision float value.
-
inline float to_float(int src)
Conversion from a signed integer to a single-precision float.
Targets the
f32fromi32
instruction.- Parameters
src – A signed integer.
- Returns
The signed integer
src
converted to a single-precision float value.
-
inline float to_float_su(unsigned src)
Symmetric, unbiased conversion from an unsigned integer to a single-precision float.
Targets the
f32sufromui
instruction.- Parameters
src – An unsigned integer.
- Returns
The unsigned integer
src
converted to a single-precision float value via symmetric, unbiased conversion.
-
inline float2 to_float_su(uint2 src)
Symmetric, unbiased conversion from a vector of two unsigned integers to single-precision floats.
Targets the
f32v2sufromui
instruction.- Parameters
src – A vector of two unsigned integers.
- Returns
The vector of two unsigned integers
src
converted to single-precision floats via symmetric, unbiased conversion.
-
inline half to_half(float src)
Conversion from a single-precision float to half-precision.
Targets the
f32tof16
instruction.- Parameters
src – A value of type
float
.- Returns
The single-precision float value
src
converted to half-precision.
-
inline half2 to_half(float2 src)
Conversion from a vector of two single-precision floats to half-precision.
Targets the
f32v2tof16
instruction.- Parameters
src – A vector of two single-precision float values.
- Returns
The vector of two single-precision float values
src
converted to half-precision.
-
inline half2 to_half_su(ushort2 src)
Symmetric, unbiased conversion from a vector of two unsigned 16-bit integers to half-precision floats.
Targets the
f16v2sufromui
instruction.- Parameters
src – A two-element vector of unsigned 16-bit integers.
- Returns
The vector of two unsigned 16-bit integers
src
converted to half precision floats via symmetric, unbiased conversion.
-
inline half4 to_half_su(ushort4 src)
Symmetric, unbiased conversion from a vector of four unsigned 16-bit integers to half-precision floats.
Targets the
f16v4sufromui
instruction.- Parameters
src – A four-element vector of unsigned 16-bit integers.
- Returns
The vector of four unsigned 16-bit integers
src
converted to half precision floats via symmetric, unbiased conversion.
-
inline unsigned to_uint(float src)
Conversion from a single-precision float to an unsigned integer.
Targets the
f32toui32
instruction.- Parameters
src – A float value.
- Returns
The single-precision float
src
converted to an unsigned integer.
-
inline int to_int(float src)
Conversion from a single-precision float to a signed integer.
Targets the
f32toi32
instruction.- Parameters
src – A float value.
- Returns
The single-precision float
src
converted to a signed integer.
-
inline float f16tof32(half src)
Targets the
f16tof32
instruction.- Parameters
src – A value of type
half
.- Returns
The half-precision float value
src
converted to single-precision.
-
inline float2 f16v2tof32(half2 src)
Targets the
f16v2tof32
instruction.- Parameters
src – A vector of type
half2
.- Returns
The vector of two half-precision float values
src
converted to single-precision.
-
inline half2 f16v2sufromui(half2 src)
Targets the
f16v2sufromui
instruction.- Parameters
src – A two-element vector of unsigned 16-bit integers, as a variable of type
half2
. Note: this builtin directly targets thef16v2sufromui
instruction, whose operand and result are both in the floating point register file. To achieve this conversion from aushort2
type, see theto_half
function in this header.- Returns
The vector of two unsigned 16-bit integers
src
converted to half precision floats via symmetric, unbiased conversion.
-
inline half4 f16v4sufromui(half4 src)
Targets the
f16v4sufromui
instruction.- Parameters
src – A four-element vector of unsigned 16-bit integers, as a variable of type
half4
. Note: this builtin directly targets thef16v4sufromui
instruction, whose operand and result are both in the floating point register file. To achieve this conversion from aushort4
type, see theto_half
function in this header.- Returns
The vector of four unsigned 16-bit integers
src
converted to half precision floats via symmetric, unbiased conversion.
-
inline float f32fromi32(float src)
Targets the
f32fromi32
instruction.- Parameters
src – A signed integer as a float variable. Note: this builtin directly targets the
f32fromi32
instruction, whose operand and result are both in the floating point register file. To achieve this conversion from anint
type, see theto_float
function in this header.- Returns
The signed integer
src
converted to a single-precision float value.
-
inline float f32fromui32(float src)
Targets the
f32fromui32
instruction.- Parameters
src – An unsigned integer as a float variable. Note: this builtin directly targets the
f32fromui32
instruction, whose operand and result are both in the floating point register file. To achieve this conversion from anunsigned
type, see theto_float
function in this header.- Returns
The unsigned integer
src
converted to a single-precision float value.
-
inline float f32sufromui(float src)
Targets the
f32sufromui
instruction.- Parameters
src – An unsigned integer as a float variable. Note: this builtin directly targets the
f32sufromui
instruction, whose operand and result are both in the floating point register file. To achieve this conversion from anunsigned
type, see theto_float_su
function in this header.- Returns
The unsigned integer
src
converted to a single-precision float value via symmetric, unbiased conversion.
-
inline half f32tof16(float src)
Targets the
f32tof16
instruction.- Parameters
src – A value of type
float
.- Returns
The single-precision float value
src
converted to half-precision.
-
inline float f32toi32(float src)
Targets the
f32toi32
instruction.- Parameters
src – A float value.
- Returns
The single-precision float
src
converted to an integer, as a float-type variable. Note: this builtin directly targets thef32toi32
instruction, whose operand and result are both in the floating point register file. To achieve this conversion to anint
type, see theto_int
function in this header.
-
inline float f32toui32(float src)
Targets the
f32toui32
instruction.- Parameters
src – A float value.
- Returns
The single-precision float
src
converted to an unsigned integer, as a float-type variable. Note: this builtin directly targets thef32toui32
instruction, whose operand and result are both in the floating point register file. To achieve this conversion to anunsigned
type, see theto_uint
function in this header.
-
inline float2 f32v2sufromui(float2 src)
Targets the
f32v2sufromui
instruction.- Parameters
src – A two-element vector of unsigned integers, as a variable of type type
float2
. Note: this builtin directly targets thef32v2sufromui
instruction, whose operand and result are both in the floating point register file. To achieve this conversion from auint2
type, see theto_float
function in this header.- Returns
The vector of two unsigned integers
src
converted to single precision floats via symmetric, unbiased conversion.
-
inline half2 f32v2tof16(float2 src)
Targets the
f32v2tof16
instruction.- Parameters
src – A vector of two single-precision float values.
- Returns
The vector of two single-precision float values
src
converted to half-precision.
-
inline unsigned clz(int src)
Targets the
clz
instruction.- Parameters
src – A value of type
int
.- Returns
The number of higher bits in
src
that are zero.
-
inline unsigned popc(int src)
Targets the
popc
instruction.- Parameters
src – A value of type
int
.- Returns
The number of set bits in
src
.
-
inline short2 roll16(short2 src0, short2 src1)
Targets the
roll16
instruction.- Parameters
src0 – A value of type
short2
.src1 – A value of type
short2
.
- Returns
The result of a SIMD roll permutation on the 4 16-bit values across
src0
andsrc1
, as ashort2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 1 |
-
inline ushort2 roll16(ushort2 src0, ushort2 src1)
Targets the
roll16
instruction.- Parameters
src0 – A value of type
ushort2
.src1 – A value of type
ushort2
.
- Returns
The result of a SIMD roll permutation on the 4 16-bit values across
src0
andsrc1
, as aushort2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 1 |
-
inline half2 roll16(half2 src0, half2 src1)
Targets the
roll16
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The result of a SIMD roll permutation on the 4 16-bit values across
src0
andsrc1
, as ahalf2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 1 |
-
inline short2 sort4x16hi(short2 src0, short2 src1)
Targets the
sort4x16hi
instruction.- Parameters
src0 – A value of type
short2
.src1 – A value of type
short2
.
- Returns
The result of a SIMD sort permutation on the 4 16-bit values across
src0
andsrc1
, as ashort2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 3 | 1 |
-
inline ushort2 sort4x16hi(ushort2 src0, ushort2 src1)
Targets the
sort4x16hi
instruction.- Parameters
src0 – A value of type
ushort2
.src1 – A value of type
ushort2
.
- Returns
The result of a SIMD sort permutation on the 4 16-bit values across
src0
andsrc1
, as aushort2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 3 | 1 |
-
inline half2 sort4x16hi(half2 src0, half2 src1)
Targets the
sort4x16hi
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The result of a SIMD sort permutation on the 4 16-bit values across
src0
andsrc1
, as ahalf2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 3 | 1 |
-
inline short2 sort4x16lo(short2 src0, short2 src1)
Targets the
sort4x16lo
instruction.- Parameters
src0 – A value of type
short2
.src1 – A value of type
short2
.
- Returns
The result of a SIMD sort permutation on the 4 16-bit values across
src0
andsrc1
, as ashort2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 0 |
-
inline ushort2 sort4x16lo(ushort2 src0, ushort2 src1)
Targets the
sort4x16lo
instruction.- Parameters
src0 – A value of type
ushort2
.src1 – A value of type
ushort2
.
- Returns
The result of a SIMD sort permutation on the 4 16-bit values across
src0
andsrc1
, as aushort2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 0 |
-
inline half2 sort4x16lo(half2 src0, half2 src1)
Targets the
sort4x16lo
instruction.- Parameters
src0 – A value of type
half2
.src1 – A value of type
half2
.
- Returns
The result of a SIMD sort permutation on the 4 16-bit values across
src0
andsrc1
, as ahalf2
. src0 src1 -> Result | 3 | 2 | | 1 | 0 | | 2 | 0 |
-
template<typename T0, typename T1, typename = std::enable_if_t<both_integral<T0, T1>>>