Permutation

Generated from include/msa2.h. This page contains 57 intrinsics.

__m128i __msa2_n2x_rnd_sx_nc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_nc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.nc.b
Builtin: __builtin_msa2_n2x_rnd_sx_nc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:260

Description

Narrow u16 elements into u8 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = truncate(rounding_shift(a.i16[0], imm), 8);
dst.u8[1] = truncate(rounding_shift(a.i16[1], imm), 8);
dst.u8[2] = truncate(rounding_shift(a.i16[2], imm), 8);
dst.u8[3] = truncate(rounding_shift(a.i16[3], imm), 8);
dst.u8[4] = truncate(rounding_shift(a.i16[4], imm), 8);
dst.u8[5] = truncate(rounding_shift(a.i16[5], imm), 8);
dst.u8[6] = truncate(rounding_shift(a.i16[6], imm), 8);
dst.u8[7] = truncate(rounding_shift(a.i16[7], imm), 8);
dst.u8[8] = truncate(rounding_shift(a.i16[8], imm), 8);
dst.u8[9] = truncate(rounding_shift(a.i16[9], imm), 8);
dst.u8[10] = truncate(rounding_shift(a.i16[10], imm), 8);
dst.u8[11] = truncate(rounding_shift(a.i16[11], imm), 8);
dst.u8[12] = truncate(rounding_shift(a.i16[12], imm), 8);
dst.u8[13] = truncate(rounding_shift(a.i16[13], imm), 8);
dst.u8[14] = truncate(rounding_shift(a.i16[14], imm), 8);
dst.u8[15] = truncate(rounding_shift(a.i16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_nc_b(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_nc_b ((v16i8) a, imm);

__m128i __msa2_n2x_rnd_sx_nc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_nc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.nc.d
Builtin: __builtin_msa2_n2x_rnd_sx_nc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:269

Description

Narrow u128 elements into u64 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = truncate(rounding_shift(a.i128[0], imm), 64);
dst.u64[1] = truncate(rounding_shift(a.i128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_rnd_sx_nc_d(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_nc_d ((v2i64) a, imm);

__m128i __msa2_n2x_rnd_sx_nc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_nc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.nc.h
Builtin: __builtin_msa2_n2x_rnd_sx_nc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:263

Description

Narrow u32 elements into u16 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = truncate(rounding_shift(a.i32[0], imm), 16);
dst.u16[1] = truncate(rounding_shift(a.i32[1], imm), 16);
dst.u16[2] = truncate(rounding_shift(a.i32[2], imm), 16);
dst.u16[3] = truncate(rounding_shift(a.i32[3], imm), 16);
dst.u16[4] = truncate(rounding_shift(a.i32[4], imm), 16);
dst.u16[5] = truncate(rounding_shift(a.i32[5], imm), 16);
dst.u16[6] = truncate(rounding_shift(a.i32[6], imm), 16);
dst.u16[7] = truncate(rounding_shift(a.i32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_nc_h(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_nc_h ((v8i16) a, imm);

__m128i __msa2_n2x_rnd_sx_nc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_nc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.nc.w
Builtin: __builtin_msa2_n2x_rnd_sx_nc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:266

Description

Narrow u64 elements into u32 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = truncate(rounding_shift(a.i64[0], imm), 32);
dst.u32[1] = truncate(rounding_shift(a.i64[1], imm), 32);
dst.u32[2] = truncate(rounding_shift(a.i64[2], imm), 32);
dst.u32[3] = truncate(rounding_shift(a.i64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_nc_w(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_nc_w ((v4i32) a, imm);

__m128i __msa2_n2x_rnd_sx_sc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_sc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.sc.b
Builtin: __builtin_msa2_n2x_rnd_sx_sc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:236

Description

Narrow u16 elements into u8 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = signed_saturate(rounding_shift(a.i16[0], imm), 8);
dst.u8[1] = signed_saturate(rounding_shift(a.i16[1], imm), 8);
dst.u8[2] = signed_saturate(rounding_shift(a.i16[2], imm), 8);
dst.u8[3] = signed_saturate(rounding_shift(a.i16[3], imm), 8);
dst.u8[4] = signed_saturate(rounding_shift(a.i16[4], imm), 8);
dst.u8[5] = signed_saturate(rounding_shift(a.i16[5], imm), 8);
dst.u8[6] = signed_saturate(rounding_shift(a.i16[6], imm), 8);
dst.u8[7] = signed_saturate(rounding_shift(a.i16[7], imm), 8);
dst.u8[8] = signed_saturate(rounding_shift(a.i16[8], imm), 8);
dst.u8[9] = signed_saturate(rounding_shift(a.i16[9], imm), 8);
dst.u8[10] = signed_saturate(rounding_shift(a.i16[10], imm), 8);
dst.u8[11] = signed_saturate(rounding_shift(a.i16[11], imm), 8);
dst.u8[12] = signed_saturate(rounding_shift(a.i16[12], imm), 8);
dst.u8[13] = signed_saturate(rounding_shift(a.i16[13], imm), 8);
dst.u8[14] = signed_saturate(rounding_shift(a.i16[14], imm), 8);
dst.u8[15] = signed_saturate(rounding_shift(a.i16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_sc_b(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_sc_b ((v16i8) a, imm);

__m128i __msa2_n2x_rnd_sx_sc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_sc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.sc.d
Builtin: __builtin_msa2_n2x_rnd_sx_sc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:245

Description

Narrow u128 elements into u64 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = signed_saturate(rounding_shift(a.i128[0], imm), 64);
dst.u64[1] = signed_saturate(rounding_shift(a.i128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_rnd_sx_sc_d(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_sc_d ((v2i64) a, imm);

__m128i __msa2_n2x_rnd_sx_sc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_sc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.sc.h
Builtin: __builtin_msa2_n2x_rnd_sx_sc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:239

Description

Narrow u32 elements into u16 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = signed_saturate(rounding_shift(a.i32[0], imm), 16);
dst.u16[1] = signed_saturate(rounding_shift(a.i32[1], imm), 16);
dst.u16[2] = signed_saturate(rounding_shift(a.i32[2], imm), 16);
dst.u16[3] = signed_saturate(rounding_shift(a.i32[3], imm), 16);
dst.u16[4] = signed_saturate(rounding_shift(a.i32[4], imm), 16);
dst.u16[5] = signed_saturate(rounding_shift(a.i32[5], imm), 16);
dst.u16[6] = signed_saturate(rounding_shift(a.i32[6], imm), 16);
dst.u16[7] = signed_saturate(rounding_shift(a.i32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_sc_h(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_sc_h ((v8i16) a, imm);

__m128i __msa2_n2x_rnd_sx_sc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_sc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.sc.w
Builtin: __builtin_msa2_n2x_rnd_sx_sc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:242

Description

Narrow u64 elements into u32 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = signed_saturate(rounding_shift(a.i64[0], imm), 32);
dst.u32[1] = signed_saturate(rounding_shift(a.i64[1], imm), 32);
dst.u32[2] = signed_saturate(rounding_shift(a.i64[2], imm), 32);
dst.u32[3] = signed_saturate(rounding_shift(a.i64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_sc_w(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_sc_w ((v4i32) a, imm);

__m128i __msa2_n2x_rnd_sx_uc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_uc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.uc.b
Builtin: __builtin_msa2_n2x_rnd_sx_uc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:248

Description

Narrow u16 elements into u8 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = unsigned_saturate(rounding_shift(a.i16[0], imm), 8);
dst.u8[1] = unsigned_saturate(rounding_shift(a.i16[1], imm), 8);
dst.u8[2] = unsigned_saturate(rounding_shift(a.i16[2], imm), 8);
dst.u8[3] = unsigned_saturate(rounding_shift(a.i16[3], imm), 8);
dst.u8[4] = unsigned_saturate(rounding_shift(a.i16[4], imm), 8);
dst.u8[5] = unsigned_saturate(rounding_shift(a.i16[5], imm), 8);
dst.u8[6] = unsigned_saturate(rounding_shift(a.i16[6], imm), 8);
dst.u8[7] = unsigned_saturate(rounding_shift(a.i16[7], imm), 8);
dst.u8[8] = unsigned_saturate(rounding_shift(a.i16[8], imm), 8);
dst.u8[9] = unsigned_saturate(rounding_shift(a.i16[9], imm), 8);
dst.u8[10] = unsigned_saturate(rounding_shift(a.i16[10], imm), 8);
dst.u8[11] = unsigned_saturate(rounding_shift(a.i16[11], imm), 8);
dst.u8[12] = unsigned_saturate(rounding_shift(a.i16[12], imm), 8);
dst.u8[13] = unsigned_saturate(rounding_shift(a.i16[13], imm), 8);
dst.u8[14] = unsigned_saturate(rounding_shift(a.i16[14], imm), 8);
dst.u8[15] = unsigned_saturate(rounding_shift(a.i16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_uc_b(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_uc_b ((v16i8) a, imm);

__m128i __msa2_n2x_rnd_sx_uc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_uc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.uc.d
Builtin: __builtin_msa2_n2x_rnd_sx_uc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:257

Description

Narrow u128 elements into u64 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = unsigned_saturate(rounding_shift(a.i128[0], imm), 64);
dst.u64[1] = unsigned_saturate(rounding_shift(a.i128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_rnd_sx_uc_d(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_uc_d ((v2i64) a, imm);

__m128i __msa2_n2x_rnd_sx_uc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_uc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.uc.h
Builtin: __builtin_msa2_n2x_rnd_sx_uc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:251

Description

Narrow u32 elements into u16 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = unsigned_saturate(rounding_shift(a.i32[0], imm), 16);
dst.u16[1] = unsigned_saturate(rounding_shift(a.i32[1], imm), 16);
dst.u16[2] = unsigned_saturate(rounding_shift(a.i32[2], imm), 16);
dst.u16[3] = unsigned_saturate(rounding_shift(a.i32[3], imm), 16);
dst.u16[4] = unsigned_saturate(rounding_shift(a.i32[4], imm), 16);
dst.u16[5] = unsigned_saturate(rounding_shift(a.i32[5], imm), 16);
dst.u16[6] = unsigned_saturate(rounding_shift(a.i32[6], imm), 16);
dst.u16[7] = unsigned_saturate(rounding_shift(a.i32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_uc_h(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_uc_h ((v8i16) a, imm);

__m128i __msa2_n2x_rnd_sx_uc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_sx_uc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.sx.uc.w
Builtin: __builtin_msa2_n2x_rnd_sx_uc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:254

Description

Narrow u64 elements into u32 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = unsigned_saturate(rounding_shift(a.i64[0], imm), 32);
dst.u32[1] = unsigned_saturate(rounding_shift(a.i64[1], imm), 32);
dst.u32[2] = unsigned_saturate(rounding_shift(a.i64[2], imm), 32);
dst.u32[3] = unsigned_saturate(rounding_shift(a.i64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_sx_uc_w(a, imm) (__m128i) __builtin_msa2_n2x_rnd_sx_uc_w ((v4i32) a, imm);

__m128i __msa2_n2x_rnd_ux_nc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_nc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.nc.b
Builtin: __builtin_msa2_n2x_rnd_ux_nc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:296

Description

Narrow u16 elements into u8 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = truncate(rounding_shift(a.u16[0], imm), 8);
dst.u8[1] = truncate(rounding_shift(a.u16[1], imm), 8);
dst.u8[2] = truncate(rounding_shift(a.u16[2], imm), 8);
dst.u8[3] = truncate(rounding_shift(a.u16[3], imm), 8);
dst.u8[4] = truncate(rounding_shift(a.u16[4], imm), 8);
dst.u8[5] = truncate(rounding_shift(a.u16[5], imm), 8);
dst.u8[6] = truncate(rounding_shift(a.u16[6], imm), 8);
dst.u8[7] = truncate(rounding_shift(a.u16[7], imm), 8);
dst.u8[8] = truncate(rounding_shift(a.u16[8], imm), 8);
dst.u8[9] = truncate(rounding_shift(a.u16[9], imm), 8);
dst.u8[10] = truncate(rounding_shift(a.u16[10], imm), 8);
dst.u8[11] = truncate(rounding_shift(a.u16[11], imm), 8);
dst.u8[12] = truncate(rounding_shift(a.u16[12], imm), 8);
dst.u8[13] = truncate(rounding_shift(a.u16[13], imm), 8);
dst.u8[14] = truncate(rounding_shift(a.u16[14], imm), 8);
dst.u8[15] = truncate(rounding_shift(a.u16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_nc_b(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_nc_b ((v16i8) a, imm);

__m128i __msa2_n2x_rnd_ux_nc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_nc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.nc.d
Builtin: __builtin_msa2_n2x_rnd_ux_nc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:305

Description

Narrow u128 elements into u64 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = truncate(rounding_shift(a.u128[0], imm), 64);
dst.u64[1] = truncate(rounding_shift(a.u128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_rnd_ux_nc_d(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_nc_d ((v2i64) a, imm);

__m128i __msa2_n2x_rnd_ux_nc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_nc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.nc.h
Builtin: __builtin_msa2_n2x_rnd_ux_nc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:299

Description

Narrow u32 elements into u16 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = truncate(rounding_shift(a.u32[0], imm), 16);
dst.u16[1] = truncate(rounding_shift(a.u32[1], imm), 16);
dst.u16[2] = truncate(rounding_shift(a.u32[2], imm), 16);
dst.u16[3] = truncate(rounding_shift(a.u32[3], imm), 16);
dst.u16[4] = truncate(rounding_shift(a.u32[4], imm), 16);
dst.u16[5] = truncate(rounding_shift(a.u32[5], imm), 16);
dst.u16[6] = truncate(rounding_shift(a.u32[6], imm), 16);
dst.u16[7] = truncate(rounding_shift(a.u32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_nc_h(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_nc_h ((v8i16) a, imm);

__m128i __msa2_n2x_rnd_ux_nc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_nc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.nc.w
Builtin: __builtin_msa2_n2x_rnd_ux_nc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:302

Description

Narrow u64 elements into u32 elements using rounding, conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = truncate(rounding_shift(a.u64[0], imm), 32);
dst.u32[1] = truncate(rounding_shift(a.u64[1], imm), 32);
dst.u32[2] = truncate(rounding_shift(a.u64[2], imm), 32);
dst.u32[3] = truncate(rounding_shift(a.u64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_nc_w(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_nc_w ((v4i32) a, imm);

__m128i __msa2_n2x_rnd_ux_sc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_sc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.sc.b
Builtin: __builtin_msa2_n2x_rnd_ux_sc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:272

Description

Narrow u16 elements into u8 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = signed_saturate(rounding_shift(a.u16[0], imm), 8);
dst.u8[1] = signed_saturate(rounding_shift(a.u16[1], imm), 8);
dst.u8[2] = signed_saturate(rounding_shift(a.u16[2], imm), 8);
dst.u8[3] = signed_saturate(rounding_shift(a.u16[3], imm), 8);
dst.u8[4] = signed_saturate(rounding_shift(a.u16[4], imm), 8);
dst.u8[5] = signed_saturate(rounding_shift(a.u16[5], imm), 8);
dst.u8[6] = signed_saturate(rounding_shift(a.u16[6], imm), 8);
dst.u8[7] = signed_saturate(rounding_shift(a.u16[7], imm), 8);
dst.u8[8] = signed_saturate(rounding_shift(a.u16[8], imm), 8);
dst.u8[9] = signed_saturate(rounding_shift(a.u16[9], imm), 8);
dst.u8[10] = signed_saturate(rounding_shift(a.u16[10], imm), 8);
dst.u8[11] = signed_saturate(rounding_shift(a.u16[11], imm), 8);
dst.u8[12] = signed_saturate(rounding_shift(a.u16[12], imm), 8);
dst.u8[13] = signed_saturate(rounding_shift(a.u16[13], imm), 8);
dst.u8[14] = signed_saturate(rounding_shift(a.u16[14], imm), 8);
dst.u8[15] = signed_saturate(rounding_shift(a.u16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_sc_b(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_sc_b ((v16i8) a, imm);

__m128i __msa2_n2x_rnd_ux_sc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_sc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.sc.d
Builtin: __builtin_msa2_n2x_rnd_ux_sc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:281

Description

Narrow u128 elements into u64 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = signed_saturate(rounding_shift(a.u128[0], imm), 64);
dst.u64[1] = signed_saturate(rounding_shift(a.u128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_rnd_ux_sc_d(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_sc_d ((v2i64) a, imm);

__m128i __msa2_n2x_rnd_ux_sc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_sc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.sc.h
Builtin: __builtin_msa2_n2x_rnd_ux_sc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:275

Description

Narrow u32 elements into u16 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = signed_saturate(rounding_shift(a.u32[0], imm), 16);
dst.u16[1] = signed_saturate(rounding_shift(a.u32[1], imm), 16);
dst.u16[2] = signed_saturate(rounding_shift(a.u32[2], imm), 16);
dst.u16[3] = signed_saturate(rounding_shift(a.u32[3], imm), 16);
dst.u16[4] = signed_saturate(rounding_shift(a.u32[4], imm), 16);
dst.u16[5] = signed_saturate(rounding_shift(a.u32[5], imm), 16);
dst.u16[6] = signed_saturate(rounding_shift(a.u32[6], imm), 16);
dst.u16[7] = signed_saturate(rounding_shift(a.u32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_sc_h(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_sc_h ((v8i16) a, imm);

__m128i __msa2_n2x_rnd_ux_sc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_sc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.sc.w
Builtin: __builtin_msa2_n2x_rnd_ux_sc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:278

Description

Narrow u64 elements into u32 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = signed_saturate(rounding_shift(a.u64[0], imm), 32);
dst.u32[1] = signed_saturate(rounding_shift(a.u64[1], imm), 32);
dst.u32[2] = signed_saturate(rounding_shift(a.u64[2], imm), 32);
dst.u32[3] = signed_saturate(rounding_shift(a.u64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_sc_w(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_sc_w ((v4i32) a, imm);

__m128i __msa2_n2x_rnd_ux_uc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_uc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.uc.b
Builtin: __builtin_msa2_n2x_rnd_ux_uc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:284

Description

Narrow u16 elements into u8 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = unsigned_saturate(rounding_shift(a.u16[0], imm), 8);
dst.u8[1] = unsigned_saturate(rounding_shift(a.u16[1], imm), 8);
dst.u8[2] = unsigned_saturate(rounding_shift(a.u16[2], imm), 8);
dst.u8[3] = unsigned_saturate(rounding_shift(a.u16[3], imm), 8);
dst.u8[4] = unsigned_saturate(rounding_shift(a.u16[4], imm), 8);
dst.u8[5] = unsigned_saturate(rounding_shift(a.u16[5], imm), 8);
dst.u8[6] = unsigned_saturate(rounding_shift(a.u16[6], imm), 8);
dst.u8[7] = unsigned_saturate(rounding_shift(a.u16[7], imm), 8);
dst.u8[8] = unsigned_saturate(rounding_shift(a.u16[8], imm), 8);
dst.u8[9] = unsigned_saturate(rounding_shift(a.u16[9], imm), 8);
dst.u8[10] = unsigned_saturate(rounding_shift(a.u16[10], imm), 8);
dst.u8[11] = unsigned_saturate(rounding_shift(a.u16[11], imm), 8);
dst.u8[12] = unsigned_saturate(rounding_shift(a.u16[12], imm), 8);
dst.u8[13] = unsigned_saturate(rounding_shift(a.u16[13], imm), 8);
dst.u8[14] = unsigned_saturate(rounding_shift(a.u16[14], imm), 8);
dst.u8[15] = unsigned_saturate(rounding_shift(a.u16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_uc_b(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_uc_b ((v16i8) a, imm);

__m128i __msa2_n2x_rnd_ux_uc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_uc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.uc.d
Builtin: __builtin_msa2_n2x_rnd_ux_uc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:293

Description

Narrow u128 elements into u64 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = unsigned_saturate(rounding_shift(a.u128[0], imm), 64);
dst.u64[1] = unsigned_saturate(rounding_shift(a.u128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_rnd_ux_uc_d(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_uc_d ((v2i64) a, imm);

__m128i __msa2_n2x_rnd_ux_uc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_uc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.uc.h
Builtin: __builtin_msa2_n2x_rnd_ux_uc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:287

Description

Narrow u32 elements into u16 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = unsigned_saturate(rounding_shift(a.u32[0], imm), 16);
dst.u16[1] = unsigned_saturate(rounding_shift(a.u32[1], imm), 16);
dst.u16[2] = unsigned_saturate(rounding_shift(a.u32[2], imm), 16);
dst.u16[3] = unsigned_saturate(rounding_shift(a.u32[3], imm), 16);
dst.u16[4] = unsigned_saturate(rounding_shift(a.u32[4], imm), 16);
dst.u16[5] = unsigned_saturate(rounding_shift(a.u32[5], imm), 16);
dst.u16[6] = unsigned_saturate(rounding_shift(a.u32[6], imm), 16);
dst.u16[7] = unsigned_saturate(rounding_shift(a.u32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_uc_h(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_uc_h ((v8i16) a, imm);

__m128i __msa2_n2x_rnd_ux_uc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_rnd_ux_uc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.rnd.ux.uc.w
Builtin: __builtin_msa2_n2x_rnd_ux_uc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:290

Description

Narrow u64 elements into u32 elements using rounding, saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = unsigned_saturate(rounding_shift(a.u64[0], imm), 32);
dst.u32[1] = unsigned_saturate(rounding_shift(a.u64[1], imm), 32);
dst.u32[2] = unsigned_saturate(rounding_shift(a.u64[2], imm), 32);
dst.u32[3] = unsigned_saturate(rounding_shift(a.u64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_rnd_ux_uc_w(a, imm) (__m128i) __builtin_msa2_n2x_rnd_ux_uc_w ((v4i32) a, imm);

__m128i __msa2_n2x_sx_nc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_nc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.nc.b
Builtin: __builtin_msa2_n2x_sx_nc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:188

Description

Narrow u16 elements into u8 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = truncate(shift(a.i16[0], imm), 8);
dst.u8[1] = truncate(shift(a.i16[1], imm), 8);
dst.u8[2] = truncate(shift(a.i16[2], imm), 8);
dst.u8[3] = truncate(shift(a.i16[3], imm), 8);
dst.u8[4] = truncate(shift(a.i16[4], imm), 8);
dst.u8[5] = truncate(shift(a.i16[5], imm), 8);
dst.u8[6] = truncate(shift(a.i16[6], imm), 8);
dst.u8[7] = truncate(shift(a.i16[7], imm), 8);
dst.u8[8] = truncate(shift(a.i16[8], imm), 8);
dst.u8[9] = truncate(shift(a.i16[9], imm), 8);
dst.u8[10] = truncate(shift(a.i16[10], imm), 8);
dst.u8[11] = truncate(shift(a.i16[11], imm), 8);
dst.u8[12] = truncate(shift(a.i16[12], imm), 8);
dst.u8[13] = truncate(shift(a.i16[13], imm), 8);
dst.u8[14] = truncate(shift(a.i16[14], imm), 8);
dst.u8[15] = truncate(shift(a.i16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_nc_b(a, imm) (__m128i) __builtin_msa2_n2x_sx_nc_b ((v16i8) a, imm);

__m128i __msa2_n2x_sx_nc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_nc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.nc.d
Builtin: __builtin_msa2_n2x_sx_nc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:197

Description

Narrow u128 elements into u64 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = truncate(shift(a.i128[0], imm), 64);
dst.u64[1] = truncate(shift(a.i128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_sx_nc_d(a, imm) (__m128i) __builtin_msa2_n2x_sx_nc_d ((v2i64) a, imm);

__m128i __msa2_n2x_sx_nc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_nc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.nc.h
Builtin: __builtin_msa2_n2x_sx_nc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:191

Description

Narrow u32 elements into u16 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = truncate(shift(a.i32[0], imm), 16);
dst.u16[1] = truncate(shift(a.i32[1], imm), 16);
dst.u16[2] = truncate(shift(a.i32[2], imm), 16);
dst.u16[3] = truncate(shift(a.i32[3], imm), 16);
dst.u16[4] = truncate(shift(a.i32[4], imm), 16);
dst.u16[5] = truncate(shift(a.i32[5], imm), 16);
dst.u16[6] = truncate(shift(a.i32[6], imm), 16);
dst.u16[7] = truncate(shift(a.i32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_nc_h(a, imm) (__m128i) __builtin_msa2_n2x_sx_nc_h ((v8i16) a, imm);

__m128i __msa2_n2x_sx_nc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_nc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.nc.w
Builtin: __builtin_msa2_n2x_sx_nc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:194

Description

Narrow u64 elements into u32 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = truncate(shift(a.i64[0], imm), 32);
dst.u32[1] = truncate(shift(a.i64[1], imm), 32);
dst.u32[2] = truncate(shift(a.i64[2], imm), 32);
dst.u32[3] = truncate(shift(a.i64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_nc_w(a, imm) (__m128i) __builtin_msa2_n2x_sx_nc_w ((v4i32) a, imm);

__m128i __msa2_n2x_sx_sc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_sc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.sc.b
Builtin: __builtin_msa2_n2x_sx_sc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:164

Description

Narrow u16 elements into u8 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = signed_saturate(shift(a.i16[0], imm), 8);
dst.u8[1] = signed_saturate(shift(a.i16[1], imm), 8);
dst.u8[2] = signed_saturate(shift(a.i16[2], imm), 8);
dst.u8[3] = signed_saturate(shift(a.i16[3], imm), 8);
dst.u8[4] = signed_saturate(shift(a.i16[4], imm), 8);
dst.u8[5] = signed_saturate(shift(a.i16[5], imm), 8);
dst.u8[6] = signed_saturate(shift(a.i16[6], imm), 8);
dst.u8[7] = signed_saturate(shift(a.i16[7], imm), 8);
dst.u8[8] = signed_saturate(shift(a.i16[8], imm), 8);
dst.u8[9] = signed_saturate(shift(a.i16[9], imm), 8);
dst.u8[10] = signed_saturate(shift(a.i16[10], imm), 8);
dst.u8[11] = signed_saturate(shift(a.i16[11], imm), 8);
dst.u8[12] = signed_saturate(shift(a.i16[12], imm), 8);
dst.u8[13] = signed_saturate(shift(a.i16[13], imm), 8);
dst.u8[14] = signed_saturate(shift(a.i16[14], imm), 8);
dst.u8[15] = signed_saturate(shift(a.i16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_sc_b(a, imm) (__m128i) __builtin_msa2_n2x_sx_sc_b ((v16i8) a, imm);

__m128i __msa2_n2x_sx_sc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_sc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.sc.d
Builtin: __builtin_msa2_n2x_sx_sc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:173

Description

Narrow u128 elements into u64 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = signed_saturate(shift(a.i128[0], imm), 64);
dst.u64[1] = signed_saturate(shift(a.i128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_sx_sc_d(a, imm) (__m128i) __builtin_msa2_n2x_sx_sc_d ((v2i64) a, imm);

__m128i __msa2_n2x_sx_sc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_sc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.sc.h
Builtin: __builtin_msa2_n2x_sx_sc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:167

Description

Narrow u32 elements into u16 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = signed_saturate(shift(a.i32[0], imm), 16);
dst.u16[1] = signed_saturate(shift(a.i32[1], imm), 16);
dst.u16[2] = signed_saturate(shift(a.i32[2], imm), 16);
dst.u16[3] = signed_saturate(shift(a.i32[3], imm), 16);
dst.u16[4] = signed_saturate(shift(a.i32[4], imm), 16);
dst.u16[5] = signed_saturate(shift(a.i32[5], imm), 16);
dst.u16[6] = signed_saturate(shift(a.i32[6], imm), 16);
dst.u16[7] = signed_saturate(shift(a.i32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_sc_h(a, imm) (__m128i) __builtin_msa2_n2x_sx_sc_h ((v8i16) a, imm);

__m128i __msa2_n2x_sx_sc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_sc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.sc.w
Builtin: __builtin_msa2_n2x_sx_sc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:170

Description

Narrow u64 elements into u32 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = signed_saturate(shift(a.i64[0], imm), 32);
dst.u32[1] = signed_saturate(shift(a.i64[1], imm), 32);
dst.u32[2] = signed_saturate(shift(a.i64[2], imm), 32);
dst.u32[3] = signed_saturate(shift(a.i64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_sc_w(a, imm) (__m128i) __builtin_msa2_n2x_sx_sc_w ((v4i32) a, imm);

__m128i __msa2_n2x_sx_uc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_uc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.uc.b
Builtin: __builtin_msa2_n2x_sx_uc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:176

Description

Narrow u16 elements into u8 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = unsigned_saturate(shift(a.i16[0], imm), 8);
dst.u8[1] = unsigned_saturate(shift(a.i16[1], imm), 8);
dst.u8[2] = unsigned_saturate(shift(a.i16[2], imm), 8);
dst.u8[3] = unsigned_saturate(shift(a.i16[3], imm), 8);
dst.u8[4] = unsigned_saturate(shift(a.i16[4], imm), 8);
dst.u8[5] = unsigned_saturate(shift(a.i16[5], imm), 8);
dst.u8[6] = unsigned_saturate(shift(a.i16[6], imm), 8);
dst.u8[7] = unsigned_saturate(shift(a.i16[7], imm), 8);
dst.u8[8] = unsigned_saturate(shift(a.i16[8], imm), 8);
dst.u8[9] = unsigned_saturate(shift(a.i16[9], imm), 8);
dst.u8[10] = unsigned_saturate(shift(a.i16[10], imm), 8);
dst.u8[11] = unsigned_saturate(shift(a.i16[11], imm), 8);
dst.u8[12] = unsigned_saturate(shift(a.i16[12], imm), 8);
dst.u8[13] = unsigned_saturate(shift(a.i16[13], imm), 8);
dst.u8[14] = unsigned_saturate(shift(a.i16[14], imm), 8);
dst.u8[15] = unsigned_saturate(shift(a.i16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_uc_b(a, imm) (__m128i) __builtin_msa2_n2x_sx_uc_b ((v16i8) a, imm);

__m128i __msa2_n2x_sx_uc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_uc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.uc.d
Builtin: __builtin_msa2_n2x_sx_uc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:185

Description

Narrow u128 elements into u64 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = unsigned_saturate(shift(a.i128[0], imm), 64);
dst.u64[1] = unsigned_saturate(shift(a.i128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_sx_uc_d(a, imm) (__m128i) __builtin_msa2_n2x_sx_uc_d ((v2i64) a, imm);

__m128i __msa2_n2x_sx_uc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_uc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.uc.h
Builtin: __builtin_msa2_n2x_sx_uc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:179

Description

Narrow u32 elements into u16 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = unsigned_saturate(shift(a.i32[0], imm), 16);
dst.u16[1] = unsigned_saturate(shift(a.i32[1], imm), 16);
dst.u16[2] = unsigned_saturate(shift(a.i32[2], imm), 16);
dst.u16[3] = unsigned_saturate(shift(a.i32[3], imm), 16);
dst.u16[4] = unsigned_saturate(shift(a.i32[4], imm), 16);
dst.u16[5] = unsigned_saturate(shift(a.i32[5], imm), 16);
dst.u16[6] = unsigned_saturate(shift(a.i32[6], imm), 16);
dst.u16[7] = unsigned_saturate(shift(a.i32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_uc_h(a, imm) (__m128i) __builtin_msa2_n2x_sx_uc_h ((v8i16) a, imm);

__m128i __msa2_n2x_sx_uc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_sx_uc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.sx.uc.w
Builtin: __builtin_msa2_n2x_sx_uc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:182

Description

Narrow u64 elements into u32 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = unsigned_saturate(shift(a.i64[0], imm), 32);
dst.u32[1] = unsigned_saturate(shift(a.i64[1], imm), 32);
dst.u32[2] = unsigned_saturate(shift(a.i64[2], imm), 32);
dst.u32[3] = unsigned_saturate(shift(a.i64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_sx_uc_w(a, imm) (__m128i) __builtin_msa2_n2x_sx_uc_w ((v4i32) a, imm);

__m128i __msa2_n2x_ux_nc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_nc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.nc.b
Builtin: __builtin_msa2_n2x_ux_nc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:224

Description

Narrow u16 elements into u8 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = truncate(shift(a.u16[0], imm), 8);
dst.u8[1] = truncate(shift(a.u16[1], imm), 8);
dst.u8[2] = truncate(shift(a.u16[2], imm), 8);
dst.u8[3] = truncate(shift(a.u16[3], imm), 8);
dst.u8[4] = truncate(shift(a.u16[4], imm), 8);
dst.u8[5] = truncate(shift(a.u16[5], imm), 8);
dst.u8[6] = truncate(shift(a.u16[6], imm), 8);
dst.u8[7] = truncate(shift(a.u16[7], imm), 8);
dst.u8[8] = truncate(shift(a.u16[8], imm), 8);
dst.u8[9] = truncate(shift(a.u16[9], imm), 8);
dst.u8[10] = truncate(shift(a.u16[10], imm), 8);
dst.u8[11] = truncate(shift(a.u16[11], imm), 8);
dst.u8[12] = truncate(shift(a.u16[12], imm), 8);
dst.u8[13] = truncate(shift(a.u16[13], imm), 8);
dst.u8[14] = truncate(shift(a.u16[14], imm), 8);
dst.u8[15] = truncate(shift(a.u16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_nc_b(a, imm) (__m128i) __builtin_msa2_n2x_ux_nc_b ((v16i8) a, imm);

__m128i __msa2_n2x_ux_nc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_nc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.nc.d
Builtin: __builtin_msa2_n2x_ux_nc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:233

Description

Narrow u128 elements into u64 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = truncate(shift(a.u128[0], imm), 64);
dst.u64[1] = truncate(shift(a.u128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_ux_nc_d(a, imm) (__m128i) __builtin_msa2_n2x_ux_nc_d ((v2i64) a, imm);

__m128i __msa2_n2x_ux_nc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_nc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.nc.h
Builtin: __builtin_msa2_n2x_ux_nc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:227

Description

Narrow u32 elements into u16 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = truncate(shift(a.u32[0], imm), 16);
dst.u16[1] = truncate(shift(a.u32[1], imm), 16);
dst.u16[2] = truncate(shift(a.u32[2], imm), 16);
dst.u16[3] = truncate(shift(a.u32[3], imm), 16);
dst.u16[4] = truncate(shift(a.u32[4], imm), 16);
dst.u16[5] = truncate(shift(a.u32[5], imm), 16);
dst.u16[6] = truncate(shift(a.u32[6], imm), 16);
dst.u16[7] = truncate(shift(a.u32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_nc_h(a, imm) (__m128i) __builtin_msa2_n2x_ux_nc_h ((v8i16) a, imm);

__m128i __msa2_n2x_ux_nc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_nc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.nc.w
Builtin: __builtin_msa2_n2x_ux_nc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:230

Description

Narrow u64 elements into u32 elements using conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = truncate(shift(a.u64[0], imm), 32);
dst.u32[1] = truncate(shift(a.u64[1], imm), 32);
dst.u32[2] = truncate(shift(a.u64[2], imm), 32);
dst.u32[3] = truncate(shift(a.u64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_nc_w(a, imm) (__m128i) __builtin_msa2_n2x_ux_nc_w ((v4i32) a, imm);

__m128i __msa2_n2x_ux_sc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_sc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.sc.b
Builtin: __builtin_msa2_n2x_ux_sc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:200

Description

Narrow u16 elements into u8 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = signed_saturate(shift(a.u16[0], imm), 8);
dst.u8[1] = signed_saturate(shift(a.u16[1], imm), 8);
dst.u8[2] = signed_saturate(shift(a.u16[2], imm), 8);
dst.u8[3] = signed_saturate(shift(a.u16[3], imm), 8);
dst.u8[4] = signed_saturate(shift(a.u16[4], imm), 8);
dst.u8[5] = signed_saturate(shift(a.u16[5], imm), 8);
dst.u8[6] = signed_saturate(shift(a.u16[6], imm), 8);
dst.u8[7] = signed_saturate(shift(a.u16[7], imm), 8);
dst.u8[8] = signed_saturate(shift(a.u16[8], imm), 8);
dst.u8[9] = signed_saturate(shift(a.u16[9], imm), 8);
dst.u8[10] = signed_saturate(shift(a.u16[10], imm), 8);
dst.u8[11] = signed_saturate(shift(a.u16[11], imm), 8);
dst.u8[12] = signed_saturate(shift(a.u16[12], imm), 8);
dst.u8[13] = signed_saturate(shift(a.u16[13], imm), 8);
dst.u8[14] = signed_saturate(shift(a.u16[14], imm), 8);
dst.u8[15] = signed_saturate(shift(a.u16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_sc_b(a, imm) (__m128i) __builtin_msa2_n2x_ux_sc_b ((v16i8) a, imm);

__m128i __msa2_n2x_ux_sc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_sc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.sc.d
Builtin: __builtin_msa2_n2x_ux_sc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:209

Description

Narrow u128 elements into u64 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = signed_saturate(shift(a.u128[0], imm), 64);
dst.u64[1] = signed_saturate(shift(a.u128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_ux_sc_d(a, imm) (__m128i) __builtin_msa2_n2x_ux_sc_d ((v2i64) a, imm);

__m128i __msa2_n2x_ux_sc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_sc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.sc.h
Builtin: __builtin_msa2_n2x_ux_sc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:203

Description

Narrow u32 elements into u16 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = signed_saturate(shift(a.u32[0], imm), 16);
dst.u16[1] = signed_saturate(shift(a.u32[1], imm), 16);
dst.u16[2] = signed_saturate(shift(a.u32[2], imm), 16);
dst.u16[3] = signed_saturate(shift(a.u32[3], imm), 16);
dst.u16[4] = signed_saturate(shift(a.u32[4], imm), 16);
dst.u16[5] = signed_saturate(shift(a.u32[5], imm), 16);
dst.u16[6] = signed_saturate(shift(a.u32[6], imm), 16);
dst.u16[7] = signed_saturate(shift(a.u32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_sc_h(a, imm) (__m128i) __builtin_msa2_n2x_ux_sc_h ((v8i16) a, imm);

__m128i __msa2_n2x_ux_sc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_sc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.sc.w
Builtin: __builtin_msa2_n2x_ux_sc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:206

Description

Narrow u64 elements into u32 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = signed_saturate(shift(a.u64[0], imm), 32);
dst.u32[1] = signed_saturate(shift(a.u64[1], imm), 32);
dst.u32[2] = signed_saturate(shift(a.u64[2], imm), 32);
dst.u32[3] = signed_saturate(shift(a.u64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_sc_w(a, imm) (__m128i) __builtin_msa2_n2x_ux_sc_w ((v4i32) a, imm);

__m128i __msa2_n2x_ux_uc_b (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_uc_b (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.uc.b
Builtin: __builtin_msa2_n2x_ux_uc_b
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:212

Description

Narrow u16 elements into u8 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u8[0] = unsigned_saturate(shift(a.u16[0], imm), 8);
dst.u8[1] = unsigned_saturate(shift(a.u16[1], imm), 8);
dst.u8[2] = unsigned_saturate(shift(a.u16[2], imm), 8);
dst.u8[3] = unsigned_saturate(shift(a.u16[3], imm), 8);
dst.u8[4] = unsigned_saturate(shift(a.u16[4], imm), 8);
dst.u8[5] = unsigned_saturate(shift(a.u16[5], imm), 8);
dst.u8[6] = unsigned_saturate(shift(a.u16[6], imm), 8);
dst.u8[7] = unsigned_saturate(shift(a.u16[7], imm), 8);
dst.u8[8] = unsigned_saturate(shift(a.u16[8], imm), 8);
dst.u8[9] = unsigned_saturate(shift(a.u16[9], imm), 8);
dst.u8[10] = unsigned_saturate(shift(a.u16[10], imm), 8);
dst.u8[11] = unsigned_saturate(shift(a.u16[11], imm), 8);
dst.u8[12] = unsigned_saturate(shift(a.u16[12], imm), 8);
dst.u8[13] = unsigned_saturate(shift(a.u16[13], imm), 8);
dst.u8[14] = unsigned_saturate(shift(a.u16[14], imm), 8);
dst.u8[15] = unsigned_saturate(shift(a.u16[15], imm), 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_uc_b(a, imm) (__m128i) __builtin_msa2_n2x_ux_uc_b ((v16i8) a, imm);

__m128i __msa2_n2x_ux_uc_d (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_uc_d (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.uc.d
Builtin: __builtin_msa2_n2x_ux_uc_d
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:221

Description

Narrow u128 elements into u64 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u64[0] = unsigned_saturate(shift(a.u128[0], imm), 64);
dst.u64[1] = unsigned_saturate(shift(a.u128[1], imm), 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

#define __msa2_n2x_ux_uc_d(a, imm) (__m128i) __builtin_msa2_n2x_ux_uc_d ((v2i64) a, imm);

__m128i __msa2_n2x_ux_uc_h (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_uc_h (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.uc.h
Builtin: __builtin_msa2_n2x_ux_uc_h
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:215

Description

Narrow u32 elements into u16 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u16[0] = unsigned_saturate(shift(a.u32[0], imm), 16);
dst.u16[1] = unsigned_saturate(shift(a.u32[1], imm), 16);
dst.u16[2] = unsigned_saturate(shift(a.u32[2], imm), 16);
dst.u16[3] = unsigned_saturate(shift(a.u32[3], imm), 16);
dst.u16[4] = unsigned_saturate(shift(a.u32[4], imm), 16);
dst.u16[5] = unsigned_saturate(shift(a.u32[5], imm), 16);
dst.u16[6] = unsigned_saturate(shift(a.u32[6], imm), 16);
dst.u16[7] = unsigned_saturate(shift(a.u32[7], imm), 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_uc_h(a, imm) (__m128i) __builtin_msa2_n2x_ux_uc_h ((v8i16) a, imm);

__m128i __msa2_n2x_ux_uc_w (__m128i a, int imm)

Synopsis

__m128i __msa2_n2x_ux_uc_w (__m128i a, int imm)
#include <msa2.h>
Instruction: n2x.ux.uc.w
Builtin: __builtin_msa2_n2x_ux_uc_w
CPU Flags: __mips_msa
Kind: macro
Source: include/msa2.h:218

Description

Narrow u64 elements into u32 elements using saturating conversion. This is the usual final step after wider intermediate arithmetic.

Operation

dst.u32[0] = unsigned_saturate(shift(a.u64[0], imm), 32);
dst.u32[1] = unsigned_saturate(shift(a.u64[1], imm), 32);
dst.u32[2] = unsigned_saturate(shift(a.u64[2], imm), 32);
dst.u32[3] = unsigned_saturate(shift(a.u64[3], imm), 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Header Mapping

#define __msa2_n2x_ux_uc_w(a, imm) (__m128i) __builtin_msa2_n2x_ux_uc_w ((v4i32) a, imm);

__m128i __msa2_vperm_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vperm_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vperm.b
Builtin: __builtin_msa2_vperm_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:392

Description

Use byte selectors from c to choose bytes from a or b, with selector values in the zero range producing 0.

Operation

dst.u8[0] = (c.u8[0] & 0x40) ? 0 : ((c.u8[0] & 0x10) ? a.u8[(c.u8[0] & 15)] : b.u8[(c.u8[0] & 15)]);
dst.u8[1] = (c.u8[1] & 0x40) ? 0 : ((c.u8[1] & 0x10) ? a.u8[(c.u8[1] & 15)] : b.u8[(c.u8[1] & 15)]);
dst.u8[2] = (c.u8[2] & 0x40) ? 0 : ((c.u8[2] & 0x10) ? a.u8[(c.u8[2] & 15)] : b.u8[(c.u8[2] & 15)]);
dst.u8[3] = (c.u8[3] & 0x40) ? 0 : ((c.u8[3] & 0x10) ? a.u8[(c.u8[3] & 15)] : b.u8[(c.u8[3] & 15)]);
dst.u8[4] = (c.u8[4] & 0x40) ? 0 : ((c.u8[4] & 0x10) ? a.u8[(c.u8[4] & 15)] : b.u8[(c.u8[4] & 15)]);
dst.u8[5] = (c.u8[5] & 0x40) ? 0 : ((c.u8[5] & 0x10) ? a.u8[(c.u8[5] & 15)] : b.u8[(c.u8[5] & 15)]);
dst.u8[6] = (c.u8[6] & 0x40) ? 0 : ((c.u8[6] & 0x10) ? a.u8[(c.u8[6] & 15)] : b.u8[(c.u8[6] & 15)]);
dst.u8[7] = (c.u8[7] & 0x40) ? 0 : ((c.u8[7] & 0x10) ? a.u8[(c.u8[7] & 15)] : b.u8[(c.u8[7] & 15)]);
dst.u8[8] = (c.u8[8] & 0x40) ? 0 : ((c.u8[8] & 0x10) ? a.u8[(c.u8[8] & 15)] : b.u8[(c.u8[8] & 15)]);
dst.u8[9] = (c.u8[9] & 0x40) ? 0 : ((c.u8[9] & 0x10) ? a.u8[(c.u8[9] & 15)] : b.u8[(c.u8[9] & 15)]);
dst.u8[10] = (c.u8[10] & 0x40) ? 0 : ((c.u8[10] & 0x10) ? a.u8[(c.u8[10] & 15)] : b.u8[(c.u8[10] & 15)]);
dst.u8[11] = (c.u8[11] & 0x40) ? 0 : ((c.u8[11] & 0x10) ? a.u8[(c.u8[11] & 15)] : b.u8[(c.u8[11] & 15)]);
dst.u8[12] = (c.u8[12] & 0x40) ? 0 : ((c.u8[12] & 0x10) ? a.u8[(c.u8[12] & 15)] : b.u8[(c.u8[12] & 15)]);
dst.u8[13] = (c.u8[13] & 0x40) ? 0 : ((c.u8[13] & 0x10) ? a.u8[(c.u8[13] & 15)] : b.u8[(c.u8[13] & 15)]);
dst.u8[14] = (c.u8[14] & 0x40) ? 0 : ((c.u8[14] & 0x10) ? a.u8[(c.u8[14] & 15)] : b.u8[(c.u8[14] & 15)]);
dst.u8[15] = (c.u8[15] & 0x40) ? 0 : ((c.u8[15] & 0x10) ? a.u8[(c.u8[15] & 15)] : b.u8[(c.u8[15] & 15)]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_vperm_b ((v16i8) a, (v16i8) b, (v16i8) c);

__m128i __msa2_w2x_hi_s_b (__m128i a)

Synopsis

__m128i __msa2_w2x_hi_s_b (__m128i a)
#include <msa2.h>
Instruction: w2x.hi.s.b
Builtin: __builtin_msa2_w2x_hi_s_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:428

Description

Take upper-half i8 lanes, sign- or zero-extend each one into i16 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i16[0] = widen(a.i8[8]) - widen(b.i8[8]);
dst.i16[1] = widen(a.i8[9]) - widen(b.i8[9]);
dst.i16[2] = widen(a.i8[10]) - widen(b.i8[10]);
dst.i16[3] = widen(a.i8[11]) - widen(b.i8[11]);
dst.i16[4] = widen(a.i8[12]) - widen(b.i8[12]);
dst.i16[5] = widen(a.i8[13]) - widen(b.i8[13]);
dst.i16[6] = widen(a.i8[14]) - widen(b.i8[14]);
dst.i16[7] = widen(a.i8[15]) - widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_hi_s_b ((v16i8) a);

__m128i __msa2_w2x_hi_s_d (__m128i a)

Synopsis

__m128i __msa2_w2x_hi_s_d (__m128i a)
#include <msa2.h>
Instruction: w2x.hi.s.d
Builtin: __builtin_msa2_w2x_hi_s_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:446

Description

Take upper-half i64 lanes, sign- or zero-extend each one into i128 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i128[0] = widen(a.i64[1]) - widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_hi_s_d ((v2i64) a);

__m128i __msa2_w2x_hi_s_h (__m128i a)

Synopsis

__m128i __msa2_w2x_hi_s_h (__m128i a)
#include <msa2.h>
Instruction: w2x.hi.s.h
Builtin: __builtin_msa2_w2x_hi_s_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:434

Description

Take upper-half i16 lanes, sign- or zero-extend each one into i32 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i32[0] = widen(a.i16[4]) - widen(b.i16[4]);
dst.i32[1] = widen(a.i16[5]) - widen(b.i16[5]);
dst.i32[2] = widen(a.i16[6]) - widen(b.i16[6]);
dst.i32[3] = widen(a.i16[7]) - widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_hi_s_h ((v8i16) a);

__m128i __msa2_w2x_hi_s_w (__m128i a)

Synopsis

__m128i __msa2_w2x_hi_s_w (__m128i a)
#include <msa2.h>
Instruction: w2x.hi.s.w
Builtin: __builtin_msa2_w2x_hi_s_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:440

Description

Take upper-half i32 lanes, sign- or zero-extend each one into i64 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i64[0] = widen(a.i32[2]) - widen(b.i32[2]);
dst.i64[1] = widen(a.i32[3]) - widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_hi_s_w ((v4i32) a);

__m128i __msa2_w2x_lo_s_b (__m128i a)

Synopsis

__m128i __msa2_w2x_lo_s_b (__m128i a)
#include <msa2.h>
Instruction: w2x.lo.s.b
Builtin: __builtin_msa2_w2x_lo_s_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:404

Description

Take lower-half i8 lanes, sign- or zero-extend each one into i16 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i16[0] = widen(a.i8[0]) - widen(b.i8[0]);
dst.i16[1] = widen(a.i8[1]) - widen(b.i8[1]);
dst.i16[2] = widen(a.i8[2]) - widen(b.i8[2]);
dst.i16[3] = widen(a.i8[3]) - widen(b.i8[3]);
dst.i16[4] = widen(a.i8[4]) - widen(b.i8[4]);
dst.i16[5] = widen(a.i8[5]) - widen(b.i8[5]);
dst.i16[6] = widen(a.i8[6]) - widen(b.i8[6]);
dst.i16[7] = widen(a.i8[7]) - widen(b.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_lo_s_b ((v16i8) a);

__m128i __msa2_w2x_lo_s_d (__m128i a)

Synopsis

__m128i __msa2_w2x_lo_s_d (__m128i a)
#include <msa2.h>
Instruction: w2x.lo.s.d
Builtin: __builtin_msa2_w2x_lo_s_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:422

Description

Take lower-half i64 lanes, sign- or zero-extend each one into i128 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i128[0] = widen(a.i64[0]) - widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_lo_s_d ((v2i64) a);

__m128i __msa2_w2x_lo_s_h (__m128i a)

Synopsis

__m128i __msa2_w2x_lo_s_h (__m128i a)
#include <msa2.h>
Instruction: w2x.lo.s.h
Builtin: __builtin_msa2_w2x_lo_s_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:410

Description

Take lower-half i16 lanes, sign- or zero-extend each one into i32 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i32[0] = widen(a.i16[0]) - widen(b.i16[0]);
dst.i32[1] = widen(a.i16[1]) - widen(b.i16[1]);
dst.i32[2] = widen(a.i16[2]) - widen(b.i16[2]);
dst.i32[3] = widen(a.i16[3]) - widen(b.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_lo_s_h ((v8i16) a);

__m128i __msa2_w2x_lo_s_w (__m128i a)

Synopsis

__m128i __msa2_w2x_lo_s_w (__m128i a)
#include <msa2.h>
Instruction: w2x.lo.s.w
Builtin: __builtin_msa2_w2x_lo_s_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:416

Description

Take lower-half i32 lanes, sign- or zero-extend each one into i64 lanes, and write the widened vector. This prepares narrow data for wider arithmetic without losing sign information.

Operation

dst.i64[0] = widen(a.i32[0]) - widen(b.i32[0]);
dst.i64[1] = widen(a.i32[1]) - widen(b.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i) __builtin_msa2_w2x_lo_s_w ((v4i32) a);