Permutation

Generated from include/loongson-mmiintrin.h. This page contains 27 intrinsics.

int8x8_t packsshb (int16x4_t a, int16x4_t b)

Synopsis

int8x8_t packsshb (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: packsshb
Builtin: __builtin_loongson_packsshb
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:63

Description

Pack wider u16 elements from the concatenated sources into narrower u8 elements with signed saturation. This is used before storing or continuing with narrower packed data.

Operation

dst.i8[0] = signed_saturate(concatenate_lanes(a, b).i16[0], bit_width(i8));
dst.i8[1] = signed_saturate(concatenate_lanes(a, b).i16[1], bit_width(i8));
dst.i8[2] = signed_saturate(concatenate_lanes(a, b).i16[2], bit_width(i8));
dst.i8[3] = signed_saturate(concatenate_lanes(a, b).i16[3], bit_width(i8));
dst.i8[4] = signed_saturate(concatenate_lanes(a, b).i16[4], bit_width(i8));
dst.i8[5] = signed_saturate(concatenate_lanes(a, b).i16[5], bit_width(i8));
dst.i8[6] = signed_saturate(concatenate_lanes(a, b).i16[6], bit_width(i8));
dst.i8[7] = signed_saturate(concatenate_lanes(a, b).i16[7], bit_width(i8));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_packsshb (a, b);

int16x4_t packsswh (int32x2_t a, int32x2_t b)

Synopsis

int16x4_t packsswh (int32x2_t a, int32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: packsswh
Builtin: __builtin_loongson_packsswh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:57

Description

Pack wider u32 elements from the concatenated sources into narrower u16 elements with signed saturation. This is used before storing or continuing with narrower packed data.

Operation

dst.i16[0] = signed_saturate(concatenate_lanes(a, b).i32[0], bit_width(i16));
dst.i16[1] = signed_saturate(concatenate_lanes(a, b).i32[1], bit_width(i16));
dst.i16[2] = signed_saturate(concatenate_lanes(a, b).i32[2], bit_width(i16));
dst.i16[3] = signed_saturate(concatenate_lanes(a, b).i32[3], bit_width(i16));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_packsswh (a, b);

uint8x8_t packushb (uint16x4_t a, uint16x4_t b)

Synopsis

uint8x8_t packushb (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: packushb
Builtin: __builtin_loongson_packushb
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:70

Description

Pack wider u16 elements from the concatenated sources into narrower u8 elements with unsigned saturation. This is used before storing or continuing with narrower packed data.

Operation

dst.u8[0] = unsigned_saturate(concatenate_lanes(a, b).u16[0], bit_width(u8));
dst.u8[1] = unsigned_saturate(concatenate_lanes(a, b).u16[1], bit_width(u8));
dst.u8[2] = unsigned_saturate(concatenate_lanes(a, b).u16[2], bit_width(u8));
dst.u8[3] = unsigned_saturate(concatenate_lanes(a, b).u16[3], bit_width(u8));
dst.u8[4] = unsigned_saturate(concatenate_lanes(a, b).u16[4], bit_width(u8));
dst.u8[5] = unsigned_saturate(concatenate_lanes(a, b).u16[5], bit_width(u8));
dst.u8[6] = unsigned_saturate(concatenate_lanes(a, b).u16[6], bit_width(u8));
dst.u8[7] = unsigned_saturate(concatenate_lanes(a, b).u16[7], bit_width(u8));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_packushb (a, b);

int16x4_t pextrh_s (int16x4_t a, int field)

Synopsis

int16x4_t pextrh_s (int16x4_t a, int field)
#include <loongson-mmiintrin.h>
Instruction: pextrh.s
Builtin: __builtin_loongson_pextrh_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:295

Description

Extract the selected 16-bit lane from the vector and return it as a scalar, useful at vector/scalar boundaries.

Operation

return extract_u16_lane(a, field);

Header Mapping

return __builtin_loongson_pextrh_s (a, field);

uint16x4_t pextrh_u (uint16x4_t a, int field)

Synopsis

uint16x4_t pextrh_u (uint16x4_t a, int field)
#include <loongson-mmiintrin.h>
Instruction: pextrh.u
Builtin: __builtin_loongson_pextrh_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:289

Description

Extract the selected 16-bit lane from the vector and return it as a scalar, useful at vector/scalar boundaries.

Operation

return extract_u16_lane(a, field);

Header Mapping

return __builtin_loongson_pextrh_u (a, field);

int16x4_t pinsrh_0_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t pinsrh_0_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.0.s
Builtin: __builtin_loongson_pinsrh_0_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:326

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[0] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_0_s (a, b);

uint16x4_t pinsrh_0_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t pinsrh_0_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.0.u
Builtin: __builtin_loongson_pinsrh_0_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:302

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[0] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_0_u (a, b);

int16x4_t pinsrh_1_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t pinsrh_1_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.1.s
Builtin: __builtin_loongson_pinsrh_1_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:332

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[1] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_1_s (a, b);

uint16x4_t pinsrh_1_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t pinsrh_1_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.1.u
Builtin: __builtin_loongson_pinsrh_1_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:308

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[1] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_1_u (a, b);

int16x4_t pinsrh_2_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t pinsrh_2_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.2.s
Builtin: __builtin_loongson_pinsrh_2_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:338

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[2] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_2_s (a, b);

uint16x4_t pinsrh_2_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t pinsrh_2_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.2.u
Builtin: __builtin_loongson_pinsrh_2_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:314

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[2] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_2_u (a, b);

int16x4_t pinsrh_3_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t pinsrh_3_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.3.s
Builtin: __builtin_loongson_pinsrh_3_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:344

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[3] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_3_s (a, b);

uint16x4_t pinsrh_3_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t pinsrh_3_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pinsrh.3.u
Builtin: __builtin_loongson_pinsrh_3_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:320

Description

Insert the low 16-bit lane from b into a fixed lane of a, preserving the other lanes.

Operation

dst = a;
dst.u16[3] = b.u16[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_pinsrh_3_u (a, b);

int16x4_t pshufh_s (int16x4_t a, int16x4_t b, uint8_t order)

Synopsis

int16x4_t pshufh_s (int16x4_t a, int16x4_t b, uint8_t order)
#include <loongson-mmiintrin.h>
Instruction: pshufh.s
Builtin: __builtin_loongson_pshufh_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:456

Description

Use two-bit fields from the immediate to select i16 lanes from the concatenated a/b inputs inside each 128-bit half.

Operation

dst.u16[0] = b.u16[shuffle_selector(order, 0)];
dst.u16[1] = b.u16[shuffle_selector(order, 1)];
dst.u16[2] = b.u16[shuffle_selector(order, 2)];
dst.u16[3] = b.u16[shuffle_selector(order, 3)];

Header Mapping

return __builtin_loongson_pshufh_s (b, order);

uint16x4_t pshufh_u (uint16x4_t a, uint16x4_t b, uint8_t order)

Synopsis

uint16x4_t pshufh_u (uint16x4_t a, uint16x4_t b, uint8_t order)
#include <loongson-mmiintrin.h>
Instruction: pshufh.u
Builtin: __builtin_loongson_pshufh_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:450

Description

Use two-bit fields from the immediate to select u16 lanes from the concatenated a/b inputs inside each 128-bit half.

Operation

dst.u16[0] = b.u16[shuffle_selector(order, 0)];
dst.u16[1] = b.u16[shuffle_selector(order, 1)];
dst.u16[2] = b.u16[shuffle_selector(order, 2)];
dst.u16[3] = b.u16[shuffle_selector(order, 3)];

Header Mapping

return __builtin_loongson_pshufh_u (b, order);

int8x8_t punpckhbh_s (int8x8_t a, int8x8_t b)

Synopsis

int8x8_t punpckhbh_s (int8x8_t a, int8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: punpckhbh.s
Builtin: __builtin_loongson_punpckhbh_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:644

Description

Unpack by interleaving lower i16 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u16[0] = b.u16[0];
dst.u16[1] = a.u16[0];
dst.u16[2] = b.u16[1];
dst.u16[3] = a.u16[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpckhbh_s (a, b);

uint8x8_t punpckhbh_u (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t punpckhbh_u (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: punpckhbh.u
Builtin: __builtin_loongson_punpckhbh_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:626

Description

Unpack by interleaving lower u16 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u16[0] = b.u16[0];
dst.u16[1] = a.u16[0];
dst.u16[2] = b.u16[1];
dst.u16[3] = a.u16[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpckhbh_u (a, b);

int16x4_t punpckhhw_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t punpckhhw_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: punpckhhw.s
Builtin: __builtin_loongson_punpckhhw_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:638

Description

Unpack by interleaving lower i32 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u32[0] = b.u32[0];
dst.u32[1] = a.u32[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpckhhw_s (a, b);

uint16x4_t punpckhhw_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t punpckhhw_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: punpckhhw.u
Builtin: __builtin_loongson_punpckhhw_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:620

Description

Unpack by interleaving lower u32 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u32[0] = b.u32[0];
dst.u32[1] = a.u32[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpckhhw_u (a, b);

int32x2_t punpckhwd_s (int32x2_t a, int32x2_t b)

Synopsis

int32x2_t punpckhwd_s (int32x2_t a, int32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: punpckhwd.s
Builtin: __builtin_loongson_punpckhwd_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:632

Description

Unpack by interleaving lower i64 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u64[0] = b.u64[0];
dst.u64[1] = a.u64[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpckhwd_s (a, b);

uint32x2_t punpckhwd_u (uint32x2_t a, uint32x2_t b)

Synopsis

uint32x2_t punpckhwd_u (uint32x2_t a, uint32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: punpckhwd.u
Builtin: __builtin_loongson_punpckhwd_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:614

Description

Unpack by interleaving lower u64 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u64[0] = b.u64[0];
dst.u64[1] = a.u64[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpckhwd_u (a, b);

int8x8_t punpcklbh_s (int8x8_t a, int8x8_t b)

Synopsis

int8x8_t punpcklbh_s (int8x8_t a, int8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: punpcklbh.s
Builtin: __builtin_loongson_punpcklbh_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:681

Description

Unpack by interleaving lower i16 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u16[0] = b.u16[0];
dst.u16[1] = a.u16[0];
dst.u16[2] = b.u16[1];
dst.u16[3] = a.u16[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpcklbh_s (a, b);

uint8x8_t punpcklbh_u (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t punpcklbh_u (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: punpcklbh.u
Builtin: __builtin_loongson_punpcklbh_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:663

Description

Unpack by interleaving lower u16 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u16[0] = b.u16[0];
dst.u16[1] = a.u16[0];
dst.u16[2] = b.u16[1];
dst.u16[3] = a.u16[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpcklbh_u (a, b);

int16x4_t punpcklhw_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t punpcklhw_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: punpcklhw.s
Builtin: __builtin_loongson_punpcklhw_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:675

Description

Unpack by interleaving lower i32 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u32[0] = b.u32[0];
dst.u32[1] = a.u32[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpcklhw_s (a, b);

uint16x4_t punpcklhw_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t punpcklhw_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: punpcklhw.u
Builtin: __builtin_loongson_punpcklhw_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:657

Description

Unpack by interleaving lower u32 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u32[0] = b.u32[0];
dst.u32[1] = a.u32[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpcklhw_u (a, b);

int32x2_t punpcklwd_s (int32x2_t a, int32x2_t b)

Synopsis

int32x2_t punpcklwd_s (int32x2_t a, int32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: punpcklwd.s
Builtin: __builtin_loongson_punpcklwd_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:669

Description

Unpack by interleaving lower i64 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u64[0] = b.u64[0];
dst.u64[1] = a.u64[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpcklwd_s (a, b);

uint32x2_t punpcklwd_u (uint32x2_t a, uint32x2_t b)

Synopsis

uint32x2_t punpcklwd_u (uint32x2_t a, uint32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: punpcklwd.u
Builtin: __builtin_loongson_punpcklwd_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:651

Description

Unpack by interleaving lower u64 lanes from b and a into alternating destination lanes. This separates packed streams into a wider interleaved layout.

Operation

dst.u64[0] = b.u64[0];
dst.u64[1] = a.u64[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return __builtin_loongson_punpcklwd_u (a, b);