Permutation

Generated from include/loongson-sxintrin.h. This page contains 19 intrinsics.

__m128i __lsx_shf_d (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_shf_d (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: shf.d
Builtin: __builtin_lsx_shf_d
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:894

Description

Use two-bit fields from the immediate to select u64 lanes from the concatenated a/b inputs inside each 128-bit half.

Operation

dst.u64[0] = (((imm >> 0) & 3) < 2) ? a.u64[((imm >> 0) & 3)] : b.u64[((imm >> 0) & 3) - 2];
dst.u64[1] = (((imm >> 2) & 3) < 2) ? a.u64[((imm >> 2) & 3)] : b.u64[((imm >> 2) & 3) - 2];

Header Mapping

#define __lsx_shf_d(a, b, imm) ((__m128i)__builtin_lsx_shf_d((v2i64)(a), (v2i64)(b), (imm)))

__m128i __lsx_vextr_v (__m128i a, __m128i b, unsigned char imm)

Synopsis

__m128i __lsx_vextr_v (__m128i a, __m128i b, unsigned char imm)
#include <loongson-sxintrin.h>
Instruction: vextr.v
Builtin: __builtin_lsx_vextr_v
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:908

Description

For each 128-bit half, concatenate b followed by a and extract a byte window starting at imm.

Operation

dst.u8[0] = (imm < 16) ? b.u8[imm] : a.u8[imm - 16];
dst.u8[1] = (imm + 1 < 16) ? b.u8[imm + 1] : a.u8[imm + 1 - 16];
dst.u8[2] = (imm + 2 < 16) ? b.u8[imm + 2] : a.u8[imm + 2 - 16];
dst.u8[3] = (imm + 3 < 16) ? b.u8[imm + 3] : a.u8[imm + 3 - 16];
dst.u8[4] = (imm + 4 < 16) ? b.u8[imm + 4] : a.u8[imm + 4 - 16];
dst.u8[5] = (imm + 5 < 16) ? b.u8[imm + 5] : a.u8[imm + 5 - 16];
dst.u8[6] = (imm + 6 < 16) ? b.u8[imm + 6] : a.u8[imm + 6 - 16];
dst.u8[7] = (imm + 7 < 16) ? b.u8[imm + 7] : a.u8[imm + 7 - 16];
dst.u8[8] = (imm + 8 < 16) ? b.u8[imm + 8] : a.u8[imm + 8 - 16];
dst.u8[9] = (imm + 9 < 16) ? b.u8[imm + 9] : a.u8[imm + 9 - 16];
dst.u8[10] = (imm + 10 < 16) ? b.u8[imm + 10] : a.u8[imm + 10 - 16];
dst.u8[11] = (imm + 11 < 16) ? b.u8[imm + 11] : a.u8[imm + 11 - 16];
dst.u8[12] = (imm + 12 < 16) ? b.u8[imm + 12] : a.u8[imm + 12 - 16];
dst.u8[13] = (imm + 13 < 16) ? b.u8[imm + 13] : a.u8[imm + 13 - 16];
dst.u8[14] = (imm + 14 < 16) ? b.u8[imm + 14] : a.u8[imm + 14 - 16];
dst.u8[15] = (imm + 15 < 16) ? b.u8[imm + 15] : a.u8[imm + 15 - 16];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

#define __lsx_vextr_v(a, b, imm) ((__m128i)__builtin_lsx_vextr_v((v16i8)(a), (v16i8)(b), (imm)))

__m128i __lsx_vextrins_b (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)

Synopsis

__m128i __lsx_vextrins_b (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)
#include <loongson-sxintrin.h>
Instruction: vextrins.b
Builtin: __builtin_lsx_vextrins_b
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:907

Description

Copy one selected source u8 lane into one selected destination lane.

Operation

dst = a;
dst.u8[dst_lane] = b.u8[src_lane];

Header Mapping

#define __lsx_vextrins_b(a, b, dst_lane, src_lane) ((__m128i)__builtin_lsx_vextrins_b((v16i8)(a), (v16i8)(b), (dst_lane), (src_lane)))

__m128i __lsx_vextrins_d (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)

Synopsis

__m128i __lsx_vextrins_d (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)
#include <loongson-sxintrin.h>
Instruction: vextrins.d
Builtin: __builtin_lsx_vextrins_d
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:904

Description

Copy one selected source u64 lane into one selected destination lane.

Operation

dst = a;
dst.u64[dst_lane] = b.u64[src_lane];

Header Mapping

#define __lsx_vextrins_d(a, b, dst_lane, src_lane) ((__m128i)__builtin_lsx_vextrins_d((v2i64)(a), (v2i64)(b), (dst_lane), (src_lane)))

__m128i __lsx_vextrins_h (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)

Synopsis

__m128i __lsx_vextrins_h (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)
#include <loongson-sxintrin.h>
Instruction: vextrins.h
Builtin: __builtin_lsx_vextrins_h
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:906

Description

Copy one selected source u16 lane into one selected destination lane.

Operation

dst = a;
dst.u16[dst_lane] = b.u16[src_lane];

Header Mapping

#define __lsx_vextrins_h(a, b, dst_lane, src_lane) ((__m128i)__builtin_lsx_vextrins_h((v8i16)(a), (v8i16)(b), (dst_lane), (src_lane)))

__m128i __lsx_vextrins_w (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)

Synopsis

__m128i __lsx_vextrins_w (__m128i a, __m128i b, unsigned char dst_lane, unsigned char src_lane)
#include <loongson-sxintrin.h>
Instruction: vextrins.w
Builtin: __builtin_lsx_vextrins_w
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:905

Description

Copy one selected source u32 lane into one selected destination lane.

Operation

dst = a;
dst.u32[dst_lane] = b.u32[src_lane];

Header Mapping

#define __lsx_vextrins_w(a, b, dst_lane, src_lane) ((__m128i)__builtin_lsx_vextrins_w((v4i32)(a), (v4i32)(b), (dst_lane), (src_lane)))

__m128i __lsx_vperml_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vperml_d (__m128i a, __m128i b)
#include <loongson-sxintrin.h>
Instruction: vperml.d
Builtin: __builtin_lsx_vperml_d
CPU Flags: __mips_loongson_sx
Kind: function
Source: include/loongson-sxintrin.h:1397

Description

Use the low control bits in b to select u64 lanes from a within each 128-bit half.

Operation

dst.u64[0] = a.u64[(b.u64[0] & 1)];
dst.u64[1] = a.u64[(b.u64[1] & 1)];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i)__builtin_lsx_vperml_d((v2i64)a, (v2i64)b);

__m128i __lsx_vperml_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vperml_w (__m128i a, __m128i b)
#include <loongson-sxintrin.h>
Instruction: vperml.w
Builtin: __builtin_lsx_vperml_w
CPU Flags: __mips_loongson_sx
Kind: function
Source: include/loongson-sxintrin.h:1404

Description

Use the low control bits in b to select u32 lanes from a within each 128-bit half.

Operation

dst.u32[0] = a.u32[(b.u32[0] & 3)];
dst.u32[1] = a.u32[(b.u32[1] & 3)];
dst.u32[2] = a.u32[(b.u32[2] & 3)];
dst.u32[3] = a.u32[(b.u32[3] & 3)];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i)__builtin_lsx_vperml_w((v4i32)a, (v4i32)b);

__m128i __lsx_vsel_d (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vsel_d (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vsel.d
Builtin: __builtin_lsx_vsel_d
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:909

Description

Use one immediate bit per u64 lane: a 0 bit chooses a, and a 1 bit chooses b.

Operation

dst.u64[0] = ((imm >> 0) & 1) ? b.u64[0] : a.u64[0];
dst.u64[1] = ((imm >> 1) & 1) ? b.u64[1] : a.u64[1];

Header Mapping

#define __lsx_vsel_d(a, b, imm) ((__m128i)__builtin_lsx_vsel_d((v2i64)(a), (v2i64)(b), (imm)))

__m128i __lsx_vsel_h (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vsel_h (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vsel.h
Builtin: __builtin_lsx_vsel_h
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:911

Description

Use one immediate bit per u16 lane: a 0 bit chooses a, and a 1 bit chooses b.

Operation

dst.u16[0] = ((imm >> 0) & 1) ? b.u16[0] : a.u16[0];
dst.u16[1] = ((imm >> 1) & 1) ? b.u16[1] : a.u16[1];
dst.u16[2] = ((imm >> 2) & 1) ? b.u16[2] : a.u16[2];
dst.u16[3] = ((imm >> 3) & 1) ? b.u16[3] : a.u16[3];
dst.u16[4] = ((imm >> 4) & 1) ? b.u16[4] : a.u16[4];
dst.u16[5] = ((imm >> 5) & 1) ? b.u16[5] : a.u16[5];
dst.u16[6] = ((imm >> 6) & 1) ? b.u16[6] : a.u16[6];
dst.u16[7] = ((imm >> 7) & 1) ? b.u16[7] : a.u16[7];

Header Mapping

#define __lsx_vsel_h(a, b, imm) ((__m128i)__builtin_lsx_vsel_h((v8i16)(a), (v8i16)(b), (imm)))

__m128i __lsx_vsel_w (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vsel_w (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vsel.w
Builtin: __builtin_lsx_vsel_w
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:910

Description

Use one immediate bit per u32 lane: a 0 bit chooses a, and a 1 bit chooses b.

Operation

dst.u32[0] = ((imm >> 0) & 1) ? b.u32[0] : a.u32[0];
dst.u32[1] = ((imm >> 1) & 1) ? b.u32[1] : a.u32[1];
dst.u32[2] = ((imm >> 2) & 1) ? b.u32[2] : a.u32[2];
dst.u32[3] = ((imm >> 3) & 1) ? b.u32[3] : a.u32[3];

Header Mapping

#define __lsx_vsel_w(a, b, imm) ((__m128i)__builtin_lsx_vsel_w((v4i32)(a), (v4i32)(b), (imm)))

__m128i __lsx_vselr_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __lsx_vselr_d (__m128i a, __m128i b, __m128i c)
#include <loongson-sxintrin.h>
Instruction: vselr.d
Builtin: __builtin_lsx_vselr_d
CPU Flags: __mips_loongson_sx
Kind: function
Source: include/loongson-sxintrin.h:1383

Description

Use the sign bit of each lane in a as the selector: non-negative chooses b, negative chooses c.

Operation

dst.i64[0] = (a.i64[0] < 0) ? c.i64[0] : b.i64[0];
dst.i64[1] = (a.i64[1] < 0) ? c.i64[1] : b.i64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i)__builtin_lsx_vselr_d((v2i64)a, (v2i64)b, (v2i64)c);

__m128i __lsx_vselr_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __lsx_vselr_w (__m128i a, __m128i b, __m128i c)
#include <loongson-sxintrin.h>
Instruction: vselr.w
Builtin: __builtin_lsx_vselr_w
CPU Flags: __mips_loongson_sx
Kind: function
Source: include/loongson-sxintrin.h:1390

Description

Use the sign bit of each lane in a as the selector: non-negative chooses b, negative chooses c.

Operation

dst.i32[0] = (a.i32[0] < 0) ? c.i32[0] : b.i32[0];
dst.i32[1] = (a.i32[1] < 0) ? c.i32[1] : b.i32[1];
dst.i32[2] = (a.i32[2] < 0) ? c.i32[2] : b.i32[2];
dst.i32[3] = (a.i32[3] < 0) ? c.i32[3] : b.i32[3];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 1 2

Header Mapping

return (__m128i)__builtin_lsx_vselr_w((v4i32)a, (v4i32)b, (v4i32)c);

__m128i __lsx_vshufil_h (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vshufil_h (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vshufil.h
Builtin: __builtin_lsx_vshufil_h
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:900

Description

Start from a and replace one fixed u16 lane group with an immediate shuffle of the matching group from b. This is useful for small in-place table rearrangements inside a 128-bit half.

Operation

dst = a;
dst.u16[4] = b.u16[4 + ((imm >> 0) & 3)];
dst.u16[5] = b.u16[4 + ((imm >> 2) & 3)];
dst.u16[6] = b.u16[4 + ((imm >> 4) & 3)];
dst.u16[7] = b.u16[4 + ((imm >> 6) & 3)];

Header Mapping

#define __lsx_vshufil_h(a, b, imm) ((__m128i)__builtin_lsx_vshufil_h((v8i16)(a), (v8i16)(b), (imm)))

__m128i __lsx_vshufill_b (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vshufill_b (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vshufill.b
Builtin: __builtin_lsx_vshufill_b
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:898

Description

Broadcast one selected u8 lane from a into every destination lane.

Operation

dst = a;
dst.u8[12] = b.u8[12 + ((imm >> 0) & 3)];
dst.u8[13] = b.u8[12 + ((imm >> 2) & 3)];
dst.u8[14] = b.u8[12 + ((imm >> 4) & 3)];
dst.u8[15] = b.u8[12 + ((imm >> 6) & 3)];

Header Mapping

#define __lsx_vshufill_b(a, b, imm) ((__m128i)__builtin_lsx_vshufill_b((v16i8)(a), (v16i8)(b), (imm)))

__m128i __lsx_vshufilr_b (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vshufilr_b (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vshufilr.b
Builtin: __builtin_lsx_vshufilr_b
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:897

Description

Start from a and replace one fixed u8 lane group with an immediate shuffle of the matching group from b. This is useful for small in-place table rearrangements inside a 128-bit half.

Operation

dst = a;
dst.u8[8] = b.u8[8 + ((imm >> 0) & 3)];
dst.u8[9] = b.u8[8 + ((imm >> 2) & 3)];
dst.u8[10] = b.u8[8 + ((imm >> 4) & 3)];
dst.u8[11] = b.u8[8 + ((imm >> 6) & 3)];

Header Mapping

#define __lsx_vshufilr_b(a, b, imm) ((__m128i)__builtin_lsx_vshufilr_b((v16i8)(a), (v16i8)(b), (imm)))

__m128i __lsx_vshufir_h (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vshufir_h (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vshufir.h
Builtin: __builtin_lsx_vshufir_h
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:899

Description

Start from a and replace one fixed u16 lane group with an immediate shuffle of the matching group from b. This is useful for small in-place table rearrangements inside a 128-bit half.

Operation

dst = a;
dst.u16[0] = b.u16[0 + ((imm >> 0) & 3)];
dst.u16[1] = b.u16[0 + ((imm >> 2) & 3)];
dst.u16[2] = b.u16[0 + ((imm >> 4) & 3)];
dst.u16[3] = b.u16[0 + ((imm >> 6) & 3)];

Header Mapping

#define __lsx_vshufir_h(a, b, imm) ((__m128i)__builtin_lsx_vshufir_h((v8i16)(a), (v8i16)(b), (imm)))

__m128i __lsx_vshufirl_b (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vshufirl_b (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vshufirl.b
Builtin: __builtin_lsx_vshufirl_b
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:896

Description

Start from a and replace one fixed u8 lane group with an immediate shuffle of the matching group from b. This is useful for small in-place table rearrangements inside a 128-bit half.

Operation

dst = a;
dst.u8[4] = b.u8[4 + ((imm >> 0) & 3)];
dst.u8[5] = b.u8[4 + ((imm >> 2) & 3)];
dst.u8[6] = b.u8[4 + ((imm >> 4) & 3)];
dst.u8[7] = b.u8[4 + ((imm >> 6) & 3)];

Header Mapping

#define __lsx_vshufirl_b(a, b, imm) ((__m128i)__builtin_lsx_vshufirl_b((v16i8)(a), (v16i8)(b), (imm)))

__m128i __lsx_vshufirr_b (__m128i a, __m128i b, unsigned int imm)

Synopsis

__m128i __lsx_vshufirr_b (__m128i a, __m128i b, unsigned int imm)
#include <loongson-sxintrin.h>
Instruction: vshufirr.b
Builtin: __builtin_lsx_vshufirr_b
CPU Flags: __mips_loongson_sx
Kind: macro
Source: include/loongson-sxintrin.h:895

Description

Start from a and replace one fixed u8 lane group with an immediate shuffle of the matching group from b. This is useful for small in-place table rearrangements inside a 128-bit half.

Operation

dst = a;
dst.u8[0] = b.u8[0 + ((imm >> 0) & 3)];
dst.u8[1] = b.u8[0 + ((imm >> 2) & 3)];
dst.u8[2] = b.u8[0 + ((imm >> 4) & 3)];
dst.u8[3] = b.u8[0 + ((imm >> 6) & 3)];

Header Mapping

#define __lsx_vshufirr_b(a, b, imm) ((__m128i)__builtin_lsx_vshufirr_b((v16i8)(a), (v16i8)(b), (imm)))