Integer Computation

Generated from include/loongson-mmiintrin.h. This page contains 34 intrinsics.

uint16x4_t biadd (uint8x8_t a)

Synopsis

uint16x4_t biadd (uint8x8_t a)
#include <loongson-mmiintrin.h>
Instruction: biadd
Builtin: __builtin_loongson_biadd
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:434

Description

Add modular integer lanes of a and b on 1 x u64 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u16[0] = a.u8[0] + a.u8[1];
dst.u16[1] = a.u8[2] + a.u8[3];
dst.u16[2] = a.u8[4] + a.u8[5];
dst.u16[3] = a.u8[6] + a.u8[7];

Header Mapping

return __builtin_loongson_biadd (a);

int8x8_t paddb_s (int8x8_t a, int8x8_t b)

Synopsis

int8x8_t paddb_s (int8x8_t a, int8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: paddb.s
Builtin: __builtin_loongson_paddb_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:107

Description

Add signed integer lanes of a and b on 8 x i8 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.i8[0] = a.i8[0] + b.i8[0];
dst.i8[1] = a.i8[1] + b.i8[1];
dst.i8[2] = a.i8[2] + b.i8[2];
dst.i8[3] = a.i8[3] + b.i8[3];
dst.i8[4] = a.i8[4] + b.i8[4];
dst.i8[5] = a.i8[5] + b.i8[5];
dst.i8[6] = a.i8[6] + b.i8[6];
dst.i8[7] = a.i8[7] + b.i8[7];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddb_s (a, b);

uint8x8_t paddb_u (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t paddb_u (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: paddb.u
Builtin: __builtin_loongson_paddb_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:89

Description

Add unsigned integer lanes of a and b on 8 x u8 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u8[0] = a.u8[0] + b.u8[0];
dst.u8[1] = a.u8[1] + b.u8[1];
dst.u8[2] = a.u8[2] + b.u8[2];
dst.u8[3] = a.u8[3] + b.u8[3];
dst.u8[4] = a.u8[4] + b.u8[4];
dst.u8[5] = a.u8[5] + b.u8[5];
dst.u8[6] = a.u8[6] + b.u8[6];
dst.u8[7] = a.u8[7] + b.u8[7];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddb_u (a, b);

int64_t paddd_s (int64_t a, int64_t b)

Synopsis

int64_t paddd_s (int64_t a, int64_t b)
#include <loongson-mmiintrin.h>
Instruction: paddd.s
Builtin: __builtin_loongson_paddd_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:120

Description

Add signed integer lanes of a and b on 1 x i64 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.i64[0] = a.i64[0] + b.i64[0];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddd_s (a, b);

uint64_t paddd_u (uint64_t a, uint64_t b)

Synopsis

uint64_t paddd_u (uint64_t a, uint64_t b)
#include <loongson-mmiintrin.h>
Instruction: paddd.u
Builtin: __builtin_loongson_paddd_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:114

Description

Add unsigned integer lanes of a and b on 1 x u64 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u64[0] = a.u64[0] + b.u64[0];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddd_u (a, b);

int16x4_t paddh_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t paddh_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: paddh.s
Builtin: __builtin_loongson_paddh_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:101

Description

Add signed integer lanes of a and b on 4 x i16 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.i16[0] = a.i16[0] + b.i16[0];
dst.i16[1] = a.i16[1] + b.i16[1];
dst.i16[2] = a.i16[2] + b.i16[2];
dst.i16[3] = a.i16[3] + b.i16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddh_s (a, b);

uint16x4_t paddh_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t paddh_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: paddh.u
Builtin: __builtin_loongson_paddh_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:83

Description

Add unsigned integer lanes of a and b on 4 x u16 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u16[0] = a.u16[0] + b.u16[0];
dst.u16[1] = a.u16[1] + b.u16[1];
dst.u16[2] = a.u16[2] + b.u16[2];
dst.u16[3] = a.u16[3] + b.u16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddh_u (a, b);

int8x8_t paddsb (int8x8_t a, int8x8_t b)

Synopsis

int8x8_t paddsb (int8x8_t a, int8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: paddsb
Builtin: __builtin_loongson_paddsb
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:133

Description

Perform lane-wise modular saturating integer arithmetic on 8 x u8 lanes, clamping overflow instead of wrapping.

Operation

dst.u8[0] = saturate(a.i8[0] + b.i8[0], 8, signedness);
dst.u8[1] = saturate(a.i8[1] + b.i8[1], 8, signedness);
dst.u8[2] = saturate(a.i8[2] + b.i8[2], 8, signedness);
dst.u8[3] = saturate(a.i8[3] + b.i8[3], 8, signedness);
dst.u8[4] = saturate(a.i8[4] + b.i8[4], 8, signedness);
dst.u8[5] = saturate(a.i8[5] + b.i8[5], 8, signedness);
dst.u8[6] = saturate(a.i8[6] + b.i8[6], 8, signedness);
dst.u8[7] = saturate(a.i8[7] + b.i8[7], 8, signedness);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddsb (a, b);

int16x4_t paddsh (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t paddsh (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: paddsh
Builtin: __builtin_loongson_paddsh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:127

Description

Perform lane-wise modular saturating integer arithmetic on 4 x u16 lanes, clamping overflow instead of wrapping.

Operation

dst.u16[0] = saturate(a.i16[0] + b.i16[0], 16, signedness);
dst.u16[1] = saturate(a.i16[1] + b.i16[1], 16, signedness);
dst.u16[2] = saturate(a.i16[2] + b.i16[2], 16, signedness);
dst.u16[3] = saturate(a.i16[3] + b.i16[3], 16, signedness);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddsh (a, b);

uint8x8_t paddusb (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t paddusb (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: paddusb
Builtin: __builtin_loongson_paddusb
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:146

Description

Add modular integer lanes of a and b on 8 x u8 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u8[0] = a.u8[0] + b.u8[0];
dst.u8[1] = a.u8[1] + b.u8[1];
dst.u8[2] = a.u8[2] + b.u8[2];
dst.u8[3] = a.u8[3] + b.u8[3];
dst.u8[4] = a.u8[4] + b.u8[4];
dst.u8[5] = a.u8[5] + b.u8[5];
dst.u8[6] = a.u8[6] + b.u8[6];
dst.u8[7] = a.u8[7] + b.u8[7];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddusb (a, b);

uint16x4_t paddush (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t paddush (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: paddush
Builtin: __builtin_loongson_paddush
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:140

Description

Add modular integer lanes of a and b on 4 x u16 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u16[0] = a.u16[0] + b.u16[0];
dst.u16[1] = a.u16[1] + b.u16[1];
dst.u16[2] = a.u16[2] + b.u16[2];
dst.u16[3] = a.u16[3] + b.u16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddush (a, b);

int32x2_t paddw_s (int32x2_t a, int32x2_t b)

Synopsis

int32x2_t paddw_s (int32x2_t a, int32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: paddw.s
Builtin: __builtin_loongson_paddw_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:95

Description

Add signed integer lanes of a and b on 2 x i32 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.i32[0] = a.i32[0] + b.i32[0];
dst.i32[1] = a.i32[1] + b.i32[1];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddw_s (a, b);

uint32x2_t paddw_u (uint32x2_t a, uint32x2_t b)

Synopsis

uint32x2_t paddw_u (uint32x2_t a, uint32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: paddw.u
Builtin: __builtin_loongson_paddw_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:77

Description

Add unsigned integer lanes of a and b on 2 x u32 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u32[0] = a.u32[0] + b.u32[0];
dst.u32[1] = a.u32[1] + b.u32[1];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_paddw_u (a, b);

uint8x8_t pasubub (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t pasubub (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: pasubub
Builtin: __builtin_loongson_pasubub
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:427

Description

Subtract modular integer lanes of b from a on 8 x u8 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u8[0] = abs(a.u8[0] - b.u8[0]);
dst.u8[1] = abs(a.u8[1] - b.u8[1]);
dst.u8[2] = abs(a.u8[2] - b.u8[2]);
dst.u8[3] = abs(a.u8[3] - b.u8[3]);
dst.u8[4] = abs(a.u8[4] - b.u8[4]);
dst.u8[5] = abs(a.u8[5] - b.u8[5]);
dst.u8[6] = abs(a.u8[6] - b.u8[6]);
dst.u8[7] = abs(a.u8[7] - b.u8[7]);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	2	2

Header Mapping

return __builtin_loongson_pasubub (a, b);

uint8x8_t pavgb (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t pavgb (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: pavgb
Builtin: __builtin_loongson_pavgb
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:208

Description

Compute the lane-wise floor average of a and b, useful for blending packed integer samples.

Operation

dst.u8[0] = floor_average(a.u8[0], b.u8[0]);
dst.u8[1] = floor_average(a.u8[1], b.u8[1]);
dst.u8[2] = floor_average(a.u8[2], b.u8[2]);
dst.u8[3] = floor_average(a.u8[3], b.u8[3]);
dst.u8[4] = floor_average(a.u8[4], b.u8[4]);
dst.u8[5] = floor_average(a.u8[5], b.u8[5]);
dst.u8[6] = floor_average(a.u8[6], b.u8[6]);
dst.u8[7] = floor_average(a.u8[7], b.u8[7]);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_pavgb (a, b);

uint16x4_t pavgh (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t pavgh (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pavgh
Builtin: __builtin_loongson_pavgh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:202

Description

Compute the lane-wise floor average of a and b, useful for blending packed integer samples.

Operation

dst.u16[0] = floor_average(a.u16[0], b.u16[0]);
dst.u16[1] = floor_average(a.u16[1], b.u16[1]);
dst.u16[2] = floor_average(a.u16[2], b.u16[2]);
dst.u16[3] = floor_average(a.u16[3], b.u16[3]);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_pavgh (a, b);

int32x2_t pmaddhw (int16x4_t a, int16x4_t b)

Synopsis

int32x2_t pmaddhw (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pmaddhw
Builtin: __builtin_loongson_pmaddhw
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:351

Description

Add modular integer lanes of a and b on 2 x u32 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.i32[0] = a.i16[0] * b.i16[0] + a.i16[1] * b.i16[1];
dst.i32[1] = a.i16[2] * b.i16[2] + a.i16[3] * b.i16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	0.50/4	2

Header Mapping

return __builtin_loongson_pmaddhw (a, b);

int16x4_t pmulhh (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t pmulhh (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pmulhh
Builtin: __builtin_loongson_pmulhh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:406

Description

Multiply signed 16-bit lanes and keep the upper 16 bits of each 32-bit product.

Operation

dst.i16[0] = high_16_bits(a.i16[0] * b.i16[0]);
dst.i16[1] = high_16_bits(a.i16[1] * b.i16[1]);
dst.i16[2] = high_16_bits(a.i16[2] * b.i16[2]);
dst.i16[3] = high_16_bits(a.i16[3] * b.i16[3]);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	4	2

Header Mapping

return __builtin_loongson_pmulhh (a, b);

uint16x4_t pmulhuh (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t pmulhuh (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pmulhuh
Builtin: __builtin_loongson_pmulhuh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:399

Description

Multiply modular integer lanes of a and b on 4 x u16 lanes.

Operation

dst.u16[0] = a.u16[0] * b.u16[0];
dst.u16[1] = a.u16[1] * b.u16[1];
dst.u16[2] = a.u16[2] * b.u16[2];
dst.u16[3] = a.u16[3] * b.u16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	4	2

Header Mapping

return __builtin_loongson_pmulhuh (a, b);

int16x4_t pmullh (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t pmullh (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: pmullh
Builtin: __builtin_loongson_pmullh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:413

Description

Multiply signed 16-bit lanes and keep the lower 16 bits of each 32-bit product.

Operation

dst.i16[0] = low_16_bits(a.i16[0] * b.i16[0]);
dst.i16[1] = low_16_bits(a.i16[1] * b.i16[1]);
dst.i16[2] = low_16_bits(a.i16[2] * b.i16[2]);
dst.i16[3] = low_16_bits(a.i16[3] * b.i16[3]);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	4	2

Header Mapping

return __builtin_loongson_pmullh (a, b);

int64_t pmuluw (uint32x2_t a, uint32x2_t b)

Synopsis

int64_t pmuluw (uint32x2_t a, uint32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: pmuluw
Builtin: __builtin_loongson_pmuluw
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:420

Description

Multiply modular integer lanes of a and b on 2 x u32 lanes.

Operation

dst.u32[0] = a.u32[0] * b.u32[0];
dst.u32[1] = a.u32[1] * b.u32[1];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	4	2

Header Mapping

return __builtin_loongson_pmuluw (a, b);

uint16x4_t psadbh (uint8x8_t a, uint8x8_t b)

Synopsis

uint16x4_t psadbh (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: psadbh
Builtin: __builtin_loongson_psadbh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:443

Description

Compute sums of absolute differences for adjacent modular lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.u16[0] = abs(a.u8[0] - b.u8[0]) + abs(a.u8[1] - b.u8[1]);
dst.u16[1] = abs(a.u8[2] - b.u8[2]) + abs(a.u8[3] - b.u8[3]);
dst.u16[2] = abs(a.u8[4] - b.u8[4]) + abs(a.u8[5] - b.u8[5]);
dst.u16[3] = abs(a.u8[6] - b.u8[6]) + abs(a.u8[7] - b.u8[7]);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	1

Header Mapping

return __builtin_loongson_psadbh (a, b);

int8x8_t psubb_s (int8x8_t a, int8x8_t b)

Synopsis

int8x8_t psubb_s (int8x8_t a, int8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: psubb.s
Builtin: __builtin_loongson_psubb_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:568

Description

Subtract signed integer lanes of b from a on 8 x i8 lanes; immediate forms subtract the scalar immediate.

Operation

dst.i8[0] = a.i8[0] - b.i8[0];
dst.i8[1] = a.i8[1] - b.i8[1];
dst.i8[2] = a.i8[2] - b.i8[2];
dst.i8[3] = a.i8[3] - b.i8[3];
dst.i8[4] = a.i8[4] - b.i8[4];
dst.i8[5] = a.i8[5] - b.i8[5];
dst.i8[6] = a.i8[6] - b.i8[6];
dst.i8[7] = a.i8[7] - b.i8[7];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubb_s (a, b);

uint8x8_t psubb_u (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t psubb_u (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: psubb.u
Builtin: __builtin_loongson_psubb_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:550

Description

Subtract unsigned integer lanes of b from a on 8 x u8 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u8[0] = a.u8[0] - b.u8[0];
dst.u8[1] = a.u8[1] - b.u8[1];
dst.u8[2] = a.u8[2] - b.u8[2];
dst.u8[3] = a.u8[3] - b.u8[3];
dst.u8[4] = a.u8[4] - b.u8[4];
dst.u8[5] = a.u8[5] - b.u8[5];
dst.u8[6] = a.u8[6] - b.u8[6];
dst.u8[7] = a.u8[7] - b.u8[7];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubb_u (a, b);

int64_t psubd_s (int64_t a, int64_t b)

Synopsis

int64_t psubd_s (int64_t a, int64_t b)
#include <loongson-mmiintrin.h>
Instruction: psubd.s
Builtin: __builtin_loongson_psubd_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:581

Description

Subtract signed integer lanes of b from a on 1 x i64 lanes; immediate forms subtract the scalar immediate.

Operation

dst.i64[0] = a.i64[0] - b.i64[0];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubd_s (a, b);

uint64_t psubd_u (uint64_t a, uint64_t b)

Synopsis

uint64_t psubd_u (uint64_t a, uint64_t b)
#include <loongson-mmiintrin.h>
Instruction: psubd.u
Builtin: __builtin_loongson_psubd_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:575

Description

Subtract unsigned integer lanes of b from a on 1 x u64 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u64[0] = a.u64[0] - b.u64[0];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubd_u (a, b);

int16x4_t psubh_s (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t psubh_s (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: psubh.s
Builtin: __builtin_loongson_psubh_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:562

Description

Subtract signed integer lanes of b from a on 4 x i16 lanes; immediate forms subtract the scalar immediate.

Operation

dst.i16[0] = a.i16[0] - b.i16[0];
dst.i16[1] = a.i16[1] - b.i16[1];
dst.i16[2] = a.i16[2] - b.i16[2];
dst.i16[3] = a.i16[3] - b.i16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubh_s (a, b);

uint16x4_t psubh_u (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t psubh_u (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: psubh.u
Builtin: __builtin_loongson_psubh_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:544

Description

Subtract unsigned integer lanes of b from a on 4 x u16 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u16[0] = a.u16[0] - b.u16[0];
dst.u16[1] = a.u16[1] - b.u16[1];
dst.u16[2] = a.u16[2] - b.u16[2];
dst.u16[3] = a.u16[3] - b.u16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubh_u (a, b);

int8x8_t psubsb (int8x8_t a, int8x8_t b)

Synopsis

int8x8_t psubsb (int8x8_t a, int8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: psubsb
Builtin: __builtin_loongson_psubsb
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:594

Description

Perform lane-wise modular saturating integer arithmetic on 8 x u8 lanes, clamping overflow instead of wrapping.

Operation

dst.u8[0] = saturate(a.i8[0] - b.i8[0], 8, signedness);
dst.u8[1] = saturate(a.i8[1] - b.i8[1], 8, signedness);
dst.u8[2] = saturate(a.i8[2] - b.i8[2], 8, signedness);
dst.u8[3] = saturate(a.i8[3] - b.i8[3], 8, signedness);
dst.u8[4] = saturate(a.i8[4] - b.i8[4], 8, signedness);
dst.u8[5] = saturate(a.i8[5] - b.i8[5], 8, signedness);
dst.u8[6] = saturate(a.i8[6] - b.i8[6], 8, signedness);
dst.u8[7] = saturate(a.i8[7] - b.i8[7], 8, signedness);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubsb (a, b);

int16x4_t psubsh (int16x4_t a, int16x4_t b)

Synopsis

int16x4_t psubsh (int16x4_t a, int16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: psubsh
Builtin: __builtin_loongson_psubsh
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:588

Description

Perform lane-wise modular saturating integer arithmetic on 4 x u16 lanes, clamping overflow instead of wrapping.

Operation

dst.u16[0] = saturate(a.i16[0] - b.i16[0], 16, signedness);
dst.u16[1] = saturate(a.i16[1] - b.i16[1], 16, signedness);
dst.u16[2] = saturate(a.i16[2] - b.i16[2], 16, signedness);
dst.u16[3] = saturate(a.i16[3] - b.i16[3], 16, signedness);

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubsh (a, b);

uint8x8_t psubusb (uint8x8_t a, uint8x8_t b)

Synopsis

uint8x8_t psubusb (uint8x8_t a, uint8x8_t b)
#include <loongson-mmiintrin.h>
Instruction: psubusb
Builtin: __builtin_loongson_psubusb
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:607

Description

Subtract modular integer lanes of b from a on 8 x u8 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u8[0] = a.u8[0] - b.u8[0];
dst.u8[1] = a.u8[1] - b.u8[1];
dst.u8[2] = a.u8[2] - b.u8[2];
dst.u8[3] = a.u8[3] - b.u8[3];
dst.u8[4] = a.u8[4] - b.u8[4];
dst.u8[5] = a.u8[5] - b.u8[5];
dst.u8[6] = a.u8[6] - b.u8[6];
dst.u8[7] = a.u8[7] - b.u8[7];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubusb (a, b);

uint16x4_t psubush (uint16x4_t a, uint16x4_t b)

Synopsis

uint16x4_t psubush (uint16x4_t a, uint16x4_t b)
#include <loongson-mmiintrin.h>
Instruction: psubush
Builtin: __builtin_loongson_psubush
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:601

Description

Subtract modular integer lanes of b from a on 4 x u16 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u16[0] = a.u16[0] - b.u16[0];
dst.u16[1] = a.u16[1] - b.u16[1];
dst.u16[2] = a.u16[2] - b.u16[2];
dst.u16[3] = a.u16[3] - b.u16[3];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubush (a, b);

int32x2_t psubw_s (int32x2_t a, int32x2_t b)

Synopsis

int32x2_t psubw_s (int32x2_t a, int32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: psubw.s
Builtin: __builtin_loongson_psubw_s
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:556

Description

Subtract signed integer lanes of b from a on 2 x i32 lanes; immediate forms subtract the scalar immediate.

Operation

dst.i32[0] = a.i32[0] - b.i32[0];
dst.i32[1] = a.i32[1] - b.i32[1];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubw_s (a, b);

uint32x2_t psubw_u (uint32x2_t a, uint32x2_t b)

Synopsis

uint32x2_t psubw_u (uint32x2_t a, uint32x2_t b)
#include <loongson-mmiintrin.h>
Instruction: psubw.u
Builtin: __builtin_loongson_psubw_u
CPU Flags: __mips_loongson_mmi
Kind: function
Source: include/loongson-mmiintrin.h:538

Description

Subtract unsigned integer lanes of b from a on 2 x u32 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u32[0] = a.u32[0] - b.u32[0];
dst.u32[1] = a.u32[1] - b.u32[1];

Latency and Throughput

CPU	µarch	Latency	Throughput (IPC)
3A4000	GS464V	1	2

Header Mapping

return __builtin_loongson_psubw_u (a, b);