Integer Computation

Generated from include/msa2.h. This page contains 300 intrinsics.

__m128i __msa2_dotp_s_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_dotp_s_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: dotp.s.q
Builtin: __builtin_msa2_dotp_s_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:56

Description

Multiply adjacent pairs of narrower signed lanes, add each pair, and write widened dot-product lanes. This is useful for packed filters, matrix kernels, and sum-of-products code.

Operation

dst.i128[0] = a.i64[0] * b.i64[0] + a.i64[1] * b.i64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i) __builtin_msa2_dotp_s_q ((v2i64) a, (v2i64) b);

__m128i __msa2_dotp_u_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_dotp_u_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: dotp.u.q
Builtin: __builtin_msa2_dotp_u_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:62

Description

Multiply adjacent pairs of narrower unsigned lanes, add each pair, and write widened dot-product lanes. This is useful for packed filters, matrix kernels, and sum-of-products code.

Operation

dst.u128[0] = a.u64[0] * b.u64[0] + a.u64[1] * b.u64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i) __builtin_msa2_dotp_u_q ((v2u64) a, (v2u64) b);

__m128i __msa2_dotp_us_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_dotp_us_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: dotp.us.d
Builtin: __builtin_msa2_dotp_us_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:80

Description

Multiply adjacent pairs of narrower unsigned/signed mixed lanes, add each pair, and write widened dot-product lanes. This is useful for packed filters, matrix kernels, and sum-of-products code.

Operation

dst.i64[0] = a.u32[0] * b.i32[0] + a.u32[1] * b.i32[1];
dst.i64[1] = a.u32[2] * b.i32[2] + a.u32[3] * b.i32[3];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i) __builtin_msa2_dotp_us_d ((v4u32) a, (v4i32) b);

__m128i __msa2_dotp_us_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_dotp_us_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: dotp.us.h
Builtin: __builtin_msa2_dotp_us_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:68

Description

Multiply adjacent pairs of narrower unsigned/signed mixed lanes, add each pair, and write widened dot-product lanes. This is useful for packed filters, matrix kernels, and sum-of-products code.

Operation

dst.i16[0] = a.u8[0] * b.i8[0] + a.u8[1] * b.i8[1];
dst.i16[1] = a.u8[2] * b.i8[2] + a.u8[3] * b.i8[3];
dst.i16[2] = a.u8[4] * b.i8[4] + a.u8[5] * b.i8[5];
dst.i16[3] = a.u8[6] * b.i8[6] + a.u8[7] * b.i8[7];
dst.i16[4] = a.u8[8] * b.i8[8] + a.u8[9] * b.i8[9];
dst.i16[5] = a.u8[10] * b.i8[10] + a.u8[11] * b.i8[11];
dst.i16[6] = a.u8[12] * b.i8[12] + a.u8[13] * b.i8[13];
dst.i16[7] = a.u8[14] * b.i8[14] + a.u8[15] * b.i8[15];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i) __builtin_msa2_dotp_us_h ((v16u8) a, (v16i8) b);

__m128i __msa2_dotp_us_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_dotp_us_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: dotp.us.q
Builtin: __builtin_msa2_dotp_us_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:86

Description

Multiply adjacent pairs of narrower unsigned/signed mixed lanes, add each pair, and write widened dot-product lanes. This is useful for packed filters, matrix kernels, and sum-of-products code.

Operation

dst.i128[0] = a.u64[0] * b.i64[0] + a.u64[1] * b.i64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i) __builtin_msa2_dotp_us_q ((v2u64) a, (v2i64) b);

__m128i __msa2_dotp_us_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_dotp_us_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: dotp.us.w
Builtin: __builtin_msa2_dotp_us_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:74

Description

Multiply adjacent pairs of narrower unsigned/signed mixed lanes, add each pair, and write widened dot-product lanes. This is useful for packed filters, matrix kernels, and sum-of-products code.

Operation

dst.i32[0] = a.u16[0] * b.i16[0] + a.u16[1] * b.i16[1];
dst.i32[1] = a.u16[2] * b.i16[2] + a.u16[3] * b.i16[3];
dst.i32[2] = a.u16[4] * b.i16[4] + a.u16[5] * b.i16[5];
dst.i32[3] = a.u16[6] * b.i16[6] + a.u16[7] * b.i16[7];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i) __builtin_msa2_dotp_us_w ((v8u16) a, (v8i16) b);

__m128i __msa2_dpadd_s_q (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpadd_s_q (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpadd.s.q
Builtin: __builtin_msa2_dpadd_s_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:92

Description

Compute adjacent-pair dot products and add to the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.i128[0] = a.i128[0] + b.i64[0] * c.i64[0] + b.i64[1] * c.i64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i) __builtin_msa2_dpadd_s_q ((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_dpadd_u_q (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpadd_u_q (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpadd.u.q
Builtin: __builtin_msa2_dpadd_u_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:98

Description

Compute adjacent-pair dot products and add to the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.u128[0] = a.u128[0] + b.u64[0] * c.u64[0] + b.u64[1] * c.u64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i) __builtin_msa2_dpadd_u_q ((v2u64) a, (v2u64) b, (v2u64) c);

__m128i __msa2_dpadd_us_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpadd_us_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpadd.us.d
Builtin: __builtin_msa2_dpadd_us_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:116

Description

Compute adjacent-pair dot products and add to the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.i64[0] = a.i64[0] + b.u32[0] * c.i32[0] + b.u32[1] * c.i32[1];
dst.i64[1] = a.i64[1] + b.u32[2] * c.i32[2] + b.u32[3] * c.i32[3];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i) __builtin_msa2_dpadd_us_d ((v2u64) a, (v4u32) b, (v4i32) c);

__m128i __msa2_dpadd_us_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpadd_us_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpadd.us.h
Builtin: __builtin_msa2_dpadd_us_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:104

Description

Compute adjacent-pair dot products and add to the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.i16[0] = a.i16[0] + b.u8[0] * c.i8[0] + b.u8[1] * c.i8[1];
dst.i16[1] = a.i16[1] + b.u8[2] * c.i8[2] + b.u8[3] * c.i8[3];
dst.i16[2] = a.i16[2] + b.u8[4] * c.i8[4] + b.u8[5] * c.i8[5];
dst.i16[3] = a.i16[3] + b.u8[6] * c.i8[6] + b.u8[7] * c.i8[7];
dst.i16[4] = a.i16[4] + b.u8[8] * c.i8[8] + b.u8[9] * c.i8[9];
dst.i16[5] = a.i16[5] + b.u8[10] * c.i8[10] + b.u8[11] * c.i8[11];
dst.i16[6] = a.i16[6] + b.u8[12] * c.i8[12] + b.u8[13] * c.i8[13];
dst.i16[7] = a.i16[7] + b.u8[14] * c.i8[14] + b.u8[15] * c.i8[15];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i) __builtin_msa2_dpadd_us_h ((v8u16) a, (v16u8) b, (v16i8) c);

__m128i __msa2_dpadd_us_q (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpadd_us_q (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpadd.us.q
Builtin: __builtin_msa2_dpadd_us_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:122

Description

Compute adjacent-pair dot products and add to the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.i128[0] = a.i128[0] + b.u64[0] * c.i64[0] + b.u64[1] * c.i64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i) __builtin_msa2_dpadd_us_q ((v2u64) a, (v2u64) b, (v2i64) c);

__m128i __msa2_dpadd_us_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpadd_us_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpadd.us.w
Builtin: __builtin_msa2_dpadd_us_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:110

Description

Compute adjacent-pair dot products and add to the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.i32[0] = a.i32[0] + b.u16[0] * c.i16[0] + b.u16[1] * c.i16[1];
dst.i32[1] = a.i32[1] + b.u16[2] * c.i16[2] + b.u16[3] * c.i16[3];
dst.i32[2] = a.i32[2] + b.u16[4] * c.i16[4] + b.u16[5] * c.i16[5];
dst.i32[3] = a.i32[3] + b.u16[6] * c.i16[6] + b.u16[7] * c.i16[7];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i) __builtin_msa2_dpadd_us_w ((v4u32) a, (v8u16) b, (v8i16) c);

__m128i __msa2_dpsub_s_q (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpsub_s_q (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpsub.s.q
Builtin: __builtin_msa2_dpsub_s_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:128

Description

Compute adjacent-pair dot products and subtract from the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.i128[0] = a.i128[0] - b.i64[0] * c.i64[0] - b.i64[1] * c.i64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i) __builtin_msa2_dpsub_s_q ((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_dpsub_u_q (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_dpsub_u_q (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: dpsub.u.q
Builtin: __builtin_msa2_dpsub_u_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:134

Description

Compute adjacent-pair dot products and subtract from the accumulator operand in widened lanes. This is a packed multiply-accumulate primitive.

Operation

dst.u128[0] = a.u128[0] - b.u64[0] * c.u64[0] - b.u64[1] * c.u64[1];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i) __builtin_msa2_dpsub_u_q ((v2u64) a, (v2u64) b, (v2u64) c);

__m128i __msa2_hadd_s_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_hadd_s_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: hadd.s.q
Builtin: __builtin_msa2_hadd_s_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:140

Description

Widen corresponding narrower source lanes and add them pairwise into 1 x i128 lanes.

Operation

dst.i128[0] = widen(a.i64[0]) + widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i) __builtin_msa2_hadd_s_q ((v2i64) a, (v2i64) b);

__m128i __msa2_hadd_u_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_hadd_u_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: hadd.u.q
Builtin: __builtin_msa2_hadd_u_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:146

Description

Widen corresponding narrower source lanes and add them pairwise into 1 x u128 lanes.

Operation

dst.u128[0] = widen(a.u64[0]) + widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i) __builtin_msa2_hadd_u_q ((v2u64) a, (v2u64) b);

__m128i __msa2_hsub_s_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_hsub_s_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: hsub.s.q
Builtin: __builtin_msa2_hsub_s_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:152

Description

Widen corresponding narrower source lanes and subtract them pairwise into 1 x i128 lanes.

Operation

dst.i128[0] = widen(a.i64[0]) - widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i) __builtin_msa2_hsub_s_q ((v2i64) a, (v2i64) b);

__m128i __msa2_hsub_u_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_hsub_u_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: hsub.u.q
Builtin: __builtin_msa2_hsub_u_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:158

Description

Widen corresponding narrower source lanes and subtract them pairwise into 1 x u128 lanes.

Operation

dst.u128[0] = widen(a.u64[0]) - widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i) __builtin_msa2_hsub_u_q ((v2u64) a, (v2u64) b);

__m128i __msa2_muhv_s_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_muhv_s_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: muhv.s.d
Builtin: __builtin_msa2_muhv_s_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:479

Description

Multiply signed integer lanes and keep the upper half of each product.

Operation

dst.i64[0] = upper_64_bits(a.i64[0] * b.i64[0]);
dst.i64[1] = upper_64_bits(a.i64[1] * b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_muhv_s_d((v2i64) a, (v2i64) b);

__m128i __msa2_muhv_u_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_muhv_u_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: muhv.u.d
Builtin: __builtin_msa2_muhv_u_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:500

Description

Multiply unsigned integer lanes and keep the upper half of each product.

Operation

dst.u64[0] = upper_64_bits(a.u64[0] * b.u64[0]);
dst.u64[1] = upper_64_bits(a.u64[1] * b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_muhv_u_d((v2u64) a, (v2u64) b);

__m128i __msa2_muhv_us_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_muhv_us_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: muhv.us.d
Builtin: __builtin_msa2_muhv_us_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:521

Description

Multiply unsigned/signed mixed integer lanes and keep the upper half of each product.

Operation

dst.u64[0] = upper_64_bits(a.u64[0] * b.i64[0]);
dst.u64[1] = upper_64_bits(a.u64[1] * b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_muhv_us_d((v2u64) a, (v2i64) b);

__m128i __msa2_sad_adj2_s_acc_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_sad_adj2_s_acc_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: sad.adj2.s.acc.w2x.b
Builtin: __builtin_msa2_sad_adj2_s_acc_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:667

Description

Compute sums of absolute differences for adjacent signed lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.i16[0] = a.i16[0] + abs(widen(b.i8[0]) - widen(c.i8[0])) + abs(widen(b.i8[1]) - widen(c.i8[1]));
dst.i16[1] = a.i16[1] + abs(widen(b.i8[2]) - widen(c.i8[2])) + abs(widen(b.i8[3]) - widen(c.i8[3]));
dst.i16[2] = a.i16[2] + abs(widen(b.i8[4]) - widen(c.i8[4])) + abs(widen(b.i8[5]) - widen(c.i8[5]));
dst.i16[3] = a.i16[3] + abs(widen(b.i8[6]) - widen(c.i8[6])) + abs(widen(b.i8[7]) - widen(c.i8[7]));
dst.i16[4] = a.i16[4] + abs(widen(b.i8[8]) - widen(c.i8[8])) + abs(widen(b.i8[9]) - widen(c.i8[9]));
dst.i16[5] = a.i16[5] + abs(widen(b.i8[10]) - widen(c.i8[10])) + abs(widen(b.i8[11]) - widen(c.i8[11]));
dst.i16[6] = a.i16[6] + abs(widen(b.i8[12]) - widen(c.i8[12])) + abs(widen(b.i8[13]) - widen(c.i8[13]));
dst.i16[7] = a.i16[7] + abs(widen(b.i8[14]) - widen(c.i8[14])) + abs(widen(b.i8[15]) - widen(c.i8[15]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_s_acc_w2x_b((v8i16) a, (v16i8) b, (v16i8) c);

__m128i __msa2_sad_adj2_s_acc_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_sad_adj2_s_acc_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: sad.adj2.s.acc.w2x.h
Builtin: __builtin_msa2_sad_adj2_s_acc_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:687

Description

Compute sums of absolute differences for adjacent signed lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.i32[0] = a.i32[0] + abs(widen(b.i16[0]) - widen(c.i16[0])) + abs(widen(b.i16[1]) - widen(c.i16[1]));
dst.i32[1] = a.i32[1] + abs(widen(b.i16[2]) - widen(c.i16[2])) + abs(widen(b.i16[3]) - widen(c.i16[3]));
dst.i32[2] = a.i32[2] + abs(widen(b.i16[4]) - widen(c.i16[4])) + abs(widen(b.i16[5]) - widen(c.i16[5]));
dst.i32[3] = a.i32[3] + abs(widen(b.i16[6]) - widen(c.i16[6])) + abs(widen(b.i16[7]) - widen(c.i16[7]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_s_acc_w2x_h((v4i32) a, (v8i16) b, (v8i16) c);

__m128i __msa2_sad_adj2_s_acc_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_sad_adj2_s_acc_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: sad.adj2.s.acc.w2x.w
Builtin: __builtin_msa2_sad_adj2_s_acc_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:707

Description

Compute sums of absolute differences for adjacent signed lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.i64[0] = a.i64[0] + abs(widen(b.i32[0]) - widen(c.i32[0])) + abs(widen(b.i32[1]) - widen(c.i32[1]));
dst.i64[1] = a.i64[1] + abs(widen(b.i32[2]) - widen(c.i32[2])) + abs(widen(b.i32[3]) - widen(c.i32[3]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_s_acc_w2x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_sad_adj2_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_sad_adj2_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: sad.adj2.s.w2x.b
Builtin: __builtin_msa2_sad_adj2_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:542

Description

Compute sums of absolute differences for adjacent signed lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.i16[0] = abs(widen(a.i8[0]) - widen(b.i8[0])) + abs(widen(a.i8[1]) - widen(b.i8[1]));
dst.i16[1] = abs(widen(a.i8[2]) - widen(b.i8[2])) + abs(widen(a.i8[3]) - widen(b.i8[3]));
dst.i16[2] = abs(widen(a.i8[4]) - widen(b.i8[4])) + abs(widen(a.i8[5]) - widen(b.i8[5]));
dst.i16[3] = abs(widen(a.i8[6]) - widen(b.i8[6])) + abs(widen(a.i8[7]) - widen(b.i8[7]));
dst.i16[4] = abs(widen(a.i8[8]) - widen(b.i8[8])) + abs(widen(a.i8[9]) - widen(b.i8[9]));
dst.i16[5] = abs(widen(a.i8[10]) - widen(b.i8[10])) + abs(widen(a.i8[11]) - widen(b.i8[11]));
dst.i16[6] = abs(widen(a.i8[12]) - widen(b.i8[12])) + abs(widen(a.i8[13]) - widen(b.i8[13]));
dst.i16[7] = abs(widen(a.i8[14]) - widen(b.i8[14])) + abs(widen(a.i8[15]) - widen(b.i8[15]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_sad_adj2_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_sad_adj2_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: sad.adj2.s.w2x.h
Builtin: __builtin_msa2_sad_adj2_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:563

Description

Compute sums of absolute differences for adjacent signed lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.i32[0] = abs(widen(a.i16[0]) - widen(b.i16[0])) + abs(widen(a.i16[1]) - widen(b.i16[1]));
dst.i32[1] = abs(widen(a.i16[2]) - widen(b.i16[2])) + abs(widen(a.i16[3]) - widen(b.i16[3]));
dst.i32[2] = abs(widen(a.i16[4]) - widen(b.i16[4])) + abs(widen(a.i16[5]) - widen(b.i16[5]));
dst.i32[3] = abs(widen(a.i16[6]) - widen(b.i16[6])) + abs(widen(a.i16[7]) - widen(b.i16[7]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_sad_adj2_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_sad_adj2_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: sad.adj2.s.w2x.w
Builtin: __builtin_msa2_sad_adj2_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:584

Description

Compute sums of absolute differences for adjacent signed lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.i64[0] = abs(widen(a.i32[0]) - widen(b.i32[0])) + abs(widen(a.i32[1]) - widen(b.i32[1]));
dst.i64[1] = abs(widen(a.i32[2]) - widen(b.i32[2])) + abs(widen(a.i32[3]) - widen(b.i32[3]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_sad_adj2_u_acc_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_sad_adj2_u_acc_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: sad.adj2.u.acc.w2x.b
Builtin: __builtin_msa2_sad_adj2_u_acc_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:727

Description

Compute sums of absolute differences for adjacent unsigned lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.u16[0] = a.u16[0] + abs(widen(b.u8[0]) - widen(c.u8[0])) + abs(widen(b.u8[1]) - widen(c.u8[1]));
dst.u16[1] = a.u16[1] + abs(widen(b.u8[2]) - widen(c.u8[2])) + abs(widen(b.u8[3]) - widen(c.u8[3]));
dst.u16[2] = a.u16[2] + abs(widen(b.u8[4]) - widen(c.u8[4])) + abs(widen(b.u8[5]) - widen(c.u8[5]));
dst.u16[3] = a.u16[3] + abs(widen(b.u8[6]) - widen(c.u8[6])) + abs(widen(b.u8[7]) - widen(c.u8[7]));
dst.u16[4] = a.u16[4] + abs(widen(b.u8[8]) - widen(c.u8[8])) + abs(widen(b.u8[9]) - widen(c.u8[9]));
dst.u16[5] = a.u16[5] + abs(widen(b.u8[10]) - widen(c.u8[10])) + abs(widen(b.u8[11]) - widen(c.u8[11]));
dst.u16[6] = a.u16[6] + abs(widen(b.u8[12]) - widen(c.u8[12])) + abs(widen(b.u8[13]) - widen(c.u8[13]));
dst.u16[7] = a.u16[7] + abs(widen(b.u8[14]) - widen(c.u8[14])) + abs(widen(b.u8[15]) - widen(c.u8[15]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_u_acc_w2x_b((v8u16) a, (v16u8) b, (v16u8) c);

__m128i __msa2_sad_adj2_u_acc_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_sad_adj2_u_acc_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: sad.adj2.u.acc.w2x.h
Builtin: __builtin_msa2_sad_adj2_u_acc_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:747

Description

Compute sums of absolute differences for adjacent unsigned lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.u32[0] = a.u32[0] + abs(widen(b.u16[0]) - widen(c.u16[0])) + abs(widen(b.u16[1]) - widen(c.u16[1]));
dst.u32[1] = a.u32[1] + abs(widen(b.u16[2]) - widen(c.u16[2])) + abs(widen(b.u16[3]) - widen(c.u16[3]));
dst.u32[2] = a.u32[2] + abs(widen(b.u16[4]) - widen(c.u16[4])) + abs(widen(b.u16[5]) - widen(c.u16[5]));
dst.u32[3] = a.u32[3] + abs(widen(b.u16[6]) - widen(c.u16[6])) + abs(widen(b.u16[7]) - widen(c.u16[7]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_u_acc_w2x_h((v4u32) a, (v8u16) b, (v8u16) c);

__m128i __msa2_sad_adj2_u_acc_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_sad_adj2_u_acc_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: sad.adj2.u.acc.w2x.w
Builtin: __builtin_msa2_sad_adj2_u_acc_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:767

Description

Compute sums of absolute differences for adjacent unsigned lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.u64[0] = a.u64[0] + abs(widen(b.u32[0]) - widen(c.u32[0])) + abs(widen(b.u32[1]) - widen(c.u32[1]));
dst.u64[1] = a.u64[1] + abs(widen(b.u32[2]) - widen(c.u32[2])) + abs(widen(b.u32[3]) - widen(c.u32[3]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_u_acc_w2x_w((v2u64) a, (v4u32) b, (v4u32) c);

__m128i __msa2_sad_adj2_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_sad_adj2_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: sad.adj2.u.w2x.b
Builtin: __builtin_msa2_sad_adj2_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:605

Description

Compute sums of absolute differences for adjacent unsigned lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.u16[0] = abs(widen(a.u8[0]) - widen(b.u8[0])) + abs(widen(a.u8[1]) - widen(b.u8[1]));
dst.u16[1] = abs(widen(a.u8[2]) - widen(b.u8[2])) + abs(widen(a.u8[3]) - widen(b.u8[3]));
dst.u16[2] = abs(widen(a.u8[4]) - widen(b.u8[4])) + abs(widen(a.u8[5]) - widen(b.u8[5]));
dst.u16[3] = abs(widen(a.u8[6]) - widen(b.u8[6])) + abs(widen(a.u8[7]) - widen(b.u8[7]));
dst.u16[4] = abs(widen(a.u8[8]) - widen(b.u8[8])) + abs(widen(a.u8[9]) - widen(b.u8[9]));
dst.u16[5] = abs(widen(a.u8[10]) - widen(b.u8[10])) + abs(widen(a.u8[11]) - widen(b.u8[11]));
dst.u16[6] = abs(widen(a.u8[12]) - widen(b.u8[12])) + abs(widen(a.u8[13]) - widen(b.u8[13]));
dst.u16[7] = abs(widen(a.u8[14]) - widen(b.u8[14])) + abs(widen(a.u8[15]) - widen(b.u8[15]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_sad_adj2_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_sad_adj2_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: sad.adj2.u.w2x.h
Builtin: __builtin_msa2_sad_adj2_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:626

Description

Compute sums of absolute differences for adjacent unsigned lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.u32[0] = abs(widen(a.u16[0]) - widen(b.u16[0])) + abs(widen(a.u16[1]) - widen(b.u16[1]));
dst.u32[1] = abs(widen(a.u16[2]) - widen(b.u16[2])) + abs(widen(a.u16[3]) - widen(b.u16[3]));
dst.u32[2] = abs(widen(a.u16[4]) - widen(b.u16[4])) + abs(widen(a.u16[5]) - widen(b.u16[5]));
dst.u32[3] = abs(widen(a.u16[6]) - widen(b.u16[6])) + abs(widen(a.u16[7]) - widen(b.u16[7]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_sad_adj2_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_sad_adj2_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: sad.adj2.u.w2x.w
Builtin: __builtin_msa2_sad_adj2_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:647

Description

Compute sums of absolute differences for adjacent unsigned lanes and write widened results. This is commonly used in image/video matching and distance calculations.

Operation

dst.u64[0] = abs(widen(a.u32[0]) - widen(b.u32[0])) + abs(widen(a.u32[1]) - widen(b.u32[1]));
dst.u64[1] = abs(widen(a.u32[2]) - widen(b.u32[2])) + abs(widen(a.u32[3]) - widen(b.u32[3]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_sad_adj2_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_subssu_u_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_subssu_u_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: subssu.u.b
Builtin: __builtin_msa2_subssu_u_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:362

Description

Perform lane-wise unsigned saturating integer arithmetic on 16 x u8 lanes, clamping overflow instead of wrapping.

Operation

dst.u8[0] = unsigned_saturate(a.i8[0] - b.u8[0], 8);
dst.u8[1] = unsigned_saturate(a.i8[1] - b.u8[1], 8);
dst.u8[2] = unsigned_saturate(a.i8[2] - b.u8[2], 8);
dst.u8[3] = unsigned_saturate(a.i8[3] - b.u8[3], 8);
dst.u8[4] = unsigned_saturate(a.i8[4] - b.u8[4], 8);
dst.u8[5] = unsigned_saturate(a.i8[5] - b.u8[5], 8);
dst.u8[6] = unsigned_saturate(a.i8[6] - b.u8[6], 8);
dst.u8[7] = unsigned_saturate(a.i8[7] - b.u8[7], 8);
dst.u8[8] = unsigned_saturate(a.i8[8] - b.u8[8], 8);
dst.u8[9] = unsigned_saturate(a.i8[9] - b.u8[9], 8);
dst.u8[10] = unsigned_saturate(a.i8[10] - b.u8[10], 8);
dst.u8[11] = unsigned_saturate(a.i8[11] - b.u8[11], 8);
dst.u8[12] = unsigned_saturate(a.i8[12] - b.u8[12], 8);
dst.u8[13] = unsigned_saturate(a.i8[13] - b.u8[13], 8);
dst.u8[14] = unsigned_saturate(a.i8[14] - b.u8[14], 8);
dst.u8[15] = unsigned_saturate(a.i8[15] - b.u8[15], 8);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i) __builtin_msa2_subssu_u_b ((v16i8) a, (v16u8) b);

__m128i __msa2_subssu_u_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_subssu_u_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: subssu.u.d
Builtin: __builtin_msa2_subssu_u_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:380

Description

Perform lane-wise unsigned saturating integer arithmetic on 2 x u64 lanes, clamping overflow instead of wrapping.

Operation

dst.u64[0] = unsigned_saturate(a.i64[0] - b.u64[0], 64);
dst.u64[1] = unsigned_saturate(a.i64[1] - b.u64[1], 64);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i) __builtin_msa2_subssu_u_d ((v2i64) a, (v2u64) b);

__m128i __msa2_subssu_u_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_subssu_u_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: subssu.u.h
Builtin: __builtin_msa2_subssu_u_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:368

Description

Perform lane-wise unsigned saturating integer arithmetic on 8 x u16 lanes, clamping overflow instead of wrapping.

Operation

dst.u16[0] = unsigned_saturate(a.i16[0] - b.u16[0], 16);
dst.u16[1] = unsigned_saturate(a.i16[1] - b.u16[1], 16);
dst.u16[2] = unsigned_saturate(a.i16[2] - b.u16[2], 16);
dst.u16[3] = unsigned_saturate(a.i16[3] - b.u16[3], 16);
dst.u16[4] = unsigned_saturate(a.i16[4] - b.u16[4], 16);
dst.u16[5] = unsigned_saturate(a.i16[5] - b.u16[5], 16);
dst.u16[6] = unsigned_saturate(a.i16[6] - b.u16[6], 16);
dst.u16[7] = unsigned_saturate(a.i16[7] - b.u16[7], 16);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i) __builtin_msa2_subssu_u_h ((v8i16) a, (v8u16) b);

__m128i __msa2_subssu_u_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_subssu_u_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: subssu.u.w
Builtin: __builtin_msa2_subssu_u_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:374

Description

Perform lane-wise unsigned saturating integer arithmetic on 4 x u32 lanes, clamping overflow instead of wrapping.

Operation

dst.u32[0] = unsigned_saturate(a.i32[0] - b.u32[0], 32);
dst.u32[1] = unsigned_saturate(a.i32[1] - b.u32[1], 32);
dst.u32[2] = unsigned_saturate(a.i32[2] - b.u32[2], 32);
dst.u32[3] = unsigned_saturate(a.i32[3] - b.u32[3], 32);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i) __builtin_msa2_subssu_u_w ((v4i32) a, (v4u32) b);

__m128i __msa2_vadd_el0_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.q
Builtin: __builtin_msa2_vadd_el0_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:386

Description

Add modular integer lanes of a and b on 1 x u128 lanes; immediate forms add the scalar immediate to each lane.

Operation

dst.u128[0] = a.i64[0] + b.i64[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i) __builtin_msa2_vadd_el0_q ((v2i64) a, (v2i64) b);

__m128i __msa2_vadd_el0_s_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_s_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.s.wx.d
Builtin: __builtin_msa2_vadd_el0_s_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:830

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) + widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_s_wx_d((v2i64) a, (v2i64) b);

__m128i __msa2_vadd_el0_s_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_s_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.s.wx.h
Builtin: __builtin_msa2_vadd_el0_s_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:788

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) + widen(b.i16[0]);
dst.i32[1] = widen(a.i16[0]) + widen(b.i16[0]);
dst.i32[2] = widen(a.i16[0]) + widen(b.i16[0]);
dst.i32[3] = widen(a.i16[0]) + widen(b.i16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_s_wx_h((v8i16) a, (v8i16) b);

__m128i __msa2_vadd_el0_s_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_s_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.s.wx.w
Builtin: __builtin_msa2_vadd_el0_s_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:809

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) + widen(b.i32[0]);
dst.i64[1] = widen(a.i32[0]) + widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_s_wx_w((v4i32) a, (v4i32) b);

__m128i __msa2_vadd_el0_u_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_u_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.u.wx.d
Builtin: __builtin_msa2_vadd_el0_u_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:893

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) + widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_u_wx_d((v2u64) a, (v2u64) b);

__m128i __msa2_vadd_el0_u_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_u_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.u.wx.h
Builtin: __builtin_msa2_vadd_el0_u_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:851

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) + widen(b.u16[0]);
dst.u32[1] = widen(a.u16[0]) + widen(b.u16[0]);
dst.u32[2] = widen(a.u16[0]) + widen(b.u16[0]);
dst.u32[3] = widen(a.u16[0]) + widen(b.u16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_u_wx_h((v8u16) a, (v8u16) b);

__m128i __msa2_vadd_el0_u_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_u_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.u.wx.w
Builtin: __builtin_msa2_vadd_el0_u_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:872

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) + widen(b.u32[0]);
dst.u64[1] = widen(a.u32[0]) + widen(b.u32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_u_wx_w((v4u32) a, (v4u32) b);

__m128i __msa2_vadd_el0_us_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_us_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.us.wx.d
Builtin: __builtin_msa2_vadd_el0_us_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:956

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) + widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_us_wx_d((v2u64) a, (v2i64) b);

__m128i __msa2_vadd_el0_us_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_us_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.us.wx.h
Builtin: __builtin_msa2_vadd_el0_us_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:914

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) + widen(b.i16[0]);
dst.u32[1] = widen(a.u16[0]) + widen(b.i16[0]);
dst.u32[2] = widen(a.u16[0]) + widen(b.i16[0]);
dst.u32[3] = widen(a.u16[0]) + widen(b.i16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_us_wx_h((v8u16) a, (v8i16) b);

__m128i __msa2_vadd_el0_us_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_el0_us_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.el0.us.wx.w
Builtin: __builtin_msa2_vadd_el0_us_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:935

Description

Widen lane 0 source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) + widen(b.i32[0]);
dst.u64[1] = widen(a.u32[0]) + widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_el0_us_wx_w((v4u32) a, (v4i32) b);

__m128i __msa2_vadd_even_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.s.w2x.b
Builtin: __builtin_msa2_vadd_even_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:977

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[0]) + widen(b.i8[0]);
dst.i16[1] = widen(a.i8[2]) + widen(b.i8[2]);
dst.i16[2] = widen(a.i8[4]) + widen(b.i8[4]);
dst.i16[3] = widen(a.i8[6]) + widen(b.i8[6]);
dst.i16[4] = widen(a.i8[8]) + widen(b.i8[8]);
dst.i16[5] = widen(a.i8[10]) + widen(b.i8[10]);
dst.i16[6] = widen(a.i8[12]) + widen(b.i8[12]);
dst.i16[7] = widen(a.i8[14]) + widen(b.i8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vadd_even_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.s.w2x.d
Builtin: __builtin_msa2_vadd_even_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1040

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) + widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vadd_even_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.s.w2x.h
Builtin: __builtin_msa2_vadd_even_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:998

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) + widen(b.i16[0]);
dst.i32[1] = widen(a.i16[2]) + widen(b.i16[2]);
dst.i32[2] = widen(a.i16[4]) + widen(b.i16[4]);
dst.i32[3] = widen(a.i16[6]) + widen(b.i16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vadd_even_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.s.w2x.w
Builtin: __builtin_msa2_vadd_even_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1019

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) + widen(b.i32[0]);
dst.i64[1] = widen(a.i32[2]) + widen(b.i32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vadd_even_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.u.w2x.b
Builtin: __builtin_msa2_vadd_even_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1061

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) + widen(b.u8[0]);
dst.u16[1] = widen(a.u8[2]) + widen(b.u8[2]);
dst.u16[2] = widen(a.u8[4]) + widen(b.u8[4]);
dst.u16[3] = widen(a.u8[6]) + widen(b.u8[6]);
dst.u16[4] = widen(a.u8[8]) + widen(b.u8[8]);
dst.u16[5] = widen(a.u8[10]) + widen(b.u8[10]);
dst.u16[6] = widen(a.u8[12]) + widen(b.u8[12]);
dst.u16[7] = widen(a.u8[14]) + widen(b.u8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vadd_even_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.u.w2x.d
Builtin: __builtin_msa2_vadd_even_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1124

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) + widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vadd_even_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.u.w2x.h
Builtin: __builtin_msa2_vadd_even_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1082

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) + widen(b.u16[0]);
dst.u32[1] = widen(a.u16[2]) + widen(b.u16[2]);
dst.u32[2] = widen(a.u16[4]) + widen(b.u16[4]);
dst.u32[3] = widen(a.u16[6]) + widen(b.u16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vadd_even_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.u.w2x.w
Builtin: __builtin_msa2_vadd_even_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1103

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) + widen(b.u32[0]);
dst.u64[1] = widen(a.u32[2]) + widen(b.u32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vadd_even_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.us.w2x.b
Builtin: __builtin_msa2_vadd_even_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1145

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) + widen(b.i8[0]);
dst.u16[1] = widen(a.u8[2]) + widen(b.i8[2]);
dst.u16[2] = widen(a.u8[4]) + widen(b.i8[4]);
dst.u16[3] = widen(a.u8[6]) + widen(b.i8[6]);
dst.u16[4] = widen(a.u8[8]) + widen(b.i8[8]);
dst.u16[5] = widen(a.u8[10]) + widen(b.i8[10]);
dst.u16[6] = widen(a.u8[12]) + widen(b.i8[12]);
dst.u16[7] = widen(a.u8[14]) + widen(b.i8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vadd_even_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.us.w2x.d
Builtin: __builtin_msa2_vadd_even_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1208

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) + widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vadd_even_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.us.w2x.h
Builtin: __builtin_msa2_vadd_even_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1166

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) + widen(b.i16[0]);
dst.u32[1] = widen(a.u16[2]) + widen(b.i16[2]);
dst.u32[2] = widen(a.u16[4]) + widen(b.i16[4]);
dst.u32[3] = widen(a.u16[6]) + widen(b.i16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vadd_even_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_even_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.even.us.w2x.w
Builtin: __builtin_msa2_vadd_even_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1187

Description

Widen even-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) + widen(b.i32[0]);
dst.u64[1] = widen(a.u32[2]) + widen(b.i32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_even_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vadd_hi_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.s.w2x.b
Builtin: __builtin_msa2_vadd_hi_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1733

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[8]) + widen(b.i8[8]);
dst.i16[1] = widen(a.i8[9]) + widen(b.i8[9]);
dst.i16[2] = widen(a.i8[10]) + widen(b.i8[10]);
dst.i16[3] = widen(a.i8[11]) + widen(b.i8[11]);
dst.i16[4] = widen(a.i8[12]) + widen(b.i8[12]);
dst.i16[5] = widen(a.i8[13]) + widen(b.i8[13]);
dst.i16[6] = widen(a.i8[14]) + widen(b.i8[14]);
dst.i16[7] = widen(a.i8[15]) + widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vadd_hi_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.s.w2x.d
Builtin: __builtin_msa2_vadd_hi_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1796

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[1]) + widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vadd_hi_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.s.w2x.h
Builtin: __builtin_msa2_vadd_hi_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1754

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[4]) + widen(b.i16[4]);
dst.i32[1] = widen(a.i16[5]) + widen(b.i16[5]);
dst.i32[2] = widen(a.i16[6]) + widen(b.i16[6]);
dst.i32[3] = widen(a.i16[7]) + widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vadd_hi_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.s.w2x.w
Builtin: __builtin_msa2_vadd_hi_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1775

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[2]) + widen(b.i32[2]);
dst.i64[1] = widen(a.i32[3]) + widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vadd_hi_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.u.w2x.b
Builtin: __builtin_msa2_vadd_hi_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1817

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[8]) + widen(b.u8[8]);
dst.u16[1] = widen(a.u8[9]) + widen(b.u8[9]);
dst.u16[2] = widen(a.u8[10]) + widen(b.u8[10]);
dst.u16[3] = widen(a.u8[11]) + widen(b.u8[11]);
dst.u16[4] = widen(a.u8[12]) + widen(b.u8[12]);
dst.u16[5] = widen(a.u8[13]) + widen(b.u8[13]);
dst.u16[6] = widen(a.u8[14]) + widen(b.u8[14]);
dst.u16[7] = widen(a.u8[15]) + widen(b.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vadd_hi_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.u.w2x.d
Builtin: __builtin_msa2_vadd_hi_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1880

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) + widen(b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vadd_hi_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.u.w2x.h
Builtin: __builtin_msa2_vadd_hi_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1838

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[4]) + widen(b.u16[4]);
dst.u32[1] = widen(a.u16[5]) + widen(b.u16[5]);
dst.u32[2] = widen(a.u16[6]) + widen(b.u16[6]);
dst.u32[3] = widen(a.u16[7]) + widen(b.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vadd_hi_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.u.w2x.w
Builtin: __builtin_msa2_vadd_hi_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1859

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[2]) + widen(b.u32[2]);
dst.u64[1] = widen(a.u32[3]) + widen(b.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vadd_hi_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.us.w2x.b
Builtin: __builtin_msa2_vadd_hi_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1901

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[8]) + widen(b.i8[8]);
dst.u16[1] = widen(a.u8[9]) + widen(b.i8[9]);
dst.u16[2] = widen(a.u8[10]) + widen(b.i8[10]);
dst.u16[3] = widen(a.u8[11]) + widen(b.i8[11]);
dst.u16[4] = widen(a.u8[12]) + widen(b.i8[12]);
dst.u16[5] = widen(a.u8[13]) + widen(b.i8[13]);
dst.u16[6] = widen(a.u8[14]) + widen(b.i8[14]);
dst.u16[7] = widen(a.u8[15]) + widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vadd_hi_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.us.w2x.d
Builtin: __builtin_msa2_vadd_hi_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1964

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) + widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vadd_hi_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.us.w2x.h
Builtin: __builtin_msa2_vadd_hi_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1922

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[4]) + widen(b.i16[4]);
dst.u32[1] = widen(a.u16[5]) + widen(b.i16[5]);
dst.u32[2] = widen(a.u16[6]) + widen(b.i16[6]);
dst.u32[3] = widen(a.u16[7]) + widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vadd_hi_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_hi_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.hi.us.w2x.w
Builtin: __builtin_msa2_vadd_hi_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1943

Description

Widen upper-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[2]) + widen(b.i32[2]);
dst.u64[1] = widen(a.u32[3]) + widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_hi_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vadd_lo_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.s.w2x.b
Builtin: __builtin_msa2_vadd_lo_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1481

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[0]) + widen(b.i8[0]);
dst.i16[1] = widen(a.i8[1]) + widen(b.i8[1]);
dst.i16[2] = widen(a.i8[2]) + widen(b.i8[2]);
dst.i16[3] = widen(a.i8[3]) + widen(b.i8[3]);
dst.i16[4] = widen(a.i8[4]) + widen(b.i8[4]);
dst.i16[5] = widen(a.i8[5]) + widen(b.i8[5]);
dst.i16[6] = widen(a.i8[6]) + widen(b.i8[6]);
dst.i16[7] = widen(a.i8[7]) + widen(b.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vadd_lo_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.s.w2x.d
Builtin: __builtin_msa2_vadd_lo_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1544

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) + widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vadd_lo_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.s.w2x.h
Builtin: __builtin_msa2_vadd_lo_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1502

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) + widen(b.i16[0]);
dst.i32[1] = widen(a.i16[1]) + widen(b.i16[1]);
dst.i32[2] = widen(a.i16[2]) + widen(b.i16[2]);
dst.i32[3] = widen(a.i16[3]) + widen(b.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vadd_lo_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.s.w2x.w
Builtin: __builtin_msa2_vadd_lo_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1523

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) + widen(b.i32[0]);
dst.i64[1] = widen(a.i32[1]) + widen(b.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vadd_lo_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.u.w2x.b
Builtin: __builtin_msa2_vadd_lo_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1565

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) + widen(b.u8[0]);
dst.u16[1] = widen(a.u8[1]) + widen(b.u8[1]);
dst.u16[2] = widen(a.u8[2]) + widen(b.u8[2]);
dst.u16[3] = widen(a.u8[3]) + widen(b.u8[3]);
dst.u16[4] = widen(a.u8[4]) + widen(b.u8[4]);
dst.u16[5] = widen(a.u8[5]) + widen(b.u8[5]);
dst.u16[6] = widen(a.u8[6]) + widen(b.u8[6]);
dst.u16[7] = widen(a.u8[7]) + widen(b.u8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vadd_lo_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.u.w2x.d
Builtin: __builtin_msa2_vadd_lo_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1628

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) + widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vadd_lo_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.u.w2x.h
Builtin: __builtin_msa2_vadd_lo_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1586

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) + widen(b.u16[0]);
dst.u32[1] = widen(a.u16[1]) + widen(b.u16[1]);
dst.u32[2] = widen(a.u16[2]) + widen(b.u16[2]);
dst.u32[3] = widen(a.u16[3]) + widen(b.u16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vadd_lo_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.u.w2x.w
Builtin: __builtin_msa2_vadd_lo_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1607

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) + widen(b.u32[0]);
dst.u64[1] = widen(a.u32[1]) + widen(b.u32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vadd_lo_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.us.w2x.b
Builtin: __builtin_msa2_vadd_lo_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1649

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) + widen(b.i8[0]);
dst.u16[1] = widen(a.u8[1]) + widen(b.i8[1]);
dst.u16[2] = widen(a.u8[2]) + widen(b.i8[2]);
dst.u16[3] = widen(a.u8[3]) + widen(b.i8[3]);
dst.u16[4] = widen(a.u8[4]) + widen(b.i8[4]);
dst.u16[5] = widen(a.u8[5]) + widen(b.i8[5]);
dst.u16[6] = widen(a.u8[6]) + widen(b.i8[6]);
dst.u16[7] = widen(a.u8[7]) + widen(b.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vadd_lo_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.us.w2x.d
Builtin: __builtin_msa2_vadd_lo_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1712

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) + widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vadd_lo_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.us.w2x.h
Builtin: __builtin_msa2_vadd_lo_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1670

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) + widen(b.i16[0]);
dst.u32[1] = widen(a.u16[1]) + widen(b.i16[1]);
dst.u32[2] = widen(a.u16[2]) + widen(b.i16[2]);
dst.u32[3] = widen(a.u16[3]) + widen(b.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vadd_lo_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_lo_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.lo.us.w2x.w
Builtin: __builtin_msa2_vadd_lo_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1691

Description

Widen lower-half source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) + widen(b.i32[0]);
dst.u64[1] = widen(a.u32[1]) + widen(b.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vadd_lo_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vadd_odd_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.s.w2x.b
Builtin: __builtin_msa2_vadd_odd_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1229

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[1]) + widen(b.i8[1]);
dst.i16[1] = widen(a.i8[3]) + widen(b.i8[3]);
dst.i16[2] = widen(a.i8[5]) + widen(b.i8[5]);
dst.i16[3] = widen(a.i8[7]) + widen(b.i8[7]);
dst.i16[4] = widen(a.i8[9]) + widen(b.i8[9]);
dst.i16[5] = widen(a.i8[11]) + widen(b.i8[11]);
dst.i16[6] = widen(a.i8[13]) + widen(b.i8[13]);
dst.i16[7] = widen(a.i8[15]) + widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vadd_odd_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.s.w2x.d
Builtin: __builtin_msa2_vadd_odd_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1292

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[1]) + widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vadd_odd_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.s.w2x.h
Builtin: __builtin_msa2_vadd_odd_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1250

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[1]) + widen(b.i16[1]);
dst.i32[1] = widen(a.i16[3]) + widen(b.i16[3]);
dst.i32[2] = widen(a.i16[5]) + widen(b.i16[5]);
dst.i32[3] = widen(a.i16[7]) + widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vadd_odd_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.s.w2x.w
Builtin: __builtin_msa2_vadd_odd_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1271

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[1]) + widen(b.i32[1]);
dst.i64[1] = widen(a.i32[3]) + widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vadd_odd_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.u.w2x.b
Builtin: __builtin_msa2_vadd_odd_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1313

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[1]) + widen(b.u8[1]);
dst.u16[1] = widen(a.u8[3]) + widen(b.u8[3]);
dst.u16[2] = widen(a.u8[5]) + widen(b.u8[5]);
dst.u16[3] = widen(a.u8[7]) + widen(b.u8[7]);
dst.u16[4] = widen(a.u8[9]) + widen(b.u8[9]);
dst.u16[5] = widen(a.u8[11]) + widen(b.u8[11]);
dst.u16[6] = widen(a.u8[13]) + widen(b.u8[13]);
dst.u16[7] = widen(a.u8[15]) + widen(b.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vadd_odd_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.u.w2x.d
Builtin: __builtin_msa2_vadd_odd_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1376

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) + widen(b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vadd_odd_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.u.w2x.h
Builtin: __builtin_msa2_vadd_odd_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1334

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[1]) + widen(b.u16[1]);
dst.u32[1] = widen(a.u16[3]) + widen(b.u16[3]);
dst.u32[2] = widen(a.u16[5]) + widen(b.u16[5]);
dst.u32[3] = widen(a.u16[7]) + widen(b.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vadd_odd_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.u.w2x.w
Builtin: __builtin_msa2_vadd_odd_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1355

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[1]) + widen(b.u32[1]);
dst.u64[1] = widen(a.u32[3]) + widen(b.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vadd_odd_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.us.w2x.b
Builtin: __builtin_msa2_vadd_odd_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1397

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[1]) + widen(b.i8[1]);
dst.u16[1] = widen(a.u8[3]) + widen(b.i8[3]);
dst.u16[2] = widen(a.u8[5]) + widen(b.i8[5]);
dst.u16[3] = widen(a.u8[7]) + widen(b.i8[7]);
dst.u16[4] = widen(a.u8[9]) + widen(b.i8[9]);
dst.u16[5] = widen(a.u8[11]) + widen(b.i8[11]);
dst.u16[6] = widen(a.u8[13]) + widen(b.i8[13]);
dst.u16[7] = widen(a.u8[15]) + widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vadd_odd_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.us.w2x.d
Builtin: __builtin_msa2_vadd_odd_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1460

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) + widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vadd_odd_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.us.w2x.h
Builtin: __builtin_msa2_vadd_odd_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1418

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[1]) + widen(b.i16[1]);
dst.u32[1] = widen(a.u16[3]) + widen(b.i16[3]);
dst.u32[2] = widen(a.u16[5]) + widen(b.i16[5]);
dst.u32[3] = widen(a.u16[7]) + widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vadd_odd_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vadd_odd_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vadd.odd.us.w2x.w
Builtin: __builtin_msa2_vadd_odd_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1439

Description

Widen odd-numbered source lanes, add them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[1]) + widen(b.i32[1]);
dst.u64[1] = widen(a.u32[3]) + widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vadd_odd_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vmuhp_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmuhp_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmuhp.d
Builtin: __builtin_msa2_vmuhp_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2006

Description

Multiply modular integer lanes and keep the upper half of each product.

Operation

dst.u64[0] = high_part(vmuhp_lane(a.u64[0], b.u64[0]));
dst.u64[1] = high_part(vmuhp_lane(a.u64[1], b.u64[1]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmuhp_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmuhp_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmuhp_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmuhp.w
Builtin: __builtin_msa2_vmuhp_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:1985

Description

Multiply modular integer lanes and keep the upper half of each product.

Operation

dst.u32[0] = high_part(vmuhp_lane(a.u32[0], b.u32[0]));
dst.u32[1] = high_part(vmuhp_lane(a.u32[1], b.u32[1]));
dst.u32[2] = high_part(vmuhp_lane(a.u32[2], b.u32[2]));
dst.u32[3] = high_part(vmuhp_lane(a.u32[3], b.u32[3]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmuhp_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmuhp_xacc_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmuhp_xacc_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmuhp.xacc.d
Builtin: __builtin_msa2_vmuhp_xacc_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2088

Description

Compute the Loongson MSA2 high-part packed multiply and combine the result with accumulator a. This supports multiply-accumulate style integer kernels.

Operation

dst.u64[0] = a.u64[0] + high_part(vmuhp_lane(b.u64[0], c.u64[0]));
dst.u64[1] = a.u64[1] + high_part(vmuhp_lane(b.u64[1], c.u64[1]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmuhp_xacc_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmuhp_xacc_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmuhp_xacc_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmuhp.xacc.w
Builtin: __builtin_msa2_vmuhp_xacc_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2068

Description

Compute the Loongson MSA2 high-part packed multiply and combine the result with accumulator a. This supports multiply-accumulate style integer kernels.

Operation

dst.u32[0] = a.u32[0] + high_part(vmuhp_lane(b.u32[0], c.u32[0]));
dst.u32[1] = a.u32[1] + high_part(vmuhp_lane(b.u32[1], c.u32[1]));
dst.u32[2] = a.u32[2] + high_part(vmuhp_lane(b.u32[2], c.u32[2]));
dst.u32[3] = a.u32[3] + high_part(vmuhp_lane(b.u32[3], c.u32[3]));

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmuhp_xacc_w((v4i32) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmulp_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmulp_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmulp.d
Builtin: __builtin_msa2_vmulp_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2048

Description

Compute the Loongson MSA2 packed multiply. This supports multiply-accumulate style integer kernels.

Operation

dst.u64[0] = vmulp_lane(a.u64[0], b.u64[0]);
dst.u64[1] = vmulp_lane(a.u64[1], b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmulp_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmulp_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmulp_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmulp.w
Builtin: __builtin_msa2_vmulp_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2027

Description

Compute the Loongson MSA2 packed multiply. This supports multiply-accumulate style integer kernels.

Operation

dst.u32[0] = vmulp_lane(a.u32[0], b.u32[0]);
dst.u32[1] = vmulp_lane(a.u32[1], b.u32[1]);
dst.u32[2] = vmulp_lane(a.u32[2], b.u32[2]);
dst.u32[3] = vmulp_lane(a.u32[3], b.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmulp_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmulp_xacc_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmulp_xacc_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmulp.xacc.d
Builtin: __builtin_msa2_vmulp_xacc_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2128

Description

Compute the Loongson MSA2 packed multiply and combine the result with accumulator a. This supports multiply-accumulate style integer kernels.

Operation

dst.u64[0] = a.u64[0] + vmulp_lane(b.u64[0], c.u64[0]);
dst.u64[1] = a.u64[1] + vmulp_lane(b.u64[1], c.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmulp_xacc_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmulp_xacc_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmulp_xacc_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmulp.xacc.w
Builtin: __builtin_msa2_vmulp_xacc_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2108

Description

Compute the Loongson MSA2 packed multiply and combine the result with accumulator a. This supports multiply-accumulate style integer kernels.

Operation

dst.u32[0] = a.u32[0] + vmulp_lane(b.u32[0], c.u32[0]);
dst.u32[1] = a.u32[1] + vmulp_lane(b.u32[1], c.u32[1]);
dst.u32[2] = a.u32[2] + vmulp_lane(b.u32[2], c.u32[2]);
dst.u32[3] = a.u32[3] + vmulp_lane(b.u32[3], c.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmulp_xacc_w((v4i32) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmult_el0_acc_s_wx_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_s_wx_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.s.wx.d
Builtin: __builtin_msa2_vmult_el0_acc_s_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2377

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_s_wx_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmult_el0_acc_s_wx_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_s_wx_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.s.wx.h
Builtin: __builtin_msa2_vmult_el0_acc_s_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2337

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = a.i32[0] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i32[1] = a.i32[1] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i32[2] = a.i32[2] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i32[3] = a.i32[3] + widen(b.i16[0]) * widen(c.i16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_s_wx_h((v2i64) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmult_el0_acc_s_wx_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_s_wx_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.s.wx.w
Builtin: __builtin_msa2_vmult_el0_acc_s_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2357

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i32[0]) * widen(c.i32[0]);
dst.i64[1] = a.i64[1] + widen(b.i32[0]) * widen(c.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_s_wx_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmult_el0_acc_u_wx_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_u_wx_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.u.wx.d
Builtin: __builtin_msa2_vmult_el0_acc_u_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2437

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[0]) * widen(c.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_u_wx_d((v2u64) a, (v2u64) b, (v2u64) c);

__m128i __msa2_vmult_el0_acc_u_wx_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_u_wx_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.u.wx.h
Builtin: __builtin_msa2_vmult_el0_acc_u_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2397

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[0]) * widen(c.u16[0]);
dst.u32[1] = a.u32[1] + widen(b.u16[0]) * widen(c.u16[0]);
dst.u32[2] = a.u32[2] + widen(b.u16[0]) * widen(c.u16[0]);
dst.u32[3] = a.u32[3] + widen(b.u16[0]) * widen(c.u16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_u_wx_h((v2u64) a, (v8u16) b, (v8u16) c);

__m128i __msa2_vmult_el0_acc_u_wx_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_u_wx_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.u.wx.w
Builtin: __builtin_msa2_vmult_el0_acc_u_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2417

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[0]) * widen(c.u32[0]);
dst.u64[1] = a.u64[1] + widen(b.u32[0]) * widen(c.u32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_u_wx_w((v2u64) a, (v4u32) b, (v4u32) c);

__m128i __msa2_vmult_el0_acc_us_wx_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_us_wx_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.us.wx.d
Builtin: __builtin_msa2_vmult_el0_acc_us_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2497

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_us_wx_d((v2u64) a, (v2u64) b, (v2i64) c);

__m128i __msa2_vmult_el0_acc_us_wx_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_us_wx_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.us.wx.h
Builtin: __builtin_msa2_vmult_el0_acc_us_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2457

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[0]) * widen(c.i16[0]);
dst.u32[1] = a.u32[1] + widen(b.u16[0]) * widen(c.i16[0]);
dst.u32[2] = a.u32[2] + widen(b.u16[0]) * widen(c.i16[0]);
dst.u32[3] = a.u32[3] + widen(b.u16[0]) * widen(c.i16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_us_wx_h((v2u64) a, (v8u16) b, (v8i16) c);

__m128i __msa2_vmult_el0_acc_us_wx_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_el0_acc_us_wx_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.el0.acc.us.wx.w
Builtin: __builtin_msa2_vmult_el0_acc_us_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2477

Description

Widen lane 0 source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[0]) * widen(c.i32[0]);
dst.u64[1] = a.u64[1] + widen(b.u32[0]) * widen(c.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_acc_us_wx_w((v2u64) a, (v4u32) b, (v4i32) c);

__m128i __msa2_vmult_el0_s_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_s_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.s.wx.d
Builtin: __builtin_msa2_vmult_el0_s_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2191

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_s_wx_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmult_el0_s_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_s_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.s.wx.h
Builtin: __builtin_msa2_vmult_el0_s_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2149

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i32[1] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i32[2] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i32[3] = widen(a.i16[0]) * widen(b.i16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_s_wx_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmult_el0_s_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_s_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.s.wx.w
Builtin: __builtin_msa2_vmult_el0_s_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2170

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) * widen(b.i32[0]);
dst.i64[1] = widen(a.i32[0]) * widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_s_wx_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmult_el0_u_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_u_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.u.wx.d
Builtin: __builtin_msa2_vmult_el0_u_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2254

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) * widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_u_wx_d((v2u64) a, (v2u64) b);

__m128i __msa2_vmult_el0_u_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_u_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.u.wx.h
Builtin: __builtin_msa2_vmult_el0_u_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2212

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) * widen(b.u16[0]);
dst.u32[1] = widen(a.u16[0]) * widen(b.u16[0]);
dst.u32[2] = widen(a.u16[0]) * widen(b.u16[0]);
dst.u32[3] = widen(a.u16[0]) * widen(b.u16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_u_wx_h((v8u16) a, (v8u16) b);

__m128i __msa2_vmult_el0_u_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_u_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.u.wx.w
Builtin: __builtin_msa2_vmult_el0_u_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2233

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) * widen(b.u32[0]);
dst.u64[1] = widen(a.u32[0]) * widen(b.u32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_u_wx_w((v4u32) a, (v4u32) b);

__m128i __msa2_vmult_el0_us_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_us_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.us.wx.d
Builtin: __builtin_msa2_vmult_el0_us_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2317

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_us_wx_d((v2u64) a, (v2i64) b);

__m128i __msa2_vmult_el0_us_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_us_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.us.wx.h
Builtin: __builtin_msa2_vmult_el0_us_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2275

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) * widen(b.i16[0]);
dst.u32[1] = widen(a.u16[0]) * widen(b.i16[0]);
dst.u32[2] = widen(a.u16[0]) * widen(b.i16[0]);
dst.u32[3] = widen(a.u16[0]) * widen(b.i16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_us_wx_h((v8u16) a, (v8i16) b);

__m128i __msa2_vmult_el0_us_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_el0_us_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.el0.us.wx.w
Builtin: __builtin_msa2_vmult_el0_us_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2296

Description

Widen lane 0 source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) * widen(b.i32[0]);
dst.u64[1] = widen(a.u32[0]) * widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_el0_us_wx_w((v4u32) a, (v4i32) b);

__m128i __msa2_vmult_even_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.s.w2x.b
Builtin: __builtin_msa2_vmult_even_acc_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3390

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = a.i16[0] + widen(b.i8[0]) * widen(c.i8[0]);
dst.i16[1] = a.i16[1] + widen(b.i8[2]) * widen(c.i8[2]);
dst.i16[2] = a.i16[2] + widen(b.i8[4]) * widen(c.i8[4]);
dst.i16[3] = a.i16[3] + widen(b.i8[6]) * widen(c.i8[6]);
dst.i16[4] = a.i16[4] + widen(b.i8[8]) * widen(c.i8[8]);
dst.i16[5] = a.i16[5] + widen(b.i8[10]) * widen(c.i8[10]);
dst.i16[6] = a.i16[6] + widen(b.i8[12]) * widen(c.i8[12]);
dst.i16[7] = a.i16[7] + widen(b.i8[14]) * widen(c.i8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_s_w2x_b((v8i16) a, (v16i8) b, (v16i8) c);

__m128i __msa2_vmult_even_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.s.w2x.d
Builtin: __builtin_msa2_vmult_even_acc_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3450

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_s_w2x_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmult_even_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.s.w2x.h
Builtin: __builtin_msa2_vmult_even_acc_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3410

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = a.i32[0] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i32[1] = a.i32[1] + widen(b.i16[2]) * widen(c.i16[2]);
dst.i32[2] = a.i32[2] + widen(b.i16[4]) * widen(c.i16[4]);
dst.i32[3] = a.i32[3] + widen(b.i16[6]) * widen(c.i16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_s_w2x_h((v4i32) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmult_even_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.s.w2x.w
Builtin: __builtin_msa2_vmult_even_acc_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3430

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i32[0]) * widen(c.i32[0]);
dst.i64[1] = a.i64[1] + widen(b.i32[2]) * widen(c.i32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_s_w2x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmult_even_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.u.w2x.b
Builtin: __builtin_msa2_vmult_even_acc_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3470

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[0]) * widen(c.u8[0]);
dst.u16[1] = a.u16[1] + widen(b.u8[2]) * widen(c.u8[2]);
dst.u16[2] = a.u16[2] + widen(b.u8[4]) * widen(c.u8[4]);
dst.u16[3] = a.u16[3] + widen(b.u8[6]) * widen(c.u8[6]);
dst.u16[4] = a.u16[4] + widen(b.u8[8]) * widen(c.u8[8]);
dst.u16[5] = a.u16[5] + widen(b.u8[10]) * widen(c.u8[10]);
dst.u16[6] = a.u16[6] + widen(b.u8[12]) * widen(c.u8[12]);
dst.u16[7] = a.u16[7] + widen(b.u8[14]) * widen(c.u8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_u_w2x_b((v8u16) a, (v16u8) b, (v16u8) c);

__m128i __msa2_vmult_even_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.u.w2x.d
Builtin: __builtin_msa2_vmult_even_acc_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3530

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[0]) * widen(c.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_u_w2x_d((v2u64) a, (v2u64) b, (v2u64) c);

__m128i __msa2_vmult_even_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.u.w2x.h
Builtin: __builtin_msa2_vmult_even_acc_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3490

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[0]) * widen(c.u16[0]);
dst.u32[1] = a.u32[1] + widen(b.u16[2]) * widen(c.u16[2]);
dst.u32[2] = a.u32[2] + widen(b.u16[4]) * widen(c.u16[4]);
dst.u32[3] = a.u32[3] + widen(b.u16[6]) * widen(c.u16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_u_w2x_h((v4u32) a, (v8u16) b, (v8u16) c);

__m128i __msa2_vmult_even_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.u.w2x.w
Builtin: __builtin_msa2_vmult_even_acc_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3510

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[0]) * widen(c.u32[0]);
dst.u64[1] = a.u64[1] + widen(b.u32[2]) * widen(c.u32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_u_w2x_w((v2u64) a, (v4u32) b, (v4u32) c);

__m128i __msa2_vmult_even_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.us.w2x.b
Builtin: __builtin_msa2_vmult_even_acc_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3550

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[0]) * widen(c.i8[0]);
dst.u16[1] = a.u16[1] + widen(b.u8[2]) * widen(c.i8[2]);
dst.u16[2] = a.u16[2] + widen(b.u8[4]) * widen(c.i8[4]);
dst.u16[3] = a.u16[3] + widen(b.u8[6]) * widen(c.i8[6]);
dst.u16[4] = a.u16[4] + widen(b.u8[8]) * widen(c.i8[8]);
dst.u16[5] = a.u16[5] + widen(b.u8[10]) * widen(c.i8[10]);
dst.u16[6] = a.u16[6] + widen(b.u8[12]) * widen(c.i8[12]);
dst.u16[7] = a.u16[7] + widen(b.u8[14]) * widen(c.i8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_us_w2x_b((v8u16) a, (v16u8) b, (v16i8) c);

__m128i __msa2_vmult_even_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.us.w2x.d
Builtin: __builtin_msa2_vmult_even_acc_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3610

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_us_w2x_d((v2u64) a, (v2u64) b, (v2i64) c);

__m128i __msa2_vmult_even_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.us.w2x.h
Builtin: __builtin_msa2_vmult_even_acc_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3570

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[0]) * widen(c.i16[0]);
dst.u32[1] = a.u32[1] + widen(b.u16[2]) * widen(c.i16[2]);
dst.u32[2] = a.u32[2] + widen(b.u16[4]) * widen(c.i16[4]);
dst.u32[3] = a.u32[3] + widen(b.u16[6]) * widen(c.i16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_us_w2x_h((v4u32) a, (v8u16) b, (v8i16) c);

__m128i __msa2_vmult_even_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_even_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.even.acc.us.w2x.w
Builtin: __builtin_msa2_vmult_even_acc_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3590

Description

Widen even-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[0]) * widen(c.i32[0]);
dst.u64[1] = a.u64[1] + widen(b.u32[2]) * widen(c.i32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_acc_us_w2x_w((v2u64) a, (v4u32) b, (v4i32) c);

__m128i __msa2_vmult_even_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.s.w2x.b
Builtin: __builtin_msa2_vmult_even_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2887

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[0]) * widen(b.i8[0]);
dst.i16[1] = widen(a.i8[2]) * widen(b.i8[2]);
dst.i16[2] = widen(a.i8[4]) * widen(b.i8[4]);
dst.i16[3] = widen(a.i8[6]) * widen(b.i8[6]);
dst.i16[4] = widen(a.i8[8]) * widen(b.i8[8]);
dst.i16[5] = widen(a.i8[10]) * widen(b.i8[10]);
dst.i16[6] = widen(a.i8[12]) * widen(b.i8[12]);
dst.i16[7] = widen(a.i8[14]) * widen(b.i8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vmult_even_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.s.w2x.d
Builtin: __builtin_msa2_vmult_even_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2950

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmult_even_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.s.w2x.h
Builtin: __builtin_msa2_vmult_even_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2908

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i32[1] = widen(a.i16[2]) * widen(b.i16[2]);
dst.i32[2] = widen(a.i16[4]) * widen(b.i16[4]);
dst.i32[3] = widen(a.i16[6]) * widen(b.i16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmult_even_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.s.w2x.w
Builtin: __builtin_msa2_vmult_even_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2929

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) * widen(b.i32[0]);
dst.i64[1] = widen(a.i32[2]) * widen(b.i32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmult_even_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.u.w2x.b
Builtin: __builtin_msa2_vmult_even_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2971

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) * widen(b.u8[0]);
dst.u16[1] = widen(a.u8[2]) * widen(b.u8[2]);
dst.u16[2] = widen(a.u8[4]) * widen(b.u8[4]);
dst.u16[3] = widen(a.u8[6]) * widen(b.u8[6]);
dst.u16[4] = widen(a.u8[8]) * widen(b.u8[8]);
dst.u16[5] = widen(a.u8[10]) * widen(b.u8[10]);
dst.u16[6] = widen(a.u8[12]) * widen(b.u8[12]);
dst.u16[7] = widen(a.u8[14]) * widen(b.u8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vmult_even_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.u.w2x.d
Builtin: __builtin_msa2_vmult_even_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3034

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) * widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vmult_even_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.u.w2x.h
Builtin: __builtin_msa2_vmult_even_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2992

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) * widen(b.u16[0]);
dst.u32[1] = widen(a.u16[2]) * widen(b.u16[2]);
dst.u32[2] = widen(a.u16[4]) * widen(b.u16[4]);
dst.u32[3] = widen(a.u16[6]) * widen(b.u16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vmult_even_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.u.w2x.w
Builtin: __builtin_msa2_vmult_even_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3013

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) * widen(b.u32[0]);
dst.u64[1] = widen(a.u32[2]) * widen(b.u32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vmult_even_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.us.w2x.b
Builtin: __builtin_msa2_vmult_even_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3055

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) * widen(b.i8[0]);
dst.u16[1] = widen(a.u8[2]) * widen(b.i8[2]);
dst.u16[2] = widen(a.u8[4]) * widen(b.i8[4]);
dst.u16[3] = widen(a.u8[6]) * widen(b.i8[6]);
dst.u16[4] = widen(a.u8[8]) * widen(b.i8[8]);
dst.u16[5] = widen(a.u8[10]) * widen(b.i8[10]);
dst.u16[6] = widen(a.u8[12]) * widen(b.i8[12]);
dst.u16[7] = widen(a.u8[14]) * widen(b.i8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vmult_even_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.us.w2x.d
Builtin: __builtin_msa2_vmult_even_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3118

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vmult_even_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.us.w2x.h
Builtin: __builtin_msa2_vmult_even_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3076

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) * widen(b.i16[0]);
dst.u32[1] = widen(a.u16[2]) * widen(b.i16[2]);
dst.u32[2] = widen(a.u16[4]) * widen(b.i16[4]);
dst.u32[3] = widen(a.u16[6]) * widen(b.i16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vmult_even_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_even_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.even.us.w2x.w
Builtin: __builtin_msa2_vmult_even_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3097

Description

Widen even-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) * widen(b.i32[0]);
dst.u64[1] = widen(a.u32[2]) * widen(b.i32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_even_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vmult_hadd_adj4_acc_s_w4x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_s_w4x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.s.w4x.b
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_s_w4x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2706

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = a.i32[0] + widen(b.i8[0]) * widen(c.i8[0]);
dst.i32[1] = a.i32[1] + widen(b.i8[1]) * widen(c.i8[1]);
dst.i32[2] = a.i32[2] + widen(b.i8[2]) * widen(c.i8[2]);
dst.i32[3] = a.i32[3] + widen(b.i8[3]) * widen(c.i8[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_s_w4x_b((v4i32) a, (v16i8) b, (v16i8) c);

__m128i __msa2_vmult_hadd_adj4_acc_s_w4x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_s_w4x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.s.w4x.h
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_s_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2726

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i64[1] = a.i64[1] + widen(b.i16[1]) * widen(c.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_s_w4x_h((v2i64) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmult_hadd_adj4_acc_s_w4x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_s_w4x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.s.w4x.w
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_s_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2746

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i32[0]) * widen(c.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_s_w4x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmult_hadd_adj4_acc_u_w4x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_u_w4x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.u.w4x.b
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_u_w4x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2766

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u8[0]) * widen(c.u8[0]);
dst.u32[1] = a.u32[1] + widen(b.u8[1]) * widen(c.u8[1]);
dst.u32[2] = a.u32[2] + widen(b.u8[2]) * widen(c.u8[2]);
dst.u32[3] = a.u32[3] + widen(b.u8[3]) * widen(c.u8[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_u_w4x_b((v4u32) a, (v16u8) b, (v16u8) c);

__m128i __msa2_vmult_hadd_adj4_acc_u_w4x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_u_w4x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.u.w4x.h
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_u_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2786

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u16[0]) * widen(c.u16[0]);
dst.u64[1] = a.u64[1] + widen(b.u16[1]) * widen(c.u16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_u_w4x_h((v2u64) a, (v8u16) b, (v8u16) c);

__m128i __msa2_vmult_hadd_adj4_acc_u_w4x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_u_w4x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.u.w4x.w
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_u_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2806

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u32[0]) * widen(c.u32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_u_w4x_w((v2u64) a, (v4u32) b, (v4u32) c);

__m128i __msa2_vmult_hadd_adj4_acc_us_w4x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_us_w4x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.us.w4x.b
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_us_w4x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2826

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u8[0]) * widen(c.i8[0]);
dst.u32[1] = a.u32[1] + widen(b.u8[1]) * widen(c.i8[1]);
dst.u32[2] = a.u32[2] + widen(b.u8[2]) * widen(c.i8[2]);
dst.u32[3] = a.u32[3] + widen(b.u8[3]) * widen(c.i8[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_us_w4x_b((v4u32) a, (v16u8) b, (v16i8) c);

__m128i __msa2_vmult_hadd_adj4_acc_us_w4x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_us_w4x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.us.w4x.h
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_us_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2846

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u16[0]) * widen(c.i16[0]);
dst.u64[1] = a.u64[1] + widen(b.u16[1]) * widen(c.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_us_w4x_h((v2u64) a, (v8u16) b, (v8i16) c);

__m128i __msa2_vmult_hadd_adj4_acc_us_w4x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hadd_adj4_acc_us_w4x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hadd.adj4.acc.us.w4x.w
Builtin: __builtin_msa2_vmult_hadd_adj4_acc_us_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2866

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u32[0]) * widen(c.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_acc_us_w4x_w((v2u64) a, (v4u32) b, (v4i32) c);

__m128i __msa2_vmult_hadd_adj4_s_w4x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_s_w4x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.s.w4x.b
Builtin: __builtin_msa2_vmult_hadd_adj4_s_w4x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2518

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i8[0]) * widen(b.i8[0]);
dst.i32[1] = widen(a.i8[1]) * widen(b.i8[1]);
dst.i32[2] = widen(a.i8[2]) * widen(b.i8[2]);
dst.i32[3] = widen(a.i8[3]) * widen(b.i8[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_s_w4x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vmult_hadd_adj4_s_w4x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_s_w4x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.s.w4x.h
Builtin: __builtin_msa2_vmult_hadd_adj4_s_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2539

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i64[1] = widen(a.i16[1]) * widen(b.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_s_w4x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmult_hadd_adj4_s_w4x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_s_w4x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.s.w4x.w
Builtin: __builtin_msa2_vmult_hadd_adj4_s_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2560

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i32[0]) * widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_s_w4x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmult_hadd_adj4_u_w4x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_u_w4x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.u.w4x.b
Builtin: __builtin_msa2_vmult_hadd_adj4_u_w4x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2581

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u8[0]) * widen(b.u8[0]);
dst.u32[1] = widen(a.u8[1]) * widen(b.u8[1]);
dst.u32[2] = widen(a.u8[2]) * widen(b.u8[2]);
dst.u32[3] = widen(a.u8[3]) * widen(b.u8[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_u_w4x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vmult_hadd_adj4_u_w4x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_u_w4x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.u.w4x.h
Builtin: __builtin_msa2_vmult_hadd_adj4_u_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2602

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u16[0]) * widen(b.u16[0]);
dst.u64[1] = widen(a.u16[1]) * widen(b.u16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_u_w4x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vmult_hadd_adj4_u_w4x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_u_w4x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.u.w4x.w
Builtin: __builtin_msa2_vmult_hadd_adj4_u_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2623

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u32[0]) * widen(b.u32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_u_w4x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vmult_hadd_adj4_us_w4x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_us_w4x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.us.w4x.b
Builtin: __builtin_msa2_vmult_hadd_adj4_us_w4x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2644

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u8[0]) * widen(b.i8[0]);
dst.u32[1] = widen(a.u8[1]) * widen(b.i8[1]);
dst.u32[2] = widen(a.u8[2]) * widen(b.i8[2]);
dst.u32[3] = widen(a.u8[3]) * widen(b.i8[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_us_w4x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vmult_hadd_adj4_us_w4x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_us_w4x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.us.w4x.h
Builtin: __builtin_msa2_vmult_hadd_adj4_us_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2665

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u16[0]) * widen(b.i16[0]);
dst.u64[1] = widen(a.u16[1]) * widen(b.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_us_w4x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vmult_hadd_adj4_us_w4x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hadd_adj4_us_w4x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hadd.adj4.us.w4x.w
Builtin: __builtin_msa2_vmult_hadd_adj4_us_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:2686

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u32[0]) * widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hadd_adj4_us_w4x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vmult_hi_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.s.w2x.b
Builtin: __builtin_msa2_vmult_hi_acc_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4614

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = a.i16[0] + widen(b.i8[8]) * widen(c.i8[8]);
dst.i16[1] = a.i16[1] + widen(b.i8[9]) * widen(c.i8[9]);
dst.i16[2] = a.i16[2] + widen(b.i8[10]) * widen(c.i8[10]);
dst.i16[3] = a.i16[3] + widen(b.i8[11]) * widen(c.i8[11]);
dst.i16[4] = a.i16[4] + widen(b.i8[12]) * widen(c.i8[12]);
dst.i16[5] = a.i16[5] + widen(b.i8[13]) * widen(c.i8[13]);
dst.i16[6] = a.i16[6] + widen(b.i8[14]) * widen(c.i8[14]);
dst.i16[7] = a.i16[7] + widen(b.i8[15]) * widen(c.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_s_w2x_b((v8i16) a, (v16i8) b, (v16i8) c);

__m128i __msa2_vmult_hi_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.s.w2x.d
Builtin: __builtin_msa2_vmult_hi_acc_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4674

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i64[1]) * widen(c.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_s_w2x_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmult_hi_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.s.w2x.h
Builtin: __builtin_msa2_vmult_hi_acc_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4634

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = a.i32[0] + widen(b.i16[4]) * widen(c.i16[4]);
dst.i32[1] = a.i32[1] + widen(b.i16[5]) * widen(c.i16[5]);
dst.i32[2] = a.i32[2] + widen(b.i16[6]) * widen(c.i16[6]);
dst.i32[3] = a.i32[3] + widen(b.i16[7]) * widen(c.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_s_w2x_h((v4i32) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmult_hi_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.s.w2x.w
Builtin: __builtin_msa2_vmult_hi_acc_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4654

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i32[2]) * widen(c.i32[2]);
dst.i64[1] = a.i64[1] + widen(b.i32[3]) * widen(c.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_s_w2x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmult_hi_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.u.w2x.b
Builtin: __builtin_msa2_vmult_hi_acc_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4694

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[8]) * widen(c.u8[8]);
dst.u16[1] = a.u16[1] + widen(b.u8[9]) * widen(c.u8[9]);
dst.u16[2] = a.u16[2] + widen(b.u8[10]) * widen(c.u8[10]);
dst.u16[3] = a.u16[3] + widen(b.u8[11]) * widen(c.u8[11]);
dst.u16[4] = a.u16[4] + widen(b.u8[12]) * widen(c.u8[12]);
dst.u16[5] = a.u16[5] + widen(b.u8[13]) * widen(c.u8[13]);
dst.u16[6] = a.u16[6] + widen(b.u8[14]) * widen(c.u8[14]);
dst.u16[7] = a.u16[7] + widen(b.u8[15]) * widen(c.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_u_w2x_b((v8u16) a, (v16u8) b, (v16u8) c);

__m128i __msa2_vmult_hi_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.u.w2x.d
Builtin: __builtin_msa2_vmult_hi_acc_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4754

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[1]) * widen(c.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_u_w2x_d((v2u64) a, (v2u64) b, (v2u64) c);

__m128i __msa2_vmult_hi_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.u.w2x.h
Builtin: __builtin_msa2_vmult_hi_acc_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4714

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[4]) * widen(c.u16[4]);
dst.u32[1] = a.u32[1] + widen(b.u16[5]) * widen(c.u16[5]);
dst.u32[2] = a.u32[2] + widen(b.u16[6]) * widen(c.u16[6]);
dst.u32[3] = a.u32[3] + widen(b.u16[7]) * widen(c.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_u_w2x_h((v4u32) a, (v8u16) b, (v8u16) c);

__m128i __msa2_vmult_hi_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.u.w2x.w
Builtin: __builtin_msa2_vmult_hi_acc_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4734

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[2]) * widen(c.u32[2]);
dst.u64[1] = a.u64[1] + widen(b.u32[3]) * widen(c.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_u_w2x_w((v2u64) a, (v4u32) b, (v4u32) c);

__m128i __msa2_vmult_hi_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.us.w2x.b
Builtin: __builtin_msa2_vmult_hi_acc_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4774

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[8]) * widen(c.i8[8]);
dst.u16[1] = a.u16[1] + widen(b.u8[9]) * widen(c.i8[9]);
dst.u16[2] = a.u16[2] + widen(b.u8[10]) * widen(c.i8[10]);
dst.u16[3] = a.u16[3] + widen(b.u8[11]) * widen(c.i8[11]);
dst.u16[4] = a.u16[4] + widen(b.u8[12]) * widen(c.i8[12]);
dst.u16[5] = a.u16[5] + widen(b.u8[13]) * widen(c.i8[13]);
dst.u16[6] = a.u16[6] + widen(b.u8[14]) * widen(c.i8[14]);
dst.u16[7] = a.u16[7] + widen(b.u8[15]) * widen(c.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_us_w2x_b((v8u16) a, (v16u8) b, (v16i8) c);

__m128i __msa2_vmult_hi_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.us.w2x.d
Builtin: __builtin_msa2_vmult_hi_acc_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4834

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[1]) * widen(c.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_us_w2x_d((v2u64) a, (v2u64) b, (v2i64) c);

__m128i __msa2_vmult_hi_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.us.w2x.h
Builtin: __builtin_msa2_vmult_hi_acc_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4794

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[4]) * widen(c.i16[4]);
dst.u32[1] = a.u32[1] + widen(b.u16[5]) * widen(c.i16[5]);
dst.u32[2] = a.u32[2] + widen(b.u16[6]) * widen(c.i16[6]);
dst.u32[3] = a.u32[3] + widen(b.u16[7]) * widen(c.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_us_w2x_h((v4u32) a, (v8u16) b, (v8i16) c);

__m128i __msa2_vmult_hi_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_hi_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.hi.acc.us.w2x.w
Builtin: __builtin_msa2_vmult_hi_acc_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4814

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[2]) * widen(c.i32[2]);
dst.u64[1] = a.u64[1] + widen(b.u32[3]) * widen(c.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_acc_us_w2x_w((v2u64) a, (v4u32) b, (v4i32) c);

__m128i __msa2_vmult_hi_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.s.w2x.b
Builtin: __builtin_msa2_vmult_hi_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4123

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[8]) * widen(b.i8[8]);
dst.i16[1] = widen(a.i8[9]) * widen(b.i8[9]);
dst.i16[2] = widen(a.i8[10]) * widen(b.i8[10]);
dst.i16[3] = widen(a.i8[11]) * widen(b.i8[11]);
dst.i16[4] = widen(a.i8[12]) * widen(b.i8[12]);
dst.i16[5] = widen(a.i8[13]) * widen(b.i8[13]);
dst.i16[6] = widen(a.i8[14]) * widen(b.i8[14]);
dst.i16[7] = widen(a.i8[15]) * widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vmult_hi_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.s.w2x.d
Builtin: __builtin_msa2_vmult_hi_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4186

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[1]) * widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmult_hi_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.s.w2x.h
Builtin: __builtin_msa2_vmult_hi_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4144

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[4]) * widen(b.i16[4]);
dst.i32[1] = widen(a.i16[5]) * widen(b.i16[5]);
dst.i32[2] = widen(a.i16[6]) * widen(b.i16[6]);
dst.i32[3] = widen(a.i16[7]) * widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmult_hi_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.s.w2x.w
Builtin: __builtin_msa2_vmult_hi_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4165

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[2]) * widen(b.i32[2]);
dst.i64[1] = widen(a.i32[3]) * widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmult_hi_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.u.w2x.b
Builtin: __builtin_msa2_vmult_hi_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4207

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[8]) * widen(b.u8[8]);
dst.u16[1] = widen(a.u8[9]) * widen(b.u8[9]);
dst.u16[2] = widen(a.u8[10]) * widen(b.u8[10]);
dst.u16[3] = widen(a.u8[11]) * widen(b.u8[11]);
dst.u16[4] = widen(a.u8[12]) * widen(b.u8[12]);
dst.u16[5] = widen(a.u8[13]) * widen(b.u8[13]);
dst.u16[6] = widen(a.u8[14]) * widen(b.u8[14]);
dst.u16[7] = widen(a.u8[15]) * widen(b.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vmult_hi_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.u.w2x.d
Builtin: __builtin_msa2_vmult_hi_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4270

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) * widen(b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vmult_hi_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.u.w2x.h
Builtin: __builtin_msa2_vmult_hi_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4228

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[4]) * widen(b.u16[4]);
dst.u32[1] = widen(a.u16[5]) * widen(b.u16[5]);
dst.u32[2] = widen(a.u16[6]) * widen(b.u16[6]);
dst.u32[3] = widen(a.u16[7]) * widen(b.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vmult_hi_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.u.w2x.w
Builtin: __builtin_msa2_vmult_hi_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4249

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[2]) * widen(b.u32[2]);
dst.u64[1] = widen(a.u32[3]) * widen(b.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vmult_hi_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.us.w2x.b
Builtin: __builtin_msa2_vmult_hi_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4291

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[8]) * widen(b.i8[8]);
dst.u16[1] = widen(a.u8[9]) * widen(b.i8[9]);
dst.u16[2] = widen(a.u8[10]) * widen(b.i8[10]);
dst.u16[3] = widen(a.u8[11]) * widen(b.i8[11]);
dst.u16[4] = widen(a.u8[12]) * widen(b.i8[12]);
dst.u16[5] = widen(a.u8[13]) * widen(b.i8[13]);
dst.u16[6] = widen(a.u8[14]) * widen(b.i8[14]);
dst.u16[7] = widen(a.u8[15]) * widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vmult_hi_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.us.w2x.d
Builtin: __builtin_msa2_vmult_hi_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4354

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) * widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vmult_hi_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.us.w2x.h
Builtin: __builtin_msa2_vmult_hi_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4312

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[4]) * widen(b.i16[4]);
dst.u32[1] = widen(a.u16[5]) * widen(b.i16[5]);
dst.u32[2] = widen(a.u16[6]) * widen(b.i16[6]);
dst.u32[3] = widen(a.u16[7]) * widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vmult_hi_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_hi_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.hi.us.w2x.w
Builtin: __builtin_msa2_vmult_hi_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4333

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[2]) * widen(b.i32[2]);
dst.u64[1] = widen(a.u32[3]) * widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_hi_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vmult_lo_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.s.w2x.b
Builtin: __builtin_msa2_vmult_lo_acc_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4374

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = a.i16[0] + widen(b.i8[0]) * widen(c.i8[0]);
dst.i16[1] = a.i16[1] + widen(b.i8[1]) * widen(c.i8[1]);
dst.i16[2] = a.i16[2] + widen(b.i8[2]) * widen(c.i8[2]);
dst.i16[3] = a.i16[3] + widen(b.i8[3]) * widen(c.i8[3]);
dst.i16[4] = a.i16[4] + widen(b.i8[4]) * widen(c.i8[4]);
dst.i16[5] = a.i16[5] + widen(b.i8[5]) * widen(c.i8[5]);
dst.i16[6] = a.i16[6] + widen(b.i8[6]) * widen(c.i8[6]);
dst.i16[7] = a.i16[7] + widen(b.i8[7]) * widen(c.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_s_w2x_b((v8i16) a, (v16i8) b, (v16i8) c);

__m128i __msa2_vmult_lo_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.s.w2x.d
Builtin: __builtin_msa2_vmult_lo_acc_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4434

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_s_w2x_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmult_lo_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.s.w2x.h
Builtin: __builtin_msa2_vmult_lo_acc_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4394

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = a.i32[0] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i32[1] = a.i32[1] + widen(b.i16[1]) * widen(c.i16[1]);
dst.i32[2] = a.i32[2] + widen(b.i16[2]) * widen(c.i16[2]);
dst.i32[3] = a.i32[3] + widen(b.i16[3]) * widen(c.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_s_w2x_h((v4i32) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmult_lo_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.s.w2x.w
Builtin: __builtin_msa2_vmult_lo_acc_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4414

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i32[0]) * widen(c.i32[0]);
dst.i64[1] = a.i64[1] + widen(b.i32[1]) * widen(c.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_s_w2x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmult_lo_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.u.w2x.b
Builtin: __builtin_msa2_vmult_lo_acc_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4454

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[0]) * widen(c.u8[0]);
dst.u16[1] = a.u16[1] + widen(b.u8[1]) * widen(c.u8[1]);
dst.u16[2] = a.u16[2] + widen(b.u8[2]) * widen(c.u8[2]);
dst.u16[3] = a.u16[3] + widen(b.u8[3]) * widen(c.u8[3]);
dst.u16[4] = a.u16[4] + widen(b.u8[4]) * widen(c.u8[4]);
dst.u16[5] = a.u16[5] + widen(b.u8[5]) * widen(c.u8[5]);
dst.u16[6] = a.u16[6] + widen(b.u8[6]) * widen(c.u8[6]);
dst.u16[7] = a.u16[7] + widen(b.u8[7]) * widen(c.u8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_u_w2x_b((v8u16) a, (v16u8) b, (v16u8) c);

__m128i __msa2_vmult_lo_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.u.w2x.d
Builtin: __builtin_msa2_vmult_lo_acc_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4514

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[0]) * widen(c.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_u_w2x_d((v2u64) a, (v2u64) b, (v2u64) c);

__m128i __msa2_vmult_lo_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.u.w2x.h
Builtin: __builtin_msa2_vmult_lo_acc_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4474

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[0]) * widen(c.u16[0]);
dst.u32[1] = a.u32[1] + widen(b.u16[1]) * widen(c.u16[1]);
dst.u32[2] = a.u32[2] + widen(b.u16[2]) * widen(c.u16[2]);
dst.u32[3] = a.u32[3] + widen(b.u16[3]) * widen(c.u16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_u_w2x_h((v4u32) a, (v8u16) b, (v8u16) c);

__m128i __msa2_vmult_lo_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.u.w2x.w
Builtin: __builtin_msa2_vmult_lo_acc_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4494

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[0]) * widen(c.u32[0]);
dst.u64[1] = a.u64[1] + widen(b.u32[1]) * widen(c.u32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_u_w2x_w((v2u64) a, (v4u32) b, (v4u32) c);

__m128i __msa2_vmult_lo_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.us.w2x.b
Builtin: __builtin_msa2_vmult_lo_acc_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4534

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[0]) * widen(c.i8[0]);
dst.u16[1] = a.u16[1] + widen(b.u8[1]) * widen(c.i8[1]);
dst.u16[2] = a.u16[2] + widen(b.u8[2]) * widen(c.i8[2]);
dst.u16[3] = a.u16[3] + widen(b.u8[3]) * widen(c.i8[3]);
dst.u16[4] = a.u16[4] + widen(b.u8[4]) * widen(c.i8[4]);
dst.u16[5] = a.u16[5] + widen(b.u8[5]) * widen(c.i8[5]);
dst.u16[6] = a.u16[6] + widen(b.u8[6]) * widen(c.i8[6]);
dst.u16[7] = a.u16[7] + widen(b.u8[7]) * widen(c.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_us_w2x_b((v8u16) a, (v16u8) b, (v16i8) c);

__m128i __msa2_vmult_lo_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.us.w2x.d
Builtin: __builtin_msa2_vmult_lo_acc_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4594

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_us_w2x_d((v2u64) a, (v2u64) b, (v2i64) c);

__m128i __msa2_vmult_lo_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.us.w2x.h
Builtin: __builtin_msa2_vmult_lo_acc_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4554

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[0]) * widen(c.i16[0]);
dst.u32[1] = a.u32[1] + widen(b.u16[1]) * widen(c.i16[1]);
dst.u32[2] = a.u32[2] + widen(b.u16[2]) * widen(c.i16[2]);
dst.u32[3] = a.u32[3] + widen(b.u16[3]) * widen(c.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_us_w2x_h((v4u32) a, (v8u16) b, (v8i16) c);

__m128i __msa2_vmult_lo_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_lo_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.lo.acc.us.w2x.w
Builtin: __builtin_msa2_vmult_lo_acc_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4574

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[0]) * widen(c.i32[0]);
dst.u64[1] = a.u64[1] + widen(b.u32[1]) * widen(c.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_acc_us_w2x_w((v2u64) a, (v4u32) b, (v4i32) c);

__m128i __msa2_vmult_lo_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.s.w2x.b
Builtin: __builtin_msa2_vmult_lo_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3871

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[0]) * widen(b.i8[0]);
dst.i16[1] = widen(a.i8[1]) * widen(b.i8[1]);
dst.i16[2] = widen(a.i8[2]) * widen(b.i8[2]);
dst.i16[3] = widen(a.i8[3]) * widen(b.i8[3]);
dst.i16[4] = widen(a.i8[4]) * widen(b.i8[4]);
dst.i16[5] = widen(a.i8[5]) * widen(b.i8[5]);
dst.i16[6] = widen(a.i8[6]) * widen(b.i8[6]);
dst.i16[7] = widen(a.i8[7]) * widen(b.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vmult_lo_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.s.w2x.d
Builtin: __builtin_msa2_vmult_lo_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3934

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmult_lo_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.s.w2x.h
Builtin: __builtin_msa2_vmult_lo_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3892

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i32[1] = widen(a.i16[1]) * widen(b.i16[1]);
dst.i32[2] = widen(a.i16[2]) * widen(b.i16[2]);
dst.i32[3] = widen(a.i16[3]) * widen(b.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmult_lo_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.s.w2x.w
Builtin: __builtin_msa2_vmult_lo_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3913

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) * widen(b.i32[0]);
dst.i64[1] = widen(a.i32[1]) * widen(b.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmult_lo_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.u.w2x.b
Builtin: __builtin_msa2_vmult_lo_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3955

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) * widen(b.u8[0]);
dst.u16[1] = widen(a.u8[1]) * widen(b.u8[1]);
dst.u16[2] = widen(a.u8[2]) * widen(b.u8[2]);
dst.u16[3] = widen(a.u8[3]) * widen(b.u8[3]);
dst.u16[4] = widen(a.u8[4]) * widen(b.u8[4]);
dst.u16[5] = widen(a.u8[5]) * widen(b.u8[5]);
dst.u16[6] = widen(a.u8[6]) * widen(b.u8[6]);
dst.u16[7] = widen(a.u8[7]) * widen(b.u8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vmult_lo_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.u.w2x.d
Builtin: __builtin_msa2_vmult_lo_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4018

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) * widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vmult_lo_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.u.w2x.h
Builtin: __builtin_msa2_vmult_lo_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3976

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) * widen(b.u16[0]);
dst.u32[1] = widen(a.u16[1]) * widen(b.u16[1]);
dst.u32[2] = widen(a.u16[2]) * widen(b.u16[2]);
dst.u32[3] = widen(a.u16[3]) * widen(b.u16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vmult_lo_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.u.w2x.w
Builtin: __builtin_msa2_vmult_lo_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3997

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) * widen(b.u32[0]);
dst.u64[1] = widen(a.u32[1]) * widen(b.u32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vmult_lo_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.us.w2x.b
Builtin: __builtin_msa2_vmult_lo_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4039

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) * widen(b.i8[0]);
dst.u16[1] = widen(a.u8[1]) * widen(b.i8[1]);
dst.u16[2] = widen(a.u8[2]) * widen(b.i8[2]);
dst.u16[3] = widen(a.u8[3]) * widen(b.i8[3]);
dst.u16[4] = widen(a.u8[4]) * widen(b.i8[4]);
dst.u16[5] = widen(a.u8[5]) * widen(b.i8[5]);
dst.u16[6] = widen(a.u8[6]) * widen(b.i8[6]);
dst.u16[7] = widen(a.u8[7]) * widen(b.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vmult_lo_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.us.w2x.d
Builtin: __builtin_msa2_vmult_lo_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4102

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vmult_lo_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.us.w2x.h
Builtin: __builtin_msa2_vmult_lo_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4060

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) * widen(b.i16[0]);
dst.u32[1] = widen(a.u16[1]) * widen(b.i16[1]);
dst.u32[2] = widen(a.u16[2]) * widen(b.i16[2]);
dst.u32[3] = widen(a.u16[3]) * widen(b.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vmult_lo_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_lo_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.lo.us.w2x.w
Builtin: __builtin_msa2_vmult_lo_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4081

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) * widen(b.i32[0]);
dst.u64[1] = widen(a.u32[1]) * widen(b.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 5 1

Header Mapping

return (__m128i)__builtin_msa2_vmult_lo_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vmult_odd_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_s_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.s.w2x.b
Builtin: __builtin_msa2_vmult_odd_acc_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3630

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = a.i16[0] + widen(b.i8[1]) * widen(c.i8[1]);
dst.i16[1] = a.i16[1] + widen(b.i8[3]) * widen(c.i8[3]);
dst.i16[2] = a.i16[2] + widen(b.i8[5]) * widen(c.i8[5]);
dst.i16[3] = a.i16[3] + widen(b.i8[7]) * widen(c.i8[7]);
dst.i16[4] = a.i16[4] + widen(b.i8[9]) * widen(c.i8[9]);
dst.i16[5] = a.i16[5] + widen(b.i8[11]) * widen(c.i8[11]);
dst.i16[6] = a.i16[6] + widen(b.i8[13]) * widen(c.i8[13]);
dst.i16[7] = a.i16[7] + widen(b.i8[15]) * widen(c.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_s_w2x_b((v8i16) a, (v16i8) b, (v16i8) c);

__m128i __msa2_vmult_odd_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_s_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.s.w2x.d
Builtin: __builtin_msa2_vmult_odd_acc_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3690

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i64[1]) * widen(c.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_s_w2x_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmult_odd_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_s_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.s.w2x.h
Builtin: __builtin_msa2_vmult_odd_acc_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3650

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = a.i32[0] + widen(b.i16[1]) * widen(c.i16[1]);
dst.i32[1] = a.i32[1] + widen(b.i16[3]) * widen(c.i16[3]);
dst.i32[2] = a.i32[2] + widen(b.i16[5]) * widen(c.i16[5]);
dst.i32[3] = a.i32[3] + widen(b.i16[7]) * widen(c.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_s_w2x_h((v4i32) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmult_odd_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_s_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.s.w2x.w
Builtin: __builtin_msa2_vmult_odd_acc_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3670

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i32[1]) * widen(c.i32[1]);
dst.i64[1] = a.i64[1] + widen(b.i32[3]) * widen(c.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_s_w2x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmult_odd_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_u_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.u.w2x.b
Builtin: __builtin_msa2_vmult_odd_acc_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3710

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[1]) * widen(c.u8[1]);
dst.u16[1] = a.u16[1] + widen(b.u8[3]) * widen(c.u8[3]);
dst.u16[2] = a.u16[2] + widen(b.u8[5]) * widen(c.u8[5]);
dst.u16[3] = a.u16[3] + widen(b.u8[7]) * widen(c.u8[7]);
dst.u16[4] = a.u16[4] + widen(b.u8[9]) * widen(c.u8[9]);
dst.u16[5] = a.u16[5] + widen(b.u8[11]) * widen(c.u8[11]);
dst.u16[6] = a.u16[6] + widen(b.u8[13]) * widen(c.u8[13]);
dst.u16[7] = a.u16[7] + widen(b.u8[15]) * widen(c.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_u_w2x_b((v8u16) a, (v16u8) b, (v16u8) c);

__m128i __msa2_vmult_odd_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_u_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.u.w2x.d
Builtin: __builtin_msa2_vmult_odd_acc_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3770

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[1]) * widen(c.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_u_w2x_d((v2u64) a, (v2u64) b, (v2u64) c);

__m128i __msa2_vmult_odd_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_u_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.u.w2x.h
Builtin: __builtin_msa2_vmult_odd_acc_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3730

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[1]) * widen(c.u16[1]);
dst.u32[1] = a.u32[1] + widen(b.u16[3]) * widen(c.u16[3]);
dst.u32[2] = a.u32[2] + widen(b.u16[5]) * widen(c.u16[5]);
dst.u32[3] = a.u32[3] + widen(b.u16[7]) * widen(c.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_u_w2x_h((v4u32) a, (v8u16) b, (v8u16) c);

__m128i __msa2_vmult_odd_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_u_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.u.w2x.w
Builtin: __builtin_msa2_vmult_odd_acc_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3750

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[1]) * widen(c.u32[1]);
dst.u64[1] = a.u64[1] + widen(b.u32[3]) * widen(c.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_u_w2x_w((v2u64) a, (v4u32) b, (v4u32) c);

__m128i __msa2_vmult_odd_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_us_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.us.w2x.b
Builtin: __builtin_msa2_vmult_odd_acc_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3790

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.u8[1]) * widen(c.i8[1]);
dst.u16[1] = a.u16[1] + widen(b.u8[3]) * widen(c.i8[3]);
dst.u16[2] = a.u16[2] + widen(b.u8[5]) * widen(c.i8[5]);
dst.u16[3] = a.u16[3] + widen(b.u8[7]) * widen(c.i8[7]);
dst.u16[4] = a.u16[4] + widen(b.u8[9]) * widen(c.i8[9]);
dst.u16[5] = a.u16[5] + widen(b.u8[11]) * widen(c.i8[11]);
dst.u16[6] = a.u16[6] + widen(b.u8[13]) * widen(c.i8[13]);
dst.u16[7] = a.u16[7] + widen(b.u8[15]) * widen(c.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_us_w2x_b((v8u16) a, (v16u8) b, (v16i8) c);

__m128i __msa2_vmult_odd_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_us_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.us.w2x.d
Builtin: __builtin_msa2_vmult_odd_acc_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3850

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.u64[1]) * widen(c.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_us_w2x_d((v2u64) a, (v2u64) b, (v2i64) c);

__m128i __msa2_vmult_odd_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_us_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.us.w2x.h
Builtin: __builtin_msa2_vmult_odd_acc_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3810

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.u16[1]) * widen(c.i16[1]);
dst.u32[1] = a.u32[1] + widen(b.u16[3]) * widen(c.i16[3]);
dst.u32[2] = a.u32[2] + widen(b.u16[5]) * widen(c.i16[5]);
dst.u32[3] = a.u32[3] + widen(b.u16[7]) * widen(c.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_us_w2x_h((v4u32) a, (v8u16) b, (v8i16) c);

__m128i __msa2_vmult_odd_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmult_odd_acc_us_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmult.odd.acc.us.w2x.w
Builtin: __builtin_msa2_vmult_odd_acc_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3830

Description

Widen odd-numbered source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.u32[1]) * widen(c.i32[1]);
dst.u64[1] = a.u64[1] + widen(b.u32[3]) * widen(c.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_acc_us_w2x_w((v2u64) a, (v4u32) b, (v4i32) c);

__m128i __msa2_vmult_odd_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.s.w2x.b
Builtin: __builtin_msa2_vmult_odd_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3139

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[1]) * widen(b.i8[1]);
dst.i16[1] = widen(a.i8[3]) * widen(b.i8[3]);
dst.i16[2] = widen(a.i8[5]) * widen(b.i8[5]);
dst.i16[3] = widen(a.i8[7]) * widen(b.i8[7]);
dst.i16[4] = widen(a.i8[9]) * widen(b.i8[9]);
dst.i16[5] = widen(a.i8[11]) * widen(b.i8[11]);
dst.i16[6] = widen(a.i8[13]) * widen(b.i8[13]);
dst.i16[7] = widen(a.i8[15]) * widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vmult_odd_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.s.w2x.d
Builtin: __builtin_msa2_vmult_odd_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3202

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[1]) * widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmult_odd_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.s.w2x.h
Builtin: __builtin_msa2_vmult_odd_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3160

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[1]) * widen(b.i16[1]);
dst.i32[1] = widen(a.i16[3]) * widen(b.i16[3]);
dst.i32[2] = widen(a.i16[5]) * widen(b.i16[5]);
dst.i32[3] = widen(a.i16[7]) * widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmult_odd_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.s.w2x.w
Builtin: __builtin_msa2_vmult_odd_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3181

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[1]) * widen(b.i32[1]);
dst.i64[1] = widen(a.i32[3]) * widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmult_odd_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.u.w2x.b
Builtin: __builtin_msa2_vmult_odd_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3223

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[1]) * widen(b.u8[1]);
dst.u16[1] = widen(a.u8[3]) * widen(b.u8[3]);
dst.u16[2] = widen(a.u8[5]) * widen(b.u8[5]);
dst.u16[3] = widen(a.u8[7]) * widen(b.u8[7]);
dst.u16[4] = widen(a.u8[9]) * widen(b.u8[9]);
dst.u16[5] = widen(a.u8[11]) * widen(b.u8[11]);
dst.u16[6] = widen(a.u8[13]) * widen(b.u8[13]);
dst.u16[7] = widen(a.u8[15]) * widen(b.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vmult_odd_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.u.w2x.d
Builtin: __builtin_msa2_vmult_odd_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3286

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) * widen(b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vmult_odd_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.u.w2x.h
Builtin: __builtin_msa2_vmult_odd_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3244

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[1]) * widen(b.u16[1]);
dst.u32[1] = widen(a.u16[3]) * widen(b.u16[3]);
dst.u32[2] = widen(a.u16[5]) * widen(b.u16[5]);
dst.u32[3] = widen(a.u16[7]) * widen(b.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vmult_odd_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.u.w2x.w
Builtin: __builtin_msa2_vmult_odd_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3265

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[1]) * widen(b.u32[1]);
dst.u64[1] = widen(a.u32[3]) * widen(b.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vmult_odd_us_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_us_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.us.w2x.b
Builtin: __builtin_msa2_vmult_odd_us_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3307

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[1]) * widen(b.i8[1]);
dst.u16[1] = widen(a.u8[3]) * widen(b.i8[3]);
dst.u16[2] = widen(a.u8[5]) * widen(b.i8[5]);
dst.u16[3] = widen(a.u8[7]) * widen(b.i8[7]);
dst.u16[4] = widen(a.u8[9]) * widen(b.i8[9]);
dst.u16[5] = widen(a.u8[11]) * widen(b.i8[11]);
dst.u16[6] = widen(a.u8[13]) * widen(b.i8[13]);
dst.u16[7] = widen(a.u8[15]) * widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_us_w2x_b((v16u8) a, (v16i8) b);

__m128i __msa2_vmult_odd_us_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_us_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.us.w2x.d
Builtin: __builtin_msa2_vmult_odd_us_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3370

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) * widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_us_w2x_d((v2u64) a, (v2i64) b);

__m128i __msa2_vmult_odd_us_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_us_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.us.w2x.h
Builtin: __builtin_msa2_vmult_odd_us_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3328

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[1]) * widen(b.i16[1]);
dst.u32[1] = widen(a.u16[3]) * widen(b.i16[3]);
dst.u32[2] = widen(a.u16[5]) * widen(b.i16[5]);
dst.u32[3] = widen(a.u16[7]) * widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_us_w2x_h((v8u16) a, (v8i16) b);

__m128i __msa2_vmult_odd_us_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmult_odd_us_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmult.odd.us.w2x.w
Builtin: __builtin_msa2_vmult_odd_us_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:3349

Description

Widen odd-numbered source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[1]) * widen(b.i32[1]);
dst.u64[1] = widen(a.u32[3]) * widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 2

Header Mapping

return (__m128i)__builtin_msa2_vmult_odd_us_w2x_w((v4u32) a, (v4i32) b);

__m128i __msa2_vmultc_im_haddc_adjc2_acc_s_w4x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultc_im_haddc_adjc2_acc_s_w4x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultc.im.haddc.adjc2.acc.s.w4x.w
Builtin: __builtin_msa2_vmultc_im_haddc_adjc2_acc_s_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4875

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i32[0]) * widen(c.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_im_haddc_adjc2_acc_s_w4x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmultc_im_haddc_adjc2_s_w4x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultc_im_haddc_adjc2_s_w4x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultc.im.haddc.adjc2.s.w4x.w
Builtin: __builtin_msa2_vmultc_im_haddc_adjc2_s_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4855

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i32[0]) * widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_im_haddc_adjc2_s_w4x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmultc_im_haddc_adjc4_acc_s_w4x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultc_im_haddc_adjc4_acc_s_w4x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultc.im.haddc.adjc4.acc.s.w4x.h
Builtin: __builtin_msa2_vmultc_im_haddc_adjc4_acc_s_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4916

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i64[1] = a.i64[1] + widen(b.i16[1]) * widen(c.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_im_haddc_adjc4_acc_s_w4x_h((v2i64) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmultc_im_haddc_adjc4_s_w4x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultc_im_haddc_adjc4_s_w4x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultc.im.haddc.adjc4.s.w4x.h
Builtin: __builtin_msa2_vmultc_im_haddc_adjc4_s_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4896

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i64[1] = widen(a.i16[1]) * widen(b.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_im_haddc_adjc4_s_w4x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmultc_re_haddc_adjc2_acc_s_w4x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultc_re_haddc_adjc2_acc_s_w4x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultc.re.haddc.adjc2.acc.s.w4x.w
Builtin: __builtin_msa2_vmultc_re_haddc_adjc2_acc_s_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4957

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = a.i128[0] + widen(b.i32[0]) * widen(c.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_re_haddc_adjc2_acc_s_w4x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmultc_re_haddc_adjc2_s_w4x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultc_re_haddc_adjc2_s_w4x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultc.re.haddc.adjc2.s.w4x.w
Builtin: __builtin_msa2_vmultc_re_haddc_adjc2_s_w4x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4937

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i32[0]) * widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_re_haddc_adjc2_s_w4x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmultc_re_haddc_adjc4_acc_s_w4x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultc_re_haddc_adjc4_acc_s_w4x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultc.re.haddc.adjc4.acc.s.w4x.h
Builtin: __builtin_msa2_vmultc_re_haddc_adjc4_acc_s_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4998

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = a.i64[0] + widen(b.i16[0]) * widen(c.i16[0]);
dst.i64[1] = a.i64[1] + widen(b.i16[1]) * widen(c.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 1.16

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_re_haddc_adjc4_acc_s_w4x_h((v2i64) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmultc_re_haddc_adjc4_s_w4x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultc_re_haddc_adjc4_s_w4x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultc.re.haddc.adjc4.s.w4x.h
Builtin: __builtin_msa2_vmultc_re_haddc_adjc4_s_w4x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:4978

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i16[0]) * widen(b.i16[0]);
dst.i64[1] = widen(a.i16[1]) * widen(b.i16[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 7 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultc_re_haddc_adjc4_s_w4x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmultp_hi_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_hi_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.hi.w2x.b
Builtin: __builtin_msa2_vmultp_hi_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5144

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.i8[8]) * widen(b.i8[8]);
dst.u16[1] = widen(a.i8[9]) * widen(b.i8[9]);
dst.u16[2] = widen(a.i8[10]) * widen(b.i8[10]);
dst.u16[3] = widen(a.i8[11]) * widen(b.i8[11]);
dst.u16[4] = widen(a.i8[12]) * widen(b.i8[12]);
dst.u16[5] = widen(a.i8[13]) * widen(b.i8[13]);
dst.u16[6] = widen(a.i8[14]) * widen(b.i8[14]);
dst.u16[7] = widen(a.i8[15]) * widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vmultp_hi_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_hi_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.hi.w2x.d
Builtin: __builtin_msa2_vmultp_hi_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5207

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.i64[1]) * widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmultp_hi_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_hi_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.hi.w2x.h
Builtin: __builtin_msa2_vmultp_hi_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5165

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.i16[4]) * widen(b.i16[4]);
dst.u32[1] = widen(a.i16[5]) * widen(b.i16[5]);
dst.u32[2] = widen(a.i16[6]) * widen(b.i16[6]);
dst.u32[3] = widen(a.i16[7]) * widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmultp_hi_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_hi_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.hi.w2x.w
Builtin: __builtin_msa2_vmultp_hi_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5186

Description

Widen upper-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.i32[2]) * widen(b.i32[2]);
dst.u64[1] = widen(a.i32[3]) * widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmultp_hi_xacc_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_hi_xacc_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.hi.xacc.w2x.b
Builtin: __builtin_msa2_vmultp_hi_xacc_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5307

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.i8[8]) * widen(c.i8[8]);
dst.u16[1] = a.u16[1] + widen(b.i8[9]) * widen(c.i8[9]);
dst.u16[2] = a.u16[2] + widen(b.i8[10]) * widen(c.i8[10]);
dst.u16[3] = a.u16[3] + widen(b.i8[11]) * widen(c.i8[11]);
dst.u16[4] = a.u16[4] + widen(b.i8[12]) * widen(c.i8[12]);
dst.u16[5] = a.u16[5] + widen(b.i8[13]) * widen(c.i8[13]);
dst.u16[6] = a.u16[6] + widen(b.i8[14]) * widen(c.i8[14]);
dst.u16[7] = a.u16[7] + widen(b.i8[15]) * widen(c.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_xacc_w2x_b((v8i16) a, (v16i8) b, (v16i8) c);

__m128i __msa2_vmultp_hi_xacc_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_hi_xacc_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.hi.xacc.w2x.d
Builtin: __builtin_msa2_vmultp_hi_xacc_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5367

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.i64[1]) * widen(c.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_xacc_w2x_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmultp_hi_xacc_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_hi_xacc_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.hi.xacc.w2x.h
Builtin: __builtin_msa2_vmultp_hi_xacc_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5327

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.i16[4]) * widen(c.i16[4]);
dst.u32[1] = a.u32[1] + widen(b.i16[5]) * widen(c.i16[5]);
dst.u32[2] = a.u32[2] + widen(b.i16[6]) * widen(c.i16[6]);
dst.u32[3] = a.u32[3] + widen(b.i16[7]) * widen(c.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_xacc_w2x_h((v4i32) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmultp_hi_xacc_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_hi_xacc_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.hi.xacc.w2x.w
Builtin: __builtin_msa2_vmultp_hi_xacc_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5347

Description

Widen upper-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.i32[2]) * widen(c.i32[2]);
dst.u64[1] = a.u64[1] + widen(b.i32[3]) * widen(c.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hi_xacc_w2x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vmultp_hxor_adj2_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_hxor_adj2_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.hxor.adj2.w2x.d
Builtin: __builtin_msa2_vmultp_hxor_adj2_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5019

Description

Widen corresponding source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.i64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hxor_adj2_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmultp_hxor_adj2_xacc_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_hxor_adj2_xacc_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.hxor.adj2.xacc.w2x.d
Builtin: __builtin_msa2_vmultp_hxor_adj2_xacc_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5039

Description

Widen corresponding source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.i64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_hxor_adj2_xacc_w2x_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmultp_lo_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_lo_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.lo.w2x.b
Builtin: __builtin_msa2_vmultp_lo_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5060

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.i8[0]) * widen(b.i8[0]);
dst.u16[1] = widen(a.i8[1]) * widen(b.i8[1]);
dst.u16[2] = widen(a.i8[2]) * widen(b.i8[2]);
dst.u16[3] = widen(a.i8[3]) * widen(b.i8[3]);
dst.u16[4] = widen(a.i8[4]) * widen(b.i8[4]);
dst.u16[5] = widen(a.i8[5]) * widen(b.i8[5]);
dst.u16[6] = widen(a.i8[6]) * widen(b.i8[6]);
dst.u16[7] = widen(a.i8[7]) * widen(b.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vmultp_lo_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_lo_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.lo.w2x.d
Builtin: __builtin_msa2_vmultp_lo_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5123

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.i64[0]) * widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vmultp_lo_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_lo_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.lo.w2x.h
Builtin: __builtin_msa2_vmultp_lo_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5081

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.i16[0]) * widen(b.i16[0]);
dst.u32[1] = widen(a.i16[1]) * widen(b.i16[1]);
dst.u32[2] = widen(a.i16[2]) * widen(b.i16[2]);
dst.u32[3] = widen(a.i16[3]) * widen(b.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vmultp_lo_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vmultp_lo_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vmultp.lo.w2x.w
Builtin: __builtin_msa2_vmultp_lo_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5102

Description

Widen lower-half source lanes, multiply them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.i32[0]) * widen(b.i32[0]);
dst.u64[1] = widen(a.i32[1]) * widen(b.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vmultp_lo_xacc_w2x_b (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_lo_xacc_w2x_b (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.lo.xacc.w2x.b
Builtin: __builtin_msa2_vmultp_lo_xacc_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5227

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = a.u16[0] + widen(b.i8[0]) * widen(c.i8[0]);
dst.u16[1] = a.u16[1] + widen(b.i8[1]) * widen(c.i8[1]);
dst.u16[2] = a.u16[2] + widen(b.i8[2]) * widen(c.i8[2]);
dst.u16[3] = a.u16[3] + widen(b.i8[3]) * widen(c.i8[3]);
dst.u16[4] = a.u16[4] + widen(b.i8[4]) * widen(c.i8[4]);
dst.u16[5] = a.u16[5] + widen(b.i8[5]) * widen(c.i8[5]);
dst.u16[6] = a.u16[6] + widen(b.i8[6]) * widen(c.i8[6]);
dst.u16[7] = a.u16[7] + widen(b.i8[7]) * widen(c.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_xacc_w2x_b((v8i16) a, (v16i8) b, (v16i8) c);

__m128i __msa2_vmultp_lo_xacc_w2x_d (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_lo_xacc_w2x_d (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.lo.xacc.w2x.d
Builtin: __builtin_msa2_vmultp_lo_xacc_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5287

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = a.u128[0] + widen(b.i64[0]) * widen(c.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_xacc_w2x_d((v2i64) a, (v2i64) b, (v2i64) c);

__m128i __msa2_vmultp_lo_xacc_w2x_h (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_lo_xacc_w2x_h (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.lo.xacc.w2x.h
Builtin: __builtin_msa2_vmultp_lo_xacc_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5247

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = a.u32[0] + widen(b.i16[0]) * widen(c.i16[0]);
dst.u32[1] = a.u32[1] + widen(b.i16[1]) * widen(c.i16[1]);
dst.u32[2] = a.u32[2] + widen(b.i16[2]) * widen(c.i16[2]);
dst.u32[3] = a.u32[3] + widen(b.i16[3]) * widen(c.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_xacc_w2x_h((v4i32) a, (v8i16) b, (v8i16) c);

__m128i __msa2_vmultp_lo_xacc_w2x_w (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __msa2_vmultp_lo_xacc_w2x_w (__m128i a, __m128i b, __m128i c)
#include <msa2.h>
Instruction: vmultp.lo.xacc.w2x.w
Builtin: __builtin_msa2_vmultp_lo_xacc_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5267

Description

Widen lower-half source lanes, multiply them in wider lanes and accumulate into a. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = a.u64[0] + widen(b.i32[0]) * widen(c.i32[0]);
dst.u64[1] = a.u64[1] + widen(b.i32[1]) * widen(c.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 4 1

Notes: 3A4000(GS464V): partial SIGILL.

Header Mapping

return (__m128i)__builtin_msa2_vmultp_lo_xacc_w2x_w((v2i64) a, (v4i32) b, (v4i32) c);

__m128i __msa2_vsub_el0_q (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_el0_q (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.el0.q
Builtin: __builtin_msa2_vsub_el0_q
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:398

Description

Subtract modular integer lanes of b from a on 1 x u128 lanes; immediate forms subtract the scalar immediate.

Operation

dst.u128[0] = a.i64[0] - b.i64[0];

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i) __builtin_msa2_vsub_el0_q ((v2i64) a, (v2i64) b);

__m128i __msa2_vsub_el0_s_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_el0_s_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.el0.s.wx.d
Builtin: __builtin_msa2_vsub_el0_s_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5430

Description

Widen lane 0 source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) - widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_el0_s_wx_d((v2i64) a, (v2i64) b);

__m128i __msa2_vsub_el0_s_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_el0_s_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.el0.s.wx.h
Builtin: __builtin_msa2_vsub_el0_s_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5388

Description

Widen lane 0 source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) - widen(b.i16[0]);
dst.i32[1] = widen(a.i16[0]) - widen(b.i16[0]);
dst.i32[2] = widen(a.i16[0]) - widen(b.i16[0]);
dst.i32[3] = widen(a.i16[0]) - widen(b.i16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_el0_s_wx_h((v8i16) a, (v8i16) b);

__m128i __msa2_vsub_el0_s_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_el0_s_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.el0.s.wx.w
Builtin: __builtin_msa2_vsub_el0_s_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5409

Description

Widen lane 0 source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) - widen(b.i32[0]);
dst.i64[1] = widen(a.i32[0]) - widen(b.i32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_el0_s_wx_w((v4i32) a, (v4i32) b);

__m128i __msa2_vsub_el0_u_wx_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_el0_u_wx_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.el0.u.wx.d
Builtin: __builtin_msa2_vsub_el0_u_wx_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5493

Description

Widen lane 0 source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) - widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_el0_u_wx_d((v2u64) a, (v2u64) b);

__m128i __msa2_vsub_el0_u_wx_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_el0_u_wx_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.el0.u.wx.h
Builtin: __builtin_msa2_vsub_el0_u_wx_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5451

Description

Widen lane 0 source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) - widen(b.u16[0]);
dst.u32[1] = widen(a.u16[0]) - widen(b.u16[0]);
dst.u32[2] = widen(a.u16[0]) - widen(b.u16[0]);
dst.u32[3] = widen(a.u16[0]) - widen(b.u16[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_el0_u_wx_h((v8u16) a, (v8u16) b);

__m128i __msa2_vsub_el0_u_wx_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_el0_u_wx_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.el0.u.wx.w
Builtin: __builtin_msa2_vsub_el0_u_wx_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5472

Description

Widen lane 0 source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) - widen(b.u32[0]);
dst.u64[1] = widen(a.u32[0]) - widen(b.u32[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_el0_u_wx_w((v4u32) a, (v4u32) b);

__m128i __msa2_vsub_even_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.s.w2x.b
Builtin: __builtin_msa2_vsub_even_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5514

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[0]) - widen(b.i8[0]);
dst.i16[1] = widen(a.i8[2]) - widen(b.i8[2]);
dst.i16[2] = widen(a.i8[4]) - widen(b.i8[4]);
dst.i16[3] = widen(a.i8[6]) - widen(b.i8[6]);
dst.i16[4] = widen(a.i8[8]) - widen(b.i8[8]);
dst.i16[5] = widen(a.i8[10]) - widen(b.i8[10]);
dst.i16[6] = widen(a.i8[12]) - widen(b.i8[12]);
dst.i16[7] = widen(a.i8[14]) - widen(b.i8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vsub_even_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.s.w2x.d
Builtin: __builtin_msa2_vsub_even_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5577

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) - widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vsub_even_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.s.w2x.h
Builtin: __builtin_msa2_vsub_even_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5535

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) - widen(b.i16[0]);
dst.i32[1] = widen(a.i16[2]) - widen(b.i16[2]);
dst.i32[2] = widen(a.i16[4]) - widen(b.i16[4]);
dst.i32[3] = widen(a.i16[6]) - widen(b.i16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vsub_even_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.s.w2x.w
Builtin: __builtin_msa2_vsub_even_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5556

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) - widen(b.i32[0]);
dst.i64[1] = widen(a.i32[2]) - widen(b.i32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vsub_even_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.u.w2x.b
Builtin: __builtin_msa2_vsub_even_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5598

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) - widen(b.u8[0]);
dst.u16[1] = widen(a.u8[2]) - widen(b.u8[2]);
dst.u16[2] = widen(a.u8[4]) - widen(b.u8[4]);
dst.u16[3] = widen(a.u8[6]) - widen(b.u8[6]);
dst.u16[4] = widen(a.u8[8]) - widen(b.u8[8]);
dst.u16[5] = widen(a.u8[10]) - widen(b.u8[10]);
dst.u16[6] = widen(a.u8[12]) - widen(b.u8[12]);
dst.u16[7] = widen(a.u8[14]) - widen(b.u8[14]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vsub_even_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.u.w2x.d
Builtin: __builtin_msa2_vsub_even_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5661

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) - widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vsub_even_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.u.w2x.h
Builtin: __builtin_msa2_vsub_even_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5619

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) - widen(b.u16[0]);
dst.u32[1] = widen(a.u16[2]) - widen(b.u16[2]);
dst.u32[2] = widen(a.u16[4]) - widen(b.u16[4]);
dst.u32[3] = widen(a.u16[6]) - widen(b.u16[6]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vsub_even_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_even_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.even.u.w2x.w
Builtin: __builtin_msa2_vsub_even_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5640

Description

Widen even-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) - widen(b.u32[0]);
dst.u64[1] = widen(a.u32[2]) - widen(b.u32[2]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_even_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vsub_hi_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.s.w2x.b
Builtin: __builtin_msa2_vsub_hi_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6018

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[8]) - widen(b.i8[8]);
dst.i16[1] = widen(a.i8[9]) - widen(b.i8[9]);
dst.i16[2] = widen(a.i8[10]) - widen(b.i8[10]);
dst.i16[3] = widen(a.i8[11]) - widen(b.i8[11]);
dst.i16[4] = widen(a.i8[12]) - widen(b.i8[12]);
dst.i16[5] = widen(a.i8[13]) - widen(b.i8[13]);
dst.i16[6] = widen(a.i8[14]) - widen(b.i8[14]);
dst.i16[7] = widen(a.i8[15]) - widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vsub_hi_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.s.w2x.d
Builtin: __builtin_msa2_vsub_hi_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6081

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[1]) - widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vsub_hi_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.s.w2x.h
Builtin: __builtin_msa2_vsub_hi_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6039

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[4]) - widen(b.i16[4]);
dst.i32[1] = widen(a.i16[5]) - widen(b.i16[5]);
dst.i32[2] = widen(a.i16[6]) - widen(b.i16[6]);
dst.i32[3] = widen(a.i16[7]) - widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vsub_hi_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.s.w2x.w
Builtin: __builtin_msa2_vsub_hi_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6060

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[2]) - widen(b.i32[2]);
dst.i64[1] = widen(a.i32[3]) - widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vsub_hi_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.u.w2x.b
Builtin: __builtin_msa2_vsub_hi_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6102

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[8]) - widen(b.u8[8]);
dst.u16[1] = widen(a.u8[9]) - widen(b.u8[9]);
dst.u16[2] = widen(a.u8[10]) - widen(b.u8[10]);
dst.u16[3] = widen(a.u8[11]) - widen(b.u8[11]);
dst.u16[4] = widen(a.u8[12]) - widen(b.u8[12]);
dst.u16[5] = widen(a.u8[13]) - widen(b.u8[13]);
dst.u16[6] = widen(a.u8[14]) - widen(b.u8[14]);
dst.u16[7] = widen(a.u8[15]) - widen(b.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vsub_hi_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.u.w2x.d
Builtin: __builtin_msa2_vsub_hi_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6165

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) - widen(b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vsub_hi_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.u.w2x.h
Builtin: __builtin_msa2_vsub_hi_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6123

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[4]) - widen(b.u16[4]);
dst.u32[1] = widen(a.u16[5]) - widen(b.u16[5]);
dst.u32[2] = widen(a.u16[6]) - widen(b.u16[6]);
dst.u32[3] = widen(a.u16[7]) - widen(b.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vsub_hi_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_hi_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.hi.u.w2x.w
Builtin: __builtin_msa2_vsub_hi_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:6144

Description

Widen upper-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[2]) - widen(b.u32[2]);
dst.u64[1] = widen(a.u32[3]) - widen(b.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_hi_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vsub_lo_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.s.w2x.b
Builtin: __builtin_msa2_vsub_lo_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5850

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[0]) - widen(b.i8[0]);
dst.i16[1] = widen(a.i8[1]) - widen(b.i8[1]);
dst.i16[2] = widen(a.i8[2]) - widen(b.i8[2]);
dst.i16[3] = widen(a.i8[3]) - widen(b.i8[3]);
dst.i16[4] = widen(a.i8[4]) - widen(b.i8[4]);
dst.i16[5] = widen(a.i8[5]) - widen(b.i8[5]);
dst.i16[6] = widen(a.i8[6]) - widen(b.i8[6]);
dst.i16[7] = widen(a.i8[7]) - widen(b.i8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vsub_lo_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.s.w2x.d
Builtin: __builtin_msa2_vsub_lo_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5913

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[0]) - widen(b.i64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vsub_lo_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.s.w2x.h
Builtin: __builtin_msa2_vsub_lo_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5871

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[0]) - widen(b.i16[0]);
dst.i32[1] = widen(a.i16[1]) - widen(b.i16[1]);
dst.i32[2] = widen(a.i16[2]) - widen(b.i16[2]);
dst.i32[3] = widen(a.i16[3]) - widen(b.i16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vsub_lo_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.s.w2x.w
Builtin: __builtin_msa2_vsub_lo_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5892

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[0]) - widen(b.i32[0]);
dst.i64[1] = widen(a.i32[1]) - widen(b.i32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vsub_lo_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.u.w2x.b
Builtin: __builtin_msa2_vsub_lo_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5934

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[0]) - widen(b.u8[0]);
dst.u16[1] = widen(a.u8[1]) - widen(b.u8[1]);
dst.u16[2] = widen(a.u8[2]) - widen(b.u8[2]);
dst.u16[3] = widen(a.u8[3]) - widen(b.u8[3]);
dst.u16[4] = widen(a.u8[4]) - widen(b.u8[4]);
dst.u16[5] = widen(a.u8[5]) - widen(b.u8[5]);
dst.u16[6] = widen(a.u8[6]) - widen(b.u8[6]);
dst.u16[7] = widen(a.u8[7]) - widen(b.u8[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vsub_lo_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.u.w2x.d
Builtin: __builtin_msa2_vsub_lo_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5997

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[0]) - widen(b.u64[0]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vsub_lo_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.u.w2x.h
Builtin: __builtin_msa2_vsub_lo_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5955

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[0]) - widen(b.u16[0]);
dst.u32[1] = widen(a.u16[1]) - widen(b.u16[1]);
dst.u32[2] = widen(a.u16[2]) - widen(b.u16[2]);
dst.u32[3] = widen(a.u16[3]) - widen(b.u16[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vsub_lo_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_lo_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.lo.u.w2x.w
Builtin: __builtin_msa2_vsub_lo_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5976

Description

Widen lower-half source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[0]) - widen(b.u32[0]);
dst.u64[1] = widen(a.u32[1]) - widen(b.u32[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 1

Header Mapping

return (__m128i)__builtin_msa2_vsub_lo_u_w2x_w((v4u32) a, (v4u32) b);

__m128i __msa2_vsub_odd_s_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_s_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.s.w2x.b
Builtin: __builtin_msa2_vsub_odd_s_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5682

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i16[0] = widen(a.i8[1]) - widen(b.i8[1]);
dst.i16[1] = widen(a.i8[3]) - widen(b.i8[3]);
dst.i16[2] = widen(a.i8[5]) - widen(b.i8[5]);
dst.i16[3] = widen(a.i8[7]) - widen(b.i8[7]);
dst.i16[4] = widen(a.i8[9]) - widen(b.i8[9]);
dst.i16[5] = widen(a.i8[11]) - widen(b.i8[11]);
dst.i16[6] = widen(a.i8[13]) - widen(b.i8[13]);
dst.i16[7] = widen(a.i8[15]) - widen(b.i8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_s_w2x_b((v16i8) a, (v16i8) b);

__m128i __msa2_vsub_odd_s_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_s_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.s.w2x.d
Builtin: __builtin_msa2_vsub_odd_s_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5745

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i128[0] = widen(a.i64[1]) - widen(b.i64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_s_w2x_d((v2i64) a, (v2i64) b);

__m128i __msa2_vsub_odd_s_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_s_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.s.w2x.h
Builtin: __builtin_msa2_vsub_odd_s_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5703

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i32[0] = widen(a.i16[1]) - widen(b.i16[1]);
dst.i32[1] = widen(a.i16[3]) - widen(b.i16[3]);
dst.i32[2] = widen(a.i16[5]) - widen(b.i16[5]);
dst.i32[3] = widen(a.i16[7]) - widen(b.i16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_s_w2x_h((v8i16) a, (v8i16) b);

__m128i __msa2_vsub_odd_s_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_s_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.s.w2x.w
Builtin: __builtin_msa2_vsub_odd_s_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5724

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.i64[0] = widen(a.i32[1]) - widen(b.i32[1]);
dst.i64[1] = widen(a.i32[3]) - widen(b.i32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_s_w2x_w((v4i32) a, (v4i32) b);

__m128i __msa2_vsub_odd_u_w2x_b (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_u_w2x_b (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.u.w2x.b
Builtin: __builtin_msa2_vsub_odd_u_w2x_b
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5766

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u16[0] = widen(a.u8[1]) - widen(b.u8[1]);
dst.u16[1] = widen(a.u8[3]) - widen(b.u8[3]);
dst.u16[2] = widen(a.u8[5]) - widen(b.u8[5]);
dst.u16[3] = widen(a.u8[7]) - widen(b.u8[7]);
dst.u16[4] = widen(a.u8[9]) - widen(b.u8[9]);
dst.u16[5] = widen(a.u8[11]) - widen(b.u8[11]);
dst.u16[6] = widen(a.u8[13]) - widen(b.u8[13]);
dst.u16[7] = widen(a.u8[15]) - widen(b.u8[15]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_u_w2x_b((v16u8) a, (v16u8) b);

__m128i __msa2_vsub_odd_u_w2x_d (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_u_w2x_d (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.u.w2x.d
Builtin: __builtin_msa2_vsub_odd_u_w2x_d
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5829

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u128[0] = widen(a.u64[1]) - widen(b.u64[1]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 3 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_u_w2x_d((v2u64) a, (v2u64) b);

__m128i __msa2_vsub_odd_u_w2x_h (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_u_w2x_h (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.u.w2x.h
Builtin: __builtin_msa2_vsub_odd_u_w2x_h
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5787

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u32[0] = widen(a.u16[1]) - widen(b.u16[1]);
dst.u32[1] = widen(a.u16[3]) - widen(b.u16[3]);
dst.u32[2] = widen(a.u16[5]) - widen(b.u16[5]);
dst.u32[3] = widen(a.u16[7]) - widen(b.u16[7]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_u_w2x_h((v8u16) a, (v8u16) b);

__m128i __msa2_vsub_odd_u_w2x_w (__m128i a, __m128i b)

Synopsis

__m128i __msa2_vsub_odd_u_w2x_w (__m128i a, __m128i b)
#include <msa2.h>
Instruction: vsub.odd.u.w2x.w
Builtin: __builtin_msa2_vsub_odd_u_w2x_w
CPU Flags: __mips_msa
Kind: function
Source: include/msa2.h:5808

Description

Widen odd-numbered source lanes, subtract them in wider lanes. This avoids overflow from narrow intermediates.

Operation

dst.u64[0] = widen(a.u32[1]) - widen(b.u32[1]);
dst.u64[1] = widen(a.u32[3]) - widen(b.u32[3]);

Latency and Throughput

CPU µarch Latency Throughput (IPC)
3A4000 GS464V 2 2

Header Mapping

return (__m128i)__builtin_msa2_vsub_odd_u_w2x_w((v4u32) a, (v4u32) b);