Skip to content

Commit

Permalink
Implement relaxed simd in the interpreter
Browse files Browse the repository at this point in the history
Signed-off-by: Zoltan Herczeg [email protected]
  • Loading branch information
Zoltan Herczeg committed Oct 3, 2024
1 parent 8e3ae56 commit 3f3e275
Show file tree
Hide file tree
Showing 14 changed files with 1,103 additions and 75 deletions.
169 changes: 129 additions & 40 deletions src/interpreter/ByteCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ class FunctionType;
F(I64X2ExtmulHighI32X4S, (simdExtmulOperation<int32_t, int64_t, false>)) \
F(I64X2ExtmulLowI32X4U, (simdExtmulOperation<uint32_t, uint64_t, true>)) \
F(I64X2ExtmulHighI32X4U, (simdExtmulOperation<uint32_t, uint64_t, false>)) \
F(I32X4DotI16X8S, (simdDotOperation)) \
F(I32X4DotI16X8S, (simdDotOperation<int16_t, uint32_t>)) \
F(I8X16NarrowI16X8S, (simdNarrowOperation<int16_t, int8_t>)) \
F(I8X16NarrowI16X8U, (simdNarrowOperation<int16_t, uint8_t>)) \
F(I16X8NarrowI32X4S, (simdNarrowOperation<int32_t, int16_t>)) \
Expand Down Expand Up @@ -588,38 +588,79 @@ class FunctionType;
F(MemoryAtomicWait32) \
F(MemoryAtomicWait64) \
F(AtomicFence)

#define FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F) \
F(I32X4RelaxedTruncF32X4S, (simdTruncSatOperation<float, int32_t>)) \
F(I32X4RelaxedTruncF32X4U, (simdTruncSatOperation<float, uint32_t>)) \
F(I32X4RelaxedTruncF64X2SZero, (simdTruncSatZeroOperation<double, int32_t>)) \
F(I32X4RelaxedTruncF64X2UZero, (simdTruncSatZeroOperation<double, uint32_t>))

#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F) \
F(F32X4RelaxedMin, floatMin, float, float) \
F(F32X4RelaxedMax, floatMax, float, float) \
F(F64X2RelaxedMin, floatMin, double, double) \
F(F64X2RelaxedMax, floatMax, double, double) \
F(I16X8RelaxedQ15mulrS, saturatingRoundingQMul, int16_t, int16_t)

#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F) \
F(I8X16RelaxedSwizzle, (simdSwizzleOperation<uint8_t>)) \
F(I16X8DotI8X16I7X16S, (simdDotOperation<int8_t, uint16_t>))

#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F) \
F(F32X4RelaxedMadd, floatMulAdd, float, float) \
F(F32X4RelaxedNmadd, floatNegMulAdd, float, float) \
F(F64X2RelaxedMadd, floatMulAdd, double, double) \
F(F64X2RelaxedNmadd, floatNegMulAdd, double, double)

#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \
F(I32X4DotI8X16I7X16AddS, (simdDotAddOperation)) \
F(I8X16RelaxedLaneSelect, (simdBitSelectOperation)) \
F(I16X8RelaxedLaneSelect, (simdBitSelectOperation)) \
F(I32X4RelaxedLaneSelect, (simdBitSelectOperation)) \
F(I64X2RelaxedLaneSelect, (simdBitSelectOperation))

#else // Extended Features
#define FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F)
#define FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F)
#define FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F)
#define FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F)
#define FOR_EACH_BYTECODE_ATOMIC_OTHER(F)
#define FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F)
#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F)
#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F)
#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F)
#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F)
#endif // Extended Features

#define FOR_EACH_BYTECODE(F) \
FOR_EACH_BYTECODE_OP(F) \
FOR_EACH_BYTECODE_BINARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP_2(F) \
FOR_EACH_BYTECODE_LOAD_OP(F) \
FOR_EACH_BYTECODE_STORE_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_ETC_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F) \
#define FOR_EACH_BYTECODE(F) \
FOR_EACH_BYTECODE_OP(F) \
FOR_EACH_BYTECODE_BINARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP_2(F) \
FOR_EACH_BYTECODE_LOAD_OP(F) \
FOR_EACH_BYTECODE_STORE_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_ETC_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_OTHER(F)

class ByteCode {
Expand Down Expand Up @@ -734,6 +775,25 @@ class ByteCodeOffset2Value : public ByteCode {
uint32_t m_value;
};

class ByteCodeOffset4 : public ByteCode {
public:
ByteCodeOffset4(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset)
: ByteCode(opcode)
, m_stackOffsets{ src0Offset, src1Offset, src2Offset, dstOffset }
{
}

const ByteCodeStackOffset* srcOffsets() const { return m_stackOffsets; }
ByteCodeStackOffset src0Offset() const { return m_stackOffsets[0]; }
ByteCodeStackOffset src1Offset() const { return m_stackOffsets[1]; }
ByteCodeStackOffset src2Offset() const { return m_stackOffsets[2]; }
ByteCodeStackOffset dstOffset() const { return m_stackOffsets[3]; }

protected:
ByteCodeStackOffset m_stackOffsets[4];
};


class ByteCodeOffset4Value : public ByteCode {
public:
ByteCodeOffset4Value(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset, uint32_t value)
Expand Down Expand Up @@ -931,15 +991,56 @@ class UnaryOperation : public ByteCodeOffset2 {
DEFINE_UNARY_BYTECODE_DUMP(name) \
};

// dummy ByteCode for ternary operation
class TernaryOperation : public ByteCodeOffset4 {
public:
TernaryOperation(Opcode code, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset)
: ByteCodeOffset4(code, src0Offset, src1Offset, src2Offset, dstOffset)
{
}

#if !defined(NDEBUG)
void dump(size_t pos)
{
}
#endif
};

#if !defined(NDEBUG)
#define DEFINE_TERNARY_BYTECODE_DUMP(name) \
void dump(size_t pos) \
{ \
printf(#name " src1: %" PRIu32 " src2: %" PRIu32 " src3: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]); \
}
#else
#define DEFINE_TERNARY_BYTECODE_DUMP(name)
#endif

#define DEFINE_TERNARY_BYTECODE(name, ...) \
class name : public TernaryOperation { \
public: \
name(ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset) \
: TernaryOperation(Opcode::name##Opcode, src0Offset, src1Offset, src2Offset, dstOffset) \
{ \
} \
DEFINE_TERNARY_BYTECODE_DUMP(name) \
};


FOR_EACH_BYTECODE_BINARY_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_UNARY_OP(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_UNARY_OP_2(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_UNARY_OP(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(DEFINE_TERNARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(DEFINE_TERNARY_BYTECODE)

#define DEFINE_MOVE_BYTECODE(name) \
class name : public ByteCodeOffset2 { \
Expand Down Expand Up @@ -1920,31 +2021,19 @@ FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(DEFINE_RMW_CMPXCHG_BYTECODE)
#undef DEFINE_RMW_BYTECODE

// FOR_EACH_BYTECODE_SIMD_ETC_OP
class V128BitSelect : public ByteCode {
class V128BitSelect : public ByteCodeOffset4 {
public:
V128BitSelect(ByteCodeStackOffset lhs, ByteCodeStackOffset rhs, ByteCodeStackOffset c, ByteCodeStackOffset dst)
: ByteCode(Opcode::V128BitSelectOpcode)
, m_srcOffsets{ lhs, rhs, c }
, m_dstOffset(dst)
: ByteCodeOffset4(Opcode::V128BitSelectOpcode, lhs, rhs, c, dst)
{
}

const ByteCodeStackOffset* srcOffsets() const
{
return m_srcOffsets;
}
ByteCodeStackOffset dstOffset() const { return m_dstOffset; }

#if !defined(NDEBUG)
void dump(size_t pos)
{
printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_srcOffsets[0], (uint32_t)m_srcOffsets[1], (uint32_t)m_srcOffsets[2], (uint32_t)m_dstOffset);
printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]);
}
#endif

protected:
ByteCodeStackOffset m_srcOffsets[3];
ByteCodeStackOffset m_dstOffset;
};

class V128Load32Zero : public MemoryLoad {
Expand Down
87 changes: 75 additions & 12 deletions src/interpreter/Interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,19 @@ inline static void simdSwizzleOperation(ExecutionState& state, BinaryOperation*
writeValue<Type>(bp, code->dstOffset(), result);
}

inline static void simdBitSelectOperation(ExecutionState& state, ByteCodeOffset4* code, uint8_t* bp)
{
using Type = typename SIMDType<uint64_t>::Type;
auto src0 = readValue<Type>(bp, code->src0Offset());
auto src1 = readValue<Type>(bp, code->src1Offset());
auto src2 = readValue<Type>(bp, code->src2Offset());
Type result;
for (uint8_t i = 0; i < Type::Lanes; i++) {
result[i] = (src0[i] & src2[i]) | (src1[i] & ~src2[i]);
}
writeValue<Type>(bp, code->dstOffset(), result);
}

// FIXME optimize this function
template <typename P, typename R, bool Low>
inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
Expand All @@ -286,10 +299,11 @@ inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* c
writeValue<ResultType>(bp, code->dstOffset(), result);
}

template <typename P, typename R>
inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
{
using ParamType = typename SIMDType<int16_t>::Type;
using ResultType = typename SIMDType<uint32_t>::Type;
using ParamType = typename SIMDType<P>::Type;
using ResultType = typename SIMDType<R>::Type;
auto lhs = readValue<ParamType>(bp, code->srcOffset()[0]);
auto rhs = readValue<ParamType>(bp, code->srcOffset()[1]);
ResultType result;
Expand All @@ -302,6 +316,30 @@ inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code
writeValue<ResultType>(bp, code->dstOffset(), result);
}

#if defined(ENABLE_EXTENDED_FEATURES)

inline static void simdDotAddOperation(ExecutionState& state, TernaryOperation* code, uint8_t* bp)
{
using ParamType = typename SIMDType<int8_t>::Type;
using ResultType = typename SIMDType<int32_t>::Type;
auto src0 = readValue<ParamType>(bp, code->src0Offset());
auto src1 = readValue<ParamType>(bp, code->src1Offset());
auto src2 = readValue<ResultType>(bp, code->src2Offset());
ResultType result;
for (uint8_t i = 0; i < ResultType::Lanes; i++) {
uint8_t laneIdx = i * 4;
int16_t lo0 = static_cast<int16_t>(src0[laneIdx]) * static_cast<int16_t>(src1[laneIdx]);
int16_t hi0 = static_cast<int16_t>(src0[laneIdx + 1]) * static_cast<int16_t>(src1[laneIdx + 1]);
int16_t lo1 = static_cast<int16_t>(src0[laneIdx + 2]) * static_cast<int16_t>(src1[laneIdx + 2]);
int16_t hi1 = static_cast<int16_t>(src0[laneIdx + 3]) * static_cast<int16_t>(src1[laneIdx + 3]);
int32_t tmp = static_cast<int16_t>(lo0 + hi0) + static_cast<int16_t>(lo1 + hi1);
result[i] = add(state, tmp, src2[i]);
}
writeValue<ResultType>(bp, code->dstOffset(), result);
}

#endif

template <typename P, typename R>
inline static void simdNarrowOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
{
Expand Down Expand Up @@ -582,6 +620,35 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
NEXT_INSTRUCTION(); \
}

#define SIMD_TERNARY_OPERATION(name, op, paramType, resultType) \
DEFINE_OPCODE(name) \
: \
{ \
using ParamType = typename SIMDType<paramType>::Type; \
using ResultType = typename SIMDType<resultType>::Type; \
COMPILE_ASSERT(ParamType::Lanes == ResultType::Lanes, ""); \
name* code = (name*)programCounter; \
auto src0 = readValue<ParamType>(bp, code->src0Offset()); \
auto src1 = readValue<ParamType>(bp, code->src1Offset()); \
auto src2 = readValue<ParamType>(bp, code->src2Offset()); \
ResultType result; \
for (uint8_t i = 0; i < ParamType::Lanes; i++) { \
result[i] = op(state, src0[i], src1[i], src2[i]); \
} \
writeValue<ResultType>(bp, code->dstOffset(), result); \
ADD_PROGRAM_COUNTER(name); \
NEXT_INSTRUCTION(); \
}

#define SIMD_TERNARY_OTHER_OPERATION(name, op) \
DEFINE_OPCODE(name) \
: \
{ \
op(state, (TernaryOperation*)programCounter, bp); \
ADD_PROGRAM_COUNTER(BinaryOperation); \
NEXT_INSTRUCTION(); \
}

#define MEMORY_LOAD_OPERATION(opcodeName, readType, writeType) \
DEFINE_OPCODE(opcodeName) \
: \
Expand Down Expand Up @@ -880,9 +947,14 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
FOR_EACH_BYTECODE_SIMD_BINARY_OP(SIMD_BINARY_OPERATION)
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(SIMD_BINARY_SHIFT_OPERATION)
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(SIMD_BINARY_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_SIMD_UNARY_OP(SIMD_UNARY_OPERATION)
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(SIMD_UNARY_CONVERT_OPERATION)
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(SIMD_TERNARY_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(SIMD_TERNARY_OTHER_OPERATION)

DEFINE_OPCODE(Jump)
:
Expand Down Expand Up @@ -1090,16 +1162,7 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
DEFINE_OPCODE(V128BitSelect)
:
{
using Type = typename SIMDType<uint64_t>::Type;
V128BitSelect* code = (V128BitSelect*)programCounter;
auto lhs = readValue<Type>(bp, code->srcOffsets()[0]);
auto rhs = readValue<Type>(bp, code->srcOffsets()[1]);
auto c = readValue<Type>(bp, code->srcOffsets()[2]);
Type result;
for (uint8_t i = 0; i < Type::Lanes; i++) {
result[i] = (lhs[i] & c[i]) | (rhs[i] & ~c[i]);
}
writeValue<Type>(bp, code->dstOffset(), result);
simdBitSelectOperation(state, (ByteCodeOffset4*)programCounter, bp);
ADD_PROGRAM_COUNTER(V128BitSelect);
NEXT_INSTRUCTION();
}
Expand Down
Loading

0 comments on commit 3f3e275

Please sign in to comment.