Skip to content

[AMDGPU] Add support for v_tanh_f32 on gfx1250 #149360

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")

TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts")
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
Expand Down
1 change: 1 addition & 0 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
return Builder.CreateCall(F, { Src });
}
case AMDGPU::BI__builtin_amdgcn_tanhf:
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_tanh);
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"

Expand Down
19 changes: 19 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,25 @@ void test_s_wait_tensorcnt() {
__builtin_amdgcn_s_wait_tensorcnt(0);
}

// CHECK-LABEL: @test_tanh_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
// CHECK-NEXT: ret void
//
void test_tanh_f32(global float* out, float a)
{
*out = __builtin_amdgcn_tanhf(a);
}

// CHECK-LABEL: @test_tanh_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;

def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
"HasTanhInsts",
"true",
"Has v_tanh_f32/f16 instructions"
>;

def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
Expand Down Expand Up @@ -1979,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
FeatureTanhInsts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
Expand Down Expand Up @@ -2703,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;

def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
AssemblerPredicate<(all_of FeatureTanhInsts)>;

def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasRestrictedSOffset = false;
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
bool HasTanhInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
Expand Down Expand Up @@ -1380,6 +1381,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasMinimum3Maximum3F16;
}

bool hasTanhInsts() const { return HasTanhInsts; }

bool hasAddPC64Inst() const { return GFX1250Insts; }

bool hasMinimum3Maximum3PKF16() const {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;

let SubtargetPredicate = HasTanhInsts in
defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
} // End TRANS = 1, SchedRW = [WriteTrans32]

defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
Expand Down Expand Up @@ -1138,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;

defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;

defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["gfx1250-insts"] = true;
Features["bitop3-insts"] = true;
Features["prng-inst"] = true;
Features["tanh-insts"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["bf16-trans-insts"] = true;
Features["fp8-conversion-insts"] = true;
Expand Down
84 changes: 84 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,92 @@
; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
; FIXME: GlobalISel does not work with bf16

declare float @llvm.amdgcn.tanh.f32(float) #0
declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0

define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
; SDAG-REAL16-LABEL: tanh_f32:
; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, s2
; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1]
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: tanh_f32:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, s2
; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
; SDAG-FAKE16-NEXT: s_endpgm
%tanh = call float @llvm.amdgcn.tanh.f32(float %src) #0
store float %tanh, ptr addrspace(1) %out, align 4
ret void
}

; TODO: Really these should be constant folded
define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 {
; SDAG-REAL16-LABEL: tanh_f32_constant_4.0:
; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 4.0
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1]
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 4.0
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
; SDAG-FAKE16-NEXT: s_endpgm
%tanh = call float @llvm.amdgcn.tanh.f32(float 4.0) #0
store float %tanh, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 {
; SDAG-REAL16-LABEL: tanh_f32_constant_100.0:
; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 0x42c80000
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1]
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 0x42c80000
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
; SDAG-FAKE16-NEXT: s_endpgm
%tanh = call float @llvm.amdgcn.tanh.f32(float 100.0) #0
store float %tanh, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 {
; SDAG-REAL16-LABEL: tanh_undef_f32:
; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: tanh_undef_f32:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_endpgm
%tanh = call float @llvm.amdgcn.tanh.f32(float undef)
store float %tanh, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
; SDAG-REAL16-LABEL: tanh_bf16:
; SDAG-REAL16: ; %bb.0:
Expand Down
45 changes: 45 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,51 @@ v_mov_b64 v[4:5], 0.5
v_mov_b64 v[254:255], 0xaf123456
// GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]

v_tanh_f32 v5, v1
// GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e]

v_tanh_f32 v5, v255
// GFX1250: v_tanh_f32_e32 v5, v255 ; encoding: [0xff,0x3d,0x0a,0x7e]

v_tanh_f32 v5, s1
// GFX1250: v_tanh_f32_e32 v5, s1 ; encoding: [0x01,0x3c,0x0a,0x7e]

v_tanh_f32 v5, s105
// GFX1250: v_tanh_f32_e32 v5, s105 ; encoding: [0x69,0x3c,0x0a,0x7e]

v_tanh_f32 v5, vcc_lo
// GFX1250: v_tanh_f32_e32 v5, vcc_lo ; encoding: [0x6a,0x3c,0x0a,0x7e]

v_tanh_f32 v5, vcc_hi
// GFX1250: v_tanh_f32_e32 v5, vcc_hi ; encoding: [0x6b,0x3c,0x0a,0x7e]

v_tanh_f32 v5, ttmp15
// GFX1250: v_tanh_f32_e32 v5, ttmp15 ; encoding: [0x7b,0x3c,0x0a,0x7e]

v_tanh_f32 v5, m0
// GFX1250: v_tanh_f32_e32 v5, m0 ; encoding: [0x7d,0x3c,0x0a,0x7e]

v_tanh_f32 v5, exec_lo
// GFX1250: v_tanh_f32_e32 v5, exec_lo ; encoding: [0x7e,0x3c,0x0a,0x7e]

v_tanh_f32 v5, exec_hi
// GFX1250: v_tanh_f32_e32 v5, exec_hi ; encoding: [0x7f,0x3c,0x0a,0x7e]

v_tanh_f32 v5, null
// GFX1250: v_tanh_f32_e32 v5, null ; encoding: [0x7c,0x3c,0x0a,0x7e]

v_tanh_f32 v5, -1
// GFX1250: v_tanh_f32_e32 v5, -1 ; encoding: [0xc1,0x3c,0x0a,0x7e]

v_tanh_f32 v5, 0.5
// GFX1250: v_tanh_f32_e32 v5, 0.5 ; encoding: [0xf0,0x3c,0x0a,0x7e]

v_tanh_f32 v5, src_scc
// GFX1250: v_tanh_f32_e32 v5, src_scc ; encoding: [0xfd,0x3c,0x0a,0x7e]

v_tanh_f32 v255, 0xaf123456
// GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]

v_tanh_bf16 v5, v1
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]

Expand Down
45 changes: 45 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,51 @@ v_mov_b64 v[4:5], 0.5
v_mov_b64 v[254:255], 0xaf123456
// GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]

v_tanh_f32 v5, v1
// GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e]

v_tanh_f32 v5, v255
// GFX1250: v_tanh_f32_e32 v5, v255 ; encoding: [0xff,0x3d,0x0a,0x7e]

v_tanh_f32 v5, s1
// GFX1250: v_tanh_f32_e32 v5, s1 ; encoding: [0x01,0x3c,0x0a,0x7e]

v_tanh_f32 v5, s105
// GFX1250: v_tanh_f32_e32 v5, s105 ; encoding: [0x69,0x3c,0x0a,0x7e]

v_tanh_f32 v5, vcc_lo
// GFX1250: v_tanh_f32_e32 v5, vcc_lo ; encoding: [0x6a,0x3c,0x0a,0x7e]

v_tanh_f32 v5, vcc_hi
// GFX1250: v_tanh_f32_e32 v5, vcc_hi ; encoding: [0x6b,0x3c,0x0a,0x7e]

v_tanh_f32 v5, ttmp15
// GFX1250: v_tanh_f32_e32 v5, ttmp15 ; encoding: [0x7b,0x3c,0x0a,0x7e]

v_tanh_f32 v5, m0
// GFX1250: v_tanh_f32_e32 v5, m0 ; encoding: [0x7d,0x3c,0x0a,0x7e]

v_tanh_f32 v5, exec_lo
// GFX1250: v_tanh_f32_e32 v5, exec_lo ; encoding: [0x7e,0x3c,0x0a,0x7e]

v_tanh_f32 v5, exec_hi
// GFX1250: v_tanh_f32_e32 v5, exec_hi ; encoding: [0x7f,0x3c,0x0a,0x7e]

v_tanh_f32 v5, null
// GFX1250: v_tanh_f32_e32 v5, null ; encoding: [0x7c,0x3c,0x0a,0x7e]

v_tanh_f32 v5, -1
// GFX1250: v_tanh_f32_e32 v5, -1 ; encoding: [0xc1,0x3c,0x0a,0x7e]

v_tanh_f32 v5, 0.5
// GFX1250: v_tanh_f32_e32 v5, 0.5 ; encoding: [0xf0,0x3c,0x0a,0x7e]

v_tanh_f32 v5, src_scc
// GFX1250: v_tanh_f32_e32 v5, src_scc ; encoding: [0xfd,0x3c,0x0a,0x7e]

v_tanh_f32 v255, 0xaf123456
// GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]

v_tanh_bf16 v5, v1
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]

Expand Down
56 changes: 56 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,62 @@
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s

v_tanh_f32 v5, v1 quad_perm:[3,2,1,0]
// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 quad_perm:[0,1,2,3]
// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_mirror
// GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_half_mirror
// GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_shl:1
// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_shl:15
// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_shr:1
// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_shr:15
// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_ror:1
// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_ror:15
// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
// GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
// GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
// GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
// GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
Expand Down
Loading
Loading