diff options
Diffstat (limited to 'test/CodeGen/NVPTX')
31 files changed, 4563 insertions, 182 deletions
diff --git a/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index 1a4b0bad36e1..e84030f385c4 100644 --- a/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -15,3 +15,37 @@ define i32 @f(i32* %p) { %sum = add i32 %v0, %v1 ret i32 %sum } + +define half @fh(half* %p) { + %p.1 = getelementptr half, half* %p, i32 1 + %p.2 = getelementptr half, half* %p, i32 2 + %p.3 = getelementptr half, half* %p, i32 3 + %p.4 = getelementptr half, half* %p, i32 4 + %v0 = load half, half* %p, align 64 + %v1 = load half, half* %p.1, align 4 + %v2 = load half, half* %p.2, align 4 + %v3 = load half, half* %p.3, align 4 + %v4 = load half, half* %p.4, align 4 + %sum1 = fadd half %v0, %v1 + %sum2 = fadd half %v2, %v3 + %sum3 = fadd half %sum1, %sum2 + %sum = fadd half %sum3, %v4 + ret half %sum +} + +define float @ff(float* %p) { + %p.1 = getelementptr float, float* %p, i32 1 + %p.2 = getelementptr float, float* %p, i32 2 + %p.3 = getelementptr float, float* %p, i32 3 + %p.4 = getelementptr float, float* %p, i32 4 + %v0 = load float, float* %p, align 64 + %v1 = load float, float* %p.1, align 4 + %v2 = load float, float* %p.2, align 4 + %v3 = load float, float* %p.3, align 4 + %v4 = load float, float* %p.4, align 4 + %sum1 = fadd float %v0, %v1 + %sum2 = fadd float %v2, %v3 + %sum3 = fadd float %sum1, %sum2 + %sum = fadd float %sum3, %v4 + ret float %sum +} diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll index c4cbeca4e409..d5776d77b10d 100644 --- a/test/CodeGen/NVPTX/access-non-generic.ll +++ b/test/CodeGen/NVPTX/access-non-generic.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX -; RUN: opt < %s -S -nvptx-infer-addrspace | FileCheck %s --check-prefix IR +; RUN: opt -mtriple=nvptx-- < %s -S -infer-address-spaces | FileCheck %s --check-prefix IR +; RUN: opt -mtriple=nvptx64-- < %s -S -infer-address-spaces | FileCheck %s --check-prefix IR @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 @scalar = internal addrspace(3) global float 0.000000e+00, align 4 diff --git a/test/CodeGen/NVPTX/add-128bit.ll b/test/CodeGen/NVPTX/add-128bit.ll index 29e3cdffae7b..a077c3fcf891 100644 --- a/test/CodeGen/NVPTX/add-128bit.ll +++ b/test/CodeGen/NVPTX/add-128bit.ll @@ -8,7 +8,7 @@ define void @foo(i64 %a, i64 %add, i128* %retptr) { ; CHECK: add.s64 ; CHECK: setp.lt.u64 ; CHECK: setp.lt.u64 -; CHECK: selp.b64 +; CHECK: selp.u64 ; CHECK: selp.b64 ; CHECK: add.s64 %t1 = sext i64 %a to i128 diff --git a/test/CodeGen/NVPTX/aggregate-return.ll b/test/CodeGen/NVPTX/aggregate-return.ll index 527c5c9aa85d..785b4d6d90dc 100644 --- a/test/CodeGen/NVPTX/aggregate-return.ll +++ b/test/CodeGen/NVPTX/aggregate-return.ll @@ -1,21 +1,40 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s declare <2 x float> @barv(<2 x float> %input) +declare <3 x float> @barv3(<3 x float> %input) declare [2 x float] @bara([2 x float] %input) declare {float, float} @bars({float, float} %input) -define void @foov(<2 x float> %input, <2 x float>* %output) { -; CHECK-LABEL: @foov +define void @test_v2f32(<2 x float> %input, <2 x float>* %output) { +; CHECK-LABEL: @test_v2f32 %call = tail call <2 x float> @barv(<2 x float> %input) ; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; store <2 x float> %call, <2 x float>* %output, align 8 -; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]} +; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void } -define void @fooa([2 x float] %input, [2 x float]* %output) { -; CHECK-LABEL: @fooa +define void @test_v3f32(<3 x float> %input, <3 x float>* %output) { +; CHECK-LABEL: @test_v3f32 +; + %call = tail call <3 x float> @barv3(<3 x float> %input) +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8]; +; Make sure we don't load more values than than we need to. +; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12]; + store <3 x float> %call, <3 x float>* %output, align 8 +; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8], +; -- This is suboptimal. We should do st.v2.f32 instead +; of combining 2xf32 info i64. +; CHECK-DAG: st.u64 [{{%rd[0-9]}}], +; CHECK: ret; + ret void +} + +define void @test_a2f32([2 x float] %input, [2 x float]* %output) { +; CHECK-LABEL: @test_a2f32 %call = tail call [2 x float] @bara([2 x float] %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0]; @@ -28,8 +47,8 @@ define void @fooa([2 x float] %input, [2 x float]* %output) { ; CHECK: ret } -define void @foos({float, float} %input, {float, float}* %output) { -; CHECK-LABEL: @foos +define void @test_s2f32({float, float} %input, {float, float}* %output) { +; CHECK-LABEL: @test_s2f32 %call = tail call {float, float} @bars({float, float} %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0]; diff --git a/test/CodeGen/NVPTX/bug22322.ll b/test/CodeGen/NVPTX/bug22322.ll index 0c4c30cf37ed..74133d3dcabd 100644 --- a/test/CodeGen/NVPTX/bug22322.ll +++ b/test/CodeGen/NVPTX/bug22322.ll @@ -17,7 +17,7 @@ _ZL11compute_vecRK6float3jb.exit: %4 = add nsw i32 %2, %3 %5 = zext i32 %4 to i64 %6 = bitcast float* %ret_vec.sroa.8.i to i8* - call void @llvm.lifetime.start(i64 4, i8* %6) + call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) %7 = and i32 %4, 15 %8 = icmp eq i32 %7, 0 %9 = select i1 %8, float 0.000000e+00, float -1.000000e+00 @@ -26,7 +26,7 @@ _ZL11compute_vecRK6float3jb.exit: %10 = fcmp olt float %9, 0.000000e+00 %ret_vec.sroa.8.i.val = load float, float* %ret_vec.sroa.8.i, align 4 %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val - call void @llvm.lifetime.end(i64 4, i8* %6) + call void @llvm.lifetime.end.p0i8(i64 4, i8* %6) %12 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 0 store float 0.000000e+00, float* %12, align 4 %13 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 1 @@ -46,10 +46,10 @@ declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 ; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #2 +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 ; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #2 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/NVPTX/combine-min-max.ll b/test/CodeGen/NVPTX/combine-min-max.ll index 64bb7a37ffd2..3de86be10a5c 100644 --- a/test/CodeGen/NVPTX/combine-min-max.ll +++ b/test/CodeGen/NVPTX/combine-min-max.ll @@ -21,20 +21,140 @@ define i64 @ba_ne_i64(i64 %a, i64 %b) { ret i64 %sel } -; PTX does have e.g. max.s16, but at least as of Kepler (sm_3x) that -; gets compiled to SASS that converts the 16 bit parameters to 32 bit -; before using a 32 bit instruction. That is probably not a win and -; NVCC 7.5 does not emit 16 bit min/max either, presumably for that -; reason. +; ************************************* +; * All variations with i16 + +; *** ab, unsigned, i16 define i16 @ab_ugt_i16(i16 %a, i16 %b) { ; LABEL: @ab_ugt_i16 -; CHECK-NOT: min -; CHECK-NOT: max +; CHECK: max.u16 %cmp = icmp ugt i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b ret i16 %sel } +define i16 @ab_uge_i16(i16 %a, i16 %b) { +; LABEL: @ab_uge_i16 +; CHECK: max.u16 + %cmp = icmp uge i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +define i16 @ab_ult_i16(i16 %a, i16 %b) { +; LABEL: @ab_ult_i16 +; CHECK: min.u16 + %cmp = icmp ult i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +define i16 @ab_ule_i16(i16 %a, i16 %b) { +; LABEL: @ab_ule_i16 +; CHECK: min.u16 + %cmp = icmp ule i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; *** ab, signed, i16 +define i16 @ab_sgt_i16(i16 %a, i16 %b) { +; LABEL: @ab_ugt_i16 +; CHECK: max.s16 + %cmp = icmp sgt i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +define i16 @ab_sge_i16(i16 %a, i16 %b) { +; LABEL: @ab_sge_i16 +; CHECK: max.s16 + %cmp = icmp sge i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +define i16 @ab_slt_i16(i16 %a, i16 %b) { +; LABEL: @ab_slt_i16 +; CHECK: min.s16 + %cmp = icmp slt i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +define i16 @ab_sle_i16(i16 %a, i16 %b) { +; LABEL: @ab_sle_i16 +; CHECK: min.s16 + %cmp = icmp sle i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + +; *** ba, unsigned, i16 +define i16 @ba_ugt_i16(i16 %a, i16 %b) { +; LABEL: @ba_ugt_i16 +; CHECK: min.u16 + %cmp = icmp ugt i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} + +define i16 @ba_uge_i16(i16 %a, i16 %b) { +; LABEL: @ba_uge_i16 +; CHECK: min.u16 + %cmp = icmp uge i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} + +define i16 @ba_ult_i16(i16 %a, i16 %b) { +; LABEL: @ba_ult_i16 +; CHECK: max.u16 + %cmp = icmp ult i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} + +define i16 @ba_ule_i16(i16 %a, i16 %b) { +; LABEL: @ba_ule_i16 +; CHECK: max.u16 + %cmp = icmp ule i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} + +; *** ba, signed, i16 +define i16 @ba_sgt_i16(i16 %a, i16 %b) { +; LBAEL: @ba_ugt_i16 +; CHECK: min.s16 + %cmp = icmp sgt i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} + +define i16 @ba_sge_i16(i16 %a, i16 %b) { +; LABEL: @ba_sge_i16 +; CHECK: min.s16 + %cmp = icmp sge i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} + +define i16 @ba_slt_i16(i16 %a, i16 %b) { +; LABEL: @ba_slt_i16 +; CHECK: max.s16 + %cmp = icmp slt i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} + +define i16 @ba_sle_i16(i16 %a, i16 %b) { +; LABEL: @ba_sle_i16 +; CHECK: max.s16 + %cmp = icmp sle i16 %a, %b + %sel = select i1 %cmp, i16 %b, i16 %a + ret i16 %sel +} ; ************************************* ; * All variations with i32 diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll index 4b5446e317f4..fd28a4f7cc67 100644 --- a/test/CodeGen/NVPTX/convert-fp.ll +++ b/test/CodeGen/NVPTX/convert-fp.ll @@ -1,44 +1,37 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s - -define i16 @cvt_i16_f32(float %x) { +define i16 @cvt_u16_f32(float %x) { ; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %a = fptoui float %x to i16 ret i16 %a } - -define i16 @cvt_i16_f64(double %x) { +define i16 @cvt_u16_f64(double %x) { ; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i16 ret i16 %a } - -define i32 @cvt_i32_f32(float %x) { +define i32 @cvt_u32_f32(float %x) { ; CHECK: cvt.rzi.u32.f32 %r{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %a = fptoui float %x to i32 ret i32 %a } - -define i32 @cvt_i32_f64(double %x) { +define i32 @cvt_u32_f64(double %x) { ; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i32 ret i32 %a } - - -define i64 @cvt_i64_f32(float %x) { +define i64 @cvt_u64_f32(float %x) { ; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %a = fptoui float %x to i64 ret i64 %a } - -define i64 @cvt_i64_f64(double %x) { +define i64 @cvt_u64_f64(double %x) { ; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i64 @@ -51,63 +44,30 @@ define float @cvt_f32_i16(i16 %x) { %a = uitofp i16 %x to float ret float %a } - define float @cvt_f32_i32(i32 %x) { ; CHECK: cvt.rn.f32.u32 %f{{[0-9]+}}, %r{{[0-9]+}}; ; CHECK: ret; %a = uitofp i32 %x to float ret float %a } - define float @cvt_f32_i64(i64 %x) { ; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rd{{[0-9]+}}; ; CHECK: ret; %a = uitofp i64 %x to float ret float %a } - -define float @cvt_f32_f64(double %x) { -; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: ret; - %a = fptrunc double %x to float - ret float %a -} - -define float @cvt_f32_s16(i16 %x) { -; CHECK: cvt.rn.f32.s16 %f{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: ret - %a = sitofp i16 %x to float - ret float %a -} - -define float @cvt_f32_s32(i32 %x) { -; CHECK: cvt.rn.f32.s32 %f{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: ret - %a = sitofp i32 %x to float - ret float %a -} - -define float @cvt_f32_s64(i64 %x) { -; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: ret - %a = sitofp i64 %x to float - ret float %a -} - define double @cvt_f64_i16(i16 %x) { ; CHECK: cvt.rn.f64.u16 %fd{{[0-9]+}}, %rs{{[0-9]+}}; ; CHECK: ret; %a = uitofp i16 %x to double ret double %a } - define double @cvt_f64_i32(i32 %x) { ; CHECK: cvt.rn.f64.u32 %fd{{[0-9]+}}, %r{{[0-9]+}}; ; CHECK: ret; %a = uitofp i32 %x to double ret double %a } - define double @cvt_f64_i64(i64 %x) { ; CHECK: cvt.rn.f64.u64 %fd{{[0-9]+}}, %rd{{[0-9]+}}; ; CHECK: ret; @@ -115,6 +75,12 @@ define double @cvt_f64_i64(i64 %x) { ret double %a } +define float @cvt_f32_f64(double %x) { +; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}}; +; CHECK: ret; + %a = fptrunc double %x to float + ret float %a +} define double @cvt_f64_f32(float %x) { ; CHECK: cvt.f64.f32 %fd{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; @@ -122,23 +88,76 @@ define double @cvt_f64_f32(float %x) { ret double %a } +define float @cvt_f32_s16(i16 %x) { +; CHECK: cvt.rn.f32.s16 %f{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: ret + %a = sitofp i16 %x to float + ret float %a +} +define float @cvt_f32_s32(i32 %x) { +; CHECK: cvt.rn.f32.s32 %f{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: ret + %a = sitofp i32 %x to float + ret float %a +} +define float @cvt_f32_s64(i64 %x) { +; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: ret + %a = sitofp i64 %x to float + ret float %a +} define double @cvt_f64_s16(i16 %x) { ; CHECK: cvt.rn.f64.s16 %fd{{[0-9]+}}, %rs{{[0-9]+}} ; CHECK: ret %a = sitofp i16 %x to double ret double %a } - define double @cvt_f64_s32(i32 %x) { ; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %a = sitofp i32 %x to double ret double %a } - define double @cvt_f64_s64(i64 %x) { ; CHECK: cvt.rn.f64.s64 %fd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %a = sitofp i64 %x to double ret double %a } + +define i16 @cvt_s16_f32(float %x) { +; CHECK: cvt.rzi.s16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: ret; + %a = fptosi float %x to i16 + ret i16 %a +} +define i16 @cvt_s16_f64(double %x) { +; CHECK: cvt.rzi.s16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}}; +; CHECK: ret; + %a = fptosi double %x to i16 + ret i16 %a +} +define i32 @cvt_s32_f32(float %x) { +; CHECK: cvt.rzi.s32.f32 %r{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: ret; + %a = fptosi float %x to i32 + ret i32 %a +} +define i32 @cvt_s32_f64(double %x) { +; CHECK: cvt.rzi.s32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}}; +; CHECK: ret; + %a = fptosi double %x to i32 + ret i32 %a +} +define i64 @cvt_s64_f32(float %x) { +; CHECK: cvt.rzi.s64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: ret; + %a = fptosi float %x to i64 + ret i64 %a +} +define i64 @cvt_s64_f64(double %x) { +; CHECK: cvt.rzi.s64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}}; +; CHECK: ret; + %a = fptosi double %x to i64 + ret i64 %a +} diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll index bed15a9f6a54..005958bd938a 100644 --- a/test/CodeGen/NVPTX/ctlz.ll +++ b/test/CodeGen/NVPTX/ctlz.ll @@ -6,39 +6,127 @@ declare i16 @llvm.ctlz.i16(i16, i1) readnone declare i32 @llvm.ctlz.i32(i32, i1) readnone declare i64 @llvm.ctlz.i64(i64, i1) readnone -define i32 @myctpop(i32 %a) { -; CHECK: clz.b32 +; There should be no difference between llvm.ctlz.i32(%a, true) and +; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0. + +; CHECK-LABEL: myctlz( +define i32 @myctlz(i32 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone ret i32 %val } - -define i16 @myctpop16(i16 %a) { -; CHECK: clz.b32 - %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone - ret i16 %val +; CHECK-LABEL: myctlz_2( +define i32 @myctlz_2(i32 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone + ret i32 %val } -define i64 @myctpop64(i64 %a) { -; CHECK: clz.b64 +; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit +; value, so here we have to zero-extend it. +; CHECK-LABEL: myctlz64( +define i64 @myctlz64(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: cvt.u64.u32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone ret i64 %val } +; CHECK-LABEL: myctlz64_2( +define i64 @myctlz64_2(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: cvt.u64.u32 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone + ret i64 %val +} - -define i32 @myctpop_2(i32 %a) { -; CHECK: clz.b32 - %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone - ret i32 %val +; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the +; natural return width of ptx's clz.b64 instruction. No conversions should be +; necessary in the PTX. +; CHECK-LABEL: myctlz64_as_32( +define i32 @myctlz64_as_32(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone + %trunc = trunc i64 %val to i32 + ret i32 %trunc +} +; CHECK-LABEL: myctlz64_as_32_2( +define i32 @myctlz64_as_32_2(i64 %a) { +; CHECK: ld.param. +; CHECK-NEXT: clz.b64 +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone + %trunc = trunc i64 %val to i32 + ret i32 %trunc } -define i16 @myctpop16_2(i16 %a) { -; CHECK: clz.b32 +; ctlz.i16 is implemented by extending the input to i32, computing the result, +; and then truncating the result back down to i16. But the NVPTX ABI +; zero-extends i16 return values to i32, so the final truncation doesn't appear +; in this function. +; CHECK-LABEL: myctlz_ret16( +define i16 @myctlz_ret16(i16 %a) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: sub. +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone + ret i16 %val +} +; CHECK-LABEL: myctlz_ret16_2( +define i16 @myctlz_ret16_2(i16 %a) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NEXT: clz.b32 +; CHECK-NEXT: sub. +; CHECK-NEXT: st.param. +; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone ret i16 %val } -define i64 @myctpop64_2(i64 %a) { -; CHECK: clz.b64 - %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone - ret i64 %val +; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should +; remain. +; CHECK-LABEL: myctlz_store16( +define void @myctlz_store16(i16 %a, i16* %b) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NET: clz.b32 +; CHECK-DAG: cvt.u16.u32 +; CHECK-DAG: sub. +; CHECK: st.{{[a-z]}}16 +; CHECK: ret; + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone + store i16 %val, i16* %b + ret void +} +; CHECK-LABEL: myctlz_store16_2( +define void @myctlz_store16_2(i16 %a, i16* %b) { +; CHECK: ld.param. +; CHECK-NEXT: cvt.u32.u16 +; CHECK-NET: clz.b32 +; CHECK-DAG: cvt.u16.u32 +; CHECK-DAG: sub. +; CHECK: st.{{[a-z]}}16 +; CHECK: ret; + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone + store i16 %val, i16* %b + ret void } diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll new file mode 100644 index 000000000000..403a67f02f80 --- /dev/null +++ b/test/CodeGen/NVPTX/f16-instructions.ll @@ -0,0 +1,1063 @@ +; ## Full FP16 support enabled by default. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s +; ## FP16 support explicitly disabled. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s +; ## FP16 is not supported by hardware. +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: -disable-post-ra -disable-fp-elim \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_ret_const( +; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_ret_const() #0 { + ret half 1.0 +} + +; CHECK-LABEL: test_fadd( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_param_1]; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fadd(half %a, half %b) #0 { + %r = fadd half %a, %b + ret half %r +} + +; Check that we can lower fadd with immediate arguments. +; CHECK-LABEL: test_fadd_imm_0( +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0]; +; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fadd_imm_0(half %b) #0 { + %r = fadd half 1.0, %b + ret half %r +} + +; CHECK-LABEL: test_fadd_imm_1( +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_1_param_0]; +; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fadd_imm_1(half %a) #0 { + %r = fadd half %a, 1.0 + ret half %r +} + +; CHECK-LABEL: test_fsub( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fsub_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fsub_param_1]; +; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fsub(half %a, half %b) #0 { + %r = fsub half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fneg( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fneg_param_0]; +; CHECK-F16-NEXT: mov.b16 [[Z:%h[0-9]+]], 0x0000 +; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[Z]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; +; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fneg(half %a) #0 { + %r = fsub half 0.0, %a + ret half %r +} + +; CHECK-LABEL: test_fmul( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmul_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmul_param_1]; +; CHECK-F16-NEXT: mul.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fmul(half %a, half %b) #0 { + %r = fmul half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fdiv( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fdiv_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fdiv_param_1]; +; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]]; +; CHECK-NEXT: div.rn.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]]; +; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[FR]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fdiv(half %a, half %b) #0 { + %r = fdiv half %a, %b + ret half %r +} + +; CHECK-LABEL: test_frem( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1]; +; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]]; +; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]]; +; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]]; +; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]]; +; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]]; +; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_frem(half %a, half %b) #0 { + %r = frem half %a, %b + ret half %r +} + +; CHECK-LABEL: test_store( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_store_param_0]; +; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1]; +; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]]; +; CHECK-NEXT: ret; +define void @test_store(half %a, half* %b) #0 { + store half %a, half* %b + ret void +} + +; CHECK-LABEL: test_load( +; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0]; +; CHECK-NEXT: ld.b16 [[R:%h[0-9]+]], [%[[PTR]]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_load(half* %a) #0 { + %r = load half, half* %a + ret half %r +} + +; CHECK-LABEL: .visible .func test_halfp0a1( +; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0]; +; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1]; +; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] +; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] +; CHECK: ret +define void @test_halfp0a1(half * noalias readonly %from, half * %to) { + %1 = load half, half * %from , align 1 + store half %1, half * %to , align 1 + ret void +} + +declare half @test_callee(half %a, half %b) #0 + +; CHECK-LABEL: test_call( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_param_1]; +; CHECK: { +; CHECK-DAG: .param .b32 param0; +; CHECK-DAG: .param .b32 param1; +; CHECK-DAG: st.param.b16 [param0+0], [[A]]; +; CHECK-DAG: st.param.b16 [param1+0], [[B]]; +; CHECK-DAG: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_call(half %a, half %b) #0 { + %r = call half @test_callee(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_call_flipped( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_flipped_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .b32 param0; +; CHECK-DAG: .param .b32 param1; +; CHECK-DAG: st.param.b16 [param0+0], [[B]]; +; CHECK-DAG: st.param.b16 [param1+0], [[A]]; +; CHECK-DAG: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_call_flipped(half %a, half %b) #0 { + %r = call half @test_callee(half %b, half %a) + ret half %r +} + +; CHECK-LABEL: test_tailcall_flipped( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_tailcall_flipped_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_tailcall_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .b32 param0; +; CHECK-DAG: .param .b32 param1; +; CHECK-DAG: st.param.b16 [param0+0], [[B]]; +; CHECK-DAG: st.param.b16 [param1+0], [[A]]; +; CHECK-DAG: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_tailcall_flipped(half %a, half %b) #0 { + %r = tail call half @test_callee(half %b, half %a) + ret half %r +} + +; CHECK-LABEL: test_select( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1]; +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_select(half %a, half %b, i1 zeroext %c) #0 { + %r = select i1 %c, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_select_cc( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_param_2]; +; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_param_3]; +; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; +; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] +; CHECK: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { + %cc = fcmp une half %c, %d + %r = select i1 %cc, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_select_cc_f32_f16( +; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0]; +; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2]; +; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3]; +; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; +; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] +; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { + %cc = fcmp une half %c, %d + %r = select i1 %cc, float %a, float %b + ret float %r +} + +; CHECK-LABEL: test_select_cc_f16_f32( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0]; +; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2]; +; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3]; +; CHECK-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]] +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1]; +; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { + %cc = fcmp une float %c, %d + %r = select i1 %cc, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_fcmp_une( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_une_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_une_param_1]; +; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_une(half %a, half %b) #0 { + %r = fcmp une half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ueq( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ueq_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ueq_param_1]; +; CHECK-F16: setp.equ.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ueq(half %a, half %b) #0 { + %r = fcmp ueq half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ugt( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ugt_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ugt_param_1]; +; CHECK-F16: setp.gtu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ugt(half %a, half %b) #0 { + %r = fcmp ugt half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_uge( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uge_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uge_param_1]; +; CHECK-F16: setp.geu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_uge(half %a, half %b) #0 { + %r = fcmp uge half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ult( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ult_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ult_param_1]; +; CHECK-F16: setp.ltu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ult(half %a, half %b) #0 { + %r = fcmp ult half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ule( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ule_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ule_param_1]; +; CHECK-F16: setp.leu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ule(half %a, half %b) #0 { + %r = fcmp ule half %a, %b + ret i1 %r +} + + +; CHECK-LABEL: test_fcmp_uno( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uno_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uno_param_1]; +; CHECK-F16: setp.nan.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_uno(half %a, half %b) #0 { + %r = fcmp uno half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_one( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_one_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_one_param_1]; +; CHECK-F16: setp.ne.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_one(half %a, half %b) #0 { + %r = fcmp one half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_oeq( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oeq_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oeq_param_1]; +; CHECK-F16: setp.eq.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_oeq(half %a, half %b) #0 { + %r = fcmp oeq half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ogt( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ogt_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ogt_param_1]; +; CHECK-F16: setp.gt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ogt(half %a, half %b) #0 { + %r = fcmp ogt half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_oge( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oge_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oge_param_1]; +; CHECK-F16: setp.ge.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_oge(half %a, half %b) #0 { + %r = fcmp oge half %a, %b + ret i1 %r +} + +; XCHECK-LABEL: test_fcmp_olt( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_olt_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_olt_param_1]; +; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_olt(half %a, half %b) #0 { + %r = fcmp olt half %a, %b + ret i1 %r +} + +; XCHECK-LABEL: test_fcmp_ole( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ole_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ole_param_1]; +; CHECK-F16: setp.le.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ole(half %a, half %b) #0 { + %r = fcmp ole half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ord( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ord_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ord_param_1]; +; CHECK-F16: setp.num.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ord(half %a, half %b) #0 { + %r = fcmp ord half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_br_cc( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_br_cc_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_br_cc_param_1]; +; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2]; +; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3]; +; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]]; +; CHECK: st.u32 [%[[C]]], +; CHECK: [[LABEL]]: +; CHECK: st.u32 [%[[D]]], +; CHECK: ret; +define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 { + %c = fcmp uge half %a, %b + br i1 %c, label %then, label %else +then: + store i32 0, i32* %p1 + ret void +else: + store i32 0, i32* %p2 + ret void +} + +; CHECK-LABEL: test_phi( +; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0]; +; CHECK: ld.b16 {{%h[0-9]+}}, [%[[P1]]]; +; CHECK: [[LOOP:LBB[0-9_]+]]: +; CHECK: mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]]; +; CHECK: ld.b16 [[AB:%h[0-9]+]], [%[[P1]]]; +; CHECK: { +; CHECK: st.param.b64 [param0+0], %[[P1]]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_dummy +; CHECK: } +; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1; +; CHECK: @[[PRED]] bra [[LOOP]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_phi(half* %p1) #0 { +entry: + %a = load half, half* %p1 + br label %loop +loop: + %r = phi half [%a, %entry], [%b, %loop] + %b = load half, half* %p1 + %c = call i1 @test_dummy(half* %p1) + br i1 %c, label %loop, label %return +return: + ret half %r +} +declare i1 @test_dummy(half* %p1) #0 + +; CHECK-LABEL: test_fptosi_i32( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i32_param_0]; +; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i32 @test_fptosi_i32(half %a) #0 { + %r = fptosi half %a to i32 + ret i32 %r +} + +; CHECK-LABEL: test_fptosi_i64( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i64_param_0]; +; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: ret; +define i64 @test_fptosi_i64(half %a) #0 { + %r = fptosi half %a to i64 + ret i64 %r +} + +; CHECK-LABEL: test_fptoui_i32( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i32_param_0]; +; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i32 @test_fptoui_i32(half %a) #0 { + %r = fptoui half %a to i32 + ret i32 %r +} + +; CHECK-LABEL: test_fptoui_i64( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i64_param_0]; +; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: ret; +define i64 @test_fptoui_i64(half %a) #0 { + %r = fptoui half %a to i64 + ret i64 %r +} + +; CHECK-LABEL: test_uitofp_i32( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0]; +; CHECK: cvt.rn.f16.u32 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_uitofp_i32(i32 %a) #0 { + %r = uitofp i32 %a to half + ret half %r +} + +; CHECK-LABEL: test_uitofp_i64( +; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0]; +; CHECK: cvt.rn.f16.u64 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_uitofp_i64(i64 %a) #0 { + %r = uitofp i64 %a to half + ret half %r +} + +; CHECK-LABEL: test_sitofp_i32( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0]; +; CHECK: cvt.rn.f16.s32 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sitofp_i32(i32 %a) #0 { + %r = sitofp i32 %a to half + ret half %r +} + +; CHECK-LABEL: test_sitofp_i64( +; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0]; +; CHECK: cvt.rn.f16.s64 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sitofp_i64(i64 %a) #0 { + %r = sitofp i64 %a to half + ret half %r +} + +; CHECK-LABEL: test_uitofp_i32_fadd( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0]; +; CHECK-DAG: cvt.rn.f16.u32 [[C:%h[0-9]+]], [[A]]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1]; +; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { + %c = uitofp i32 %a to half + %r = fadd half %b, %c + ret half %r +} + +; CHECK-LABEL: test_sitofp_i32_fadd( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0]; +; CHECK-DAG: cvt.rn.f16.s32 [[C:%h[0-9]+]], [[A]]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1]; +; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; +; XCHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; +; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { + %c = sitofp i32 %a to half + %r = fadd half %b, %c + ret half %r +} + +; CHECK-LABEL: test_fptrunc_float( +; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fptrunc_float(float %a) #0 { + %r = fptrunc float %a to half + ret half %r +} + +; CHECK-LABEL: test_fptrunc_double( +; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0]; +; CHECK: cvt.rn.f16.f64 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fptrunc_double(double %a) #0 { + %r = fptrunc double %a to half + ret half %r +} + +; CHECK-LABEL: test_fpext_float( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_float_param_0]; +; CHECK: cvt.f32.f16 [[R:%f[0-9]+]], [[A]]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK: ret; +define float @test_fpext_float(half %a) #0 { + %r = fpext half %a to float + ret float %r +} + +; CHECK-LABEL: test_fpext_double( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_double_param_0]; +; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]]; +; CHECK: st.param.f64 [func_retval0+0], [[R]]; +; CHECK: ret; +define double @test_fpext_double(half %a) #0 { + %r = fpext half %a to double + ret double %r +} + + +; CHECK-LABEL: test_bitcast_halftoi16( +; CHECK: ld.param.b16 [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0]; +; CHECK: mov.b16 [[AS:%rs[0-9]+]], [[AH]] +; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AS]] +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i16 @test_bitcast_halftoi16(half %a) #0 { + %r = bitcast half %a to i16 + ret i16 %r +} + +; CHECK-LABEL: test_bitcast_i16tohalf( +; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0]; +; CHECK: mov.b16 [[AH:%h[0-9]+]], [[AS]] +; CHECK: st.param.b16 [func_retval0+0], [[AH]]; +; CHECK: ret; +define half @test_bitcast_i16tohalf(i16 %a) #0 { + %r = bitcast i16 %a to half + ret half %r +} + + +declare half @llvm.sqrt.f16(half %a) #0 +declare half @llvm.powi.f16(half %a, i32 %b) #0 +declare half @llvm.sin.f16(half %a) #0 +declare half @llvm.cos.f16(half %a) #0 +declare half @llvm.pow.f16(half %a, half %b) #0 +declare half @llvm.exp.f16(half %a) #0 +declare half @llvm.exp2.f16(half %a) #0 +declare half @llvm.log.f16(half %a) #0 +declare half @llvm.log10.f16(half %a) #0 +declare half @llvm.log2.f16(half %a) #0 +declare half @llvm.fma.f16(half %a, half %b, half %c) #0 +declare half @llvm.fabs.f16(half %a) #0 +declare half @llvm.minnum.f16(half %a, half %b) #0 +declare half @llvm.maxnum.f16(half %a, half %b) #0 +declare half @llvm.copysign.f16(half %a, half %b) #0 +declare half @llvm.floor.f16(half %a) #0 +declare half @llvm.ceil.f16(half %a) #0 +declare half @llvm.trunc.f16(half %a) #0 +declare half @llvm.rint.f16(half %a) #0 +declare half @llvm.nearbyint.f16(half %a) #0 +declare half @llvm.round.f16(half %a) #0 +declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 + +; CHECK-LABEL: test_sqrt( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sqrt_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: sqrt.rn.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sqrt(half %a) #0 { + %r = call half @llvm.sqrt.f16(half %a) + ret half %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_powi( +;define half @test_powi(half %a, i32 %b) #0 { +; %r = call half @llvm.powi.f16(half %a, i32 %b) +; ret half %r +;} + +; CHECK-LABEL: test_sin( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sin_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sin(half %a) #0 #1 { + %r = call half @llvm.sin.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_cos( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_cos_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_cos(half %a) #0 #1 { + %r = call half @llvm.cos.f16(half %a) + ret half %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_pow( +;define half @test_pow(half %a, half %b) #0 { +; %r = call half @llvm.pow.f16(half %a, half %b) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp( +;define half @test_exp(half %a) #0 { +; %r = call half @llvm.exp.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp2( +;define half @test_exp2(half %a) #0 { +; %r = call half @llvm.exp2.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log( +;define half @test_log(half %a) #0 { +; %r = call half @llvm.log.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log10( +;define half @test_log10(half %a) #0 { +; %r = call half @llvm.log10.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log2( +;define half @test_log2(half %a) #0 { +; %r = call half @llvm.log2.f16(half %a) +; ret half %r +;} + +; CHECK-LABEL: test_fma( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fma_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fma_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fma_param_2]; +; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret +define half @test_fma(half %a, half %b, half %c) #0 { + %r = call half @llvm.fma.f16(half %a, half %b, half %c) + ret half %r +} + +; CHECK-LABEL: test_fabs( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fabs_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: abs.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fabs(half %a) #0 { + %r = call half @llvm.fabs.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_minnum( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_minnum_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_minnum_param_1]; +; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK: min.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_minnum(half %a, half %b) #0 { + %r = call half @llvm.minnum.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_maxnum( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_maxnum_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_maxnum_param_1]; +; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK: max.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_maxnum(half %a, half %b) #0 { + %r = call half @llvm.maxnum.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_copysign( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_param_0]; +; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_param_1]; +; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; +; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_copysign(half %a, half %b) #0 { + %r = call half @llvm.copysign.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_copysign_f32( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f32_param_0]; +; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1]; +; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; +; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648; +; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16; +; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]]; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_copysign_f32(half %a, float %b) #0 { + %tb = fptrunc float %b to half + %r = call half @llvm.copysign.f16(half %a, half %tb) + ret half %r +} + +; CHECK-LABEL: test_copysign_f64( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f64_param_0]; +; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1]; +; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; +; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808; +; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48; +; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]]; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_copysign_f64(half %a, double %b) #0 { + %tb = fptrunc double %b to half + %r = call half @llvm.copysign.f16(half %a, half %tb) + ret half %r +} + +; CHECK-LABEL: test_copysign_extended( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_extended_param_0]; +; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_extended_param_1]; +; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; +; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: cvt.f32.f16 [[XR:%f[0-9]+]], [[R]]; +; CHECK: st.param.f32 [func_retval0+0], [[XR]]; +; CHECK: ret; +define float @test_copysign_extended(half %a, half %b) #0 { + %r = call half @llvm.copysign.f16(half %a, half %b) + %xr = fpext half %r to float + ret float %xr +} + +; CHECK-LABEL: test_floor( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_floor_param_0]; +; CHECK: cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_floor(half %a) #0 { + %r = call half @llvm.floor.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_ceil( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_ceil_param_0]; +; CHECK: cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_ceil(half %a) #0 { + %r = call half @llvm.ceil.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_trunc( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_trunc_param_0]; +; CHECK: cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_trunc(half %a) #0 { + %r = call half @llvm.trunc.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_rint( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_rint_param_0]; +; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_rint(half %a) #0 { + %r = call half @llvm.rint.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_nearbyint( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_nearbyint_param_0]; +; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_nearbyint(half %a) #0 { + %r = call half @llvm.nearbyint.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_round( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0]; +; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_round(half %a) #0 { + %r = call half @llvm.round.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_fmuladd( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmuladd_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmuladd_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fmuladd_param_2]; +; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fmuladd(half %a, half %b, half %c) #0 { + %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c) + ret half %r +} + +attributes #0 = { nounwind } +attributes #1 = { "unsafe-fp-math" = "true" } diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll new file mode 100644 index 000000000000..33bb616d895c --- /dev/null +++ b/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -0,0 +1,1426 @@ +; ## Full FP16 support enabled by default. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s +; ## FP16 support explicitly disabled. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s +; ## FP16 is not supported by hardware. +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: -disable-post-ra -disable-fp-elim \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_ret_const( +; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184; +; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_ret_const() #0 { + ret <2 x half> <half 1.0, half 2.0> +} + +; CHECK-LABEL: test_extract_0( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0]; +; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_0(<2 x half> %a) #0 { + %e = extractelement <2 x half> %a, i32 0 + ret half %e +} + +; CHECK-LABEL: test_extract_1( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0]; +; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_1(<2 x half> %a) #0 { + %e = extractelement <2 x half> %a, i32 1 + ret half %e +} + +; CHECK-LABEL: test_extract_i( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0]; +; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; +; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]]; +; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { + %e = extractelement <2 x half> %a, i64 %idx + ret half %e +} + +; CHECK-LABEL: test_fadd( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1]; +; +; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { + %r = fadd <2 x half> %a, %b + ret <2 x half> %r +} + +; Check that we can lower fadd with immediate arguments. +; CHECK-LABEL: test_fadd_imm_0( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0]; +; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { + %r = fadd <2 x half> <half 1.0, half 2.0>, %a + ret <2 x half> %r +} + +; CHECK-LABEL: test_fadd_imm_1( +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0]; +; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { + %r = fadd <2 x half> %a, <half 1.0, half 2.0> + ret <2 x half> %r +} + +; CHECK-LABEL: test_fsub( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0]; +; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1]; +; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { + %r = fsub <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fneg( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0]; +; +; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0; +; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]]; +; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]]; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fneg(<2 x half> %a) #0 { + %r = fsub <2 x half> <half 0.0, half 0.0>, %a + ret <2 x half> %r +} + +; CHECK-LABEL: test_fmul( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1]; +; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { + %r = fmul <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fdiv( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]; +; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { + %r = fdiv <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_frem( +; -- Load two 16x2 inputs and split them into f16 elements +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1]; +; -- Split into elements +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; -- promote to f32. +; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; +; -- frem(a[0],b[0]). +; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]]; +; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]]; +; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]]; +; -- frem(a[1],b[1]). +; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]]; +; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]]; +; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]]; +; -- convert back to f16. +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; -- merge into f16x2 and return it. +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { + %r = frem <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: .func test_ldst_v2f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1]; +; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]] +; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]]; +; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]}; +; CHECK: ret; +define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) { + %t1 = load <2 x half>, <2 x half>* %a + store <2 x half> %t1, <2 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v3f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1]; +; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair +; number of bitshifting instructions that may change at llvm's whim. +; So we only verify that we only issue correct number of writes using +; correct offset, but not the values we write. +; CHECK-DAG: ld.u64 +; CHECK-DAG: st.u32 [%[[B]]], +; CHECK-DAG: st.b16 [%[[B]]+4], +; CHECK: ret; +define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) { + %t1 = load <3 x half>, <3 x half>* %a + store <3 x half> %t1, <3 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v4f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1]; +; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) { + %t1 = load <4 x half>, <4 x half>* %a + store <4 x half> %t1, <4 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v8f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1]; +; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) { + %t1 = load <8 x half>, <8 x half>* %a + store <8 x half> %t1, <8 x half>* %b, align 16 + ret void +} + +declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 + +; CHECK-LABEL: test_call( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[A]]; +; CHECK-DAG: st.param.b32 [param1+0], [[B]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_call_flipped( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_tailcall_flipped( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { + %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_select( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1]; +; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { + %r = select i1 %c, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_select_cc( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2]; +; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3]; +; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] +; +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] +; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 { + %cc = fcmp une <2 x half> %c, %d + %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_select_cc_f32_f16( +; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0]; +; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2]; +; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3]; +; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] +; +; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, + <2 x half> %c, <2 x half> %d) #0 { + %cc = fcmp une <2 x half> %c, %d + %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +; CHECK-LABEL: test_select_cc_f16_f32( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1]; +; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2]; +; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3]; +; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, + <2 x float> %c, <2 x float> %d) #0 { + %cc = fcmp une <2 x float> %c, %d + %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fcmp_une( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1]; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp une <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ueq( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1]; +; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ueq <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ugt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1]; +; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ugt <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_uge( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1]; +; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp uge <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ult( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1]; +; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ult <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ule( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1]; +; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ule <2 x half> %a, %b + ret <2 x i1> %r +} + + +; CHECK-LABEL: test_fcmp_uno( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1]; +; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp uno <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_one( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1]; +; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp one <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_oeq( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1]; +; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp oeq <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ogt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1]; +; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ogt <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_oge( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1]; +; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp oge <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_olt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1]; +; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp olt <2 x half> %a, %b + ret <2 x i1> %r +} + +; XCHECK-LABEL: test_fcmp_ole( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1]; +; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ole <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ord( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1]; +; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ord <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fptosi_i32( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { + %r = fptosi <2 x half> %a to <2 x i32> + ret <2 x i32> %r +} + +; CHECK-LABEL: test_fptosi_i64( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { + %r = fptosi <2 x half> %a to <2 x i64> + ret <2 x i64> %r +} + +; CHECK-LABEL: test_fptoui_2xi32( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { + %r = fptoui <2 x half> %a to <2 x i32> + ret <2 x i32> %r +} + +; CHECK-LABEL: test_fptoui_2xi64( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { + %r = fptoui <2 x half> %a to <2 x i64> + ret <2 x i64> %r +} + +; CHECK-LABEL: test_uitofp_2xi32( +; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0]; +; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { + %r = uitofp <2 x i32> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_uitofp_2xi64( +; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0]; +; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { + %r = uitofp <2 x i64> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi32( +; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0]; +; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { + %r = sitofp <2 x i32> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi64( +; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0]; +; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { + %r = sitofp <2 x i64> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_uitofp_2xi32_fadd( +; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1]; +; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]]; + +; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} +; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { + %c = uitofp <2 x i32> %a to <2 x half> + %r = fadd <2 x half> %b, %c + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi32_fadd( +; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1]; +; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]]; +; +; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} +; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { + %c = sitofp <2 x i32> %a to <2 x half> + %r = fadd <2 x half> %b, %c + ret <2 x half> %r +} + +; CHECK-LABEL: test_fptrunc_2xfloat( +; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { + %r = fptrunc <2 x float> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_fptrunc_2xdouble( +; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0]; +; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { + %r = fptrunc <2 x double> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_fpext_2xfloat( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { + %r = fpext <2 x half> %a to <2 x float> + ret <2 x float> %r +} + +; CHECK-LABEL: test_fpext_2xdouble( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { + %r = fpext <2 x half> %a to <2 x double> + ret <2 x double> %r +} + + +; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; +; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]] +; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16 +; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]] +; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { + %r = bitcast <2 x half> %a to <2 x i16> + ret <2 x i16> %r +} + +; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( +; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0]; +; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]]; +; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]]; +; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16; +; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { + %r = bitcast <2 x i16> %a to <2 x half> + ret <2 x half> %r +} + + +declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0 +declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 +declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.round.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 + +; CHECK-LABEL: test_sqrt( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sqrt(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a) + ret <2 x half> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_powi( +;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 { +; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) +; ret <2 x half> %r +;} + +; CHECK-LABEL: test_sin( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sin(<2 x half> %a) #0 #1 { + %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_cos( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_cos(<2 x half> %a) #0 #1 { + %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) + ret <2 x half> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_pow( +;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 { +; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp( +;define <2 x half> @test_exp(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp2( +;define <2 x half> @test_exp2(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log( +;define <2 x half> @test_log(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log10( +;define <2 x half> @test_log10(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log2( +;define <2 x half> @test_log2(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +; CHECK-LABEL: test_fma( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2]; +; +; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} + +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret +define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { + %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + ret <2 x half> %r +} + +; CHECK-LABEL: test_fabs( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fabs(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_minnum( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; +; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_maxnum( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; +; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; +; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_f32( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0]; +; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; +; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; +; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16; +; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16; +; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]]; +; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { + %tb = fptrunc <2 x float> %b to <2 x half> + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_f64( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0]; +; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; +; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; +; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; +; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; +; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; +; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { + %tb = fptrunc <2 x double> %b to <2 x half> + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_extended( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; +; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]] +; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]]; +; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]]; +; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]}; +; CHECK: ret; +define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) + %xr = fpext <2 x half> %r to <2 x float> + ret <2 x float> %xr +} + +; CHECK-LABEL: test_floor( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_floor(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.floor.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_ceil( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_ceil(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_trunc( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_trunc(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_rint( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_rint(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.rint.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_nearbyint( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_nearbyint(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_round( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_round(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.round.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_fmuladd( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2]; +; +; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { + %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + ret <2 x half> %r +} + +attributes #0 = { nounwind } +attributes #1 = { "unsafe-fp-math" = "true" } diff --git a/test/CodeGen/NVPTX/fast-math.ll b/test/CodeGen/NVPTX/fast-math.ll index d0a333d369ca..56b1f88f3b2e 100644 --- a/test/CodeGen/NVPTX/fast-math.ll +++ b/test/CodeGen/NVPTX/fast-math.ll @@ -1,25 +1,91 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -declare float @llvm.nvvm.sqrt.f(float) +declare float @llvm.sqrt.f32(float) +declare double @llvm.sqrt.f64(double) -; CHECK-LABEL: sqrt_div +; CHECK-LABEL: sqrt_div( ; CHECK: sqrt.rn.f32 ; CHECK: div.rn.f32 define float @sqrt_div(float %a, float %b) { - %t1 = tail call float @llvm.nvvm.sqrt.f(float %a) + %t1 = tail call float @llvm.sqrt.f32(float %a) %t2 = fdiv float %t1, %b ret float %t2 } -; CHECK-LABEL: sqrt_div_fast +; CHECK-LABEL: sqrt_div_fast( ; CHECK: sqrt.approx.f32 ; CHECK: div.approx.f32 define float @sqrt_div_fast(float %a, float %b) #0 { - %t1 = tail call float @llvm.nvvm.sqrt.f(float %a) + %t1 = tail call float @llvm.sqrt.f32(float %a) %t2 = fdiv float %t1, %b ret float %t2 } +; CHECK-LABEL: sqrt_div_ftz( +; CHECK: sqrt.rn.ftz.f32 +; CHECK: div.rn.ftz.f32 +define float @sqrt_div_ftz(float %a, float %b) #1 { + %t1 = tail call float @llvm.sqrt.f32(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + +; CHECK-LABEL: sqrt_div_fast_ftz( +; CHECK: sqrt.approx.ftz.f32 +; CHECK: div.approx.ftz.f32 +define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 { + %t1 = tail call float @llvm.sqrt.f32(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + +; There are no fast-math or ftz versions of sqrt and div for f64. We use +; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide. + +; CHECK-LABEL: sqrt_div_fast_ftz_f64( +; CHECK: rsqrt.approx.f64 +; CHECK: rcp.approx.ftz.f64 +; CHECK: div.rn.f64 +define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 { + %t1 = tail call double @llvm.sqrt.f64(double %a) + %t2 = fdiv double %t1, %b + ret double %t2 +} + +; CHECK-LABEL: rsqrt( +; CHECK-NOT: rsqrt.approx +; CHECK: sqrt.rn.f32 +; CHECK-NOT: rsqrt.approx +define float @rsqrt(float %a) { + %b = tail call float @llvm.sqrt.f32(float %a) + %ret = fdiv float 1.0, %b + ret float %ret +} + +; CHECK-LABEL: rsqrt_fast( +; CHECK-NOT: div. +; CHECK-NOT: sqrt. +; CHECK: rsqrt.approx.f32 +; CHECK-NOT: div. +; CHECK-NOT: sqrt. +define float @rsqrt_fast(float %a) #0 { + %b = tail call float @llvm.sqrt.f32(float %a) + %ret = fdiv float 1.0, %b + ret float %ret +} + +; CHECK-LABEL: rsqrt_fast_ftz( +; CHECK-NOT: div. +; CHECK-NOT: sqrt. +; CHECK: rsqrt.approx.ftz.f32 +; CHECK-NOT: div. +; CHECK-NOT: sqrt. +define float @rsqrt_fast_ftz(float %a) #0 #1 { + %b = tail call float @llvm.sqrt.f32(float %a) + %ret = fdiv float 1.0, %b + ret float %ret +} + ; CHECK-LABEL: fadd ; CHECK: add.rn.f32 define float @fadd(float %a, float %b) { @@ -34,5 +100,66 @@ define float @fadd_ftz(float %a, float %b) #1 { ret float %t1 } +declare float @llvm.sin.f32(float) +declare float @llvm.cos.f32(float) + +; CHECK-LABEL: fsin_approx +; CHECK: sin.approx.f32 +define float @fsin_approx(float %a) #0 { + %r = tail call float @llvm.sin.f32(float %a) + ret float %r +} + +; CHECK-LABEL: fcos_approx +; CHECK: cos.approx.f32 +define float @fcos_approx(float %a) #0 { + %r = tail call float @llvm.cos.f32(float %a) + ret float %r +} + +; CHECK-LABEL: repeated_div_recip_allowed +define float @repeated_div_recip_allowed(i1 %pred, float %a, float %b, float %divisor) { +; CHECK: rcp.rn.f32 +; CHECK: mul.rn.f32 +; CHECK: mul.rn.f32 + %x = fdiv arcp float %a, %divisor + %y = fdiv arcp float %b, %divisor + %z = select i1 %pred, float %x, float %y + ret float %z +} + +; CHECK-LABEL: repeated_div_recip_allowed_ftz +define float @repeated_div_recip_allowed_ftz(i1 %pred, float %a, float %b, float %divisor) #1 { +; CHECK: rcp.rn.ftz.f32 +; CHECK: mul.rn.ftz.f32 +; CHECK: mul.rn.ftz.f32 + %x = fdiv arcp float %a, %divisor + %y = fdiv arcp float %b, %divisor + %z = select i1 %pred, float %x, float %y + ret float %z +} + +; CHECK-LABEL: repeated_div_fast +define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0 { +; CHECK: rcp.approx.f32 +; CHECK: mul.f32 +; CHECK: mul.f32 + %x = fdiv float %a, %divisor + %y = fdiv float %b, %divisor + %z = select i1 %pred, float %x, float %y + ret float %z +} + +; CHECK-LABEL: repeated_div_fast_ftz +define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #0 #1 { +; CHECK: rcp.approx.ftz.f32 +; CHECK: mul.ftz.f32 +; CHECK: mul.ftz.f32 + %x = fdiv float %a, %divisor + %y = fdiv float %b, %divisor + %z = select i1 %pred, float %x, float %y + ret float %z +} + attributes #0 = { "unsafe-fp-math" = "true" } attributes #1 = { "nvptx-f32ftz" = "true" } diff --git a/test/CodeGen/NVPTX/fcos-no-fast-math.ll b/test/CodeGen/NVPTX/fcos-no-fast-math.ll new file mode 100644 index 000000000000..d435c1d14fee --- /dev/null +++ b/test/CodeGen/NVPTX/fcos-no-fast-math.ll @@ -0,0 +1,14 @@ +; RUN: not llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s + +; Check that we fail to select fcos without fast-math enabled + +declare float @llvm.cos.f32(float) + +; CHECK: LLVM ERROR: Cannot select: {{.*}}: f32 = fcos +; CHECK: In function: test_fcos_safe +define float @test_fcos_safe(float %a) #0 { + %r = tail call float @llvm.cos.f32(float %a) + ret float %r +} + +attributes #0 = { "unsafe-fp-math" = "false" } diff --git a/test/CodeGen/NVPTX/fsin-no-fast-math.ll b/test/CodeGen/NVPTX/fsin-no-fast-math.ll new file mode 100644 index 000000000000..56396b849250 --- /dev/null +++ b/test/CodeGen/NVPTX/fsin-no-fast-math.ll @@ -0,0 +1,14 @@ +; RUN: not llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s + +; Check that we fail to select fsin without fast-math enabled + +declare float @llvm.sin.f32(float) + +; CHECK: LLVM ERROR: Cannot select: {{.*}}: f32 = fsin +; CHECK: In function: test_fsin_safe +define float @test_fsin_safe(float %a) #0 { + %r = tail call float @llvm.sin.f32(float %a) + ret float %r +} + +attributes #0 = { "unsafe-fp-math" = "false" } diff --git a/test/CodeGen/NVPTX/global-variable-big.ll b/test/CodeGen/NVPTX/global-variable-big.ll new file mode 100644 index 000000000000..0c769a856080 --- /dev/null +++ b/test/CodeGen/NVPTX/global-variable-big.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; Check that we can handle global variables of large integer type. + +; (lsb) 0x0102'0304'0506...0F10 (msb) +@gv = addrspace(1) externally_initialized global i128 21345817372864405881847059188222722561, align 16 +; CHECK: .visible .global .align 16 .b8 gv[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll index b99524162e65..6b8d01e0ed1b 100644 --- a/test/CodeGen/NVPTX/half.ll +++ b/test/CodeGen/NVPTX/half.ll @@ -2,8 +2,8 @@ define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) { ; CHECK-LABEL: @test_load_store -; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] +; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]] %val = load half, half addrspace(1)* %in store half %val, half addrspace(1) * %out ret void @@ -11,8 +11,8 @@ define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) { define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) { ; CHECK-LABEL: @test_bitcast_from_half -; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] +; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]] %val = load half, half addrspace(1) * %in %val_int = bitcast half %val to i16 store i16 %val_int, i16 addrspace(1)* %out diff --git a/test/CodeGen/NVPTX/idioms.ll b/test/CodeGen/NVPTX/idioms.ll new file mode 100644 index 000000000000..047325c85165 --- /dev/null +++ b/test/CodeGen/NVPTX/idioms.ll @@ -0,0 +1,31 @@ +; Check that various LLVM idioms get lowered to NVPTX as expected. + +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +; CHECK-LABEL: abs_i16( +define i16 @abs_i16(i16 %a) { +; CHECK: abs.s16 + %neg = sub i16 0, %a + %abs.cond = icmp sge i16 %a, 0 + %abs = select i1 %abs.cond, i16 %a, i16 %neg + ret i16 %abs +} + +; CHECK-LABEL: abs_i32( +define i32 @abs_i32(i32 %a) { +; CHECK: abs.s32 + %neg = sub i32 0, %a + %abs.cond = icmp sge i32 %a, 0 + %abs = select i1 %abs.cond, i32 %a, i32 %neg + ret i32 %abs +} + +; CHECK-LABEL: abs_i64( +define i64 @abs_i64(i64 %a) { +; CHECK: abs.s64 + %neg = sub i64 0, %a + %abs.cond = icmp sge i64 %a, 0 + %abs = select i1 %abs.cond, i64 %a, i64 %neg + ret i64 %abs +} diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll index 06a8712c2102..668de8a994bc 100644 --- a/test/CodeGen/NVPTX/intrinsics.ll +++ b/test/CodeGen/NVPTX/intrinsics.ll @@ -1,28 +1,105 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -define ptx_device float @test_fabsf(float %f) { -; CHECK: abs.f32 %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: ret; - %x = call float @llvm.fabs.f32(float %f) - ret float %x +; CHECK-LABEL test_fabsf( +define float @test_fabsf(float %f) { +; CHECK: abs.f32 + %x = call float @llvm.fabs.f32(float %f) + ret float %x } -define ptx_device double @test_fabs(double %d) { -; CHECK: abs.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: ret; - %x = call double @llvm.fabs.f64(double %d) - ret double %x +; CHECK-LABEL: test_fabs( +define double @test_fabs(double %d) { +; CHECK: abs.f64 + %x = call double @llvm.fabs.f64(double %d) + ret double %x } +; CHECK-LABEL: test_nvvm_sqrt( define float @test_nvvm_sqrt(float %a) { -; CHECK: sqrt.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: ret; +; CHECK: sqrt.rn.f32 %val = call float @llvm.nvvm.sqrt.f(float %a) ret float %val } +; CHECK-LABEL: test_llvm_sqrt( +define float @test_llvm_sqrt(float %a) { +; CHECK: sqrt.rn.f32 + %val = call float @llvm.sqrt.f32(float %a) + ret float %val +} + +; CHECK-LABEL: test_bitreverse32( +define i32 @test_bitreverse32(i32 %a) { +; CHECK: brev.b32 + %val = call i32 @llvm.bitreverse.i32(i32 %a) + ret i32 %val +} + +; CHECK-LABEL: test_bitreverse64( +define i64 @test_bitreverse64(i64 %a) { +; CHECK: brev.b64 + %val = call i64 @llvm.bitreverse.i64(i64 %a) + ret i64 %val +} + +; CHECK-LABEL: test_popc32( +define i32 @test_popc32(i32 %a) { +; CHECK: popc.b32 + %val = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %val +} + +; CHECK-LABEL: test_popc64 +define i64 @test_popc64(i64 %a) { +; CHECK: popc.b64 +; CHECK: cvt.u64.u32 + %val = call i64 @llvm.ctpop.i64(i64 %a) + ret i64 %val +} + +; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so +; if this function returns an i32, there's no need to do any type conversions +; in the ptx. +; CHECK-LABEL: test_popc64_trunc +define i32 @test_popc64_trunc(i64 %a) { +; CHECK: popc.b64 +; CHECK-NOT: cvt. + %val = call i64 @llvm.ctpop.i64(i64 %a) + %trunc = trunc i64 %val to i32 + ret i32 %trunc +} + +; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and +; then converting back to i16. +; CHECK-LABEL: test_popc16 +define void @test_popc16(i16 %a, i16* %b) { +; CHECK: cvt.u32.u16 +; CHECK: popc.b32 +; CHECK: cvt.u16.u32 + %val = call i16 @llvm.ctpop.i16(i16 %a) + store i16 %val, i16* %b + ret void +} + +; If we call llvm.ctpop.i16 and then zext the result to i32, we shouldn't need +; to do any conversions after calling popc.b32, because that returns an i32. +; CHECK-LABEL: test_popc16_to_32 +define i32 @test_popc16_to_32(i16 %a) { +; CHECK: cvt.u32.u16 +; CHECK: popc.b32 +; CHECK-NOT: cvt. + %val = call i16 @llvm.ctpop.i16(i16 %a) + %zext = zext i16 %val to i32 + ret i32 %zext +} declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) declare float @llvm.nvvm.sqrt.f(float) +declare float @llvm.sqrt.f32(float) +declare i32 @llvm.bitreverse.i32(i32) +declare i64 @llvm.bitreverse.i64(i64) +declare i16 @llvm.ctpop.i16(i16) +declare i32 @llvm.ctpop.i32(i32) +declare i64 @llvm.ctpop.i64(i64) diff --git a/test/CodeGen/NVPTX/ldg-invariant.ll b/test/CodeGen/NVPTX/ldg-invariant.ll index 40dad1f1769b..311bea6f4164 100644 --- a/test/CodeGen/NVPTX/ldg-invariant.ll +++ b/test/CodeGen/NVPTX/ldg-invariant.ll @@ -10,6 +10,30 @@ define i32 @ld_global(i32 addrspace(1)* %ptr) { ret i32 %a } +; CHECK-LABEL: @ld_global_v2i32 +define i32 @ld_global_v2i32(<2 x i32> addrspace(1)* %ptr) { +; CHECK: ld.global.nc.v2.{{[a-z]}}32 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %ptr, !invariant.load !0 + %v1 = extractelement <2 x i32> %a, i32 0 + %v2 = extractelement <2 x i32> %a, i32 1 + %sum = add i32 %v1, %v2 + ret i32 %sum +} + +; CHECK-LABEL: @ld_global_v4i32 +define i32 @ld_global_v4i32(<4 x i32> addrspace(1)* %ptr) { +; CHECK: ld.global.nc.v4.{{[a-z]}}32 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %ptr, !invariant.load !0 + %v1 = extractelement <4 x i32> %a, i32 0 + %v2 = extractelement <4 x i32> %a, i32 1 + %v3 = extractelement <4 x i32> %a, i32 2 + %v4 = extractelement <4 x i32> %a, i32 3 + %sum1 = add i32 %v1, %v2 + %sum2 = add i32 %v3, %v4 + %sum3 = add i32 %sum1, %sum2 + ret i32 %sum3 +} + ; CHECK-LABEL: @ld_not_invariant define i32 @ld_not_invariant(i32 addrspace(1)* %ptr) { ; CHECK: ld.global.{{[a-z]}}32 diff --git a/test/CodeGen/NVPTX/ldparam-v4.ll b/test/CodeGen/NVPTX/ldparam-v4.ll index ec306aafe854..4d082f6e9a58 100644 --- a/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/test/CodeGen/NVPTX/ldparam-v4.ll @@ -2,8 +2,11 @@ declare <4 x float> @bar() +; CHECK-LABEL: .func foo( define void @foo(<4 x float>* %ptr) { -; CHECK: ld.param.v4.f32 +; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0]; +; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0]; +; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]} %val = tail call <4 x float> @bar() store <4 x float> %val, <4 x float>* %ptr ret void diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll index ef570982b808..192d4becb059 100644 --- a/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to @@ -27,9 +27,9 @@ entry: ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.u8 %rs[[REG:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -45,9 +45,9 @@ entry: ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]] ; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { @@ -78,12 +78,26 @@ entry: ; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]] ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller( -; PTX: ld.param.u8 %rs[[REG:[0-9]+]] +; PTX: ld.param.u32 %r[[C:[0-9]+]] +; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]]; ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] +} + +define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 { +entry: + %0 = trunc i32 %c to i8 + tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i32 1, i1 true) + ret i8* %dst + +; IR-LABEL: @volatile_memset_caller +; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8 +; IR: loadstoreloop: +; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 +; IR-NEXT: store volatile i8 [[VAL]], i8* [[STOREPTR]] } define i8* @memmove_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -100,12 +114,12 @@ entry: ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memmove_caller( ; PTX: ld.param.u64 %rd[[N:[0-9]+]] -; PTX: setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0 -; PTX: setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; PTX-DAG: setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0 +; PTX-DAG: setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} ; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]] ; -- this is the backwards copying BB ; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]] -; PTX: add.s64 %rd[[N]], %rd[[N]], -1 +; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1 ; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]] ; -- this is the forwards copying BB @@ -113,7 +127,7 @@ entry: ; PTX: @%p[[NEQ0]] bra LBB[[EXIT]] ; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]] -; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1 +; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1 ; -- exit block ; PTX: LBB[[EXIT]]: ; PTX-NEXT: st.param.b64 [func_retval0 diff --git a/test/CodeGen/NVPTX/lower-alloca.ll b/test/CodeGen/NVPTX/lower-alloca.ll index 4177cd1fe977..3db225ef0e75 100644 --- a/test/CodeGen/NVPTX/lower-alloca.ll +++ b/test/CodeGen/NVPTX/lower-alloca.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -nvptx-lower-alloca -nvptx-infer-addrspace | FileCheck %s +; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/test/CodeGen/NVPTX/math-intrins.ll b/test/CodeGen/NVPTX/math-intrins.ll index de911d050755..828a8807dcfa 100644 --- a/test/CodeGen/NVPTX/math-intrins.ll +++ b/test/CodeGen/NVPTX/math-intrins.ll @@ -21,6 +21,8 @@ declare float @llvm.minnum.f32(float, float) #0 declare double @llvm.minnum.f64(double, double) #0 declare float @llvm.maxnum.f32(float, float) #0 declare double @llvm.maxnum.f64(double, double) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare double @llvm.fma.f64(double, double, double) #0 ; ---- ceil ---- @@ -257,5 +259,28 @@ define double @max_double(double %a, double %b) { ret double %x } +; ---- fma ---- + +; CHECK-LABEL: @fma_float +define float @fma_float(float %a, float %b, float %c) { + ; CHECK: fma.rn.f32 + %x = call float @llvm.fma.f32(float %a, float %b, float %c) + ret float %x +} + +; CHECK-LABEL: @fma_float_ftz +define float @fma_float_ftz(float %a, float %b, float %c) #1 { + ; CHECK: fma.rn.ftz.f32 + %x = call float @llvm.fma.f32(float %a, float %b, float %c) + ret float %x +} + +; CHECK-LABEL: @fma_double +define double @fma_double(double %a, double %b, double %c) { + ; CHECK: fma.rn.f64 + %x = call double @llvm.fma.f64(double %a, double %b, double %c) + ret double %x +} + attributes #0 = { nounwind readnone } attributes #1 = { "nvptx-f32ftz" = "true" } diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index 2ad72b018851..036d9638ceac 100644 --- a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -41,6 +41,64 @@ define <4 x float> @t4(i8* %p1) { ret <4 x float> %r } +; CHECK-LABEL: .visible .func test_v1halfp0a1( +; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0]; +; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1]; +; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] +; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] +; CHECK: ret +define void @test_v1halfp0a1(<1 x half> * noalias readonly %from, <1 x half> * %to) { + %1 = load <1 x half>, <1 x half> * %from , align 1 + store <1 x half> %1, <1 x half> * %to , align 1 + ret void +} + +; CHECK-LABEL: .visible .func test_v2halfp0a1( +; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0]; +; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1]; +; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.u8 [%[[TO]]], +; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.u8 [%[[TO]]+1], +; CHECK-DAG: ld.u8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] +; CHECK-DAG: st.u8 [%[[TO]]+2], +; CHECK-DAG: ld.u8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] +; CHECK-DAG: st.u8 [%[[TO]]+3], +; CHECK: ret +define void @test_v2halfp0a1(<2 x half> * noalias readonly %from, <2 x half> * %to) { + %1 = load <2 x half>, <2 x half> * %from , align 1 + store <2 x half> %1, <2 x half> * %to , align 1 + ret void +} + +; CHECK-LABEL: .visible .func test_v4halfp0a1( +; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0]; +; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1]; +; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] +; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] +; CHECK-DAG: ld.u8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] +; CHECK-DAG: st.u8 [%[[TO]]+2], [[B2]] +; CHECK-DAG: ld.u8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] +; CHECK-DAG: st.u8 [%[[TO]]+3], [[B3]] +; CHECK-DAG: ld.u8 [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4] +; CHECK-DAG: st.u8 [%[[TO]]+4], [[B4]] +; CHECK-DAG: ld.u8 [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5] +; CHECK-DAG: st.u8 [%[[TO]]+5], [[B5]] +; CHECK-DAG: ld.u8 [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6] +; CHECK-DAG: st.u8 [%[[TO]]+6], [[B6]] +; CHECK-DAG: ld.u8 [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7] +; CHECK-DAG: st.u8 [%[[TO]]+7], [[B7]] +; CHECK: ret +define void @test_v4halfp0a1(<4 x half> * noalias readonly %from, <4 x half> * %to) { + %1 = load <4 x half>, <4 x half> * %from , align 1 + store <4 x half> %1, <4 x half> * %to , align 1 + ret void +} + ; CHECK-LABEL: s1 define void @s1(<4 x float>* %p1, <4 x float> %v) { diff --git a/test/CodeGen/NVPTX/named-barriers.ll b/test/CodeGen/NVPTX/named-barriers.ll new file mode 100644 index 000000000000..accc0fd6fef7 --- /dev/null +++ b/test/CodeGen/NVPTX/named-barriers.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for all threads in CTA to also arrive: +define ptx_device void @test_barrier_named_cta() { +; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0]]; +; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1]]; +; CHECK: mov.u32 %r[[REG2:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2]]; +; CHECK: ret; + call void @llvm.nvvm.barrier.n(i32 0) + call void @llvm.nvvm.barrier.n(i32 10) + call void @llvm.nvvm.barrier.n(i32 15) + ret void +} + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for fixed number of cooperating threads to arrive: +define ptx_device void @test_barrier_named() { +; CHECK: mov.u32 %r[[REG0A:[0-9]+]], 32; +; CHECK: mov.u32 %r[[REG0B:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]]; +; CHECK: mov.u32 %r[[REG1A:[0-9]+]], 352; +; CHECK: mov.u32 %r[[REG1B:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]]; +; CHECK: mov.u32 %r[[REG2A:[0-9]+]], 992; +; CHECK: mov.u32 %r[[REG2B:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]]; +; CHECK: ret; + call void @llvm.nvvm.barrier(i32 0, i32 32) + call void @llvm.nvvm.barrier(i32 10, i32 352) + call void @llvm.nvvm.barrier(i32 15, i32 992) + ret void +} + +declare void @llvm.nvvm.barrier(i32, i32) +declare void @llvm.nvvm.barrier.n(i32) diff --git a/test/CodeGen/NVPTX/nvvm-reflect.ll b/test/CodeGen/NVPTX/nvvm-reflect.ll index 8c75dfc30a56..165597d6baff 100644 --- a/test/CodeGen/NVPTX/nvvm-reflect.ll +++ b/test/CodeGen/NVPTX/nvvm-reflect.ll @@ -1,30 +1,38 @@ -; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=0 -O2 | FileCheck %s --check-prefix=USE_MUL_0 -; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=1 -O2 | FileCheck %s --check-prefix=USE_MUL_1 +; We run nvvm-reflect (and then optimize) this module twice, once with metadata +; that enables FTZ, and again with metadata that disables it. -@str = private unnamed_addr addrspace(4) constant [8 x i8] c"USE_MUL\00" +; RUN: cat %s > %t.noftz +; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz +; RUN: opt %t.noftz -S -nvvm-reflect -O2 \ +; RUN: | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK + +; RUN: cat %s > %t.ftz +; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz +; RUN: opt %t.ftz -S -nvvm-reflect -O2 \ +; RUN: | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK + +@str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00" declare i32 @__nvvm_reflect(i8*) declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)*) +; CHECK-LABEL: @foo define float @foo(float %a, float %b) { -; USE_MUL_0: define float @foo -; USE_MUL_0-NOT: call i32 @__nvvm_reflect -; USE_MUL_1: define float @foo -; USE_MUL_1-NOT: call i32 @__nvvm_reflect - %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) +; CHECK-NOT: call i32 @__nvvm_reflect + %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0)) %reflect = tail call i32 @__nvvm_reflect(i8* %ptr) %cmp = icmp ugt i32 %reflect, 0 br i1 %cmp, label %use_mul, label %use_add use_mul: -; USE_MUL_1: fmul float %a, %b -; USE_MUL_0-NOT: fadd float %a, %b +; USE_FTZ_1: fmul float %a, %b +; USE_FTZ_0-NOT: fadd float %a, %b %ret1 = fmul float %a, %b br label %exit use_add: -; USE_MUL_0: fadd float %a, %b -; USE_MUL_1-NOT: fmul float %a, %b +; USE_FTZ_0: fadd float %a, %b +; USE_FTZ_1-NOT: fmul float %a, %b %ret2 = fadd float %a, %b br label %exit @@ -35,14 +43,12 @@ exit: declare i32 @llvm.nvvm.reflect.p0i8(i8*) -; USE_MUL_0: define i32 @intrinsic -; USE_MUL_1: define i32 @intrinsic +; CHECK-LABEL: define i32 @intrinsic define i32 @intrinsic() { -; USE_MUL_0-NOT: call i32 @llvm.nvvm.reflect -; USE_MUL_0: ret i32 0 -; USE_MUL_1-NOT: call i32 @llvm.nvvm.reflect -; USE_MUL_1: ret i32 1 - %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) +; CHECK-NOT: call i32 @llvm.nvvm.reflect +; USE_FTZ_0: ret i32 0 +; USE_FTZ_1: ret i32 1 + %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0)) %reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr) ret i32 %reflect } @@ -50,26 +56,24 @@ define i32 @intrinsic() { ; CUDA-7.0 passes __nvvm_reflect argument slightly differently. ; Verify that it works, too -@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" +@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00" +; CHECK-LABEL: @bar define float @bar(float %a, float %b) { -; USE_MUL_0: define float @bar -; USE_MUL_0-NOT: call i32 @__nvvm_reflect -; USE_MUL_1: define float @bar -; USE_MUL_1-NOT: call i32 @__nvvm_reflect - %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) +; CHECK-NOT: call i32 @__nvvm_reflect + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) %cmp = icmp ne i32 %reflect, 0 br i1 %cmp, label %use_mul, label %use_add use_mul: -; USE_MUL_1: fmul float %a, %b -; USE_MUL_0-NOT: fadd float %a, %b +; USE_FTZ_1: fmul float %a, %b +; USE_FTZ_0-NOT: fadd float %a, %b %ret1 = fmul float %a, %b br label %exit use_add: -; USE_MUL_0: fadd float %a, %b -; USE_MUL_1-NOT: fmul float %a, %b +; USE_FTZ_0: fadd float %a, %b +; USE_FTZ_1-NOT: fmul float %a, %b %ret2 = fadd float %a, %b br label %exit @@ -77,3 +81,6 @@ exit: %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] ret float %ret } + +!llvm.module.flags = !{!0} +; A module flag is added to the end of this file by the RUN lines at the top. diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll new file mode 100644 index 000000000000..8a67567acc96 --- /dev/null +++ b/test/CodeGen/NVPTX/param-load-store.ll @@ -0,0 +1,939 @@ +; Verifies correctness of load/store of parameters and return values. +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s + +%s_i1 = type { i1 } +%s_i8 = type { i8 } +%s_i16 = type { i16 } +%s_f16 = type { half } +%s_i32 = type { i32 } +%s_f32 = type { float } +%s_i64 = type { i64 } +%s_f64 = type { double } + +; More complicated types. i64 is used to increase natural alignment +; requirement for the type. +%s_i32x4 = type { i32, i32, i32, i32, i64} +%s_i32f32 = type { i32, float, i32, float, i64} +%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64} +%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}> +%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]} +; All scalar parameters must be at least 32 bits in size. +; i1 is loaded/stored as i8. + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1( +; CHECK-NEXT: .param .b32 test_i1_param_0 +; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]] +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK-NEXT: test_i1, +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i1 @test_i1(i1 %a) { + %r = tail call i1 @test_i1(i1 %a); + ret i1 %r; +} + +; Signed i1 is a somewhat special case. We only care about one bit and +; then us neg.s32 to convert it to 32-bit -1 if it's set. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1s( +; CHECK-NEXT: .param .b32 test_i1s_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; +; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; +; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i1 @test_i1s(i1 signext %a) { + %r = tail call signext i1 @test_i1s(i1 signext %a); + ret i1 %r; +} + +; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i1( +; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i1, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i1> @test_v3i1(<3 x i1> %a) { + %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); + ret <3 x i1> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i1( +; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK: test_v4i1, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; +; CHECK-NEXT: ret; +define <4 x i1> @test_v4i1(<4 x i1> %a) { + %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); + ret <4 x i1> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i1( +; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i1, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i1> @test_v5i1(<5 x i1> %a) { + %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); + ret <5 x i1> %r; +} + +; Unsigned i8 is loaded directly into 32-bit register. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8( +; CHECK-NEXT: .param .b32 test_i8_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i8 @test_i8(i8 %a) { + %r = tail call i8 @test_i8(i8 %a); + ret i8 %r; +} + +; signed i8 is loaded into 16-bit register which is then sign-extended to i32. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8s( +; CHECK-NEXT: .param .b32 test_i8s_param_0 +; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; +; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8s, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? +; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i8 @test_i8s(i8 signext %a) { + %r = tail call signext i8 @test_i8s(i8 signext %a); + ret i8 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i8( +; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i8, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i8> @test_v3i8(<3 x i8> %a) { + %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); + ret <3 x i8> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i8( +; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i8, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i8> @test_v4i8(<4 x i8> %a) { + %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); + ret <4 x i8> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i8( +; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; +; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i8, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i8> @test_v5i8(<5 x i8> %a) { + %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); + ret <5 x i8> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16( +; CHECK-NEXT: .param .b32 test_i16_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; +; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i16 @test_i16(i16 %a) { + %r = tail call i16 @test_i16(i16 %a); + ret i16 %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16s( +; CHECK-NEXT: .param .b32 test_i16s_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; +; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16s, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i16 @test_i16s(i16 signext %a) { + %r = tail call signext i16 @test_i16s(i16 signext %a); + ret i16 %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3i16( +; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] +; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; +; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i16, +; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i16> @test_v3i16(<3 x i16> %a) { + %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); + ret <3 x i16> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4i16( +; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] +; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i16, +; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i16> @test_v4i16(<4 x i16> %a) { + %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); + ret <4 x i16> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5i16( +; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] +; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; +; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i16, +; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i16> @test_v5i16(<5 x i16> %a) { + %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); + ret <5 x i16> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_f16( +; CHECK-NEXT: .param .b32 test_f16_param_0 +; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b16 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_f16, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define half @test_f16(half %a) { + %r = tail call half @test_f16(half %a); + ret half %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v2f16( +; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] +; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v2f16, +; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define <2 x half> @test_v2f16(<2 x half> %a) { + %r = tail call <2 x half> @test_v2f16(<2 x half> %a); + ret <2 x half> %r; +} + +; CHECK:.func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3f16( +; CHECK: .param .align 8 .b8 test_v3f16_param_0[8] +; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0]; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; +; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK: test_v3f16, +; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; +; CHECK: ret; +define <3 x half> @test_v3f16(<3 x half> %a) { + %r = tail call <3 x half> @test_v3f16(<3 x half> %a); + ret <3 x half> %r; +} + +; CHECK:.func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4f16( +; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] +; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; +; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; +; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK: test_v4f16, +; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]}; +; CHECK: ret; +define <4 x half> @test_v4f16(<4 x half> %a) { + %r = tail call <4 x half> @test_v4f16(<4 x half> %a); + ret <4 x half> %r; +} + +; CHECK:.func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5f16( +; CHECK: .param .align 16 .b8 test_v5f16_param_0[16] +; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0]; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; +; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK: test_v5f16, +; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; +; CHECK: ret; +define <5 x half> @test_v5f16(<5 x half> %a) { + %r = tail call <5 x half> @test_v5f16(<5 x half> %a); + ret <5 x half> %r; +} + +; CHECK:.func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v8f16( +; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] +; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; +; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; +; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; +; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]]; +; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK: test_v8f16, +; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; +; CHECK: ret; +define <8 x half> @test_v8f16(<8 x half> %a) { + %r = tail call <8 x half> @test_v8f16(<8 x half> %a); + ret <8 x half> %r; +} + +; CHECK:.func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v9f16( +; CHECK: .param .align 32 .b8 test_v9f16_param_0[32] +; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0]; +; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8]; +; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.v4.b16 [param0+8], +; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_v9f16, +; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8]; +; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; +; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; +; CHECK: ret; +define <9 x half> @test_v9f16(<9 x half> %a) { + %r = tail call <9 x half> @test_v9f16(<9 x half> %a); + ret <9 x half> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i32( +; CHECK-NEXT: .param .b32 test_i32_param_0 +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i32 @test_i32(i32 %a) { + %r = tail call i32 @test_i32(i32 %a); + ret i32 %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v3i32( +; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i32, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i32> @test_v3i32(<3 x i32> %a) { + %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); + ret <3 x i32> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v4i32( +; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] +; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i32, +; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHCK-NEXT: ret; +define <4 x i32> @test_v4i32(<4 x i32> %a) { + %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); + ret <4 x i32> %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v5i32( +; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] +; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; +; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i32, +; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i32> @test_v5i32(<5 x i32> %a) { + %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); + ret <5 x i32> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_f32( +; CHECK-NEXT: .param .b32 test_f32_param_0 +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_f32, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define float @test_f32(float %a) { + %r = tail call float @test_f32(float %a); + ret float %r; +} + +; CHECK: .func (.param .b64 func_retval0) +; CHECK-LABEL: test_i64( +; CHECK-NEXT: .param .b64 test_i64_param_0 +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; +; CHECK: .param .b64 param0; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .b64 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i64 @test_i64(i64 %a) { + %r = tail call i64 @test_i64(i64 %a); + ret i64 %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v3i64( +; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] +; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b64 [param0+16], [[E2]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i64> @test_v3i64(<3 x i64> %a) { + %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); + ret <3 x i64> %r; +} + +; For i64 vector loads are limited by PTX to 2 elements. +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v4i64( +; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] +; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-NEXT: ret; +define <4 x i64> @test_v4i64(<4 x i64> %a) { + %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); + ret <4 x i64> %r; +} + +; Aggregates, on the other hand, do not get extended. + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i1( +; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i1, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i1 @test_s_i1(%s_i1 %a) { + %r = tail call %s_i1 @test_s_i1(%s_i1 %a); + ret %s_i1 %r; +} + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i8( +; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i8, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i8 @test_s_i8(%s_i8 %a) { + %r = tail call %s_i8 @test_s_i8(%s_i8 %a); + ret %s_i8 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_i16( +; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] +; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i16, +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i16 @test_s_i16(%s_i16 %a) { + %r = tail call %s_i16 @test_s_i16(%s_i16 %a); + ret %s_i16 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_f16( +; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_f16, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_f16 @test_s_f16(%s_f16 %a) { + %r = tail call %s_f16 @test_s_f16(%s_f16 %a); + ret %s_f16 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_i32( +; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i32 @test_s_i32(%s_i32 %a) { + %r = tail call %s_i32 @test_s_i32(%s_i32 %a); + ret %s_i32 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_f32( +; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_f32, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_f32 @test_s_f32(%s_f32 %a) { + %r = tail call %s_f32 @test_s_f32(%s_f32 %a); + ret %s_f32 %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_s_i64( +; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i64 @test_s_i64(%s_i64 %a) { + %r = tail call %s_i64 @test_s_i64(%s_i64 %a); + ret %s_i64 %r; +} + +; Fields that have different types, but identical sizes are not vectorized. +; CHECK: .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32f32( +; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] +; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; +; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; +; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; +; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; +; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; +; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32f32, +; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; +; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; +define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { + %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); + ret %s_i32f32 %r; +} + +; We do vectorize consecutive fields with matching types. +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32x4( +; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] +; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; +; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; +; CHECK: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32x4, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; +; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; + +define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { + %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); + ret %s_i32x4 %r; +} + +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i1i32x4( +; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] +; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; +; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; +; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; +; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+8], [[E2]]; +; CHECK: st.param.b32 [param0+12], [[E3]]; +; CHECK: st.param.b32 [param0+16], [[E4]]; +; CHECK: st.param.b64 [param0+24], [[E5]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_s_i1i32x4, +; CHECK: ( +; CHECK: param0 +; CHECK: ); +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; +; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; +; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; +; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; +; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; +; CHECK: ret; + +define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { + %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); + ret %s_i8i32x4 %r; +} + +; -- All loads/stores from parameters aligned by one must be done one +; -- byte at a time. +; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) +; CHECK-LABEL: test_s_i1i32x4p( +; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; +; --- TODO +; --- Unaligned parameter store/ return value load is broken in both nvcc +; --- and llvm and needs to be fixed. +; CHECK: .param .align 1 .b8 param0[25]; +; CHECK-DAG: st.param.b32 [param0+0], +; CHECK-DAG: st.param.b32 [param0+4], +; CHECK-DAG: st.param.b8 [param0+8], +; CHECK-DAG: st.param.b32 [param0+9], +; CHECK-DAG: st.param.b32 [param0+13], +; CHECK-DAG: st.param.b64 [param0+17], +; CHECK: .param .align 1 .b8 retval0[25]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i1i32x4p, +; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; +; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; +; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; +; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; +; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; +; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; +; CHECK-DAG: st.param.b32 [func_retval0+0], +; CHECK-DAG: st.param.b32 [func_retval0+4], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK-DAG: st.param.b32 [func_retval0+9], +; CHECK-DAG: st.param.b32 [func_retval0+13], +; CHECK-DAG: st.param.b64 [func_retval0+17], + +define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { + %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); + ret %s_i8i32x4p %r; +} + +; Check that we can vectorize loads that span multiple aggregate fields. +; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) +; CHECK-LABEL: test_s_crossfield( +; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] +; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; +; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; +; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; +; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; +; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; +; CHECK: .param .align 16 .b8 param0[80]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; +; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; +; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; +; CHECK: st.param.b32 [param0+64], [[E15]]; +; CHECK: .param .align 16 .b8 retval0[80]; +; CHECK: call.uni (retval0), +; CHECK: test_s_crossfield, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; +; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; +; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; +; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; +; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; +; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; +; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; +; CHECK: ret; + +define %s_crossfield @test_s_crossfield(%s_crossfield %a) { + %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); + ret %s_crossfield %r; +} diff --git a/test/CodeGen/NVPTX/rsqrt.ll b/test/CodeGen/NVPTX/rsqrt.ll deleted file mode 100644 index 3a52a493abdd..000000000000 --- a/test/CodeGen/NVPTX/rsqrt.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=1 -nvptx-prec-sqrtf32=0 | FileCheck %s - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - -declare float @llvm.nvvm.sqrt.f(float) - -define float @foo(float %a) { -; CHECK: rsqrt.approx.f32 - %val = tail call float @llvm.nvvm.sqrt.f(float %a) - %ret = fdiv float 1.0, %val - ret float %ret -} - diff --git a/test/CodeGen/NVPTX/sqrt-approx.ll b/test/CodeGen/NVPTX/sqrt-approx.ll new file mode 100644 index 000000000000..1e28db44b804 --- /dev/null +++ b/test/CodeGen/NVPTX/sqrt-approx.ll @@ -0,0 +1,150 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \ +; RUN: | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +declare float @llvm.sqrt.f32(float) +declare double @llvm.sqrt.f64(double) + +; -- reciprocal sqrt -- + +; CHECK-LABEL test_rsqrt32 +define float @test_rsqrt32(float %a) #0 { +; CHECK: rsqrt.approx.f32 + %val = tail call float @llvm.sqrt.f32(float %a) + %ret = fdiv float 1.0, %val + ret float %ret +} + +; CHECK-LABEL test_rsqrt_ftz +define float @test_rsqrt_ftz(float %a) #0 #1 { +; CHECK: rsqrt.approx.ftz.f32 + %val = tail call float @llvm.sqrt.f32(float %a) + %ret = fdiv float 1.0, %val + ret float %ret +} + +; CHECK-LABEL test_rsqrt64 +define double @test_rsqrt64(double %a) #0 { +; CHECK: rsqrt.approx.f64 + %val = tail call double @llvm.sqrt.f64(double %a) + %ret = fdiv double 1.0, %val + ret double %ret +} + +; CHECK-LABEL test_rsqrt64_ftz +define double @test_rsqrt64_ftz(double %a) #0 #1 { +; There's no rsqrt.approx.ftz.f64 instruction; we just use the non-ftz version. +; CHECK: rsqrt.approx.f64 + %val = tail call double @llvm.sqrt.f64(double %a) + %ret = fdiv double 1.0, %val + ret double %ret +} + +; -- sqrt -- + +; CHECK-LABEL test_sqrt32 +define float @test_sqrt32(float %a) #0 { +; CHECK: sqrt.approx.f32 + %ret = tail call float @llvm.sqrt.f32(float %a) + ret float %ret +} + +; CHECK-LABEL test_sqrt_ftz +define float @test_sqrt_ftz(float %a) #0 #1 { +; CHECK: sqrt.approx.ftz.f32 + %ret = tail call float @llvm.sqrt.f32(float %a) + ret float %ret +} + +; CHECK-LABEL test_sqrt64 +define double @test_sqrt64(double %a) #0 { +; There's no sqrt.approx.f64 instruction; we emit +; reciprocal(rsqrt.approx.f64(x)). There's no non-ftz approximate reciprocal, +; so we just use the ftz version. +; CHECK: rsqrt.approx.f64 +; CHECK: rcp.approx.ftz.f64 + %ret = tail call double @llvm.sqrt.f64(double %a) + ret double %ret +} + +; CHECK-LABEL test_sqrt64_ftz +define double @test_sqrt64_ftz(double %a) #0 #1 { +; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version. +; CHECK: rsqrt.approx.f64 +; CHECK: rcp.approx.ftz.f64 + %ret = tail call double @llvm.sqrt.f64(double %a) + ret double %ret +} + +; -- refined sqrt and rsqrt -- +; +; The sqrt and rsqrt refinement algorithms both emit an rsqrt.approx, followed +; by some math. + +; CHECK-LABEL: test_rsqrt32_refined +define float @test_rsqrt32_refined(float %a) #0 #2 { +; CHECK: rsqrt.approx.f32 + %val = tail call float @llvm.sqrt.f32(float %a) + %ret = fdiv float 1.0, %val + ret float %ret +} + +; CHECK-LABEL: test_sqrt32_refined +define float @test_sqrt32_refined(float %a) #0 #2 { +; CHECK: rsqrt.approx.f32 + %ret = tail call float @llvm.sqrt.f32(float %a) + ret float %ret +} + +; CHECK-LABEL: test_rsqrt64_refined +define double @test_rsqrt64_refined(double %a) #0 #2 { +; CHECK: rsqrt.approx.f64 + %val = tail call double @llvm.sqrt.f64(double %a) + %ret = fdiv double 1.0, %val + ret double %ret +} + +; CHECK-LABEL: test_sqrt64_refined +define double @test_sqrt64_refined(double %a) #0 #2 { +; CHECK: rsqrt.approx.f64 + %ret = tail call double @llvm.sqrt.f64(double %a) + ret double %ret +} + +; -- refined sqrt and rsqrt with ftz enabled -- + +; CHECK-LABEL: test_rsqrt32_refined_ftz +define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 { +; CHECK: rsqrt.approx.ftz.f32 + %val = tail call float @llvm.sqrt.f32(float %a) + %ret = fdiv float 1.0, %val + ret float %ret +} + +; CHECK-LABEL: test_sqrt32_refined_ftz +define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 { +; CHECK: rsqrt.approx.ftz.f32 + %ret = tail call float @llvm.sqrt.f32(float %a) + ret float %ret +} + +; CHECK-LABEL: test_rsqrt64_refined_ftz +define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 { +; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version. +; CHECK: rsqrt.approx.f64 + %val = tail call double @llvm.sqrt.f64(double %a) + %ret = fdiv double 1.0, %val + ret double %ret +} + +; CHECK-LABEL: test_sqrt64_refined_ftz +define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 { +; CHECK: rsqrt.approx.f64 + %ret = tail call double @llvm.sqrt.f64(double %a) + ret double %ret +} + +attributes #0 = { "unsafe-fp-math" = "true" } +attributes #1 = { "nvptx-f32ftz" = "true" } +attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" } diff --git a/test/CodeGen/NVPTX/vec-param-load.ll b/test/CodeGen/NVPTX/vec-param-load.ll index 4193ac4085cc..bf26e5ff1bdb 100644 --- a/test/CodeGen/NVPTX/vec-param-load.ll +++ b/test/CodeGen/NVPTX/vec-param-load.ll @@ -2,12 +2,81 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - -define <16 x float> @foo(<16 x float> %a) { -; Make sure we index into vectors properly -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0]; +define <16 x float> @test_v16f32(<16 x float> %a) { +; CHECK-LABEL: test_v16f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; +; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]} +; CHECK: ret; ret <16 x float> %a } + +define <8 x float> @test_v8f32(<8 x float> %a) { +; CHECK-LABEL: test_v8f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK: ret; + ret <8 x float> %a +} + +define <4 x float> @test_v4f32(<4 x float> %a) { +; CHECK-LABEL: test_v4f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <4 x float> %a +} + +define <2 x float> @test_v2f32(<2 x float> %a) { +; CHECK-LABEL: test_v2f32( +; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <2 x float> %a +} + +; Oddly shaped vectors should not load any extra elements. +define <3 x float> @test_v3f32(<3 x float> %a) { +; CHECK-LABEL: test_v3f32( +; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8]; +; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]] +; CHECK: ret; + ret <3 x float> %a +} + +define <8 x i64> @test_v8i64(<8 x i64> %a) { +; CHECK-LABEL: test_v8i64( +; CHECK-DAG: ld.param.v2.u64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; +; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; +; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]} +; CHECK: ret; + ret <8 x i64> %a +} + +define <16 x i16> @test_v16i16(<16 x i16> %a) { +; CHECK-LABEL: test_v16i16( +; CHECK-DAG: ld.param.v4.u16 {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24]; +; CHECK-DAG: ld.param.v4.u16 {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; +; CHECK-DAG: ld.param.v4.u16 {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8]; +; CHECK-DAG: ld.param.v4.u16 {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+16], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+24], {[[V_12_15]]} +; CHECK: ret; + ret <16 x i16> %a +} diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll index 03f5cfc6cb01..a86ba1e29d5c 100644 --- a/test/CodeGen/NVPTX/vec8.ll +++ b/test/CodeGen/NVPTX/vec8.ll @@ -4,10 +4,15 @@ target triple = "nvptx-unknown-cuda" ; CHECK: .visible .func foo define void @foo(<8 x i8> %a, i8* %b) { - %t0 = extractelement <8 x i8> %a, i32 0 -; CHECK-DAG: ld.param.v4.u8 -; CHECK-DAG: ld.param.u32 - store i8 %t0, i8* %b +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] +; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] +; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] +; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK: st.u8 [%[[B]]], [[T]]; + %t0 = extractelement <8 x i8> %a, i32 1 + %t1 = extractelement <8 x i8> %a, i32 6 + %t = add i8 %t0, %t1 + store i8 %t, i8* %b ret void } diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll index 968d1d4a5f51..bf7b931a5758 100644 --- a/test/CodeGen/NVPTX/vector-call.ll +++ b/test/CodeGen/NVPTX/vector-call.ll @@ -4,9 +4,27 @@ target triple = "nvptx-unknown-cuda" declare void @bar(<4 x i32>) -; CHECK-LABEL: @foo +; CHECK-LABEL: .func foo( +; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: call.uni +; CHECK: ret; define void @foo(<4 x i32> %a) { -; CHECK: st.param.v4.b32 tail call void @bar(<4 x i32> %a) ret void } + +; CHECK-LABEL: .func foo3( +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK: call.uni +; CHECK: ret; +declare void @bar3(<3 x i32>) +define void @foo3(<3 x i32> %a) { + tail call void @bar3(<3 x i32> %a) + ret void +} |