diff options
Diffstat (limited to 'test/CodeGen/NVPTX')
44 files changed, 328 insertions, 148 deletions
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll index c225abf0fd85..e709302918f5 100644 --- a/test/CodeGen/NVPTX/access-non-generic.ll +++ b/test/CodeGen/NVPTX/access-non-generic.ll @@ -18,7 +18,7 @@ define float @ld_st_shared_f32(i32 %i, float %v) { ; IR-NOT: addrspacecast ; PTX-LABEL: ld_st_shared_f32( ; load cast - %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 + %1 = load float, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; ; store cast store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 @@ -29,7 +29,7 @@ define float @ld_st_shared_f32(i32 %i, float %v) { ; cast; load %2 = addrspacecast float addrspace(3)* @scalar to float* - %3 = load float* %2, align 4 + %3 = load float, float* %2, align 4 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; ; cast; store store float %v, float* %2, align 4 @@ -38,17 +38,17 @@ define float @ld_st_shared_f32(i32 %i, float %v) { ; PTX: bar.sync 0; ; load gep cast - %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 + %4 = load float, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; ; store gep cast - store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 + store float %v, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 ; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; call void @llvm.cuda.syncthreads() ; PTX: bar.sync 0; ; gep cast; load - %5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5 - %6 = load float* %5, align 4 + %5 = getelementptr inbounds [10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5 + %6 = load float, float* %5, align 4 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; ; gep cast; store store float %v, float* %5, align 4 @@ -58,8 +58,8 @@ define float @ld_st_shared_f32(i32 %i, float %v) { ; cast; gep; load %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]* - %8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i - %9 = load float* %8, align 4 + %8 = getelementptr inbounds [10 x float], [10 x float]* %7, i32 0, i32 %i + %9 = load float, float* %8, align 4 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}]; ; cast; gep; store store float %v, float* %8, align 4 @@ -78,10 +78,10 @@ define float @ld_st_shared_f32(i32 %i, float %v) { ; addrspacecast with a bitcast. define i32 @ld_int_from_float() { ; IR-LABEL: @ld_int_from_float -; IR: load i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*) +; IR: load i32, i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*) ; PTX-LABEL: ld_int_from_float( ; PTX: ld.shared.u{{(32|64)}} - %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4 + %1 = load i32, i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4 ret i32 %1 } diff --git a/test/CodeGen/NVPTX/addrspacecast-gvar.ll b/test/CodeGen/NVPTX/addrspacecast-gvar.ll index 6afbdb8a429f..1e2fde4b858a 100644 --- a/test/CodeGen/NVPTX/addrspacecast-gvar.ll +++ b/test/CodeGen/NVPTX/addrspacecast-gvar.ll @@ -3,7 +3,11 @@ ; CHECK: .visible .global .align 4 .u32 g = 42; ; CHECK: .visible .global .align 4 .u32 g2 = generic(g); ; CHECK: .visible .global .align 4 .u32 g3 = g; +; CHECK: .visible .global .align 8 .u32 g4[2] = {0, generic(g)}; +; CHECK: .visible .global .align 8 .u32 g5[2] = {0, generic(g)+8}; @g = addrspace(1) global i32 42 @g2 = addrspace(1) global i32* addrspacecast (i32 addrspace(1)* @g to i32*) @g3 = addrspace(1) global i32 addrspace(1)* @g +@g4 = constant {i32*, i32*} {i32* null, i32* addrspacecast (i32 addrspace(1)* @g to i32*)} +@g5 = constant {i32*, i32*} {i32* null, i32* addrspacecast (i32 addrspace(1)* getelementptr (i32, i32 addrspace(1)* @g, i32 2) to i32*)} diff --git a/test/CodeGen/NVPTX/addrspacecast.ll b/test/CodeGen/NVPTX/addrspacecast.ll index 03b9a9844752..42e67ca8ce9b 100644 --- a/test/CodeGen/NVPTX/addrspacecast.ll +++ b/test/CodeGen/NVPTX/addrspacecast.ll @@ -10,7 +10,7 @@ define i32 @conv1(i32 addrspace(1)* %ptr) { ; PTX64: cvta.global.u64 ; PTX64: ld.u32 %genptr = addrspacecast i32 addrspace(1)* %ptr to i32* - %val = load i32* %genptr + %val = load i32, i32* %genptr ret i32 %val } @@ -22,7 +22,7 @@ define i32 @conv2(i32 addrspace(3)* %ptr) { ; PTX64: cvta.shared.u64 ; PTX64: ld.u32 %genptr = addrspacecast i32 addrspace(3)* %ptr to i32* - %val = load i32* %genptr + %val = load i32, i32* %genptr ret i32 %val } @@ -34,7 +34,7 @@ define i32 @conv3(i32 addrspace(4)* %ptr) { ; PTX64: cvta.const.u64 ; PTX64: ld.u32 %genptr = addrspacecast i32 addrspace(4)* %ptr to i32* - %val = load i32* %genptr + %val = load i32, i32* %genptr ret i32 %val } @@ -46,7 +46,7 @@ define i32 @conv4(i32 addrspace(5)* %ptr) { ; PTX64: cvta.local.u64 ; PTX64: ld.u32 %genptr = addrspacecast i32 addrspace(5)* %ptr to i32* - %val = load i32* %genptr + %val = load i32, i32* %genptr ret i32 %val } @@ -58,7 +58,7 @@ define i32 @conv5(i32* %ptr) { ; PTX64: cvta.to.global.u64 ; PTX64: ld.global.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(1)* - %val = load i32 addrspace(1)* %specptr + %val = load i32, i32 addrspace(1)* %specptr ret i32 %val } @@ -70,7 +70,7 @@ define i32 @conv6(i32* %ptr) { ; PTX64: cvta.to.shared.u64 ; PTX64: ld.shared.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(3)* - %val = load i32 addrspace(3)* %specptr + %val = load i32, i32 addrspace(3)* %specptr ret i32 %val } @@ -82,7 +82,7 @@ define i32 @conv7(i32* %ptr) { ; PTX64: cvta.to.const.u64 ; PTX64: ld.const.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(4)* - %val = load i32 addrspace(4)* %specptr + %val = load i32, i32 addrspace(4)* %specptr ret i32 %val } @@ -94,6 +94,6 @@ define i32 @conv8(i32* %ptr) { ; PTX64: cvta.to.local.u64 ; PTX64: ld.local.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(5)* - %val = load i32 addrspace(5)* %specptr + %val = load i32, i32 addrspace(5)* %specptr ret i32 %val } diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll index cacffceac517..76af386c6516 100644 --- a/test/CodeGen/NVPTX/bug21465.ll +++ b/test/CodeGen/NVPTX/bug21465.ll @@ -11,8 +11,8 @@ entry: ; CHECK-LABEL @_Z22TakesStruct1SPi ; CHECK: bitcast %struct.S* %input to i8* ; CHECK: call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8 - %b = getelementptr inbounds %struct.S* %input, i64 0, i32 1 - %0 = load i32* %b, align 4 + %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 + %0 = load i32, i32* %b, align 4 store i32 %0, i32* %output, align 4 ret void } diff --git a/test/CodeGen/NVPTX/bug22246.ll b/test/CodeGen/NVPTX/bug22246.ll new file mode 100644 index 000000000000..70e7e12336e7 --- /dev/null +++ b/test/CodeGen/NVPTX/bug22246.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: _Z3foobbbPb +define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8* nocapture %output) { +entry: +; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}} + %.sink.v = select i1 %p1, i1 %p2, i1 %p3 + %frombool5 = zext i1 %.sink.v to i8 + store i8 %frombool5, i8* %output, align 1 + ret void +} diff --git a/test/CodeGen/NVPTX/bug22322.ll b/test/CodeGen/NVPTX/bug22322.ll new file mode 100644 index 000000000000..97863b9ea546 --- /dev/null +++ b/test/CodeGen/NVPTX/bug22322.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%class.float3 = type { float, float, float } + +; Function Attrs: nounwind +; CHECK-LABEL: some_kernel +define void @some_kernel(%class.float3* nocapture %dst) #0 { +_ZL11compute_vecRK6float3jb.exit: + %ret_vec.sroa.8.i = alloca float, align 4 + %0 = tail call i32 @llvm.ptx.read.ctaid.x() + %1 = tail call i32 @llvm.ptx.read.ntid.x() + %2 = mul nsw i32 %1, %0 + %3 = tail call i32 @llvm.ptx.read.tid.x() + %4 = add nsw i32 %2, %3 + %5 = zext i32 %4 to i64 + %6 = bitcast float* %ret_vec.sroa.8.i to i8* + call void @llvm.lifetime.start(i64 4, i8* %6) + %7 = and i32 %4, 15 + %8 = icmp eq i32 %7, 0 + %9 = select i1 %8, float 0.000000e+00, float -1.000000e+00 + store float %9, float* %ret_vec.sroa.8.i, align 4 +; CHECK: setp.lt.f32 %p{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000 + %10 = fcmp olt float %9, 0.000000e+00 + %ret_vec.sroa.8.i.val = load float, float* %ret_vec.sroa.8.i, align 4 + %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val + call void @llvm.lifetime.end(i64 4, i8* %6) + %12 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 0 + store float 0.000000e+00, float* %12, align 4 + %13 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 1 + store float %11, float* %13, align 4 + %14 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 2 + store float 0.000000e+00, float* %14, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.ptx.read.ctaid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.ptx.read.ntid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.ptx.read.tid.x() #1 + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #2 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind } + +!nvvm.annotations = !{!0} +!llvm.ident = !{!1} + +!0 = !{void (%class.float3*)* @some_kernel, !"kernel", i32 1} +!1 = !{!"clang version 3.5.1 (tags/RELEASE_351/final)"} diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 8483112381f1..58b191129917 100644 --- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -27,22 +27,22 @@ entry: ; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]] ; CHECK: st.f32 [%SP+0], %f[[A0_REG]] - %0 = load float* %a, align 4 + %0 = load float, float* %a, align 4 %1 = bitcast [16 x i8]* %buf to float* store float %0, float* %1, align 4 - %arrayidx2 = getelementptr inbounds float* %a, i64 1 - %2 = load float* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 1 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 1 + %2 = load float, float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 1 %3 = bitcast i8* %arrayidx3 to float* store float %2, float* %3, align 4 - %arrayidx4 = getelementptr inbounds float* %a, i64 2 - %4 = load float* %arrayidx4, align 4 - %arrayidx5 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 2 + %arrayidx4 = getelementptr inbounds float, float* %a, i64 2 + %4 = load float, float* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 2 %5 = bitcast i8* %arrayidx5 to float* store float %4, float* %5, align 4 - %arrayidx6 = getelementptr inbounds float* %a, i64 3 - %6 = load float* %arrayidx6, align 4 - %arrayidx7 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 3 + %arrayidx6 = getelementptr inbounds float, float* %a, i64 3 + %6 = load float, float* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 3 %7 = bitcast i8* %arrayidx7 to float* store float %6, float* %7, align 4 @@ -54,7 +54,7 @@ entry: ; CHECK-NEXT: call.uni ; CHECK-NEXT: callee, - %arraydecay = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 0 + %arraydecay = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 0 call void @callee(float* %a, i8* %arraydecay) #2 ret void } diff --git a/test/CodeGen/NVPTX/fp16.ll b/test/CodeGen/NVPTX/fp16.ll index 8770399f2ec9..b85eed0f6c7f 100644 --- a/test/CodeGen/NVPTX/fp16.ll +++ b/test/CodeGen/NVPTX/fp16.ll @@ -8,7 +8,7 @@ declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone ; CHECK-LABEL: @test_convert_fp16_to_fp32 ; CHECK: cvt.f32.f16 define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { - %val = load i16 addrspace(1)* %in, align 2 + %val = load i16, i16 addrspace(1)* %in, align 2 %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone store float %cvt, float addrspace(1)* %out, align 4 ret void @@ -18,7 +18,7 @@ define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 add ; CHECK-LABEL: @test_convert_fp16_to_fp64 ; CHECK: cvt.f64.f16 define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { - %val = load i16 addrspace(1)* %in, align 2 + %val = load i16, i16 addrspace(1)* %in, align 2 %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone store double %cvt, double addrspace(1)* %out, align 4 ret void @@ -28,7 +28,7 @@ define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 ad ; CHECK-LABEL: @test_convert_fp32_to_fp16 ; CHECK: cvt.rn.f16.f32 define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %val = load float addrspace(1)* %in, align 2 + %val = load float, float addrspace(1)* %in, align 2 %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone store i16 %cvt, i16 addrspace(1)* %out, align 4 ret void @@ -38,7 +38,7 @@ define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float add ; CHECK-LABEL: @test_convert_fp64_to_fp16 ; CHECK: cvt.rn.f16.f64 define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { - %val = load double addrspace(1)* %in, align 2 + %val = load double, double addrspace(1)* %in, align 2 %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone store i16 %cvt, i16 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/NVPTX/function-align.ll b/test/CodeGen/NVPTX/function-align.ll new file mode 100644 index 000000000000..e7abfb128f58 --- /dev/null +++ b/test/CodeGen/NVPTX/function-align.ll @@ -0,0 +1,7 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; CHECK-NOT: .align 2 +define ptx_device void @foo() align 2 { +; CHECK-LABEL: .func foo + ret void +} diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll index fb63d6ed575f..66917d5cb182 100644 --- a/test/CodeGen/NVPTX/generic-to-nvvm.ll +++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll @@ -13,9 +13,9 @@ target triple = "nvptx-nvidia-cuda" define void @foo(i32* %a, i32* %b) { ; CHECK: cvta.global.u32 - %ld1 = load i32* @myglobal + %ld1 = load i32, i32* @myglobal ; CHECK: cvta.global.u32 - %ld2 = load i32* @myconst + %ld2 = load i32, i32* @myconst store i32 %ld1, i32* %a store i32 %ld2, i32* %b ret void diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll index aa08cc78e91a..b99524162e65 100644 --- a/test/CodeGen/NVPTX/half.ll +++ b/test/CodeGen/NVPTX/half.ll @@ -4,7 +4,7 @@ define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) { ; CHECK-LABEL: @test_load_store ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] - %val = load half addrspace(1)* %in + %val = load half, half addrspace(1)* %in store half %val, half addrspace(1) * %out ret void } @@ -13,7 +13,7 @@ define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %o ; CHECK-LABEL: @test_bitcast_from_half ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] - %val = load half addrspace(1) * %in + %val = load half, half addrspace(1) * %in %val_int = bitcast half %val to i16 store i16 %val_int, i16 addrspace(1)* %out ret void @@ -23,7 +23,7 @@ define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in ; CHECK-LABEL: @test_bitcast_to_half ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] - %val = load i16 addrspace(1)* %in + %val = load i16, i16 addrspace(1)* %in %val_fp = bitcast i16 %val to half store half %val_fp, half addrspace(1)* %out ret void @@ -33,7 +33,7 @@ define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) { ; CHECK-LABEL: @test_extend32 ; CHECK: cvt.f32.f16 - %val16 = load half addrspace(1)* %in + %val16 = load half, half addrspace(1)* %in %val32 = fpext half %val16 to float store float %val32, float addrspace(1)* %out ret void @@ -43,7 +43,7 @@ define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) { ; CHECK-LABEL: @test_extend64 ; CHECK: cvt.f64.f16 - %val16 = load half addrspace(1)* %in + %val16 = load half, half addrspace(1)* %in %val64 = fpext half %val16 to double store double %val64, double addrspace(1)* %out ret void @@ -53,7 +53,7 @@ define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) { ; CHECK-LABEL: test_trunc32 ; CHECK: cvt.rn.f16.f32 - %val32 = load float addrspace(1)* %in + %val32 = load float, float addrspace(1)* %in %val16 = fptrunc float %val32 to half store half %val16, half addrspace(1)* %out ret void @@ -63,7 +63,7 @@ define void @test_trunc64(double addrspace(1)* %in, half addrspace(1)* %out) { ; CHECK-LABEL: @test_trunc64 ; CHECK: cvt.rn.f16.f64 - %val32 = load double addrspace(1)* %in + %val32 = load double, double addrspace(1)* %in %val16 = fptrunc double %val32 to half store half %val16, half addrspace(1)* %out ret void diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll index e3fe08e5f874..35d77b4b44d2 100644 --- a/test/CodeGen/NVPTX/i1-global.ll +++ b/test/CodeGen/NVPTX/i1-global.ll @@ -8,7 +8,7 @@ target triple = "nvptx-nvidia-cuda" define void @foo(i1 %p, i32* %out) { - %ld = load i1 addrspace(1)* @mypred + %ld = load i1, i1 addrspace(1)* @mypred %val = zext i1 %ld to i32 store i32 %val, i32* %out ret void diff --git a/test/CodeGen/NVPTX/i8-param.ll b/test/CodeGen/NVPTX/i8-param.ll index 84daa9f66316..6a1e3a0e1a0d 100644 --- a/test/CodeGen/NVPTX/i8-param.ll +++ b/test/CodeGen/NVPTX/i8-param.ll @@ -13,7 +13,7 @@ define i8 @callee(i8 %a) { ; CHECK: .visible .func caller define void @caller(i8* %a) { ; CHECK: ld.u8 - %val = load i8* %a + %val = load i8, i8* %a %ret = tail call i8 @callee(i8 %val) ; CHECK: ld.param.b32 store i8 %ret, i8* %a diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll index f33659c92e84..0018e6177be8 100644 --- a/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/test/CodeGen/NVPTX/ld-addrspace.ll @@ -8,7 +8,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { ; PTX32: ret ; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i8 addrspace(1)* %ptr + %a = load i8, i8 addrspace(1)* %ptr ret i8 %a } @@ -17,7 +17,7 @@ define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { ; PTX32: ret ; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i8 addrspace(3)* %ptr + %a = load i8, i8 addrspace(3)* %ptr ret i8 %a } @@ -26,7 +26,7 @@ define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { ; PTX32: ret ; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i8 addrspace(5)* %ptr + %a = load i8, i8 addrspace(5)* %ptr ret i8 %a } @@ -36,7 +36,7 @@ define i16 @ld_global_i16(i16 addrspace(1)* %ptr) { ; PTX32: ret ; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i16 addrspace(1)* %ptr + %a = load i16, i16 addrspace(1)* %ptr ret i16 %a } @@ -45,7 +45,7 @@ define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) { ; PTX32: ret ; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i16 addrspace(3)* %ptr + %a = load i16, i16 addrspace(3)* %ptr ret i16 %a } @@ -54,7 +54,7 @@ define i16 @ld_local_i16(i16 addrspace(5)* %ptr) { ; PTX32: ret ; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i16 addrspace(5)* %ptr + %a = load i16, i16 addrspace(5)* %ptr ret i16 %a } @@ -64,7 +64,7 @@ define i32 @ld_global_i32(i32 addrspace(1)* %ptr) { ; PTX32: ret ; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i32 addrspace(1)* %ptr + %a = load i32, i32 addrspace(1)* %ptr ret i32 %a } @@ -73,7 +73,7 @@ define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) { ; PTX32: ret ; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i32 addrspace(3)* %ptr + %a = load i32, i32 addrspace(3)* %ptr ret i32 %a } @@ -82,7 +82,7 @@ define i32 @ld_local_i32(i32 addrspace(5)* %ptr) { ; PTX32: ret ; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i32 addrspace(5)* %ptr + %a = load i32, i32 addrspace(5)* %ptr ret i32 %a } @@ -92,7 +92,7 @@ define i64 @ld_global_i64(i64 addrspace(1)* %ptr) { ; PTX32: ret ; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i64 addrspace(1)* %ptr + %a = load i64, i64 addrspace(1)* %ptr ret i64 %a } @@ -101,7 +101,7 @@ define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) { ; PTX32: ret ; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i64 addrspace(3)* %ptr + %a = load i64, i64 addrspace(3)* %ptr ret i64 %a } @@ -110,7 +110,7 @@ define i64 @ld_local_i64(i64 addrspace(5)* %ptr) { ; PTX32: ret ; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i64 addrspace(5)* %ptr + %a = load i64, i64 addrspace(5)* %ptr ret i64 %a } @@ -120,7 +120,7 @@ define float @ld_global_f32(float addrspace(1)* %ptr) { ; PTX32: ret ; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load float addrspace(1)* %ptr + %a = load float, float addrspace(1)* %ptr ret float %a } @@ -129,7 +129,7 @@ define float @ld_shared_f32(float addrspace(3)* %ptr) { ; PTX32: ret ; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load float addrspace(3)* %ptr + %a = load float, float addrspace(3)* %ptr ret float %a } @@ -138,7 +138,7 @@ define float @ld_local_f32(float addrspace(5)* %ptr) { ; PTX32: ret ; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load float addrspace(5)* %ptr + %a = load float, float addrspace(5)* %ptr ret float %a } @@ -148,7 +148,7 @@ define double @ld_global_f64(double addrspace(1)* %ptr) { ; PTX32: ret ; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load double addrspace(1)* %ptr + %a = load double, double addrspace(1)* %ptr ret double %a } @@ -157,7 +157,7 @@ define double @ld_shared_f64(double addrspace(3)* %ptr) { ; PTX32: ret ; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load double addrspace(3)* %ptr + %a = load double, double addrspace(3)* %ptr ret double %a } @@ -166,6 +166,6 @@ define double @ld_local_f64(double addrspace(5)* %ptr) { ; PTX32: ret ; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load double addrspace(5)* %ptr + %a = load double, double addrspace(5)* %ptr ret double %a } diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll index d629e0ecc647..44cfe6551b99 100644 --- a/test/CodeGen/NVPTX/ld-generic.ll +++ b/test/CodeGen/NVPTX/ld-generic.ll @@ -8,7 +8,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { ; PTX32: ret ; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i8 addrspace(0)* %ptr + %a = load i8, i8 addrspace(0)* %ptr ret i8 %a } @@ -18,7 +18,7 @@ define i16 @ld_global_i16(i16 addrspace(0)* %ptr) { ; PTX32: ret ; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i16 addrspace(0)* %ptr + %a = load i16, i16 addrspace(0)* %ptr ret i16 %a } @@ -28,7 +28,7 @@ define i32 @ld_global_i32(i32 addrspace(0)* %ptr) { ; PTX32: ret ; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i32 addrspace(0)* %ptr + %a = load i32, i32 addrspace(0)* %ptr ret i32 %a } @@ -38,7 +38,7 @@ define i64 @ld_global_i64(i64 addrspace(0)* %ptr) { ; PTX32: ret ; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load i64 addrspace(0)* %ptr + %a = load i64, i64 addrspace(0)* %ptr ret i64 %a } @@ -48,7 +48,7 @@ define float @ld_global_f32(float addrspace(0)* %ptr) { ; PTX32: ret ; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load float addrspace(0)* %ptr + %a = load float, float addrspace(0)* %ptr ret float %a } @@ -58,6 +58,6 @@ define double @ld_global_f64(double addrspace(0)* %ptr) { ; PTX32: ret ; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret - %a = load double addrspace(0)* %ptr + %a = load double, double addrspace(0)* %ptr ret double %a } diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll index fd35a7503901..ec96a493021a 100644 --- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll +++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll @@ -6,9 +6,9 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define void @reg_plus_offset(i32* %a) { ; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+32]; ; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+36]; - %p2 = getelementptr i32* %a, i32 8 + %p2 = getelementptr i32, i32* %a, i32 8 %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2, i32 4) - %p3 = getelementptr i32* %a, i32 9 + %p3 = getelementptr i32, i32* %a, i32 9 %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3, i32 4) %t3 = mul i32 %t1, %t2 store i32 %t3, i32* %a diff --git a/test/CodeGen/NVPTX/load-sext-i1.ll b/test/CodeGen/NVPTX/load-sext-i1.ll index d836740eed94..9fc98a45f59a 100644 --- a/test/CodeGen/NVPTX/load-sext-i1.ll +++ b/test/CodeGen/NVPTX/load-sext-i1.ll @@ -6,8 +6,8 @@ target triple = "nvptx-nvidia-cuda" define void @main(i1* %a1, i32 %a2, i32* %arg3) { ; CHECK: ld.u8 ; CHECK-NOT: ld.u1 - %t1 = getelementptr i1* %a1, i32 %a2 - %t2 = load i1* %t1 + %t1 = getelementptr i1, i1* %a1, i32 %a2 + %t2 = load i1, i1* %t1 %t3 = sext i1 %t2 to i32 store i32 %t3, i32* %arg3 ret void diff --git a/test/CodeGen/NVPTX/machine-sink.ll b/test/CodeGen/NVPTX/machine-sink.ll index 3614bea16534..65ba141c41d9 100644 --- a/test/CodeGen/NVPTX/machine-sink.ll +++ b/test/CodeGen/NVPTX/machine-sink.ll @@ -14,8 +14,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define float @post_dominate(float %x, i1 %cond) { ; CHECK-LABEL: post_dominate( entry: - %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4 - %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4 + %0 = load float, float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4 + %1 = load float, float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4 ; CHECK: ld.shared.f32 ; CHECK: ld.shared.f32 %2 = fmul float %0, %0 diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index 90c9c4306de7..2ad72b018851 100644 --- a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -10,7 +10,7 @@ define <4 x float> @t1(i8* %p1) { ; CHECK-NOT: ld.f32 ; CHECK: ld.u8 %cast = bitcast i8* %p1 to <4 x float>* - %r = load <4 x float>* %cast, align 1 + %r = load <4 x float>, <4 x float>* %cast, align 1 ret <4 x float> %r } @@ -20,7 +20,7 @@ define <4 x float> @t2(i8* %p1) { ; CHECK-NOT: ld.v2 ; CHECK: ld.f32 %cast = bitcast i8* %p1 to <4 x float>* - %r = load <4 x float>* %cast, align 4 + %r = load <4 x float>, <4 x float>* %cast, align 4 ret <4 x float> %r } @@ -29,7 +29,7 @@ define <4 x float> @t3(i8* %p1) { ; CHECK-NOT: ld.v4 ; CHECK: ld.v2 %cast = bitcast i8* %p1 to <4 x float>* - %r = load <4 x float>* %cast, align 8 + %r = load <4 x float>, <4 x float>* %cast, align 8 ret <4 x float> %r } @@ -37,7 +37,7 @@ define <4 x float> @t3(i8* %p1) { define <4 x float> @t4(i8* %p1) { ; CHECK: ld.v4 %cast = bitcast i8* %p1 to <4 x float>* - %r = load <4 x float>* %cast, align 16 + %r = load <4 x float>, <4 x float>* %cast, align 16 ret <4 x float> %r } diff --git a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll index 841bbc3a517c..2fec31b3791d 100644 --- a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll +++ b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll @@ -11,16 +11,16 @@ define void @foo(float* %output) #1 { entry: %output.addr = alloca float*, align 8 store float* %output, float** %output.addr, align 8 - %0 = load float** %output.addr, align 8 - %arrayidx = getelementptr inbounds float* %0, i64 0 - %1 = load float* %arrayidx, align 4 + %0 = load float*, float** %output.addr, align 8 + %arrayidx = getelementptr inbounds float, float* %0, i64 0 + %1 = load float, float* %arrayidx, align 4 %conv = fpext float %1 to double %cmp = fcmp olt double %conv, 1.000000e+01 br i1 %cmp, label %if.then, label %if.else if.then: ; preds = %entry - %2 = load float** %output.addr, align 8 - %3 = load float* %2, align 4 + %2 = load float*, float** %output.addr, align 8 + %3 = load float, float* %2, align 4 %conv1 = fpext float %3 to double %add = fadd double %conv1, 1.000000e+00 %conv2 = fptrunc double %add to float @@ -28,8 +28,8 @@ if.then: ; preds = %entry br label %if.end if.else: ; preds = %entry - %4 = load float** %output.addr, align 8 - %5 = load float* %4, align 4 + %4 = load float*, float** %output.addr, align 8 + %5 = load float, float* %4, align 4 %conv3 = fpext float %5 to double %add4 = fadd double %conv3, 2.000000e+00 %conv5 = fptrunc double %add4 to float @@ -38,16 +38,16 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then call void @llvm.cuda.syncthreads() - %6 = load float** %output.addr, align 8 - %arrayidx6 = getelementptr inbounds float* %6, i64 0 - %7 = load float* %arrayidx6, align 4 + %6 = load float*, float** %output.addr, align 8 + %arrayidx6 = getelementptr inbounds float, float* %6, i64 0 + %7 = load float, float* %arrayidx6, align 4 %conv7 = fpext float %7 to double %cmp8 = fcmp olt double %conv7, 1.000000e+01 br i1 %cmp8, label %if.then9, label %if.else13 if.then9: ; preds = %if.end - %8 = load float** %output.addr, align 8 - %9 = load float* %8, align 4 + %8 = load float*, float** %output.addr, align 8 + %9 = load float, float* %8, align 4 %conv10 = fpext float %9 to double %add11 = fadd double %conv10, 3.000000e+00 %conv12 = fptrunc double %add11 to float @@ -55,8 +55,8 @@ if.then9: ; preds = %if.end br label %if.end17 if.else13: ; preds = %if.end - %10 = load float** %output.addr, align 8 - %11 = load float* %10, align 4 + %10 = load float*, float** %output.addr, align 8 + %11 = load float, float* %10, align 4 %conv14 = fpext float %11 to double %add15 = fadd double %conv14, 4.000000e+00 %conv16 = fptrunc double %add15 to float diff --git a/test/CodeGen/NVPTX/nounroll.ll b/test/CodeGen/NVPTX/nounroll.ll new file mode 100644 index 000000000000..e80a4a21f161 --- /dev/null +++ b/test/CodeGen/NVPTX/nounroll.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +; Compiled from the following CUDA code: +; +; #pragma nounroll +; for (int i = 0; i < 2; ++i) +; output[i] = input[i]; +define void @nounroll(float* %input, float* %output) { +; CHECK-LABEL: .visible .func nounroll( +entry: + br label %for.body + +for.body: +; CHECK: .pragma "nounroll" + %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %idxprom = sext i32 %i.06 to i64 + %arrayidx = getelementptr inbounds float, float* %input, i64 %idxprom + %0 = load float, float* %arrayidx, align 4 +; CHECK: ld.f32 + %arrayidx2 = getelementptr inbounds float, float* %output, i64 %idxprom + store float %0, float* %arrayidx2, align 4 +; CHECK: st.f32 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, 2 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 +; CHECK-NOT: ld.f32 +; CHECK-NOT: st.f32 + +for.end: + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.unroll.disable"} diff --git a/test/CodeGen/NVPTX/nvvm-reflect.ll b/test/CodeGen/NVPTX/nvvm-reflect.ll index 21e9c69e657a..8c75dfc30a56 100644 --- a/test/CodeGen/NVPTX/nvvm-reflect.ll +++ b/test/CodeGen/NVPTX/nvvm-reflect.ll @@ -11,7 +11,7 @@ define float @foo(float %a, float %b) { ; USE_MUL_0-NOT: call i32 @__nvvm_reflect ; USE_MUL_1: define float @foo ; USE_MUL_1-NOT: call i32 @__nvvm_reflect - %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8] addrspace(4)* @str, i32 0, i32 0)) + %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) %reflect = tail call i32 @__nvvm_reflect(i8* %ptr) %cmp = icmp ugt i32 %reflect, 0 br i1 %cmp, label %use_mul, label %use_add @@ -42,7 +42,38 @@ define i32 @intrinsic() { ; USE_MUL_0: ret i32 0 ; USE_MUL_1-NOT: call i32 @llvm.nvvm.reflect ; USE_MUL_1: ret i32 1 - %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8] addrspace(4)* @str, i32 0, i32 0)) + %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) %reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr) ret i32 %reflect } + +; CUDA-7.0 passes __nvvm_reflect argument slightly differently. +; Verify that it works, too + +@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" + +define float @bar(float %a, float %b) { +; USE_MUL_0: define float @bar +; USE_MUL_0-NOT: call i32 @__nvvm_reflect +; USE_MUL_1: define float @bar +; USE_MUL_1-NOT: call i32 @__nvvm_reflect + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %cmp = icmp ne i32 %reflect, 0 + br i1 %cmp, label %use_mul, label %use_add + +use_mul: +; USE_MUL_1: fmul float %a, %b +; USE_MUL_0-NOT: fadd float %a, %b + %ret1 = fmul float %a, %b + br label %exit + +use_add: +; USE_MUL_0: fadd float %a, %b +; USE_MUL_1-NOT: fmul float %a, %b + %ret2 = fadd float %a, %b + br label %exit + +exit: + %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] + ret float %ret +} diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll index cc67a6fff8e4..d4f7c3bd210a 100644 --- a/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -19,7 +19,7 @@ define ptx_kernel void @t2(i1* %a, i8* %b) { ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; - %t1 = load i1* %a + %t1 = load i1, i1* %a %t2 = select i1 %t1, i8 1, i8 2 store i8 %t2, i8* %b ret void diff --git a/test/CodeGen/NVPTX/pr16278.ll b/test/CodeGen/NVPTX/pr16278.ll index 5432a848442c..a836eaf2e51f 100644 --- a/test/CodeGen/NVPTX/pr16278.ll +++ b/test/CodeGen/NVPTX/pr16278.ll @@ -5,6 +5,6 @@ define float @foo() { ; CHECK: ld.const.f32 - %val = load float addrspace(4)* @one_f + %val = load float, float addrspace(4)* @one_f ret float %val } diff --git a/test/CodeGen/NVPTX/pr17529.ll b/test/CodeGen/NVPTX/pr17529.ll index a16214225674..a7519776f526 100644 --- a/test/CodeGen/NVPTX/pr17529.ll +++ b/test/CodeGen/NVPTX/pr17529.ll @@ -11,7 +11,7 @@ entry: vector.body: ; preds = %vector.body, %entry %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] - %scevgep9 = getelementptr i8* %dst, i64 %index + %scevgep9 = getelementptr i8, i8* %dst, i64 %index %scevgep910 = bitcast i8* %scevgep9 to <4 x i8>* store <4 x i8> undef, <4 x i8>* %scevgep910, align 1 %index.next = add i64 %index, 4 @@ -22,13 +22,13 @@ middle.block: ; preds = %vector.body br i1 undef, label %for.end, label %for.body.preheader1 for.body.preheader1: ; preds = %middle.block - %scevgep2 = getelementptr i8* %dst, i64 0 + %scevgep2 = getelementptr i8, i8* %dst, i64 0 br label %for.body for.body: ; preds = %for.body, %for.body.preheader1 %lsr.iv3 = phi i8* [ %scevgep2, %for.body.preheader1 ], [ %scevgep4, %for.body ] store i8 undef, i8* %lsr.iv3, align 1 - %scevgep4 = getelementptr i8* %lsr.iv3, i64 1 + %scevgep4 = getelementptr i8, i8* %lsr.iv3, i64 1 br label %for.body for.end: ; preds = %middle.block, %entry diff --git a/test/CodeGen/NVPTX/ptx-version-30.ll b/test/CodeGen/NVPTX/ptx-version-30.ll deleted file mode 100644 index 0422b01f4ee3..000000000000 --- a/test/CodeGen/NVPTX/ptx-version-30.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx30 | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx30 | FileCheck %s - - -; CHECK: .version 3.0 - diff --git a/test/CodeGen/NVPTX/ptx-version-31.ll b/test/CodeGen/NVPTX/ptx-version-31.ll deleted file mode 100644 index d6e57301a371..000000000000 --- a/test/CodeGen/NVPTX/ptx-version-31.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx31 | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx31 | FileCheck %s - - -; CHECK: .version 3.1 - diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll index e8782ea3aa27..0432b67535c0 100644 --- a/test/CodeGen/NVPTX/refl1.ll +++ b/test/CodeGen/NVPTX/refl1.ll @@ -5,7 +5,7 @@ target triple = "nvptx-nvidia-cuda" ; Function Attrs: nounwind ; CHECK: .entry foo define void @foo(float* nocapture %a) #0 { - %val = load float* %a + %val = load float, float* %a %tan = tail call fastcc float @__nv_fast_tanf(float %val) store float %tan, float* %a ret void diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll index 03ab635e73b9..fb01eb262adc 100644 --- a/test/CodeGen/NVPTX/sched1.ll +++ b/test/CodeGen/NVPTX/sched1.ll @@ -11,14 +11,14 @@ define void @foo(i32* %a) { ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 - %ptr0 = getelementptr i32* %a, i32 0 - %val0 = load i32* %ptr0 - %ptr1 = getelementptr i32* %a, i32 1 - %val1 = load i32* %ptr1 - %ptr2 = getelementptr i32* %a, i32 2 - %val2 = load i32* %ptr2 - %ptr3 = getelementptr i32* %a, i32 3 - %val3 = load i32* %ptr3 + %ptr0 = getelementptr i32, i32* %a, i32 0 + %val0 = load i32, i32* %ptr0 + %ptr1 = getelementptr i32, i32* %a, i32 1 + %val1 = load i32, i32* %ptr1 + %ptr2 = getelementptr i32, i32* %a, i32 2 + %val2 = load i32, i32* %ptr2 + %ptr3 = getelementptr i32, i32* %a, i32 3 + %val3 = load i32, i32* %ptr3 %t0 = add i32 %val0, %val1 %t1 = add i32 %t0, %val2 diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll index 71a9a4963faf..91ed77878f81 100644 --- a/test/CodeGen/NVPTX/sched2.ll +++ b/test/CodeGen/NVPTX/sched2.ll @@ -12,14 +12,14 @@ define void @foo(<2 x i32>* %a) { ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 - %ptr0 = getelementptr <2 x i32>* %a, i32 0 - %val0 = load <2 x i32>* %ptr0 - %ptr1 = getelementptr <2 x i32>* %a, i32 1 - %val1 = load <2 x i32>* %ptr1 - %ptr2 = getelementptr <2 x i32>* %a, i32 2 - %val2 = load <2 x i32>* %ptr2 - %ptr3 = getelementptr <2 x i32>* %a, i32 3 - %val3 = load <2 x i32>* %ptr3 + %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0 + %val0 = load <2 x i32>, <2 x i32>* %ptr0 + %ptr1 = getelementptr <2 x i32>, <2 x i32>* %a, i32 1 + %val1 = load <2 x i32>, <2 x i32>* %ptr1 + %ptr2 = getelementptr <2 x i32>, <2 x i32>* %a, i32 2 + %val2 = load <2 x i32>, <2 x i32>* %ptr2 + %ptr3 = getelementptr <2 x i32>, <2 x i32>* %a, i32 3 + %val3 = load <2 x i32>, <2 x i32>* %ptr3 %t0 = add <2 x i32> %val0, %val1 %t1 = add <2 x i32> %t0, %val2 diff --git a/test/CodeGen/NVPTX/shift-parts.ll b/test/CodeGen/NVPTX/shift-parts.ll index 748297caf339..b4d408ff5972 100644 --- a/test/CodeGen/NVPTX/shift-parts.ll +++ b/test/CodeGen/NVPTX/shift-parts.ll @@ -12,8 +12,8 @@ define void @shift_parts_left_128(i128* %val, i128* %amtptr) { ; CHECK: setp.gt.s32 ; CHECK: selp.b64 ; CHECK: shl.b64 - %amt = load i128* %amtptr - %a = load i128* %val + %amt = load i128, i128* %amtptr + %a = load i128, i128* %val %val0 = shl i128 %a, %amt store i128 %val0, i128* %val ret void @@ -30,8 +30,8 @@ define void @shift_parts_right_128(i128* %val, i128* %amtptr) { ; CHECK: setp.gt.s32 ; CHECK: selp.b64 ; CHECK: shr.s64 - %amt = load i128* %amtptr - %a = load i128* %val + %amt = load i128, i128* %amtptr + %a = load i128, i128* %val %val0 = ashr i128 %a, %amt store i128 %val0, i128* %val ret void diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll index 1b41361cf7ed..da6568685fe6 100644 --- a/test/CodeGen/NVPTX/simple-call.ll +++ b/test/CodeGen/NVPTX/simple-call.ll @@ -11,7 +11,7 @@ define float @device_func(float %a) noinline { ; CHECK: .entry kernel_func define void @kernel_func(float* %a) { - %val = load float* %a + %val = load float, float* %a ; CHECK: call.uni (retval0), ; CHECK: device_func, %mul = call float @device_func(float %val) diff --git a/test/CodeGen/NVPTX/sm-version-30.ll b/test/CodeGen/NVPTX/sm-version-30.ll index 692b49a0d6b3..4f35cf04c63b 100644 --- a/test/CodeGen/NVPTX/sm-version-30.ll +++ b/test/CodeGen/NVPTX/sm-version-30.ll @@ -2,5 +2,6 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s +; CHECK: .version 3.2 ; CHECK: .target sm_30 diff --git a/test/CodeGen/NVPTX/sm-version-32.ll b/test/CodeGen/NVPTX/sm-version-32.ll new file mode 100644 index 000000000000..d6a5082c5267 --- /dev/null +++ b/test/CodeGen/NVPTX/sm-version-32.ll @@ -0,0 +1,7 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_32 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s + + +; CHECK: .version 4.0 +; CHECK: .target sm_32 + diff --git a/test/CodeGen/NVPTX/sm-version-35.ll b/test/CodeGen/NVPTX/sm-version-35.ll index 25368a01335e..8456c666677d 100644 --- a/test/CodeGen/NVPTX/sm-version-35.ll +++ b/test/CodeGen/NVPTX/sm-version-35.ll @@ -2,5 +2,6 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s +; CHECK: .version 3.2 ; CHECK: .target sm_35 diff --git a/test/CodeGen/NVPTX/sm-version-37.ll b/test/CodeGen/NVPTX/sm-version-37.ll new file mode 100644 index 000000000000..fd51a9c7063f --- /dev/null +++ b/test/CodeGen/NVPTX/sm-version-37.ll @@ -0,0 +1,7 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_37 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_37 | FileCheck %s + + +; CHECK: .version 4.1 +; CHECK: .target sm_37 + diff --git a/test/CodeGen/NVPTX/sm-version-50.ll b/test/CodeGen/NVPTX/sm-version-50.ll new file mode 100644 index 000000000000..374c6ea057ae --- /dev/null +++ b/test/CodeGen/NVPTX/sm-version-50.ll @@ -0,0 +1,7 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_50 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s + + +; CHECK: .version 4.0 +; CHECK: .target sm_50 + diff --git a/test/CodeGen/NVPTX/sm-version-52.ll b/test/CodeGen/NVPTX/sm-version-52.ll new file mode 100644 index 000000000000..18881b2e98cc --- /dev/null +++ b/test/CodeGen/NVPTX/sm-version-52.ll @@ -0,0 +1,7 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_52 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 | FileCheck %s + + +; CHECK: .version 4.1 +; CHECK: .target sm_52 + diff --git a/test/CodeGen/NVPTX/sm-version-53.ll b/test/CodeGen/NVPTX/sm-version-53.ll new file mode 100644 index 000000000000..50d2dec11bc5 --- /dev/null +++ b/test/CodeGen/NVPTX/sm-version-53.ll @@ -0,0 +1,7 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_53 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 | FileCheck %s + + +; CHECK: .version 4.2 +; CHECK: .target sm_53 + diff --git a/test/CodeGen/NVPTX/symbol-naming.ll b/test/CodeGen/NVPTX/symbol-naming.ll index bd1333f1c4e6..0f176934ca39 100644 --- a/test/CodeGen/NVPTX/symbol-naming.ll +++ b/test/CodeGen/NVPTX/symbol-naming.ll @@ -24,7 +24,7 @@ target triple = "nvptx64-unknown-unknown" ; Function Attrs: nounwind define void @foo(i32 %a, float %b, i8 signext %c, i32 %e) { entry: - %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0)) + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0)) ret void } diff --git a/test/CodeGen/NVPTX/vector-compare.ll b/test/CodeGen/NVPTX/vector-compare.ll index 218049995233..2992b0e62c56 100644 --- a/test/CodeGen/NVPTX/vector-compare.ll +++ b/test/CodeGen/NVPTX/vector-compare.ll @@ -6,8 +6,8 @@ ; tried to promote <2 x i1> to <2 x i8> and instruction selection failed. define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) { - %aval = load <2 x i32>* %a - %bval = load <2 x i32>* %b + %aval = load <2 x i32>, <2 x i32>* %a + %bval = load <2 x i32>, <2 x i32>* %b %res = icmp slt <2 x i32> %aval, %bval %t1 = extractelement <2 x i1> %res, i32 0 %t2 = extractelement <2 x i1> %res, i32 1 diff --git a/test/CodeGen/NVPTX/vector-loads.ll b/test/CodeGen/NVPTX/vector-loads.ll index 58882bf16668..d70348942200 100644 --- a/test/CodeGen/NVPTX/vector-loads.ll +++ b/test/CodeGen/NVPTX/vector-loads.ll @@ -10,7 +10,7 @@ define void @foo(<2 x float>* %a) { ; CHECK: .func foo ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}} - %t1 = load <2 x float>* %a + %t1 = load <2 x float>, <2 x float>* %a %t2 = fmul <2 x float> %t1, %t1 store <2 x float> %t2, <2 x float>* %a ret void @@ -19,7 +19,7 @@ define void @foo(<2 x float>* %a) { define void @foo2(<4 x float>* %a) { ; CHECK: .func foo2 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - %t1 = load <4 x float>* %a + %t1 = load <4 x float>, <4 x float>* %a %t2 = fmul <4 x float> %t1, %t1 store <4 x float> %t2, <4 x float>* %a ret void @@ -29,7 +29,7 @@ define void @foo3(<8 x float>* %a) { ; CHECK: .func foo3 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} ; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} - %t1 = load <8 x float>* %a + %t1 = load <8 x float>, <8 x float>* %a %t2 = fmul <8 x float> %t1, %t1 store <8 x float> %t2, <8 x float>* %a ret void @@ -40,7 +40,7 @@ define void @foo3(<8 x float>* %a) { define void @foo4(<2 x i32>* %a) { ; CHECK: .func foo4 ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}} - %t1 = load <2 x i32>* %a + %t1 = load <2 x i32>, <2 x i32>* %a %t2 = mul <2 x i32> %t1, %t1 store <2 x i32> %t2, <2 x i32>* %a ret void @@ -49,7 +49,7 @@ define void @foo4(<2 x i32>* %a) { define void @foo5(<4 x i32>* %a) { ; CHECK: .func foo5 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - %t1 = load <4 x i32>* %a + %t1 = load <4 x i32>, <4 x i32>* %a %t2 = mul <4 x i32> %t1, %t1 store <4 x i32> %t2, <4 x i32>* %a ret void @@ -59,7 +59,7 @@ define void @foo6(<8 x i32>* %a) { ; CHECK: .func foo6 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} ; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} - %t1 = load <8 x i32>* %a + %t1 = load <8 x i32>, <8 x i32>* %a %t2 = mul <8 x i32> %t1, %t1 store <8 x i32> %t2, <8 x i32>* %a ret void diff --git a/test/CodeGen/NVPTX/vector-select.ll b/test/CodeGen/NVPTX/vector-select.ll index 11893df10329..1e81031c685a 100644 --- a/test/CodeGen/NVPTX/vector-select.ll +++ b/test/CodeGen/NVPTX/vector-select.ll @@ -6,9 +6,9 @@ define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) { entry: - %tmp4 = load <2 x i32> addrspace(1)* %def_a - %tmp6 = load <2 x i32> addrspace(1)* %def_c - %tmp8 = load <2 x i32> addrspace(1)* %def_b + %tmp4 = load <2 x i32>, <2 x i32> addrspace(1)* %def_a + %tmp6 = load <2 x i32>, <2 x i32> addrspace(1)* %def_c + %tmp8 = load <2 x i32>, <2 x i32> addrspace(1)* %def_b %0 = icmp sge <2 x i32> %tmp4, zeroinitializer %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8 store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c diff --git a/test/CodeGen/NVPTX/weak-global.ll b/test/CodeGen/NVPTX/weak-global.ll index 2bef4c5228a9..a64f9f48b26f 100644 --- a/test/CodeGen/NVPTX/weak-global.ll +++ b/test/CodeGen/NVPTX/weak-global.ll @@ -4,6 +4,6 @@ @g = common addrspace(1) global i32 zeroinitializer define i32 @func0() { - %val = load i32 addrspace(1)* @g + %val = load i32, i32 addrspace(1)* @g ret i32 %val } |