44 files changed, 328 insertions, 148 deletions
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll
index c225abf0fd85..e709302918f5 100644
--- a/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@@ -18,7 +18,7 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
 ; IR-NOT: addrspacecast
 ; PTX-LABEL: ld_st_shared_f32(
   ; load cast
-  %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+  %1 = load float, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
   ; store cast
   store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
@@ -29,7 +29,7 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
 
   ; cast; load
   %2 = addrspacecast float addrspace(3)* @scalar to float*
-  %3 = load float* %2, align 4
+  %3 = load float, float* %2, align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
   ; cast; store
   store float %v, float* %2, align 4
@@ -38,17 +38,17 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
 ; PTX: bar.sync 0;
 
   ; load gep cast
-  %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+  %4 = load float, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
   ; store gep cast
-  store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+  store float %v, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
 ; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
   call void @llvm.cuda.syncthreads()
 ; PTX: bar.sync 0;
 
   ; gep cast; load
-  %5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
-  %6 = load float* %5, align 4
+  %5 = getelementptr inbounds [10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
+  %6 = load float, float* %5, align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
   ; gep cast; store
   store float %v, float* %5, align 4
@@ -58,8 +58,8 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
 
   ; cast; gep; load
   %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]*
-  %8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i
-  %9 = load float* %8, align 4
+  %8 = getelementptr inbounds [10 x float], [10 x float]* %7, i32 0, i32 %i
+  %9 = load float, float* %8, align 4
 ; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
   ; cast; gep; store
   store float %v, float* %8, align 4
@@ -78,10 +78,10 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
 ; addrspacecast with a bitcast.
 define i32 @ld_int_from_float() {
 ; IR-LABEL: @ld_int_from_float
-; IR: load i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
+; IR: load i32, i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
 ; PTX-LABEL: ld_int_from_float(
 ; PTX: ld.shared.u{{(32|64)}}
-  %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
+  %1 = load i32, i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
   ret i32 %1
 }
 
diff --git a/test/CodeGen/NVPTX/addrspacecast-gvar.ll b/test/CodeGen/NVPTX/addrspacecast-gvar.ll
index 6afbdb8a429f..1e2fde4b858a 100644
--- a/test/CodeGen/NVPTX/addrspacecast-gvar.ll
+++ b/test/CodeGen/NVPTX/addrspacecast-gvar.ll
@@ -3,7 +3,11 @@
 ; CHECK: .visible .global .align 4 .u32 g = 42;
 ; CHECK: .visible .global .align 4 .u32 g2 = generic(g);
 ; CHECK: .visible .global .align 4 .u32 g3 = g;
+; CHECK: .visible .global .align 8 .u32 g4[2] = {0, generic(g)};
+; CHECK: .visible .global .align 8 .u32 g5[2] = {0, generic(g)+8};
 
 @g = addrspace(1) global i32 42
 @g2 = addrspace(1) global i32* addrspacecast (i32 addrspace(1)* @g to i32*)
 @g3 = addrspace(1) global i32 addrspace(1)* @g
+@g4 = constant {i32*, i32*} {i32* null, i32* addrspacecast (i32 addrspace(1)* @g to i32*)}
+@g5 = constant {i32*, i32*} {i32* null, i32* addrspacecast (i32 addrspace(1)* getelementptr (i32, i32 addrspace(1)* @g, i32 2) to i32*)}
diff --git a/test/CodeGen/NVPTX/addrspacecast.ll b/test/CodeGen/NVPTX/addrspacecast.ll
index 03b9a9844752..42e67ca8ce9b 100644
--- a/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/test/CodeGen/NVPTX/addrspacecast.ll
@@ -10,7 +10,7 @@ define i32 @conv1(i32 addrspace(1)* %ptr) {
 ; PTX64: cvta.global.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(1)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -22,7 +22,7 @@ define i32 @conv2(i32 addrspace(3)* %ptr) {
 ; PTX64: cvta.shared.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(3)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -34,7 +34,7 @@ define i32 @conv3(i32 addrspace(4)* %ptr) {
 ; PTX64: cvta.const.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(4)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -46,7 +46,7 @@ define i32 @conv4(i32 addrspace(5)* %ptr) {
 ; PTX64: cvta.local.u64
 ; PTX64: ld.u32
   %genptr = addrspacecast i32 addrspace(5)* %ptr to i32*
-  %val = load i32* %genptr
+  %val = load i32, i32* %genptr
   ret i32 %val
 }
 
@@ -58,7 +58,7 @@ define i32 @conv5(i32* %ptr) {
 ; PTX64: cvta.to.global.u64
 ; PTX64: ld.global.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(1)*
-  %val = load i32 addrspace(1)* %specptr
+  %val = load i32, i32 addrspace(1)* %specptr
   ret i32 %val
 }
 
@@ -70,7 +70,7 @@ define i32 @conv6(i32* %ptr) {
 ; PTX64: cvta.to.shared.u64
 ; PTX64: ld.shared.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(3)*
-  %val = load i32 addrspace(3)* %specptr
+  %val = load i32, i32 addrspace(3)* %specptr
   ret i32 %val
 }
 
@@ -82,7 +82,7 @@ define i32 @conv7(i32* %ptr) {
 ; PTX64: cvta.to.const.u64
 ; PTX64: ld.const.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(4)*
-  %val = load i32 addrspace(4)* %specptr
+  %val = load i32, i32 addrspace(4)* %specptr
   ret i32 %val
 }
 
@@ -94,6 +94,6 @@ define i32 @conv8(i32* %ptr) {
 ; PTX64: cvta.to.local.u64
 ; PTX64: ld.local.u32
   %specptr = addrspacecast i32* %ptr to i32 addrspace(5)*
-  %val = load i32 addrspace(5)* %specptr
+  %val = load i32, i32 addrspace(5)* %specptr
   ret i32 %val
 }
diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll
index cacffceac517..76af386c6516 100644
--- a/test/CodeGen/NVPTX/bug21465.ll
+++ b/test/CodeGen/NVPTX/bug21465.ll
@@ -11,8 +11,8 @@ entry:
 ; CHECK-LABEL @_Z22TakesStruct1SPi
 ; CHECK:   bitcast %struct.S* %input to i8*
 ; CHECK:   call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8
-  %b = getelementptr inbounds %struct.S* %input, i64 0, i32 1
-  %0 = load i32* %b, align 4
+  %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
+  %0 = load i32, i32* %b, align 4
   store i32 %0, i32* %output, align 4
   ret void
 }
diff --git a/test/CodeGen/NVPTX/bug22246.ll b/test/CodeGen/NVPTX/bug22246.ll
new file mode 100644
index 000000000000..70e7e12336e7
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug22246.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: _Z3foobbbPb
+define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8* nocapture %output) {
+entry:
+; CHECK: selp.b32       %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}
+  %.sink.v = select i1 %p1, i1 %p2, i1 %p3
+  %frombool5 = zext i1 %.sink.v to i8
+  store i8 %frombool5, i8* %output, align 1
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/bug22322.ll b/test/CodeGen/NVPTX/bug22322.ll
new file mode 100644
index 000000000000..97863b9ea546
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug22322.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%class.float3 = type { float, float, float }
+
+; Function Attrs: nounwind
+; CHECK-LABEL: some_kernel
+define void @some_kernel(%class.float3* nocapture %dst) #0 {
+_ZL11compute_vecRK6float3jb.exit:
+  %ret_vec.sroa.8.i = alloca float, align 4
+  %0 = tail call i32 @llvm.ptx.read.ctaid.x()
+  %1 = tail call i32 @llvm.ptx.read.ntid.x()
+  %2 = mul nsw i32 %1, %0
+  %3 = tail call i32 @llvm.ptx.read.tid.x()
+  %4 = add nsw i32 %2, %3
+  %5 = zext i32 %4 to i64
+  %6 = bitcast float* %ret_vec.sroa.8.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %6)
+  %7 = and i32 %4, 15
+  %8 = icmp eq i32 %7, 0
+  %9 = select i1 %8, float 0.000000e+00, float -1.000000e+00
+  store float %9, float* %ret_vec.sroa.8.i, align 4
+; CHECK: setp.lt.f32     %p{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000
+  %10 = fcmp olt float %9, 0.000000e+00
+  %ret_vec.sroa.8.i.val = load float, float* %ret_vec.sroa.8.i, align 4
+  %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val
+  call void @llvm.lifetime.end(i64 4, i8* %6)
+  %12 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 0
+  store float 0.000000e+00, float* %12, align 4
+  %13 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 1
+  store float %11, float* %13, align 4
+  %14 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 2
+  store float 0.000000e+00, float* %14, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ptx.read.ctaid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ptx.read.ntid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ptx.read.tid.x() #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!nvvm.annotations = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{void (%class.float3*)* @some_kernel, !"kernel", i32 1}
+!1 = !{!"clang version 3.5.1 (tags/RELEASE_351/final)"}
diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 8483112381f1..58b191129917 100644
--- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -27,22 +27,22 @@ entry:
 ; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]]
 ; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
 
-  %0 = load float* %a, align 4
+  %0 = load float, float* %a, align 4
   %1 = bitcast [16 x i8]* %buf to float*
   store float %0, float* %1, align 4
-  %arrayidx2 = getelementptr inbounds float* %a, i64 1
-  %2 = load float* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 1
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 1
+  %2 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 1
   %3 = bitcast i8* %arrayidx3 to float*
   store float %2, float* %3, align 4
-  %arrayidx4 = getelementptr inbounds float* %a, i64 2
-  %4 = load float* %arrayidx4, align 4
-  %arrayidx5 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 2
+  %arrayidx4 = getelementptr inbounds float, float* %a, i64 2
+  %4 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 2
   %5 = bitcast i8* %arrayidx5 to float*
   store float %4, float* %5, align 4
-  %arrayidx6 = getelementptr inbounds float* %a, i64 3
-  %6 = load float* %arrayidx6, align 4
-  %arrayidx7 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 3
+  %arrayidx6 = getelementptr inbounds float, float* %a, i64 3
+  %6 = load float, float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 3
   %7 = bitcast i8* %arrayidx7 to float*
   store float %6, float* %7, align 4
 
@@ -54,7 +54,7 @@ entry:
 ; CHECK-NEXT:   call.uni
 ; CHECK-NEXT:   callee,
 
-  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 0
+  %arraydecay = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i64 0, i64 0
   call void @callee(float* %a, i8* %arraydecay) #2
   ret void
 }
diff --git a/test/CodeGen/NVPTX/fp16.ll b/test/CodeGen/NVPTX/fp16.ll
index 8770399f2ec9..b85eed0f6c7f 100644
--- a/test/CodeGen/NVPTX/fp16.ll
+++ b/test/CodeGen/NVPTX/fp16.ll
@@ -8,7 +8,7 @@ declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
 ; CHECK-LABEL: @test_convert_fp16_to_fp32
 ; CHECK: cvt.f32.f16
 define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
+  %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
@@ -18,7 +18,7 @@ define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 add
 ; CHECK-LABEL: @test_convert_fp16_to_fp64
 ; CHECK: cvt.f64.f16
 define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
+  %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, double addrspace(1)* %out, align 4
   ret void
@@ -28,7 +28,7 @@ define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 ad
 ; CHECK-LABEL: @test_convert_fp32_to_fp16
 ; CHECK: cvt.rn.f16.f32
 define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float addrspace(1)* %in, align 2
+  %val = load float, float addrspace(1)* %in, align 2
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 4
   ret void
@@ -38,7 +38,7 @@ define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float add
 ; CHECK-LABEL: @test_convert_fp64_to_fp16
 ; CHECK: cvt.rn.f16.f64
 define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
-  %val = load double addrspace(1)* %in, align 2
+  %val = load double, double addrspace(1)* %in, align 2
   %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/NVPTX/function-align.ll b/test/CodeGen/NVPTX/function-align.ll
new file mode 100644
index 000000000000..e7abfb128f58
--- /dev/null
+++ b/test/CodeGen/NVPTX/function-align.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK-NOT: .align 2
+define ptx_device void @foo() align 2 {
+; CHECK-LABEL: .func foo
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll
index fb63d6ed575f..66917d5cb182 100644
--- a/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -13,9 +13,9 @@ target triple = "nvptx-nvidia-cuda"
 
 define void @foo(i32* %a, i32* %b) {
 ; CHECK: cvta.global.u32
-  %ld1 = load i32* @myglobal
+  %ld1 = load i32, i32* @myglobal
 ; CHECK: cvta.global.u32
-  %ld2 = load i32* @myconst
+  %ld2 = load i32, i32* @myconst
   store i32 %ld1, i32* %a
   store i32 %ld2, i32* %b
   ret void
diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll
index aa08cc78e91a..b99524162e65 100644
--- a/test/CodeGen/NVPTX/half.ll
+++ b/test/CodeGen/NVPTX/half.ll
@@ -4,7 +4,7 @@ define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
 ; CHECK-LABEL: @test_load_store
 ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
-  %val = load half addrspace(1)* %in
+  %val = load half, half addrspace(1)* %in
   store half %val, half addrspace(1) * %out
   ret void
 }
@@ -13,7 +13,7 @@ define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %o
 ; CHECK-LABEL: @test_bitcast_from_half
 ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
-  %val = load half addrspace(1) * %in
+  %val = load half, half addrspace(1) * %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, i16 addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in
 ; CHECK-LABEL: @test_bitcast_to_half
 ; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
 ; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
-  %val = load i16 addrspace(1)* %in
+  %val = load i16, i16 addrspace(1)* %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, half addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@ define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) {
 ; CHECK-LABEL: @test_extend32
 ; CHECK: cvt.f32.f16
 
-  %val16 = load half addrspace(1)* %in
+  %val16 = load half, half addrspace(1)* %in
   %val32 = fpext half %val16 to float
   store float %val32, float addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) {
 ; CHECK-LABEL: @test_extend64
 ; CHECK: cvt.f64.f16
 
-  %val16 = load half addrspace(1)* %in
+  %val16 = load half, half addrspace(1)* %in
   %val64 = fpext half %val16 to double
   store double %val64, double addrspace(1)* %out
   ret void
@@ -53,7 +53,7 @@ define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) {
 ; CHECK-LABEL: test_trunc32
 ; CHECK: cvt.rn.f16.f32
 
-  %val32 = load float addrspace(1)* %in
+  %val32 = load float, float addrspace(1)* %in
   %val16 = fptrunc float %val32 to half
   store half %val16, half addrspace(1)* %out
   ret void
@@ -63,7 +63,7 @@ define void @test_trunc64(double addrspace(1)* %in, half addrspace(1)* %out) {
 ; CHECK-LABEL: @test_trunc64
 ; CHECK: cvt.rn.f16.f64
 
-  %val32 = load double addrspace(1)* %in
+  %val32 = load double, double addrspace(1)* %in
   %val16 = fptrunc double %val32 to half
   store half %val16, half addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll
index e3fe08e5f874..35d77b4b44d2 100644
--- a/test/CodeGen/NVPTX/i1-global.ll
+++ b/test/CodeGen/NVPTX/i1-global.ll
@@ -8,7 +8,7 @@ target triple = "nvptx-nvidia-cuda"
 
 
 define void @foo(i1 %p, i32* %out) {
-  %ld = load i1 addrspace(1)* @mypred
+  %ld = load i1, i1 addrspace(1)* @mypred
   %val = zext i1 %ld to i32
   store i32 %val, i32* %out
   ret void
diff --git a/test/CodeGen/NVPTX/i8-param.ll b/test/CodeGen/NVPTX/i8-param.ll
index 84daa9f66316..6a1e3a0e1a0d 100644
--- a/test/CodeGen/NVPTX/i8-param.ll
+++ b/test/CodeGen/NVPTX/i8-param.ll
@@ -13,7 +13,7 @@ define i8 @callee(i8 %a) {
 ; CHECK: .visible .func caller
 define void @caller(i8* %a) {
 ; CHECK: ld.u8
-  %val = load i8* %a
+  %val = load i8, i8* %a
   %ret = tail call i8 @callee(i8 %val)
 ; CHECK: ld.param.b32
   store i8 %ret, i8* %a
diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll
index f33659c92e84..0018e6177be8 100644
--- a/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -8,7 +8,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(1)* %ptr
+  %a = load i8, i8 addrspace(1)* %ptr
   ret i8 %a
 }
 
@@ -17,7 +17,7 @@ define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(3)* %ptr
+  %a = load i8, i8 addrspace(3)* %ptr
   ret i8 %a
 }
 
@@ -26,7 +26,7 @@ define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(5)* %ptr
+  %a = load i8, i8 addrspace(5)* %ptr
   ret i8 %a
 }
 
@@ -36,7 +36,7 @@ define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(1)* %ptr
+  %a = load i16, i16 addrspace(1)* %ptr
   ret i16 %a
 }
 
@@ -45,7 +45,7 @@ define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(3)* %ptr
+  %a = load i16, i16 addrspace(3)* %ptr
   ret i16 %a
 }
 
@@ -54,7 +54,7 @@ define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(5)* %ptr
+  %a = load i16, i16 addrspace(5)* %ptr
   ret i16 %a
 }
 
@@ -64,7 +64,7 @@ define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(1)* %ptr
+  %a = load i32, i32 addrspace(1)* %ptr
   ret i32 %a
 }
 
@@ -73,7 +73,7 @@ define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(3)* %ptr
+  %a = load i32, i32 addrspace(3)* %ptr
   ret i32 %a
 }
 
@@ -82,7 +82,7 @@ define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(5)* %ptr
+  %a = load i32, i32 addrspace(5)* %ptr
   ret i32 %a
 }
 
@@ -92,7 +92,7 @@ define i64 @ld_global_i64(i64 addrspace(1)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(1)* %ptr
+  %a = load i64, i64 addrspace(1)* %ptr
   ret i64 %a
 }
 
@@ -101,7 +101,7 @@ define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(3)* %ptr
+  %a = load i64, i64 addrspace(3)* %ptr
   ret i64 %a
 }
 
@@ -110,7 +110,7 @@ define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(5)* %ptr
+  %a = load i64, i64 addrspace(5)* %ptr
   ret i64 %a
 }
 
@@ -120,7 +120,7 @@ define float @ld_global_f32(float addrspace(1)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(1)* %ptr
+  %a = load float, float addrspace(1)* %ptr
   ret float %a
 }
 
@@ -129,7 +129,7 @@ define float @ld_shared_f32(float addrspace(3)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(3)* %ptr
+  %a = load float, float addrspace(3)* %ptr
   ret float %a
 }
 
@@ -138,7 +138,7 @@ define float @ld_local_f32(float addrspace(5)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(5)* %ptr
+  %a = load float, float addrspace(5)* %ptr
   ret float %a
 }
 
@@ -148,7 +148,7 @@ define double @ld_global_f64(double addrspace(1)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(1)* %ptr
+  %a = load double, double addrspace(1)* %ptr
   ret double %a
 }
 
@@ -157,7 +157,7 @@ define double @ld_shared_f64(double addrspace(3)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(3)* %ptr
+  %a = load double, double addrspace(3)* %ptr
   ret double %a
 }
 
@@ -166,6 +166,6 @@ define double @ld_local_f64(double addrspace(5)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(5)* %ptr
+  %a = load double, double addrspace(5)* %ptr
   ret double %a
 }
diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll
index d629e0ecc647..44cfe6551b99 100644
--- a/test/CodeGen/NVPTX/ld-generic.ll
+++ b/test/CodeGen/NVPTX/ld-generic.ll
@@ -8,7 +8,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i8 addrspace(0)* %ptr
+  %a = load i8, i8 addrspace(0)* %ptr
   ret i8 %a
 }
 
@@ -18,7 +18,7 @@ define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i16 addrspace(0)* %ptr
+  %a = load i16, i16 addrspace(0)* %ptr
   ret i16 %a
 }
 
@@ -28,7 +28,7 @@ define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i32 addrspace(0)* %ptr
+  %a = load i32, i32 addrspace(0)* %ptr
   ret i32 %a
 }
 
@@ -38,7 +38,7 @@ define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load i64 addrspace(0)* %ptr
+  %a = load i64, i64 addrspace(0)* %ptr
   ret i64 %a
 }
 
@@ -48,7 +48,7 @@ define float @ld_global_f32(float addrspace(0)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load float addrspace(0)* %ptr
+  %a = load float, float addrspace(0)* %ptr
   ret float %a
 }
 
@@ -58,6 +58,6 @@ define double @ld_global_f64(double addrspace(0)* %ptr) {
 ; PTX32: ret
 ; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
-  %a = load double addrspace(0)* %ptr
+  %a = load double, double addrspace(0)* %ptr
   ret double %a
 }
diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index fd35a7503901..ec96a493021a 100644
--- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -6,9 +6,9 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define void @reg_plus_offset(i32* %a) {
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
-  %p2 = getelementptr i32* %a, i32 8
+  %p2 = getelementptr i32, i32* %a, i32 8
   %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2, i32 4)
-  %p3 = getelementptr i32* %a, i32 9
+  %p3 = getelementptr i32, i32* %a, i32 9
   %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3, i32 4)
   %t3 = mul i32 %t1, %t2
   store i32 %t3, i32* %a
diff --git a/test/CodeGen/NVPTX/load-sext-i1.ll b/test/CodeGen/NVPTX/load-sext-i1.ll
index d836740eed94..9fc98a45f59a 100644
--- a/test/CodeGen/NVPTX/load-sext-i1.ll
+++ b/test/CodeGen/NVPTX/load-sext-i1.ll
@@ -6,8 +6,8 @@ target triple = "nvptx-nvidia-cuda"
 define void @main(i1* %a1, i32 %a2, i32* %arg3) {
 ; CHECK: ld.u8
 ; CHECK-NOT: ld.u1
-  %t1 = getelementptr i1* %a1, i32 %a2
-  %t2 = load i1* %t1
+  %t1 = getelementptr i1, i1* %a1, i32 %a2
+  %t2 = load i1, i1* %t1
   %t3 = sext i1 %t2 to i32
   store i32 %t3, i32* %arg3
   ret void
diff --git a/test/CodeGen/NVPTX/machine-sink.ll b/test/CodeGen/NVPTX/machine-sink.ll
index 3614bea16534..65ba141c41d9 100644
--- a/test/CodeGen/NVPTX/machine-sink.ll
+++ b/test/CodeGen/NVPTX/machine-sink.ll
@@ -14,8 +14,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define float @post_dominate(float %x, i1 %cond) {
 ; CHECK-LABEL: post_dominate(
 entry:
-  %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
-  %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
+  %0 = load float, float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
+  %1 = load float, float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
 ; CHECK: ld.shared.f32
 ; CHECK: ld.shared.f32
   %2 = fmul float %0, %0
diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index 90c9c4306de7..2ad72b018851 100644
--- a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -10,7 +10,7 @@ define <4 x float> @t1(i8* %p1) {
 ; CHECK-NOT: ld.f32
 ; CHECK: ld.u8
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 1
+  %r = load <4 x float>, <4 x float>* %cast, align 1
   ret <4 x float> %r
 }
 
@@ -20,7 +20,7 @@ define <4 x float> @t2(i8* %p1) {
 ; CHECK-NOT: ld.v2
 ; CHECK: ld.f32
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 4
+  %r = load <4 x float>, <4 x float>* %cast, align 4
   ret <4 x float> %r
 }
 
@@ -29,7 +29,7 @@ define <4 x float> @t3(i8* %p1) {
 ; CHECK-NOT: ld.v4
 ; CHECK: ld.v2
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 8
+  %r = load <4 x float>, <4 x float>* %cast, align 8
   ret <4 x float> %r
 }
 
@@ -37,7 +37,7 @@ define <4 x float> @t3(i8* %p1) {
 define <4 x float> @t4(i8* %p1) {
 ; CHECK: ld.v4
   %cast = bitcast i8* %p1 to <4 x float>*
-  %r = load <4 x float>* %cast, align 16
+  %r = load <4 x float>, <4 x float>* %cast, align 16
   ret <4 x float> %r
 }
 
diff --git a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index 841bbc3a517c..2fec31b3791d 100644
--- a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -11,16 +11,16 @@ define void @foo(float* %output) #1 {
 entry:
   %output.addr = alloca float*, align 8
   store float* %output, float** %output.addr, align 8
-  %0 = load float** %output.addr, align 8
-  %arrayidx = getelementptr inbounds float* %0, i64 0
-  %1 = load float* %arrayidx, align 4
+  %0 = load float*, float** %output.addr, align 8
+  %arrayidx = getelementptr inbounds float, float* %0, i64 0
+  %1 = load float, float* %arrayidx, align 4
   %conv = fpext float %1 to double
   %cmp = fcmp olt double %conv, 1.000000e+01
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %2 = load float** %output.addr, align 8
-  %3 = load float* %2, align 4
+  %2 = load float*, float** %output.addr, align 8
+  %3 = load float, float* %2, align 4
   %conv1 = fpext float %3 to double
   %add = fadd double %conv1, 1.000000e+00
   %conv2 = fptrunc double %add to float
@@ -28,8 +28,8 @@ if.then:                                          ; preds = %entry
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %4 = load float** %output.addr, align 8
-  %5 = load float* %4, align 4
+  %4 = load float*, float** %output.addr, align 8
+  %5 = load float, float* %4, align 4
   %conv3 = fpext float %5 to double
   %add4 = fadd double %conv3, 2.000000e+00
   %conv5 = fptrunc double %add4 to float
@@ -38,16 +38,16 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   call void @llvm.cuda.syncthreads()
-  %6 = load float** %output.addr, align 8
-  %arrayidx6 = getelementptr inbounds float* %6, i64 0
-  %7 = load float* %arrayidx6, align 4
+  %6 = load float*, float** %output.addr, align 8
+  %arrayidx6 = getelementptr inbounds float, float* %6, i64 0
+  %7 = load float, float* %arrayidx6, align 4
   %conv7 = fpext float %7 to double
   %cmp8 = fcmp olt double %conv7, 1.000000e+01
   br i1 %cmp8, label %if.then9, label %if.else13
 
 if.then9:                                         ; preds = %if.end
-  %8 = load float** %output.addr, align 8
-  %9 = load float* %8, align 4
+  %8 = load float*, float** %output.addr, align 8
+  %9 = load float, float* %8, align 4
   %conv10 = fpext float %9 to double
   %add11 = fadd double %conv10, 3.000000e+00
   %conv12 = fptrunc double %add11 to float
@@ -55,8 +55,8 @@ if.then9:                                         ; preds = %if.end
   br label %if.end17
 
 if.else13:                                        ; preds = %if.end
-  %10 = load float** %output.addr, align 8
-  %11 = load float* %10, align 4
+  %10 = load float*, float** %output.addr, align 8
+  %11 = load float, float* %10, align 4
   %conv14 = fpext float %11 to double
   %add15 = fadd double %conv14, 4.000000e+00
   %conv16 = fptrunc double %add15 to float
diff --git a/test/CodeGen/NVPTX/nounroll.ll b/test/CodeGen/NVPTX/nounroll.ll
new file mode 100644
index 000000000000..e80a4a21f161
--- /dev/null
+++ b/test/CodeGen/NVPTX/nounroll.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; Compiled from the following CUDA code:
+;
+;   #pragma nounroll
+;   for (int i = 0; i < 2; ++i)
+;     output[i] = input[i];
+define void @nounroll(float* %input, float* %output) {
+; CHECK-LABEL: .visible .func nounroll(
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: .pragma "nounroll"
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.06 to i64
+  %arrayidx = getelementptr inbounds float, float* %input, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+; CHECK: ld.f32
+  %arrayidx2 = getelementptr inbounds float, float* %output, i64 %idxprom
+  store float %0, float* %arrayidx2, align 4
+; CHECK: st.f32
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 2
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+; CHECK-NOT: ld.f32
+; CHECK-NOT: st.f32
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.disable"}
diff --git a/test/CodeGen/NVPTX/nvvm-reflect.ll b/test/CodeGen/NVPTX/nvvm-reflect.ll
index 21e9c69e657a..8c75dfc30a56 100644
--- a/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -11,7 +11,7 @@ define float @foo(float %a, float %b) {
 ; USE_MUL_0-NOT: call i32 @__nvvm_reflect
 ; USE_MUL_1: define float @foo
 ; USE_MUL_1-NOT: call i32 @__nvvm_reflect
-  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8] addrspace(4)* @str, i32 0, i32 0))
+  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
   %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
   %cmp = icmp ugt i32 %reflect, 0
   br i1 %cmp, label %use_mul, label %use_add
@@ -42,7 +42,38 @@ define i32 @intrinsic() {
 ; USE_MUL_0: ret i32 0
 ; USE_MUL_1-NOT: call i32 @llvm.nvvm.reflect
 ; USE_MUL_1: ret i32 1
-  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8] addrspace(4)* @str, i32 0, i32 0))
+  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
   %reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr)
   ret i32 %reflect
 }
+
+; CUDA-7.0 passes __nvvm_reflect argument slightly differently.
+; Verify that it works, too
+
+@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00"
+
+define float @bar(float %a, float %b) {
+; USE_MUL_0: define float @bar
+; USE_MUL_0-NOT: call i32 @__nvvm_reflect
+; USE_MUL_1: define float @bar
+; USE_MUL_1-NOT: call i32 @__nvvm_reflect
+  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+  %cmp = icmp ne i32 %reflect, 0
+  br i1 %cmp, label %use_mul, label %use_add
+
+use_mul:
+; USE_MUL_1: fmul float %a, %b
+; USE_MUL_0-NOT: fadd float %a, %b
+  %ret1 = fmul float %a, %b
+  br label %exit
+
+use_add:
+; USE_MUL_0: fadd float %a, %b
+; USE_MUL_1-NOT: fmul float %a, %b
+  %ret2 = fadd float %a, %b
+  br label %exit
+
+exit:
+  %ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
+  ret float %ret
+}
diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
index cc67a6fff8e4..d4f7c3bd210a 100644
--- a/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -19,7 +19,7 @@ define ptx_kernel void @t2(i1* %a, i8* %b) {
 ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 
-  %t1 = load i1* %a
+  %t1 = load i1, i1* %a
   %t2 = select i1 %t1, i8 1, i8 2
   store i8 %t2, i8* %b
   ret void
diff --git a/test/CodeGen/NVPTX/pr16278.ll b/test/CodeGen/NVPTX/pr16278.ll
index 5432a848442c..a836eaf2e51f 100644
--- a/test/CodeGen/NVPTX/pr16278.ll
+++ b/test/CodeGen/NVPTX/pr16278.ll
@@ -5,6 +5,6 @@
 
 define float @foo() {
 ; CHECK: ld.const.f32
-  %val = load float addrspace(4)* @one_f
+  %val = load float, float addrspace(4)* @one_f
   ret float %val
 }
diff --git a/test/CodeGen/NVPTX/pr17529.ll b/test/CodeGen/NVPTX/pr17529.ll
index a16214225674..a7519776f526 100644
--- a/test/CodeGen/NVPTX/pr17529.ll
+++ b/test/CodeGen/NVPTX/pr17529.ll
@@ -11,7 +11,7 @@ entry:
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
-  %scevgep9 = getelementptr i8* %dst, i64 %index
+  %scevgep9 = getelementptr i8, i8* %dst, i64 %index
   %scevgep910 = bitcast i8* %scevgep9 to <4 x i8>*
   store <4 x i8> undef, <4 x i8>* %scevgep910, align 1
   %index.next = add i64 %index, 4
@@ -22,13 +22,13 @@ middle.block:                                     ; preds = %vector.body
   br i1 undef, label %for.end, label %for.body.preheader1
 
 for.body.preheader1:                              ; preds = %middle.block
-  %scevgep2 = getelementptr i8* %dst, i64 0
+  %scevgep2 = getelementptr i8, i8* %dst, i64 0
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %for.body.preheader1
   %lsr.iv3 = phi i8* [ %scevgep2, %for.body.preheader1 ], [ %scevgep4, %for.body ]
   store i8 undef, i8* %lsr.iv3, align 1
-  %scevgep4 = getelementptr i8* %lsr.iv3, i64 1
+  %scevgep4 = getelementptr i8, i8* %lsr.iv3, i64 1
   br label %for.body
 
 for.end:                                          ; preds = %middle.block, %entry
diff --git a/test/CodeGen/NVPTX/ptx-version-30.ll b/test/CodeGen/NVPTX/ptx-version-30.ll
deleted file mode 100644
index 0422b01f4ee3..000000000000
--- a/test/CodeGen/NVPTX/ptx-version-30.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
-
-
-; CHECK: .version 3.0
-
diff --git a/test/CodeGen/NVPTX/ptx-version-31.ll b/test/CodeGen/NVPTX/ptx-version-31.ll
deleted file mode 100644
index d6e57301a371..000000000000
--- a/test/CodeGen/NVPTX/ptx-version-31.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
-
-
-; CHECK: .version 3.1
-
diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll
index e8782ea3aa27..0432b67535c0 100644
--- a/test/CodeGen/NVPTX/refl1.ll
+++ b/test/CodeGen/NVPTX/refl1.ll
@@ -5,7 +5,7 @@ target triple = "nvptx-nvidia-cuda"
 ; Function Attrs: nounwind
 ; CHECK: .entry foo
 define void @foo(float* nocapture %a) #0 {
-  %val = load float* %a
+  %val = load float, float* %a
   %tan = tail call fastcc float @__nv_fast_tanf(float %val)
   store float %tan, float* %a
   ret void
diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll
index 03ab635e73b9..fb01eb262adc 100644
--- a/test/CodeGen/NVPTX/sched1.ll
+++ b/test/CodeGen/NVPTX/sched1.ll
@@ -11,14 +11,14 @@ define void @foo(i32* %a) {
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
-  %ptr0 = getelementptr i32* %a, i32 0
-  %val0 = load i32* %ptr0
-  %ptr1 = getelementptr i32* %a, i32 1
-  %val1 = load i32* %ptr1
-  %ptr2 = getelementptr i32* %a, i32 2
-  %val2 = load i32* %ptr2
-  %ptr3 = getelementptr i32* %a, i32 3
-  %val3 = load i32* %ptr3
+  %ptr0 = getelementptr i32, i32* %a, i32 0
+  %val0 = load i32, i32* %ptr0
+  %ptr1 = getelementptr i32, i32* %a, i32 1
+  %val1 = load i32, i32* %ptr1
+  %ptr2 = getelementptr i32, i32* %a, i32 2
+  %val2 = load i32, i32* %ptr2
+  %ptr3 = getelementptr i32, i32* %a, i32 3
+  %val3 = load i32, i32* %ptr3
 
   %t0 = add i32 %val0, %val1
   %t1 = add i32 %t0, %val2
diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll
index 71a9a4963faf..91ed77878f81 100644
--- a/test/CodeGen/NVPTX/sched2.ll
+++ b/test/CodeGen/NVPTX/sched2.ll
@@ -12,14 +12,14 @@ define void @foo(<2 x i32>* %a) {
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
-  %ptr0 = getelementptr <2 x i32>* %a, i32 0
-  %val0 = load <2 x i32>* %ptr0
-  %ptr1 = getelementptr <2 x i32>* %a, i32 1
-  %val1 = load <2 x i32>* %ptr1
-  %ptr2 = getelementptr <2 x i32>* %a, i32 2
-  %val2 = load <2 x i32>* %ptr2
-  %ptr3 = getelementptr <2 x i32>* %a, i32 3
-  %val3 = load <2 x i32>* %ptr3
+  %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0
+  %val0 = load <2 x i32>, <2 x i32>* %ptr0
+  %ptr1 = getelementptr <2 x i32>, <2 x i32>* %a, i32 1
+  %val1 = load <2 x i32>, <2 x i32>* %ptr1
+  %ptr2 = getelementptr <2 x i32>, <2 x i32>* %a, i32 2
+  %val2 = load <2 x i32>, <2 x i32>* %ptr2
+  %ptr3 = getelementptr <2 x i32>, <2 x i32>* %a, i32 3
+  %val3 = load <2 x i32>, <2 x i32>* %ptr3
 
   %t0 = add <2 x i32> %val0, %val1
   %t1 = add <2 x i32> %t0, %val2
diff --git a/test/CodeGen/NVPTX/shift-parts.ll b/test/CodeGen/NVPTX/shift-parts.ll
index 748297caf339..b4d408ff5972 100644
--- a/test/CodeGen/NVPTX/shift-parts.ll
+++ b/test/CodeGen/NVPTX/shift-parts.ll
@@ -12,8 +12,8 @@ define void @shift_parts_left_128(i128* %val, i128* %amtptr) {
 ; CHECK: setp.gt.s32
 ; CHECK: selp.b64
 ; CHECK: shl.b64
-  %amt = load i128* %amtptr
-  %a = load i128* %val
+  %amt = load i128, i128* %amtptr
+  %a = load i128, i128* %val
   %val0 = shl i128 %a, %amt
   store i128 %val0, i128* %val
   ret void
@@ -30,8 +30,8 @@ define void @shift_parts_right_128(i128* %val, i128* %amtptr) {
 ; CHECK: setp.gt.s32
 ; CHECK: selp.b64
 ; CHECK: shr.s64
-  %amt = load i128* %amtptr
-  %a = load i128* %val
+  %amt = load i128, i128* %amtptr
+  %a = load i128, i128* %val
   %val0 = ashr i128 %a, %amt
   store i128 %val0, i128* %val
   ret void
diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll
index 1b41361cf7ed..da6568685fe6 100644
--- a/test/CodeGen/NVPTX/simple-call.ll
+++ b/test/CodeGen/NVPTX/simple-call.ll
@@ -11,7 +11,7 @@ define float @device_func(float %a) noinline {
 
 ; CHECK: .entry kernel_func
 define void @kernel_func(float* %a) {
-  %val = load float* %a
+  %val = load float, float* %a
 ; CHECK: call.uni (retval0),
 ; CHECK: device_func,
   %mul = call float @device_func(float %val)
diff --git a/test/CodeGen/NVPTX/sm-version-30.ll b/test/CodeGen/NVPTX/sm-version-30.ll
index 692b49a0d6b3..4f35cf04c63b 100644
--- a/test/CodeGen/NVPTX/sm-version-30.ll
+++ b/test/CodeGen/NVPTX/sm-version-30.ll
@@ -2,5 +2,6 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
 
 
+; CHECK: .version 3.2
 ; CHECK: .target sm_30
 
diff --git a/test/CodeGen/NVPTX/sm-version-32.ll b/test/CodeGen/NVPTX/sm-version-32.ll
new file mode 100644
index 000000000000..d6a5082c5267
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-32.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_32 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s
+
+
+; CHECK: .version 4.0
+; CHECK: .target sm_32
+
diff --git a/test/CodeGen/NVPTX/sm-version-35.ll b/test/CodeGen/NVPTX/sm-version-35.ll
index 25368a01335e..8456c666677d 100644
--- a/test/CodeGen/NVPTX/sm-version-35.ll
+++ b/test/CodeGen/NVPTX/sm-version-35.ll
@@ -2,5 +2,6 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
 
 
+; CHECK: .version 3.2
 ; CHECK: .target sm_35
 
diff --git a/test/CodeGen/NVPTX/sm-version-37.ll b/test/CodeGen/NVPTX/sm-version-37.ll
new file mode 100644
index 000000000000..fd51a9c7063f
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-37.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_37 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_37 | FileCheck %s
+
+
+; CHECK: .version 4.1
+; CHECK: .target sm_37
+
diff --git a/test/CodeGen/NVPTX/sm-version-50.ll b/test/CodeGen/NVPTX/sm-version-50.ll
new file mode 100644
index 000000000000..374c6ea057ae
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-50.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_50 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+
+
+; CHECK: .version 4.0
+; CHECK: .target sm_50
+
diff --git a/test/CodeGen/NVPTX/sm-version-52.ll b/test/CodeGen/NVPTX/sm-version-52.ll
new file mode 100644
index 000000000000..18881b2e98cc
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-52.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_52 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 | FileCheck %s
+
+
+; CHECK: .version 4.1
+; CHECK: .target sm_52
+
diff --git a/test/CodeGen/NVPTX/sm-version-53.ll b/test/CodeGen/NVPTX/sm-version-53.ll
new file mode 100644
index 000000000000..50d2dec11bc5
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-53.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_53 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 | FileCheck %s
+
+
+; CHECK: .version 4.2
+; CHECK: .target sm_53
+
diff --git a/test/CodeGen/NVPTX/symbol-naming.ll b/test/CodeGen/NVPTX/symbol-naming.ll
index bd1333f1c4e6..0f176934ca39 100644
--- a/test/CodeGen/NVPTX/symbol-naming.ll
+++ b/test/CodeGen/NVPTX/symbol-naming.ll
@@ -24,7 +24,7 @@ target triple = "nvptx64-unknown-unknown"
 ; Function Attrs: nounwind
 define void @foo(i32 %a, float %b, i8 signext %c, i32 %e) {
 entry:
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
   ret void
 }
 
diff --git a/test/CodeGen/NVPTX/vector-compare.ll b/test/CodeGen/NVPTX/vector-compare.ll
index 218049995233..2992b0e62c56 100644
--- a/test/CodeGen/NVPTX/vector-compare.ll
+++ b/test/CodeGen/NVPTX/vector-compare.ll
@@ -6,8 +6,8 @@
 ; tried to promote <2 x i1> to <2 x i8> and instruction selection failed.
 
 define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) {
-  %aval = load <2 x i32>* %a
-  %bval = load <2 x i32>* %b
+  %aval = load <2 x i32>, <2 x i32>* %a
+  %bval = load <2 x i32>, <2 x i32>* %b
   %res = icmp slt <2 x i32> %aval, %bval
   %t1 = extractelement <2 x i1> %res, i32 0
   %t2 = extractelement <2 x i1> %res, i32 1
diff --git a/test/CodeGen/NVPTX/vector-loads.ll b/test/CodeGen/NVPTX/vector-loads.ll
index 58882bf16668..d70348942200 100644
--- a/test/CodeGen/NVPTX/vector-loads.ll
+++ b/test/CodeGen/NVPTX/vector-loads.ll
@@ -10,7 +10,7 @@
 define void @foo(<2 x float>* %a) {
 ; CHECK: .func foo
 ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
-  %t1 = load <2 x float>* %a
+  %t1 = load <2 x float>, <2 x float>* %a
   %t2 = fmul <2 x float> %t1, %t1
   store <2 x float> %t2, <2 x float>* %a
   ret void
@@ -19,7 +19,7 @@ define void @foo(<2 x float>* %a) {
 define void @foo2(<4 x float>* %a) {
 ; CHECK: .func foo2
 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-  %t1 = load <4 x float>* %a
+  %t1 = load <4 x float>, <4 x float>* %a
   %t2 = fmul <4 x float> %t1, %t1
   store <4 x float> %t2, <4 x float>* %a
   ret void
@@ -29,7 +29,7 @@ define void @foo3(<8 x float>* %a) {
 ; CHECK: .func foo3
 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
 ; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-  %t1 = load <8 x float>* %a
+  %t1 = load <8 x float>, <8 x float>* %a
   %t2 = fmul <8 x float> %t1, %t1
   store <8 x float> %t2, <8 x float>* %a
   ret void
@@ -40,7 +40,7 @@ define void @foo3(<8 x float>* %a) {
 define void @foo4(<2 x i32>* %a) {
 ; CHECK: .func foo4
 ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
-  %t1 = load <2 x i32>* %a
+  %t1 = load <2 x i32>, <2 x i32>* %a
   %t2 = mul <2 x i32> %t1, %t1
   store <2 x i32> %t2, <2 x i32>* %a
   ret void
@@ -49,7 +49,7 @@ define void @foo4(<2 x i32>* %a) {
 define void @foo5(<4 x i32>* %a) {
 ; CHECK: .func foo5
 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
-  %t1 = load <4 x i32>* %a
+  %t1 = load <4 x i32>, <4 x i32>* %a
   %t2 = mul <4 x i32> %t1, %t1
   store <4 x i32> %t2, <4 x i32>* %a
   ret void
@@ -59,7 +59,7 @@ define void @foo6(<8 x i32>* %a) {
 ; CHECK: .func foo6
 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
 ; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
-  %t1 = load <8 x i32>* %a
+  %t1 = load <8 x i32>, <8 x i32>* %a
   %t2 = mul <8 x i32> %t1, %t1
   store <8 x i32> %t2, <8 x i32>* %a
   ret void
diff --git a/test/CodeGen/NVPTX/vector-select.ll b/test/CodeGen/NVPTX/vector-select.ll
index 11893df10329..1e81031c685a 100644
--- a/test/CodeGen/NVPTX/vector-select.ll
+++ b/test/CodeGen/NVPTX/vector-select.ll
@@ -6,9 +6,9 @@
 
 define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) {
 entry:
-  %tmp4 = load <2 x i32> addrspace(1)* %def_a
-  %tmp6 = load <2 x i32> addrspace(1)* %def_c
-  %tmp8 = load <2 x i32> addrspace(1)* %def_b
+  %tmp4 = load <2 x i32>, <2 x i32> addrspace(1)* %def_a
+  %tmp6 = load <2 x i32>, <2 x i32> addrspace(1)* %def_c
+  %tmp8 = load <2 x i32>, <2 x i32> addrspace(1)* %def_b
   %0 = icmp sge <2 x i32> %tmp4, zeroinitializer
   %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
   store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c
diff --git a/test/CodeGen/NVPTX/weak-global.ll b/test/CodeGen/NVPTX/weak-global.ll
index 2bef4c5228a9..a64f9f48b26f 100644
--- a/test/CodeGen/NVPTX/weak-global.ll
+++ b/test/CodeGen/NVPTX/weak-global.ll
@@ -4,6 +4,6 @@
 @g = common addrspace(1) global i32 zeroinitializer
 
 define i32 @func0() {
-  %val = load i32 addrspace(1)* @g
+  %val = load i32, i32 addrspace(1)* @g
   ret i32 %val
 }